unique-search-proxy 2026.26.0.dev9__tar.gz → 2026.26.0.dev11__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/PKG-INFO +2 -2
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/pyproject.toml +2 -2
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/api/v1/crawl.py +46 -4
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/app.py +18 -15
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/core/agent_engines/vertexai/client.py +4 -3
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/core/client/service.py +6 -3
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/core/crawlers/basic/service.py +43 -14
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/core/crawlers/firecrawl/service.py +4 -3
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/core/crawlers/jina/service.py +2 -1
- unique_search_proxy-2026.26.0.dev11/unique_search_proxy_client/web/core/crawlers/pinned_egress.py +22 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/core/crawlers/tavily/service.py +2 -1
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/core/provider_response.py +10 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/core/search_engines/brave/service.py +2 -1
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/core/search_engines/google/service.py +2 -1
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/core/search_engines/perplexity/service.py +2 -1
- unique_search_proxy-2026.26.0.dev11/unique_search_proxy_client/web/core/url_safety/__init__.py +13 -0
- unique_search_proxy-2026.26.0.dev11/unique_search_proxy_client/web/core/url_safety/gate.py +76 -0
- unique_search_proxy-2026.26.0.dev11/unique_search_proxy_client/web/logging_config.py +39 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/settings/base.py +2 -3
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/settings/client.py +5 -4
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/settings/providers/base.py +20 -5
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/settings/providers/brave.py +2 -1
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/settings/providers/firecrawl.py +2 -1
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/settings/providers/google.py +2 -1
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/settings/providers/jina.py +2 -1
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/settings/providers/perplexity.py +2 -1
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/settings/providers/tavily.py +2 -1
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/settings/providers/vertexai_agent.py +2 -1
- unique_search_proxy-2026.26.0.dev11/unique_search_proxy_client/web/settings/secret_str.py +41 -0
- unique_search_proxy-2026.26.0.dev11/unique_search_proxy_client/web/settings/startup_log.py +29 -0
- unique_search_proxy-2026.26.0.dev11/unique_search_proxy_client/web/settings/startup_report.py +138 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/README.md +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/__init__.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/__init__.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/api/__init__.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/api/health.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/api/v1/__init__.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/api/v1/agent_search.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/api/v1/configuration.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/api/v1/openapi_examples.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/api/v1/search.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/core/__init__.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/core/agent_engines/__init__.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/core/agent_engines/bing/client.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/core/agent_engines/bing/runner.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/core/agent_engines/bing/service.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/core/agent_engines/factory.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/core/agent_engines/serialization.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/core/agent_engines/service_base.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/core/agent_engines/structured_output.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/core/agent_engines/vertexai/gemini.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/core/agent_engines/vertexai/service.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/core/client/__init__.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/core/crawlers/__init__.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/core/crawlers/basic/__init__.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/core/crawlers/basic/processing/__init__.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/core/crawlers/basic/processing/errors.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/core/crawlers/basic/processing/html_markdown.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/core/crawlers/basic/processing/processors/__init__.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/core/crawlers/basic/processing/processors/html.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/core/crawlers/basic/processing/processors/pdf.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/core/crawlers/basic/processing/processors/plain_text.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/core/crawlers/basic/processing/registry.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/core/crawlers/basic/settings.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/core/crawlers/basic/user_agent.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/core/crawlers/factory.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/core/crawlers/firecrawl/polling.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/core/crawlers/firecrawl/request_body.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/core/crawlers/jina/request_body.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/core/crawlers/tavily/request_body.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/core/providers.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/core/registry.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/core/search_engines/__init__.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/core/search_engines/brave/__init__.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/core/search_engines/brave/pagination.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/core/search_engines/brave/query_params.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/core/search_engines/descriptor.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/core/search_engines/factory.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/core/search_engines/google/__init__.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/core/search_engines/google/pagination.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/core/search_engines/google/query_params.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/core/search_engines/pagination.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/core/search_engines/perplexity/__init__.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/core/search_engines/perplexity/request_body.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/core/search_engines/service_base.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/error_handlers.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/monitoring/__init__.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/monitoring/metrics.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/monitoring/setup.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/presets/__init__.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/presets/common.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/presets/crawl.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/presets/search.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/presets/types.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/settings/__init__.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/settings/monitoring.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/settings/providers/__init__.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/settings/providers/bing_agent.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/utils/__init__.py +0 -0
- {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev11}/unique_search_proxy_client/web/utils/url.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: unique-search-proxy
|
|
3
|
-
Version: 2026.26.0.
|
|
3
|
+
Version: 2026.26.0.dev11
|
|
4
4
|
Summary: Web Search Proxy implementation
|
|
5
5
|
Author: ThePhilAz
|
|
6
6
|
Author-email: ThePhilAz <rami.azouz@philico.com>
|
|
@@ -19,7 +19,7 @@ Requires-Dist: certifi>=2025.11.12,<2027
|
|
|
19
19
|
Requires-Dist: google-genai>=1.73.0,<2
|
|
20
20
|
Requires-Dist: google-auth>=2.43.0,<3
|
|
21
21
|
Requires-Dist: unique-toolkit[monitoring]>=2026.26.0.dev11,<2026.26.0rc0
|
|
22
|
-
Requires-Dist: unique-search-proxy-core>=2026.26.0.
|
|
22
|
+
Requires-Dist: unique-search-proxy-core>=2026.26.0.dev7,<2026.26.0rc0
|
|
23
23
|
Requires-Python: >=3.12
|
|
24
24
|
Description-Content-Type: text/markdown
|
|
25
25
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "unique-search-proxy"
|
|
3
|
-
version = "2026.26.0.
|
|
3
|
+
version = "2026.26.0.dev11"
|
|
4
4
|
description = "Web Search Proxy implementation"
|
|
5
5
|
authors = [{ name = "ThePhilAz", email = "rami.azouz@philico.com" }]
|
|
6
6
|
readme = "README.md"
|
|
@@ -21,7 +21,7 @@ dependencies = [
|
|
|
21
21
|
"google-genai>=1.73.0,<2",
|
|
22
22
|
"google-auth>=2.43.0,<3",
|
|
23
23
|
"unique-toolkit[monitoring]>=2026.26.0.dev11,<2026.26.0rc0",
|
|
24
|
-
"unique-search-proxy-core>=2026.26.0.
|
|
24
|
+
"unique-search-proxy-core>=2026.26.0.dev7,<2026.26.0rc0",
|
|
25
25
|
]
|
|
26
26
|
|
|
27
27
|
[dependency-groups]
|
|
@@ -18,6 +18,13 @@ from unique_search_proxy_client.web.api.v1.openapi_examples import (
|
|
|
18
18
|
)
|
|
19
19
|
from unique_search_proxy_client.web.core.client import get_http_client_pool
|
|
20
20
|
from unique_search_proxy_client.web.core.crawlers.factory import get_crawler_service
|
|
21
|
+
from unique_search_proxy_client.web.core.crawlers.pinned_egress import (
|
|
22
|
+
PinnedEgressCrawler,
|
|
23
|
+
)
|
|
24
|
+
from unique_search_proxy_client.web.core.url_safety.gate import (
|
|
25
|
+
apply_url_safety_gate,
|
|
26
|
+
merge_crawl_results,
|
|
27
|
+
)
|
|
21
28
|
from unique_search_proxy_client.web.monitoring.metrics import (
|
|
22
29
|
record_crawl_error,
|
|
23
30
|
record_crawl_success,
|
|
@@ -49,10 +56,38 @@ async def crawl(
|
|
|
49
56
|
started = time.perf_counter()
|
|
50
57
|
|
|
51
58
|
try:
|
|
52
|
-
pool = get_http_client_pool(request.app)
|
|
53
|
-
crawler = get_crawler_service(crawler_id, http_client=pool.client)
|
|
54
59
|
async with asyncio.timeout(timeout):
|
|
55
|
-
|
|
60
|
+
gate = await apply_url_safety_gate(body.urls)
|
|
61
|
+
if not gate.allowed_targets:
|
|
62
|
+
record_crawl_success(
|
|
63
|
+
crawler_id,
|
|
64
|
+
len(body.urls),
|
|
65
|
+
time.perf_counter() - started,
|
|
66
|
+
)
|
|
67
|
+
return CrawlResponse(
|
|
68
|
+
crawler=crawler_id,
|
|
69
|
+
results=merge_crawl_results(
|
|
70
|
+
body.urls,
|
|
71
|
+
blocked_by_index=gate.blocked_by_index,
|
|
72
|
+
crawler_results=[],
|
|
73
|
+
),
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
crawl_body = body.model_copy(
|
|
77
|
+
update={
|
|
78
|
+
"urls": [target.display_url for target in gate.allowed_targets],
|
|
79
|
+
},
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
pool = get_http_client_pool(request.app)
|
|
83
|
+
crawler = get_crawler_service(crawler_id, http_client=pool.client)
|
|
84
|
+
if isinstance(crawler, PinnedEgressCrawler):
|
|
85
|
+
crawler_results = await crawler.crawl_pinned(
|
|
86
|
+
crawl_body,
|
|
87
|
+
gate.allowed_targets,
|
|
88
|
+
)
|
|
89
|
+
else:
|
|
90
|
+
crawler_results = await crawler.crawl(crawl_body)
|
|
56
91
|
except TimeoutError as exc:
|
|
57
92
|
record_crawl_error(
|
|
58
93
|
crawler_id,
|
|
@@ -76,4 +111,11 @@ async def crawl(
|
|
|
76
111
|
raise
|
|
77
112
|
|
|
78
113
|
record_crawl_success(crawler_id, len(body.urls), time.perf_counter() - started)
|
|
79
|
-
return CrawlResponse(
|
|
114
|
+
return CrawlResponse(
|
|
115
|
+
crawler=crawler_id,
|
|
116
|
+
results=merge_crawl_results(
|
|
117
|
+
body.urls,
|
|
118
|
+
blocked_by_index=gate.blocked_by_index,
|
|
119
|
+
crawler_results=crawler_results,
|
|
120
|
+
),
|
|
121
|
+
)
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import logging
|
|
4
|
-
import
|
|
4
|
+
import sys
|
|
5
5
|
from contextlib import asynccontextmanager
|
|
6
6
|
|
|
7
7
|
from dotenv import load_dotenv
|
|
@@ -13,22 +13,19 @@ from unique_search_proxy_client.web.api import health_router, v1_router
|
|
|
13
13
|
from unique_search_proxy_client.web.core.client.service import create_http_client_pool
|
|
14
14
|
from unique_search_proxy_client.web.core.providers import register_builtin_providers
|
|
15
15
|
from unique_search_proxy_client.web.error_handlers import register_exception_handlers
|
|
16
|
+
from unique_search_proxy_client.web.logging_config import (
|
|
17
|
+
build_logging_config,
|
|
18
|
+
configure_logging,
|
|
19
|
+
)
|
|
16
20
|
from unique_search_proxy_client.web.monitoring import setup_prometheus
|
|
21
|
+
from unique_search_proxy_client.web.settings.startup_report import (
|
|
22
|
+
log_startup_settings_report,
|
|
23
|
+
)
|
|
17
24
|
|
|
18
|
-
|
|
25
|
+
if "pytest" not in sys.modules:
|
|
26
|
+
load_dotenv()
|
|
19
27
|
|
|
20
|
-
|
|
21
|
-
def _configure_logging() -> None:
|
|
22
|
-
level_name = os.getenv("LOG_LEVEL", "INFO").upper()
|
|
23
|
-
level = getattr(logging, level_name, logging.INFO)
|
|
24
|
-
logging.basicConfig(
|
|
25
|
-
level=level,
|
|
26
|
-
format="%(levelname)s %(name)s: %(message)s",
|
|
27
|
-
force=True,
|
|
28
|
-
)
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
_configure_logging()
|
|
28
|
+
configure_logging()
|
|
32
29
|
suppress_httpx_request_logs()
|
|
33
30
|
|
|
34
31
|
_LOGGER = logging.getLogger(__name__)
|
|
@@ -52,6 +49,7 @@ logging.getLogger("uvicorn.access").addFilter(HealthCheckFilter())
|
|
|
52
49
|
@asynccontextmanager
|
|
53
50
|
async def lifespan(app: FastAPI):
|
|
54
51
|
_LOGGER.info("Starting Unique Search Proxy...")
|
|
52
|
+
log_startup_settings_report(_LOGGER)
|
|
55
53
|
pool = await create_http_client_pool()
|
|
56
54
|
app.state.http_client_pool = pool
|
|
57
55
|
try:
|
|
@@ -95,4 +93,9 @@ app = create_app()
|
|
|
95
93
|
if __name__ == "__main__":
|
|
96
94
|
import uvicorn
|
|
97
95
|
|
|
98
|
-
uvicorn.run(
|
|
96
|
+
uvicorn.run(
|
|
97
|
+
app,
|
|
98
|
+
host="0.0.0.0",
|
|
99
|
+
port=2349,
|
|
100
|
+
log_config=build_logging_config(),
|
|
101
|
+
)
|
|
@@ -8,6 +8,7 @@ from google.auth import load_credentials_from_dict
|
|
|
8
8
|
from google.genai._api_client import BaseApiClient
|
|
9
9
|
from google.genai.client import AsyncClient
|
|
10
10
|
|
|
11
|
+
from unique_search_proxy_client.web.settings.providers.base import read_secret
|
|
11
12
|
from unique_search_proxy_client.web.settings.providers.vertexai_agent import (
|
|
12
13
|
vertexai_agent_credentials,
|
|
13
14
|
)
|
|
@@ -22,9 +23,9 @@ def _get_base_api_client_from_service_account() -> BaseApiClient:
|
|
|
22
23
|
"https://www.googleapis.com/auth/cloud-platform",
|
|
23
24
|
]
|
|
24
25
|
service_account_info = json.loads(
|
|
25
|
-
b64decode(
|
|
26
|
-
|
|
27
|
-
),
|
|
26
|
+
b64decode(
|
|
27
|
+
read_secret(vertexai_agent_credentials.service_account_credentials)
|
|
28
|
+
).decode("utf-8"),
|
|
28
29
|
)
|
|
29
30
|
credentials, project_id = load_credentials_from_dict(
|
|
30
31
|
service_account_info,
|
|
@@ -14,6 +14,8 @@ from unique_search_proxy_client.web.settings.client import (
|
|
|
14
14
|
ProxyConfig,
|
|
15
15
|
http_client_settings,
|
|
16
16
|
)
|
|
17
|
+
from unique_search_proxy_client.web.settings.providers.base import read_secret
|
|
18
|
+
from unique_search_proxy_client.web.settings.secret_str import read_secret_headers
|
|
17
19
|
|
|
18
20
|
if TYPE_CHECKING:
|
|
19
21
|
from fastapi import FastAPI
|
|
@@ -51,7 +53,8 @@ def _build_proxy_url_with_username_password(settings: HttpClientSettings) -> str
|
|
|
51
53
|
raise ValueError("Proxy username and password are required")
|
|
52
54
|
return (
|
|
53
55
|
f"{settings.proxy_protocol}://"
|
|
54
|
-
f"{proxy_username}:{proxy_password}
|
|
56
|
+
f"{read_secret(proxy_username)}:{read_secret(proxy_password)}"
|
|
57
|
+
f"@{proxy_host}:{proxy_port}"
|
|
55
58
|
)
|
|
56
59
|
|
|
57
60
|
|
|
@@ -92,7 +95,7 @@ def _get_username_password_proxy_kwargs(settings: HttpClientSettings) -> ProxyCo
|
|
|
92
95
|
)
|
|
93
96
|
return ProxyConfig(
|
|
94
97
|
proxy=proxy_url,
|
|
95
|
-
headers=settings.proxy_headers,
|
|
98
|
+
headers=read_secret_headers(settings.proxy_headers) or None,
|
|
96
99
|
verify=settings.proxy_ssl_ca_bundle_path or True,
|
|
97
100
|
)
|
|
98
101
|
|
|
@@ -104,7 +107,7 @@ def _get_ssl_tls_proxy_kwargs(settings: HttpClientSettings) -> ProxyConfig:
|
|
|
104
107
|
return ProxyConfig(
|
|
105
108
|
proxy=proxy_url,
|
|
106
109
|
cert=cert_args,
|
|
107
|
-
headers=settings.proxy_headers,
|
|
110
|
+
headers=read_secret_headers(settings.proxy_headers) or None,
|
|
108
111
|
verify=settings.proxy_ssl_ca_bundle_path or True,
|
|
109
112
|
)
|
|
110
113
|
|
|
@@ -14,6 +14,11 @@ from unique_search_proxy_core.schema import (
|
|
|
14
14
|
CrawlUrlResult,
|
|
15
15
|
ProxyErrorCode,
|
|
16
16
|
)
|
|
17
|
+
from unique_search_proxy_core.url_safety import (
|
|
18
|
+
ResolvedCrawlTarget,
|
|
19
|
+
bypass_crawl_target,
|
|
20
|
+
pinned_httpx_get_args,
|
|
21
|
+
)
|
|
17
22
|
|
|
18
23
|
from unique_search_proxy_client.web.core.crawlers.basic.processing import (
|
|
19
24
|
ContentProcessingError,
|
|
@@ -27,6 +32,7 @@ from unique_search_proxy_client.web.core.provider_response import (
|
|
|
27
32
|
crawl_upstream_error,
|
|
28
33
|
transport_error_raw,
|
|
29
34
|
)
|
|
35
|
+
from unique_search_proxy_client.web.core.url_safety.gate import AllowedCrawlTarget
|
|
30
36
|
|
|
31
37
|
_LOGGER = logging.getLogger(__name__)
|
|
32
38
|
|
|
@@ -43,11 +49,30 @@ class BasicCrawlerService(BaseCrawler[BasicCrawlRequest]):
|
|
|
43
49
|
|
|
44
50
|
crawler_id = CrawlerType.BASIC.value
|
|
45
51
|
|
|
46
|
-
async def crawl(self, request: BasicCrawlRequest) -> list[CrawlUrlResult]: # type: ignore
|
|
52
|
+
async def crawl(self, request: BasicCrawlRequest) -> list[CrawlUrlResult]: # type: ignore[override]
|
|
53
|
+
bypass_targets = [
|
|
54
|
+
AllowedCrawlTarget(
|
|
55
|
+
display_url=url.strip(),
|
|
56
|
+
resolved=bypass_crawl_target(url),
|
|
57
|
+
)
|
|
58
|
+
for url in request.urls
|
|
59
|
+
]
|
|
60
|
+
return await self.crawl_pinned(request, bypass_targets)
|
|
61
|
+
|
|
62
|
+
async def crawl_pinned(
|
|
63
|
+
self,
|
|
64
|
+
request: BasicCrawlRequest, # type: ignore[valid-type]
|
|
65
|
+
allowed_targets: list[AllowedCrawlTarget],
|
|
66
|
+
) -> list[CrawlUrlResult]:
|
|
47
67
|
client = self._http_client
|
|
48
68
|
if client is None:
|
|
49
69
|
raise RuntimeError("HTTP client is required for Basic crawler")
|
|
50
70
|
|
|
71
|
+
display_urls = list(request.urls)
|
|
72
|
+
if len(allowed_targets) != len(display_urls):
|
|
73
|
+
msg = "allowed_targets length must match request.urls length"
|
|
74
|
+
raise ValueError(msg)
|
|
75
|
+
|
|
51
76
|
timeout = request.timeout
|
|
52
77
|
semaphore = asyncio.Semaphore(request.max_concurrent_requests)
|
|
53
78
|
return list(
|
|
@@ -55,12 +80,13 @@ class BasicCrawlerService(BaseCrawler[BasicCrawlRequest]):
|
|
|
55
80
|
*[
|
|
56
81
|
self._crawl_one(
|
|
57
82
|
client,
|
|
58
|
-
|
|
83
|
+
allowed_target.display_url,
|
|
84
|
+
resolved_target=allowed_target.resolved,
|
|
59
85
|
timeout=timeout,
|
|
60
86
|
semaphore=semaphore,
|
|
61
87
|
content_type_handlers=request.content_types.to_handlers(),
|
|
62
88
|
)
|
|
63
|
-
for
|
|
89
|
+
for allowed_target in allowed_targets
|
|
64
90
|
],
|
|
65
91
|
),
|
|
66
92
|
)
|
|
@@ -68,35 +94,38 @@ class BasicCrawlerService(BaseCrawler[BasicCrawlRequest]):
|
|
|
68
94
|
async def _crawl_one(
|
|
69
95
|
self,
|
|
70
96
|
client: AsyncClient,
|
|
71
|
-
|
|
97
|
+
display_url: str,
|
|
72
98
|
*,
|
|
99
|
+
resolved_target: ResolvedCrawlTarget,
|
|
73
100
|
timeout: int,
|
|
74
101
|
semaphore: asyncio.Semaphore,
|
|
75
102
|
content_type_handlers: dict[str, ContentTypeHandlerPolicy],
|
|
76
103
|
) -> CrawlUrlResult:
|
|
77
|
-
request_url =
|
|
104
|
+
request_url, pin_headers, extensions = pinned_httpx_get_args(resolved_target)
|
|
78
105
|
async with semaphore:
|
|
79
|
-
headers = {"User-Agent": random_user_agent()}
|
|
106
|
+
headers = {"User-Agent": random_user_agent(), **pin_headers}
|
|
107
|
+
|
|
80
108
|
try:
|
|
81
109
|
response = await client.get(
|
|
82
110
|
request_url,
|
|
83
111
|
headers=headers,
|
|
112
|
+
extensions=extensions or None,
|
|
84
113
|
timeout=Timeout(timeout),
|
|
85
114
|
follow_redirects=True,
|
|
86
115
|
)
|
|
87
116
|
except httpx.TimeoutException as exc:
|
|
88
|
-
_LOGGER.warning("Basic crawl timed out for %s: %s",
|
|
117
|
+
_LOGGER.warning("Basic crawl timed out for %s: %s", display_url, exc)
|
|
89
118
|
return crawl_upstream_error(
|
|
90
|
-
|
|
119
|
+
display_url,
|
|
91
120
|
f"Crawl timed out after {timeout}s",
|
|
92
121
|
content_type=None,
|
|
93
122
|
code=ProxyErrorCode.UPSTREAM_TIMEOUT.value,
|
|
94
123
|
raw=transport_error_raw(exc),
|
|
95
124
|
)
|
|
96
125
|
except httpx.HTTPError as exc:
|
|
97
|
-
_LOGGER.warning("Basic crawl failed for %s: %s",
|
|
126
|
+
_LOGGER.warning("Basic crawl failed for %s: %s", display_url, exc)
|
|
98
127
|
return crawl_upstream_error(
|
|
99
|
-
|
|
128
|
+
display_url,
|
|
100
129
|
str(exc),
|
|
101
130
|
content_type=None,
|
|
102
131
|
raw=transport_error_raw(exc),
|
|
@@ -107,11 +136,11 @@ class BasicCrawlerService(BaseCrawler[BasicCrawlRequest]):
|
|
|
107
136
|
if response.is_error:
|
|
108
137
|
_LOGGER.warning(
|
|
109
138
|
"Basic crawl HTTP error for %s: %s",
|
|
110
|
-
|
|
139
|
+
display_url,
|
|
111
140
|
response.status_code,
|
|
112
141
|
)
|
|
113
142
|
return crawl_upstream_error(
|
|
114
|
-
|
|
143
|
+
display_url,
|
|
115
144
|
f"HTTP {response.status_code} while fetching URL",
|
|
116
145
|
content_type=content_type,
|
|
117
146
|
raw=raw_body,
|
|
@@ -120,7 +149,7 @@ class BasicCrawlerService(BaseCrawler[BasicCrawlRequest]):
|
|
|
120
149
|
content = await self._maybe_process_content(
|
|
121
150
|
raw_body,
|
|
122
151
|
content_type,
|
|
123
|
-
request_url=
|
|
152
|
+
request_url=display_url,
|
|
124
153
|
timeout=timeout,
|
|
125
154
|
content_type_handlers=content_type_handlers,
|
|
126
155
|
)
|
|
@@ -128,7 +157,7 @@ class BasicCrawlerService(BaseCrawler[BasicCrawlRequest]):
|
|
|
128
157
|
return content
|
|
129
158
|
|
|
130
159
|
return CrawlUrlResult(
|
|
131
|
-
url=
|
|
160
|
+
url=display_url,
|
|
132
161
|
content=content,
|
|
133
162
|
raw=raw_body,
|
|
134
163
|
content_type=content_type,
|
|
@@ -25,6 +25,7 @@ from unique_search_proxy_client.web.core.provider_response import (
|
|
|
25
25
|
upstream_error_message,
|
|
26
26
|
upstream_response_raw,
|
|
27
27
|
)
|
|
28
|
+
from unique_search_proxy_client.web.settings.providers.base import read_secret
|
|
28
29
|
from unique_search_proxy_client.web.settings.providers.firecrawl import (
|
|
29
30
|
firecrawl_crawl_credentials as credentials,
|
|
30
31
|
)
|
|
@@ -90,7 +91,7 @@ class FirecrawlCrawlerService(BaseCrawler[FirecrawlCrawlRequest]):
|
|
|
90
91
|
response = await client.post(
|
|
91
92
|
credentials.scrape_endpoint,
|
|
92
93
|
json=body,
|
|
93
|
-
headers=_firecrawl_headers(credentials.api_key),
|
|
94
|
+
headers=_firecrawl_headers(read_secret(credentials.api_key)),
|
|
94
95
|
timeout=timeout,
|
|
95
96
|
)
|
|
96
97
|
except httpx.TimeoutException as exc:
|
|
@@ -149,7 +150,7 @@ class FirecrawlCrawlerService(BaseCrawler[FirecrawlCrawlRequest]):
|
|
|
149
150
|
start_response = await client.post(
|
|
150
151
|
credentials.batch_scrape_endpoint,
|
|
151
152
|
json=body,
|
|
152
|
-
headers=_firecrawl_headers(credentials.api_key),
|
|
153
|
+
headers=_firecrawl_headers(read_secret(credentials.api_key)),
|
|
153
154
|
timeout=timeout,
|
|
154
155
|
)
|
|
155
156
|
except httpx.TimeoutException as exc:
|
|
@@ -200,7 +201,7 @@ class FirecrawlCrawlerService(BaseCrawler[FirecrawlCrawlRequest]):
|
|
|
200
201
|
final_payload = await poll_batch_scrape(
|
|
201
202
|
client,
|
|
202
203
|
status_url=status_url,
|
|
203
|
-
api_key=credentials.api_key,
|
|
204
|
+
api_key=read_secret(credentials.api_key),
|
|
204
205
|
deadline=deadline,
|
|
205
206
|
)
|
|
206
207
|
except TimeoutError as exc:
|
|
@@ -18,6 +18,7 @@ from unique_search_proxy_client.web.core.provider_response import (
|
|
|
18
18
|
transport_error_raw,
|
|
19
19
|
upstream_response_raw,
|
|
20
20
|
)
|
|
21
|
+
from unique_search_proxy_client.web.settings.providers.base import read_secret
|
|
21
22
|
from unique_search_proxy_client.web.settings.providers.jina import (
|
|
22
23
|
jina_crawl_credentials as credentials,
|
|
23
24
|
)
|
|
@@ -70,7 +71,7 @@ class JinaCrawlerService(BaseCrawler[JinaCrawlRequest]):
|
|
|
70
71
|
|
|
71
72
|
urls = list(request.urls)
|
|
72
73
|
timeout = request.timeout
|
|
73
|
-
headers = _jina_headers(credentials.api_key)
|
|
74
|
+
headers = _jina_headers(read_secret(credentials.api_key))
|
|
74
75
|
semaphore = asyncio.Semaphore(request.max_concurrent_requests)
|
|
75
76
|
|
|
76
77
|
async def crawl_one(url: str) -> CrawlUrlResult:
|
unique_search_proxy-2026.26.0.dev11/unique_search_proxy_client/web/core/crawlers/pinned_egress.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Protocol, runtime_checkable
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel
|
|
6
|
+
from unique_search_proxy_core.schema import CrawlUrlResult
|
|
7
|
+
|
|
8
|
+
from unique_search_proxy_client.web.core.url_safety.gate import AllowedCrawlTarget
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@runtime_checkable
|
|
12
|
+
class PinnedEgressCrawler(Protocol):
|
|
13
|
+
"""Crawlers that fetch directly and must reuse gate DNS resolution."""
|
|
14
|
+
|
|
15
|
+
async def crawl_pinned(
|
|
16
|
+
self,
|
|
17
|
+
request: BaseModel,
|
|
18
|
+
allowed_targets: list[AllowedCrawlTarget],
|
|
19
|
+
) -> list[CrawlUrlResult]: ...
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
__all__ = ["PinnedEgressCrawler"]
|
|
@@ -19,6 +19,7 @@ from unique_search_proxy_client.web.core.provider_response import (
|
|
|
19
19
|
upstream_error_message,
|
|
20
20
|
upstream_response_raw,
|
|
21
21
|
)
|
|
22
|
+
from unique_search_proxy_client.web.settings.providers.base import read_secret
|
|
22
23
|
from unique_search_proxy_client.web.settings.providers.tavily import (
|
|
23
24
|
tavily_crawl_credentials as credentials,
|
|
24
25
|
)
|
|
@@ -132,7 +133,7 @@ class TavilyCrawlerService(BaseCrawler[TavilyCrawlRequest]):
|
|
|
132
133
|
response = await client.post(
|
|
133
134
|
credentials.extract_endpoint,
|
|
134
135
|
json=body,
|
|
135
|
-
headers=_tavily_headers(credentials.api_key),
|
|
136
|
+
headers=_tavily_headers(read_secret(credentials.api_key)),
|
|
136
137
|
timeout=timeout,
|
|
137
138
|
)
|
|
138
139
|
except httpx.TimeoutException as exc:
|
|
@@ -80,6 +80,16 @@ def crawl_upstream_error(
|
|
|
80
80
|
)
|
|
81
81
|
|
|
82
82
|
|
|
83
|
+
def crawl_forbidden_target(url: str, message: str) -> CrawlUrlResult:
|
|
84
|
+
"""Build a per-URL crawl failure for a URL safety policy violation."""
|
|
85
|
+
return crawl_upstream_error(
|
|
86
|
+
url,
|
|
87
|
+
message,
|
|
88
|
+
code=ProxyErrorCode.FORBIDDEN_TARGET.value,
|
|
89
|
+
content_type=None,
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
|
|
83
93
|
def raise_for_upstream_response(
|
|
84
94
|
response: httpx.Response,
|
|
85
95
|
*,
|
|
@@ -37,6 +37,7 @@ from unique_search_proxy_client.web.core.search_engines.pagination import PageRe
|
|
|
37
37
|
from unique_search_proxy_client.web.core.search_engines.service_base import (
|
|
38
38
|
SearchEngineService,
|
|
39
39
|
)
|
|
40
|
+
from unique_search_proxy_client.web.settings.providers.base import read_secret
|
|
40
41
|
from unique_search_proxy_client.web.settings.providers.brave import (
|
|
41
42
|
brave_search_credentials as credentials,
|
|
42
43
|
)
|
|
@@ -71,7 +72,7 @@ class BraveSearchService(SearchEngineService[BraveSearchRequest]):
|
|
|
71
72
|
break
|
|
72
73
|
page = await self._fetch_page(
|
|
73
74
|
request=request,
|
|
74
|
-
api_key=credentials.api_key,
|
|
75
|
+
api_key=read_secret(credentials.api_key),
|
|
75
76
|
api_endpoint=credentials.api_endpoint,
|
|
76
77
|
page=page_request,
|
|
77
78
|
timeout=timeout,
|
|
@@ -37,6 +37,7 @@ from unique_search_proxy_client.web.core.search_engines.pagination import PageRe
|
|
|
37
37
|
from unique_search_proxy_client.web.core.search_engines.service_base import (
|
|
38
38
|
SearchEngineService,
|
|
39
39
|
)
|
|
40
|
+
from unique_search_proxy_client.web.settings.providers.base import read_secret
|
|
40
41
|
from unique_search_proxy_client.web.settings.providers.google import (
|
|
41
42
|
google_search_credentials as credentials,
|
|
42
43
|
)
|
|
@@ -71,7 +72,7 @@ class GoogleSearchService(SearchEngineService[GoogleSearchRequest]):
|
|
|
71
72
|
for page_request in iter_google_page_requests(fetch_size):
|
|
72
73
|
page = await self._fetch_page(
|
|
73
74
|
request=request,
|
|
74
|
-
api_key=credentials.api_key,
|
|
75
|
+
api_key=read_secret(credentials.api_key),
|
|
75
76
|
search_engine_id=search_engine_id,
|
|
76
77
|
api_endpoint=credentials.api_endpoint,
|
|
77
78
|
page=page_request,
|
|
@@ -36,6 +36,7 @@ from unique_search_proxy_client.web.core.search_engines.service_base import (
|
|
|
36
36
|
from unique_search_proxy_client.web.settings.providers import (
|
|
37
37
|
perplexity_search_credentials as credentials,
|
|
38
38
|
)
|
|
39
|
+
from unique_search_proxy_client.web.settings.providers.base import read_secret
|
|
39
40
|
|
|
40
41
|
_LOGGER = logging.getLogger(__name__)
|
|
41
42
|
_PERPLEXITY_PROVIDER_LABEL = "Perplexity Search API"
|
|
@@ -67,7 +68,7 @@ class PerplexitySearchService(SearchEngineService[PerplexitySearchRequest]):
|
|
|
67
68
|
response = await client.post(
|
|
68
69
|
credentials.api_endpoint,
|
|
69
70
|
json=body,
|
|
70
|
-
headers=_perplexity_headers(credentials.api_key),
|
|
71
|
+
headers=_perplexity_headers(read_secret(credentials.api_key)),
|
|
71
72
|
timeout=timeout,
|
|
72
73
|
)
|
|
73
74
|
except httpx.TimeoutException as exc:
|
unique_search_proxy-2026.26.0.dev11/unique_search_proxy_client/web/core/url_safety/__init__.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
from unique_search_proxy_client.web.core.url_safety.gate import (
|
|
2
|
+
AllowedCrawlTarget,
|
|
3
|
+
UrlSafetyGateResult,
|
|
4
|
+
apply_url_safety_gate,
|
|
5
|
+
merge_crawl_results,
|
|
6
|
+
)
|
|
7
|
+
|
|
8
|
+
__all__ = [
|
|
9
|
+
"AllowedCrawlTarget",
|
|
10
|
+
"UrlSafetyGateResult",
|
|
11
|
+
"apply_url_safety_gate",
|
|
12
|
+
"merge_crawl_results",
|
|
13
|
+
]
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
|
|
5
|
+
from unique_search_proxy_core.schema import CrawlUrlResult
|
|
6
|
+
from unique_search_proxy_core.url_safety import ResolvedCrawlTarget, UrlSafetyService
|
|
7
|
+
|
|
8
|
+
from unique_search_proxy_client.web.core.provider_response import crawl_forbidden_target
|
|
9
|
+
from unique_search_proxy_client.web.monitoring.metrics import record_crawl_blocked
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass(frozen=True)
|
|
13
|
+
class AllowedCrawlTarget:
|
|
14
|
+
"""User-facing URL paired with the validated resolution for pinned egress."""
|
|
15
|
+
|
|
16
|
+
display_url: str
|
|
17
|
+
resolved: ResolvedCrawlTarget
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass(frozen=True)
|
|
21
|
+
class UrlSafetyGateResult:
|
|
22
|
+
allowed_targets: list[AllowedCrawlTarget]
|
|
23
|
+
blocked_by_index: dict[int, CrawlUrlResult]
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
async def apply_url_safety_gate(urls: list[str]) -> UrlSafetyGateResult:
|
|
27
|
+
"""Validate crawl URLs and partition them into allowed vs blocked targets."""
|
|
28
|
+
outcomes = await UrlSafetyService.validate_urls_individually(urls)
|
|
29
|
+
allowed_targets: list[AllowedCrawlTarget] = []
|
|
30
|
+
blocked_by_index: dict[int, CrawlUrlResult] = {}
|
|
31
|
+
|
|
32
|
+
for index, outcome in enumerate(outcomes):
|
|
33
|
+
if outcome.blocked is not None:
|
|
34
|
+
record_crawl_blocked(outcome.blocked.category)
|
|
35
|
+
blocked_by_index[index] = crawl_forbidden_target(
|
|
36
|
+
outcome.url.strip(),
|
|
37
|
+
outcome.blocked.reason,
|
|
38
|
+
)
|
|
39
|
+
continue
|
|
40
|
+
|
|
41
|
+
if outcome.resolved is None:
|
|
42
|
+
msg = "URL safety allowed a crawl target without resolved metadata"
|
|
43
|
+
raise RuntimeError(msg)
|
|
44
|
+
|
|
45
|
+
allowed_targets.append(
|
|
46
|
+
AllowedCrawlTarget(
|
|
47
|
+
display_url=outcome.url.strip(),
|
|
48
|
+
resolved=outcome.resolved,
|
|
49
|
+
),
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
return UrlSafetyGateResult(
|
|
53
|
+
allowed_targets=allowed_targets,
|
|
54
|
+
blocked_by_index=blocked_by_index,
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def merge_crawl_results(
|
|
59
|
+
urls: list[str],
|
|
60
|
+
*,
|
|
61
|
+
blocked_by_index: dict[int, CrawlUrlResult],
|
|
62
|
+
crawler_results: list[CrawlUrlResult],
|
|
63
|
+
) -> list[CrawlUrlResult]:
|
|
64
|
+
"""Merge per-URL blocked results with crawler outcomes in request order."""
|
|
65
|
+
merged: list[CrawlUrlResult] = []
|
|
66
|
+
crawler_index = 0
|
|
67
|
+
for index, _url in enumerate(urls):
|
|
68
|
+
blocked = blocked_by_index.get(index)
|
|
69
|
+
if blocked is not None:
|
|
70
|
+
merged.append(blocked)
|
|
71
|
+
continue
|
|
72
|
+
|
|
73
|
+
merged.append(crawler_results[crawler_index])
|
|
74
|
+
crawler_index += 1
|
|
75
|
+
|
|
76
|
+
return merged
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import copy
|
|
4
|
+
import logging
|
|
5
|
+
import logging.config
|
|
6
|
+
import os
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from uvicorn.config import LOG_LEVELS, LOGGING_CONFIG
|
|
10
|
+
|
|
11
|
+
_APP_LOGGER_NAMES = (
|
|
12
|
+
"unique_search_proxy_client",
|
|
13
|
+
"unique_search_proxy_core",
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def build_logging_config(log_level: str | None = None) -> dict[str, Any]:
|
|
18
|
+
"""Extend Uvicorn's logging config with application loggers."""
|
|
19
|
+
level_key = (log_level or os.getenv("LOG_LEVEL", "info")).lower()
|
|
20
|
+
if level_key not in LOG_LEVELS:
|
|
21
|
+
level_key = "info"
|
|
22
|
+
level_name = level_key.upper()
|
|
23
|
+
|
|
24
|
+
config = copy.deepcopy(LOGGING_CONFIG)
|
|
25
|
+
for logger_name in _APP_LOGGER_NAMES:
|
|
26
|
+
config["loggers"][logger_name] = {
|
|
27
|
+
"handlers": ["default"],
|
|
28
|
+
"level": level_name,
|
|
29
|
+
"propagate": False,
|
|
30
|
+
}
|
|
31
|
+
config["loggers"]["uvicorn"]["level"] = level_name
|
|
32
|
+
config["loggers"]["uvicorn.error"]["level"] = level_name
|
|
33
|
+
config["loggers"]["uvicorn.access"]["level"] = level_name
|
|
34
|
+
return config
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def configure_logging(log_level: str | None = None) -> None:
|
|
38
|
+
"""Configure app and Uvicorn loggers with Uvicorn's colored formatter."""
|
|
39
|
+
logging.config.dictConfig(build_logging_config(log_level))
|