unique-search-proxy 2026.26.0.dev9__tar.gz → 2026.26.0.dev10__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/PKG-INFO +2 -2
  2. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/pyproject.toml +2 -2
  3. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/api/v1/crawl.py +46 -4
  4. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/crawlers/basic/service.py +43 -14
  5. unique_search_proxy-2026.26.0.dev10/unique_search_proxy_client/web/core/crawlers/pinned_egress.py +22 -0
  6. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/provider_response.py +10 -0
  7. unique_search_proxy-2026.26.0.dev10/unique_search_proxy_client/web/core/url_safety/__init__.py +13 -0
  8. unique_search_proxy-2026.26.0.dev10/unique_search_proxy_client/web/core/url_safety/gate.py +76 -0
  9. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/README.md +0 -0
  10. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/__init__.py +0 -0
  11. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/__init__.py +0 -0
  12. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/api/__init__.py +0 -0
  13. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/api/health.py +0 -0
  14. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/api/v1/__init__.py +0 -0
  15. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/api/v1/agent_search.py +0 -0
  16. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/api/v1/configuration.py +0 -0
  17. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/api/v1/openapi_examples.py +0 -0
  18. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/api/v1/search.py +0 -0
  19. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/app.py +0 -0
  20. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/__init__.py +0 -0
  21. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/agent_engines/__init__.py +0 -0
  22. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/agent_engines/bing/client.py +0 -0
  23. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/agent_engines/bing/runner.py +0 -0
  24. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/agent_engines/bing/service.py +0 -0
  25. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/agent_engines/factory.py +0 -0
  26. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/agent_engines/serialization.py +0 -0
  27. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/agent_engines/service_base.py +0 -0
  28. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/agent_engines/structured_output.py +0 -0
  29. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/agent_engines/vertexai/client.py +0 -0
  30. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/agent_engines/vertexai/gemini.py +0 -0
  31. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/agent_engines/vertexai/service.py +0 -0
  32. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/client/__init__.py +0 -0
  33. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/client/service.py +0 -0
  34. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/crawlers/__init__.py +0 -0
  35. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/crawlers/basic/__init__.py +0 -0
  36. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/crawlers/basic/processing/__init__.py +0 -0
  37. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/crawlers/basic/processing/errors.py +0 -0
  38. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/crawlers/basic/processing/html_markdown.py +0 -0
  39. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/crawlers/basic/processing/processors/__init__.py +0 -0
  40. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/crawlers/basic/processing/processors/html.py +0 -0
  41. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/crawlers/basic/processing/processors/pdf.py +0 -0
  42. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/crawlers/basic/processing/processors/plain_text.py +0 -0
  43. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/crawlers/basic/processing/registry.py +0 -0
  44. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/crawlers/basic/settings.py +0 -0
  45. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/crawlers/basic/user_agent.py +0 -0
  46. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/crawlers/factory.py +0 -0
  47. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/crawlers/firecrawl/polling.py +0 -0
  48. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/crawlers/firecrawl/request_body.py +0 -0
  49. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/crawlers/firecrawl/service.py +0 -0
  50. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/crawlers/jina/request_body.py +0 -0
  51. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/crawlers/jina/service.py +0 -0
  52. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/crawlers/tavily/request_body.py +0 -0
  53. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/crawlers/tavily/service.py +0 -0
  54. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/providers.py +0 -0
  55. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/registry.py +0 -0
  56. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/search_engines/__init__.py +0 -0
  57. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/search_engines/brave/__init__.py +0 -0
  58. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/search_engines/brave/pagination.py +0 -0
  59. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/search_engines/brave/query_params.py +0 -0
  60. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/search_engines/brave/service.py +0 -0
  61. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/search_engines/descriptor.py +0 -0
  62. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/search_engines/factory.py +0 -0
  63. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/search_engines/google/__init__.py +0 -0
  64. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/search_engines/google/pagination.py +0 -0
  65. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/search_engines/google/query_params.py +0 -0
  66. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/search_engines/google/service.py +0 -0
  67. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/search_engines/pagination.py +0 -0
  68. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/search_engines/perplexity/__init__.py +0 -0
  69. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/search_engines/perplexity/request_body.py +0 -0
  70. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/search_engines/perplexity/service.py +0 -0
  71. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/core/search_engines/service_base.py +0 -0
  72. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/error_handlers.py +0 -0
  73. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/monitoring/__init__.py +0 -0
  74. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/monitoring/metrics.py +0 -0
  75. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/monitoring/setup.py +0 -0
  76. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/presets/__init__.py +0 -0
  77. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/presets/common.py +0 -0
  78. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/presets/crawl.py +0 -0
  79. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/presets/search.py +0 -0
  80. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/presets/types.py +0 -0
  81. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/settings/__init__.py +0 -0
  82. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/settings/base.py +0 -0
  83. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/settings/client.py +0 -0
  84. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/settings/monitoring.py +0 -0
  85. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/settings/providers/__init__.py +0 -0
  86. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/settings/providers/base.py +0 -0
  87. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/settings/providers/bing_agent.py +0 -0
  88. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/settings/providers/brave.py +0 -0
  89. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/settings/providers/firecrawl.py +0 -0
  90. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/settings/providers/google.py +0 -0
  91. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/settings/providers/jina.py +0 -0
  92. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/settings/providers/perplexity.py +0 -0
  93. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/settings/providers/tavily.py +0 -0
  94. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/settings/providers/vertexai_agent.py +0 -0
  95. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/utils/__init__.py +0 -0
  96. {unique_search_proxy-2026.26.0.dev9 → unique_search_proxy-2026.26.0.dev10}/unique_search_proxy_client/web/utils/url.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: unique-search-proxy
3
- Version: 2026.26.0.dev9
3
+ Version: 2026.26.0.dev10
4
4
  Summary: Web Search Proxy implementation
5
5
  Author: ThePhilAz
6
6
  Author-email: ThePhilAz <rami.azouz@philico.com>
@@ -19,7 +19,7 @@ Requires-Dist: certifi>=2025.11.12,<2027
19
19
  Requires-Dist: google-genai>=1.73.0,<2
20
20
  Requires-Dist: google-auth>=2.43.0,<3
21
21
  Requires-Dist: unique-toolkit[monitoring]>=2026.26.0.dev11,<2026.26.0rc0
22
- Requires-Dist: unique-search-proxy-core>=2026.26.0.dev5,<2026.26.0rc0
22
+ Requires-Dist: unique-search-proxy-core>=2026.26.0.dev6,<2026.26.0rc0
23
23
  Requires-Python: >=3.12
24
24
  Description-Content-Type: text/markdown
25
25
 
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "unique-search-proxy"
3
- version = "2026.26.0.dev9"
3
+ version = "2026.26.0.dev10"
4
4
  description = "Web Search Proxy implementation"
5
5
  authors = [{ name = "ThePhilAz", email = "rami.azouz@philico.com" }]
6
6
  readme = "README.md"
@@ -21,7 +21,7 @@ dependencies = [
21
21
  "google-genai>=1.73.0,<2",
22
22
  "google-auth>=2.43.0,<3",
23
23
  "unique-toolkit[monitoring]>=2026.26.0.dev11,<2026.26.0rc0",
24
- "unique-search-proxy-core>=2026.26.0.dev5,<2026.26.0rc0",
24
+ "unique-search-proxy-core>=2026.26.0.dev6,<2026.26.0rc0",
25
25
  ]
26
26
 
27
27
  [dependency-groups]
@@ -18,6 +18,13 @@ from unique_search_proxy_client.web.api.v1.openapi_examples import (
18
18
  )
19
19
  from unique_search_proxy_client.web.core.client import get_http_client_pool
20
20
  from unique_search_proxy_client.web.core.crawlers.factory import get_crawler_service
21
+ from unique_search_proxy_client.web.core.crawlers.pinned_egress import (
22
+ PinnedEgressCrawler,
23
+ )
24
+ from unique_search_proxy_client.web.core.url_safety.gate import (
25
+ apply_url_safety_gate,
26
+ merge_crawl_results,
27
+ )
21
28
  from unique_search_proxy_client.web.monitoring.metrics import (
22
29
  record_crawl_error,
23
30
  record_crawl_success,
@@ -49,10 +56,38 @@ async def crawl(
49
56
  started = time.perf_counter()
50
57
 
51
58
  try:
52
- pool = get_http_client_pool(request.app)
53
- crawler = get_crawler_service(crawler_id, http_client=pool.client)
54
59
  async with asyncio.timeout(timeout):
55
- results = await crawler.crawl(body)
60
+ gate = await apply_url_safety_gate(body.urls)
61
+ if not gate.allowed_targets:
62
+ record_crawl_success(
63
+ crawler_id,
64
+ len(body.urls),
65
+ time.perf_counter() - started,
66
+ )
67
+ return CrawlResponse(
68
+ crawler=crawler_id,
69
+ results=merge_crawl_results(
70
+ body.urls,
71
+ blocked_by_index=gate.blocked_by_index,
72
+ crawler_results=[],
73
+ ),
74
+ )
75
+
76
+ crawl_body = body.model_copy(
77
+ update={
78
+ "urls": [target.display_url for target in gate.allowed_targets],
79
+ },
80
+ )
81
+
82
+ pool = get_http_client_pool(request.app)
83
+ crawler = get_crawler_service(crawler_id, http_client=pool.client)
84
+ if isinstance(crawler, PinnedEgressCrawler):
85
+ crawler_results = await crawler.crawl_pinned(
86
+ crawl_body,
87
+ gate.allowed_targets,
88
+ )
89
+ else:
90
+ crawler_results = await crawler.crawl(crawl_body)
56
91
  except TimeoutError as exc:
57
92
  record_crawl_error(
58
93
  crawler_id,
@@ -76,4 +111,11 @@ async def crawl(
76
111
  raise
77
112
 
78
113
  record_crawl_success(crawler_id, len(body.urls), time.perf_counter() - started)
79
- return CrawlResponse(crawler=crawler_id, results=results)
114
+ return CrawlResponse(
115
+ crawler=crawler_id,
116
+ results=merge_crawl_results(
117
+ body.urls,
118
+ blocked_by_index=gate.blocked_by_index,
119
+ crawler_results=crawler_results,
120
+ ),
121
+ )
@@ -14,6 +14,11 @@ from unique_search_proxy_core.schema import (
14
14
  CrawlUrlResult,
15
15
  ProxyErrorCode,
16
16
  )
17
+ from unique_search_proxy_core.url_safety import (
18
+ ResolvedCrawlTarget,
19
+ bypass_crawl_target,
20
+ pinned_httpx_get_args,
21
+ )
17
22
 
18
23
  from unique_search_proxy_client.web.core.crawlers.basic.processing import (
19
24
  ContentProcessingError,
@@ -27,6 +32,7 @@ from unique_search_proxy_client.web.core.provider_response import (
27
32
  crawl_upstream_error,
28
33
  transport_error_raw,
29
34
  )
35
+ from unique_search_proxy_client.web.core.url_safety.gate import AllowedCrawlTarget
30
36
 
31
37
  _LOGGER = logging.getLogger(__name__)
32
38
 
@@ -43,11 +49,30 @@ class BasicCrawlerService(BaseCrawler[BasicCrawlRequest]):
43
49
 
44
50
  crawler_id = CrawlerType.BASIC.value
45
51
 
46
- async def crawl(self, request: BasicCrawlRequest) -> list[CrawlUrlResult]: # type: ignore
52
+ async def crawl(self, request: BasicCrawlRequest) -> list[CrawlUrlResult]: # type: ignore[override]
53
+ bypass_targets = [
54
+ AllowedCrawlTarget(
55
+ display_url=url.strip(),
56
+ resolved=bypass_crawl_target(url),
57
+ )
58
+ for url in request.urls
59
+ ]
60
+ return await self.crawl_pinned(request, bypass_targets)
61
+
62
+ async def crawl_pinned(
63
+ self,
64
+ request: BasicCrawlRequest, # type: ignore[valid-type]
65
+ allowed_targets: list[AllowedCrawlTarget],
66
+ ) -> list[CrawlUrlResult]:
47
67
  client = self._http_client
48
68
  if client is None:
49
69
  raise RuntimeError("HTTP client is required for Basic crawler")
50
70
 
71
+ display_urls = list(request.urls)
72
+ if len(allowed_targets) != len(display_urls):
73
+ msg = "allowed_targets length must match request.urls length"
74
+ raise ValueError(msg)
75
+
51
76
  timeout = request.timeout
52
77
  semaphore = asyncio.Semaphore(request.max_concurrent_requests)
53
78
  return list(
@@ -55,12 +80,13 @@ class BasicCrawlerService(BaseCrawler[BasicCrawlRequest]):
55
80
  *[
56
81
  self._crawl_one(
57
82
  client,
58
- url,
83
+ allowed_target.display_url,
84
+ resolved_target=allowed_target.resolved,
59
85
  timeout=timeout,
60
86
  semaphore=semaphore,
61
87
  content_type_handlers=request.content_types.to_handlers(),
62
88
  )
63
- for url in request.urls
89
+ for allowed_target in allowed_targets
64
90
  ],
65
91
  ),
66
92
  )
@@ -68,35 +94,38 @@ class BasicCrawlerService(BaseCrawler[BasicCrawlRequest]):
68
94
  async def _crawl_one(
69
95
  self,
70
96
  client: AsyncClient,
71
- url: str,
97
+ display_url: str,
72
98
  *,
99
+ resolved_target: ResolvedCrawlTarget,
73
100
  timeout: int,
74
101
  semaphore: asyncio.Semaphore,
75
102
  content_type_handlers: dict[str, ContentTypeHandlerPolicy],
76
103
  ) -> CrawlUrlResult:
77
- request_url = url.strip()
104
+ request_url, pin_headers, extensions = pinned_httpx_get_args(resolved_target)
78
105
  async with semaphore:
79
- headers = {"User-Agent": random_user_agent()}
106
+ headers = {"User-Agent": random_user_agent(), **pin_headers}
107
+
80
108
  try:
81
109
  response = await client.get(
82
110
  request_url,
83
111
  headers=headers,
112
+ extensions=extensions or None,
84
113
  timeout=Timeout(timeout),
85
114
  follow_redirects=True,
86
115
  )
87
116
  except httpx.TimeoutException as exc:
88
- _LOGGER.warning("Basic crawl timed out for %s: %s", request_url, exc)
117
+ _LOGGER.warning("Basic crawl timed out for %s: %s", display_url, exc)
89
118
  return crawl_upstream_error(
90
- request_url,
119
+ display_url,
91
120
  f"Crawl timed out after {timeout}s",
92
121
  content_type=None,
93
122
  code=ProxyErrorCode.UPSTREAM_TIMEOUT.value,
94
123
  raw=transport_error_raw(exc),
95
124
  )
96
125
  except httpx.HTTPError as exc:
97
- _LOGGER.warning("Basic crawl failed for %s: %s", request_url, exc)
126
+ _LOGGER.warning("Basic crawl failed for %s: %s", display_url, exc)
98
127
  return crawl_upstream_error(
99
- request_url,
128
+ display_url,
100
129
  str(exc),
101
130
  content_type=None,
102
131
  raw=transport_error_raw(exc),
@@ -107,11 +136,11 @@ class BasicCrawlerService(BaseCrawler[BasicCrawlRequest]):
107
136
  if response.is_error:
108
137
  _LOGGER.warning(
109
138
  "Basic crawl HTTP error for %s: %s",
110
- request_url,
139
+ display_url,
111
140
  response.status_code,
112
141
  )
113
142
  return crawl_upstream_error(
114
- request_url,
143
+ display_url,
115
144
  f"HTTP {response.status_code} while fetching URL",
116
145
  content_type=content_type,
117
146
  raw=raw_body,
@@ -120,7 +149,7 @@ class BasicCrawlerService(BaseCrawler[BasicCrawlRequest]):
120
149
  content = await self._maybe_process_content(
121
150
  raw_body,
122
151
  content_type,
123
- request_url=request_url,
152
+ request_url=display_url,
124
153
  timeout=timeout,
125
154
  content_type_handlers=content_type_handlers,
126
155
  )
@@ -128,7 +157,7 @@ class BasicCrawlerService(BaseCrawler[BasicCrawlRequest]):
128
157
  return content
129
158
 
130
159
  return CrawlUrlResult(
131
- url=request_url,
160
+ url=display_url,
132
161
  content=content,
133
162
  raw=raw_body,
134
163
  content_type=content_type,
@@ -0,0 +1,22 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Protocol, runtime_checkable
4
+
5
+ from pydantic import BaseModel
6
+ from unique_search_proxy_core.schema import CrawlUrlResult
7
+
8
+ from unique_search_proxy_client.web.core.url_safety.gate import AllowedCrawlTarget
9
+
10
+
11
+ @runtime_checkable
12
+ class PinnedEgressCrawler(Protocol):
13
+ """Crawlers that fetch directly and must reuse gate DNS resolution."""
14
+
15
+ async def crawl_pinned(
16
+ self,
17
+ request: BaseModel,
18
+ allowed_targets: list[AllowedCrawlTarget],
19
+ ) -> list[CrawlUrlResult]: ...
20
+
21
+
22
+ __all__ = ["PinnedEgressCrawler"]
@@ -80,6 +80,16 @@ def crawl_upstream_error(
80
80
  )
81
81
 
82
82
 
83
+ def crawl_forbidden_target(url: str, message: str) -> CrawlUrlResult:
84
+ """Build a per-URL crawl failure for a URL safety policy violation."""
85
+ return crawl_upstream_error(
86
+ url,
87
+ message,
88
+ code=ProxyErrorCode.FORBIDDEN_TARGET.value,
89
+ content_type=None,
90
+ )
91
+
92
+
83
93
  def raise_for_upstream_response(
84
94
  response: httpx.Response,
85
95
  *,
@@ -0,0 +1,13 @@
1
+ from unique_search_proxy_client.web.core.url_safety.gate import (
2
+ AllowedCrawlTarget,
3
+ UrlSafetyGateResult,
4
+ apply_url_safety_gate,
5
+ merge_crawl_results,
6
+ )
7
+
8
+ __all__ = [
9
+ "AllowedCrawlTarget",
10
+ "UrlSafetyGateResult",
11
+ "apply_url_safety_gate",
12
+ "merge_crawl_results",
13
+ ]
@@ -0,0 +1,76 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+
5
+ from unique_search_proxy_core.schema import CrawlUrlResult
6
+ from unique_search_proxy_core.url_safety import ResolvedCrawlTarget, UrlSafetyService
7
+
8
+ from unique_search_proxy_client.web.core.provider_response import crawl_forbidden_target
9
+ from unique_search_proxy_client.web.monitoring.metrics import record_crawl_blocked
10
+
11
+
12
+ @dataclass(frozen=True)
13
+ class AllowedCrawlTarget:
14
+ """User-facing URL paired with the validated resolution for pinned egress."""
15
+
16
+ display_url: str
17
+ resolved: ResolvedCrawlTarget
18
+
19
+
20
+ @dataclass(frozen=True)
21
+ class UrlSafetyGateResult:
22
+ allowed_targets: list[AllowedCrawlTarget]
23
+ blocked_by_index: dict[int, CrawlUrlResult]
24
+
25
+
26
+ async def apply_url_safety_gate(urls: list[str]) -> UrlSafetyGateResult:
27
+ """Validate crawl URLs and partition them into allowed vs blocked targets."""
28
+ outcomes = await UrlSafetyService.validate_urls_individually(urls)
29
+ allowed_targets: list[AllowedCrawlTarget] = []
30
+ blocked_by_index: dict[int, CrawlUrlResult] = {}
31
+
32
+ for index, outcome in enumerate(outcomes):
33
+ if outcome.blocked is not None:
34
+ record_crawl_blocked(outcome.blocked.category)
35
+ blocked_by_index[index] = crawl_forbidden_target(
36
+ outcome.url.strip(),
37
+ outcome.blocked.reason,
38
+ )
39
+ continue
40
+
41
+ if outcome.resolved is None:
42
+ msg = "URL safety allowed a crawl target without resolved metadata"
43
+ raise RuntimeError(msg)
44
+
45
+ allowed_targets.append(
46
+ AllowedCrawlTarget(
47
+ display_url=outcome.url.strip(),
48
+ resolved=outcome.resolved,
49
+ ),
50
+ )
51
+
52
+ return UrlSafetyGateResult(
53
+ allowed_targets=allowed_targets,
54
+ blocked_by_index=blocked_by_index,
55
+ )
56
+
57
+
58
+ def merge_crawl_results(
59
+ urls: list[str],
60
+ *,
61
+ blocked_by_index: dict[int, CrawlUrlResult],
62
+ crawler_results: list[CrawlUrlResult],
63
+ ) -> list[CrawlUrlResult]:
64
+ """Merge per-URL blocked results with crawler outcomes in request order."""
65
+ merged: list[CrawlUrlResult] = []
66
+ crawler_index = 0
67
+ for index, _url in enumerate(urls):
68
+ blocked = blocked_by_index.get(index)
69
+ if blocked is not None:
70
+ merged.append(blocked)
71
+ continue
72
+
73
+ merged.append(crawler_results[crawler_index])
74
+ crawler_index += 1
75
+
76
+ return merged