unique-search-proxy-core 2026.24.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- unique_search_proxy_core-2026.24.1/PKG-INFO +16 -0
- unique_search_proxy_core-2026.24.1/README.md +5 -0
- unique_search_proxy_core-2026.24.1/pyproject.toml +59 -0
- unique_search_proxy_core-2026.24.1/unique_search_proxy_core/__init__.py +41 -0
- unique_search_proxy_core-2026.24.1/unique_search_proxy_core/crawlers/__init__.py +37 -0
- unique_search_proxy_core-2026.24.1/unique_search_proxy_core/crawlers/base.py +52 -0
- unique_search_proxy_core-2026.24.1/unique_search_proxy_core/crawlers/basic/__init__.py +13 -0
- unique_search_proxy_core-2026.24.1/unique_search_proxy_core/crawlers/basic/content_types.py +47 -0
- unique_search_proxy_core-2026.24.1/unique_search_proxy_core/crawlers/basic/processing/policy.py +8 -0
- unique_search_proxy_core-2026.24.1/unique_search_proxy_core/crawlers/basic/schema.py +60 -0
- unique_search_proxy_core-2026.24.1/unique_search_proxy_core/crawlers/call_schema.py +75 -0
- unique_search_proxy_core-2026.24.1/unique_search_proxy_core/crawlers/config_types.py +66 -0
- unique_search_proxy_core-2026.24.1/unique_search_proxy_core/crawlers/params.py +45 -0
- unique_search_proxy_core-2026.24.1/unique_search_proxy_core/errors.py +105 -0
- unique_search_proxy_core-2026.24.1/unique_search_proxy_core/param_policy/__init__.py +21 -0
- unique_search_proxy_core-2026.24.1/unique_search_proxy_core/param_policy/exposable_param.py +252 -0
- unique_search_proxy_core-2026.24.1/unique_search_proxy_core/param_policy/policy.py +3 -0
- unique_search_proxy_core-2026.24.1/unique_search_proxy_core/projection.py +313 -0
- unique_search_proxy_core-2026.24.1/unique_search_proxy_core/providers/schema.py +64 -0
- unique_search_proxy_core-2026.24.1/unique_search_proxy_core/schema.py +191 -0
- unique_search_proxy_core-2026.24.1/unique_search_proxy_core/search_engines/__init__.py +43 -0
- unique_search_proxy_core-2026.24.1/unique_search_proxy_core/search_engines/base.py +111 -0
- unique_search_proxy_core-2026.24.1/unique_search_proxy_core/search_engines/call_schema.py +84 -0
- unique_search_proxy_core-2026.24.1/unique_search_proxy_core/search_engines/config_types.py +74 -0
- unique_search_proxy_core-2026.24.1/unique_search_proxy_core/search_engines/google/__init__.py +11 -0
- unique_search_proxy_core-2026.24.1/unique_search_proxy_core/search_engines/google/schema.py +163 -0
- unique_search_proxy_core-2026.24.1/unique_search_proxy_core/search_engines/pagination.py +36 -0
- unique_search_proxy_core-2026.24.1/unique_search_proxy_core/search_engines/params.py +98 -0
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: unique-search-proxy-core
|
|
3
|
+
Version: 2026.24.1
|
|
4
|
+
Summary: Shared Pydantic types for the Unique Search Proxy API
|
|
5
|
+
Author: ThePhilAz
|
|
6
|
+
Author-email: ThePhilAz <rami.azouz@philico.com>
|
|
7
|
+
Requires-Dist: pydantic>=2.12.5,<3.0.0
|
|
8
|
+
Requires-Dist: pyhumps>=3.8.0,<4
|
|
9
|
+
Requires-Python: >=3.12
|
|
10
|
+
Description-Content-Type: text/markdown
|
|
11
|
+
|
|
12
|
+
# unique-search-proxy-core
|
|
13
|
+
|
|
14
|
+
Server-free Pydantic models and helpers shared by the Unique Search Proxy HTTP API and SDK.
|
|
15
|
+
|
|
16
|
+
Install via PyPI as `unique-search-proxy-core`. Consumers of the HTTP client should prefer `unique-search-proxy-sdk`.
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "unique-search-proxy-core"
|
|
3
|
+
version = "2026.24.1"
|
|
4
|
+
description = "Shared Pydantic types for the Unique Search Proxy API"
|
|
5
|
+
authors = [{ name = "ThePhilAz", email = "rami.azouz@philico.com" }]
|
|
6
|
+
readme = "README.md"
|
|
7
|
+
requires-python = ">=3.12"
|
|
8
|
+
dependencies = [
|
|
9
|
+
"pydantic>=2.12.5,<3.0.0",
|
|
10
|
+
"pyhumps>=3.8.0,<4",
|
|
11
|
+
]
|
|
12
|
+
|
|
13
|
+
[dependency-groups]
|
|
14
|
+
dev = [
|
|
15
|
+
"basedpyright>=1.39.1",
|
|
16
|
+
"pytest>=9.0.3",
|
|
17
|
+
"pytest-asyncio>=1.3.0",
|
|
18
|
+
"ruff>=0.15.10",
|
|
19
|
+
]
|
|
20
|
+
|
|
21
|
+
[build-system]
|
|
22
|
+
requires = ["uv_build>=0.7.19,<0.8"]
|
|
23
|
+
build-backend = "uv_build"
|
|
24
|
+
|
|
25
|
+
[tool.uv.build-backend]
|
|
26
|
+
module-root = "."
|
|
27
|
+
|
|
28
|
+
[tool.uv]
|
|
29
|
+
exclude-newer = "2 weeks"
|
|
30
|
+
|
|
31
|
+
[tool.ruff]
|
|
32
|
+
target-version = "py312"
|
|
33
|
+
|
|
34
|
+
[tool.ruff.lint]
|
|
35
|
+
extend-select = ["I"]
|
|
36
|
+
|
|
37
|
+
[tool.basedpyright]
|
|
38
|
+
typeCheckingMode = "standard"
|
|
39
|
+
include = ["unique_search_proxy_core"]
|
|
40
|
+
|
|
41
|
+
[tool.deptry]
|
|
42
|
+
known_first_party = ["unique_search_proxy_core"]
|
|
43
|
+
|
|
44
|
+
[tool.deptry.per_rule_ignores]
|
|
45
|
+
# pydantic_core ships with (and is version-pinned by) pydantic; importing
|
|
46
|
+
# CoreSchema from it is the canonical way to type __get_pydantic_core_schema__.
|
|
47
|
+
DEP003 = ["pydantic_core"]
|
|
48
|
+
|
|
49
|
+
[tool.poe.tasks]
|
|
50
|
+
lint = "ruff check ."
|
|
51
|
+
lint-fix = "ruff check . --fix"
|
|
52
|
+
format = "ruff format ."
|
|
53
|
+
test = "pytest"
|
|
54
|
+
typecheck = "basedpyright"
|
|
55
|
+
depcheck = "deptry ."
|
|
56
|
+
|
|
57
|
+
[tool.pytest.ini_options]
|
|
58
|
+
addopts = "--strict-markers --import-mode=importlib"
|
|
59
|
+
asyncio_mode = "auto"
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
"""Shared types for Unique Search Proxy (no FastAPI / server dependencies)."""
|
|
2
|
+
|
|
3
|
+
from unique_search_proxy_core.errors import (
|
|
4
|
+
BadRequestProxyError,
|
|
5
|
+
EmptySearchResultsError,
|
|
6
|
+
EngineNotConfiguredError,
|
|
7
|
+
ForbiddenTargetError,
|
|
8
|
+
ProxyError,
|
|
9
|
+
RateLimitedError,
|
|
10
|
+
UpstreamError,
|
|
11
|
+
UpstreamTimeoutError,
|
|
12
|
+
ValidationProxyError,
|
|
13
|
+
)
|
|
14
|
+
from unique_search_proxy_core.schema import (
|
|
15
|
+
CrawlResponse,
|
|
16
|
+
ErrorDetail,
|
|
17
|
+
ErrorResponse,
|
|
18
|
+
ProvidersListResponse,
|
|
19
|
+
ProxyErrorCode,
|
|
20
|
+
SearchResponse,
|
|
21
|
+
WebSearchResult,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
__all__ = [
|
|
25
|
+
"BadRequestProxyError",
|
|
26
|
+
"CrawlResponse",
|
|
27
|
+
"EmptySearchResultsError",
|
|
28
|
+
"EngineNotConfiguredError",
|
|
29
|
+
"ErrorDetail",
|
|
30
|
+
"ErrorResponse",
|
|
31
|
+
"ForbiddenTargetError",
|
|
32
|
+
"ProvidersListResponse",
|
|
33
|
+
"ProxyError",
|
|
34
|
+
"ProxyErrorCode",
|
|
35
|
+
"RateLimitedError",
|
|
36
|
+
"SearchResponse",
|
|
37
|
+
"UpstreamError",
|
|
38
|
+
"UpstreamTimeoutError",
|
|
39
|
+
"ValidationProxyError",
|
|
40
|
+
"WebSearchResult",
|
|
41
|
+
]
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
from unique_search_proxy_core.crawlers.base import (
|
|
2
|
+
BaseCrawler,
|
|
3
|
+
BaseCrawlerConfig,
|
|
4
|
+
CrawlerRequestT,
|
|
5
|
+
CrawlerType,
|
|
6
|
+
)
|
|
7
|
+
from unique_search_proxy_core.crawlers.basic.schema import (
|
|
8
|
+
BasicCrawlerConfig,
|
|
9
|
+
BasicCrawlerRequest,
|
|
10
|
+
)
|
|
11
|
+
from unique_search_proxy_core.crawlers.config_types import (
|
|
12
|
+
CrawlerConfigTypes,
|
|
13
|
+
CrawlRequest,
|
|
14
|
+
CrawlRequestTypes,
|
|
15
|
+
build_crawl_request_union,
|
|
16
|
+
crawler_config_from_request,
|
|
17
|
+
parse_crawl_request,
|
|
18
|
+
parse_crawler_config,
|
|
19
|
+
)
|
|
20
|
+
from unique_search_proxy_core.crawlers.params import merge_crawler_config_and_invocation
|
|
21
|
+
|
|
22
|
+
__all__ = [
|
|
23
|
+
"BaseCrawler",
|
|
24
|
+
"BaseCrawlerConfig",
|
|
25
|
+
"BasicCrawlerConfig",
|
|
26
|
+
"BasicCrawlerRequest",
|
|
27
|
+
"CrawlerConfigTypes",
|
|
28
|
+
"CrawlerRequestT",
|
|
29
|
+
"CrawlRequest",
|
|
30
|
+
"CrawlRequestTypes",
|
|
31
|
+
"CrawlerType",
|
|
32
|
+
"build_crawl_request_union",
|
|
33
|
+
"crawler_config_from_request",
|
|
34
|
+
"merge_crawler_config_and_invocation",
|
|
35
|
+
"parse_crawl_request",
|
|
36
|
+
"parse_crawler_config",
|
|
37
|
+
]
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
from enum import StrEnum
|
|
5
|
+
from typing import TYPE_CHECKING, Generic, TypeVar
|
|
6
|
+
|
|
7
|
+
from pydantic import BaseModel, Field
|
|
8
|
+
|
|
9
|
+
from unique_search_proxy_core.schema import CrawlUrlResult, camelized_model_config
|
|
10
|
+
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from httpx import AsyncClient
|
|
13
|
+
|
|
14
|
+
CrawlerTypeT = TypeVar("CrawlerTypeT", bound="CrawlerType")
|
|
15
|
+
CrawlerRequestT = TypeVar("CrawlerRequestT", bound=BaseModel)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class CrawlerType(StrEnum):
|
|
19
|
+
"""Registered crawler ids (JSON discriminator values)."""
|
|
20
|
+
|
|
21
|
+
BASIC = "BasicProxyCrawler"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class BaseCrawlerConfig(BaseModel, Generic[CrawlerTypeT]):
|
|
25
|
+
"""Shared crawler config; each crawler narrows ``crawler_type`` with a Literal."""
|
|
26
|
+
|
|
27
|
+
model_config = camelized_model_config
|
|
28
|
+
|
|
29
|
+
crawler_type: CrawlerTypeT
|
|
30
|
+
timeout: int = Field(
|
|
31
|
+
default=30,
|
|
32
|
+
ge=1,
|
|
33
|
+
le=600,
|
|
34
|
+
description="Request timeout in seconds",
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class BaseCrawler(ABC, Generic[CrawlerRequestT]):
|
|
39
|
+
"""Crawler contract: per-URL outcomes with optional url-safety enforcement."""
|
|
40
|
+
|
|
41
|
+
crawler_id: str
|
|
42
|
+
|
|
43
|
+
def __init__(
|
|
44
|
+
self,
|
|
45
|
+
*,
|
|
46
|
+
http_client: AsyncClient | None = None,
|
|
47
|
+
) -> None:
|
|
48
|
+
self._http_client = http_client
|
|
49
|
+
|
|
50
|
+
@abstractmethod
|
|
51
|
+
async def crawl(self, request: CrawlerRequestT) -> list[CrawlUrlResult]:
|
|
52
|
+
"""Crawl URLs from a flat request model (``BasicCrawlerRequest``, …)."""
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
from unique_search_proxy_core.crawlers.basic.content_types import ContentTypeToggles
|
|
2
|
+
from unique_search_proxy_core.crawlers.basic.schema import (
|
|
3
|
+
BasicCrawlerCall,
|
|
4
|
+
BasicCrawlerConfig,
|
|
5
|
+
BasicCrawlerRequest,
|
|
6
|
+
)
|
|
7
|
+
|
|
8
|
+
__all__ = [
|
|
9
|
+
"BasicCrawlerCall",
|
|
10
|
+
"BasicCrawlerConfig",
|
|
11
|
+
"BasicCrawlerRequest",
|
|
12
|
+
"ContentTypeToggles",
|
|
13
|
+
]
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pydantic import BaseModel, Field
|
|
4
|
+
|
|
5
|
+
from unique_search_proxy_core.crawlers.basic.processing.policy import (
|
|
6
|
+
ContentTypeHandlerPolicy,
|
|
7
|
+
)
|
|
8
|
+
from unique_search_proxy_core.schema import get_model_config
|
|
9
|
+
|
|
10
|
+
# Supported media types for the basic crawler (must match registered processors).
|
|
11
|
+
CONTENT_TYPE_TOGGLE_TO_MIME: dict[str, str] = {
|
|
12
|
+
"html": "text/html",
|
|
13
|
+
"xhtml": "application/xhtml+xml",
|
|
14
|
+
"plain_text": "text/plain",
|
|
15
|
+
"markdown": "text/markdown",
|
|
16
|
+
"pdf": "application/pdf",
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class ContentTypeToggles(BaseModel):
|
|
21
|
+
"""Per-type activation flags for basic-crawler content processing."""
|
|
22
|
+
|
|
23
|
+
model_config = get_model_config(title="Content Types")
|
|
24
|
+
|
|
25
|
+
html: bool = Field(default=True, title="HTML", description="text/html")
|
|
26
|
+
xhtml: bool = Field(
|
|
27
|
+
default=True,
|
|
28
|
+
title="XHTML",
|
|
29
|
+
description="application/xhtml+xml",
|
|
30
|
+
)
|
|
31
|
+
plain_text: bool = Field(default=True, title="Plain text", description="text/plain")
|
|
32
|
+
markdown: bool = Field(default=True, title="Markdown", description="text/markdown")
|
|
33
|
+
pdf: bool = Field(default=False, title="PDF", description="application/pdf")
|
|
34
|
+
|
|
35
|
+
def to_handlers(self) -> dict[str, ContentTypeHandlerPolicy]:
|
|
36
|
+
"""Map enabled toggles to allow-policies for the processing registry."""
|
|
37
|
+
return {
|
|
38
|
+
mime_type: ContentTypeHandlerPolicy.ALLOW
|
|
39
|
+
for field_name, mime_type in CONTENT_TYPE_TOGGLE_TO_MIME.items()
|
|
40
|
+
if getattr(self, field_name)
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
__all__ = [
|
|
45
|
+
"CONTENT_TYPE_TOGGLE_TO_MIME",
|
|
46
|
+
"ContentTypeToggles",
|
|
47
|
+
]
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Literal
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel, Field
|
|
6
|
+
|
|
7
|
+
from unique_search_proxy_core.crawlers.base import BaseCrawlerConfig, CrawlerType
|
|
8
|
+
from unique_search_proxy_core.crawlers.basic.content_types import ContentTypeToggles
|
|
9
|
+
from unique_search_proxy_core.projection import build_crawl_request_model
|
|
10
|
+
from unique_search_proxy_core.schema import get_model_config
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class BasicCrawlerCall(BaseModel):
|
|
14
|
+
"""LLM-facing call surface for the basic crawler (urls supplied per invocation)."""
|
|
15
|
+
|
|
16
|
+
urls: list[str] = Field(
|
|
17
|
+
...,
|
|
18
|
+
min_length=1,
|
|
19
|
+
description="URLs to fetch and convert to markdown",
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class BasicCrawlerConfig(BaseCrawlerConfig[CrawlerType.BASIC]):
|
|
24
|
+
"""Deployment config for the HTTP basic crawler."""
|
|
25
|
+
|
|
26
|
+
model_config = get_model_config(title="Basic Proxy Crawler ")
|
|
27
|
+
|
|
28
|
+
crawler_type: Literal[CrawlerType.BASIC] = CrawlerType.BASIC
|
|
29
|
+
|
|
30
|
+
content_types: ContentTypeToggles = Field(
|
|
31
|
+
default_factory=ContentTypeToggles,
|
|
32
|
+
title="Content types",
|
|
33
|
+
description=(
|
|
34
|
+
"Enable built-in processing per media type. "
|
|
35
|
+
"Unchecked types return raw body only."
|
|
36
|
+
),
|
|
37
|
+
)
|
|
38
|
+
max_concurrent_requests: int = Field(
|
|
39
|
+
default=10,
|
|
40
|
+
ge=1,
|
|
41
|
+
le=50,
|
|
42
|
+
title="Maximum concurrent HTTP fetches",
|
|
43
|
+
description="Maximum concurrent HTTP fetches",
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def basic_crawler_request_model() -> type[BaseModel]:
|
|
48
|
+
"""Derived ``POST /v1/crawl`` model (cached via ``build_crawl_request_model``)."""
|
|
49
|
+
return build_crawl_request_model(BasicCrawlerConfig)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
BasicCrawlerRequest = basic_crawler_request_model()
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
__all__ = [
|
|
56
|
+
"BasicCrawlerCall",
|
|
57
|
+
"BasicCrawlerConfig",
|
|
58
|
+
"BasicCrawlerRequest",
|
|
59
|
+
"basic_crawler_request_model",
|
|
60
|
+
]
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
"""LLM-facing call JSON Schema derived from crawler deployment config (no HTTP)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from pydantic import BaseModel
|
|
9
|
+
|
|
10
|
+
from unique_search_proxy_core.crawlers.basic.schema import (
|
|
11
|
+
BasicCrawlerCall,
|
|
12
|
+
BasicCrawlerConfig,
|
|
13
|
+
)
|
|
14
|
+
from unique_search_proxy_core.crawlers.config_types import (
|
|
15
|
+
CRAWLER_NAME_TO_CONFIG,
|
|
16
|
+
CrawlerConfigTypes,
|
|
17
|
+
parse_crawler_config,
|
|
18
|
+
)
|
|
19
|
+
from unique_search_proxy_core.projection import project_call_schema
|
|
20
|
+
from unique_search_proxy_core.providers.schema import provider_default_config
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@dataclass(frozen=True)
|
|
24
|
+
class CrawlCallSchemaDescriptor:
|
|
25
|
+
"""Metadata and JSON Schema for the crawler call model on ``POST /v1/crawl``."""
|
|
26
|
+
|
|
27
|
+
crawler: str
|
|
28
|
+
call_schema: dict[str, Any]
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _llm_call_schema_for_config(config: CrawlerConfigTypes) -> type[BaseModel]:
|
|
32
|
+
if isinstance(config, BasicCrawlerConfig):
|
|
33
|
+
return project_call_schema(BasicCrawlerCall, ["urls"])
|
|
34
|
+
raise ValueError(f"No LLM call schema for crawler config {type(config).__name__}")
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def resolve_crawl_call_schema_from_config(
|
|
38
|
+
crawler_id: str,
|
|
39
|
+
config: CrawlerConfigTypes,
|
|
40
|
+
) -> CrawlCallSchemaDescriptor:
|
|
41
|
+
"""Project the LLM-visible call surface from a parsed deployment config."""
|
|
42
|
+
config_cls = CRAWLER_NAME_TO_CONFIG[crawler_id.lower()]
|
|
43
|
+
if type(config) is not config_cls:
|
|
44
|
+
raise ValueError(
|
|
45
|
+
f"Config type {type(config).__name__} does not match crawler {crawler_id!r}",
|
|
46
|
+
)
|
|
47
|
+
projected = _llm_call_schema_for_config(config)
|
|
48
|
+
return CrawlCallSchemaDescriptor(
|
|
49
|
+
crawler=crawler_id.lower(),
|
|
50
|
+
call_schema=projected.model_json_schema(),
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def resolve_crawl_call_schema(
|
|
55
|
+
crawler_id: str,
|
|
56
|
+
*,
|
|
57
|
+
config: CrawlerConfigTypes | dict[str, Any] | None = None,
|
|
58
|
+
) -> CrawlCallSchemaDescriptor:
|
|
59
|
+
"""Resolve call schema from deployment config or crawler defaults."""
|
|
60
|
+
if config is not None:
|
|
61
|
+
parsed = (
|
|
62
|
+
config if isinstance(config, BaseModel) else parse_crawler_config(config)
|
|
63
|
+
)
|
|
64
|
+
return resolve_crawl_call_schema_from_config(crawler_id, parsed)
|
|
65
|
+
|
|
66
|
+
defaults = provider_default_config("crawler", crawler_id)
|
|
67
|
+
parsed = parse_crawler_config(defaults)
|
|
68
|
+
return resolve_crawl_call_schema_from_config(crawler_id, parsed)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
__all__ = [
|
|
72
|
+
"CrawlCallSchemaDescriptor",
|
|
73
|
+
"resolve_crawl_call_schema",
|
|
74
|
+
"resolve_crawl_call_schema_from_config",
|
|
75
|
+
]
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Annotated, Any, TypeAlias, Union
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel, Field, TypeAdapter
|
|
6
|
+
|
|
7
|
+
from unique_search_proxy_core.crawlers.base import BaseCrawlerConfig, CrawlerType
|
|
8
|
+
from unique_search_proxy_core.crawlers.basic.schema import BasicCrawlerConfig
|
|
9
|
+
from unique_search_proxy_core.projection import URLS_FIELD, build_crawl_request_model
|
|
10
|
+
|
|
11
|
+
CrawlerConfigTypes: TypeAlias = BasicCrawlerConfig
|
|
12
|
+
|
|
13
|
+
CRAWLER_NAME_TO_CONFIG: dict[str, type[BaseCrawlerConfig]] = {
|
|
14
|
+
CrawlerType.BASIC.value: BasicCrawlerConfig,
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
_crawler_config_adapter: TypeAdapter[CrawlerConfigTypes] = TypeAdapter(
|
|
18
|
+
CrawlerConfigTypes,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
_CRAWL_REQUEST_EXCLUDED_FIELDS = {URLS_FIELD}
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def parse_crawler_config(data: object) -> CrawlerConfigTypes:
|
|
25
|
+
return _crawler_config_adapter.validate_python(data)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def build_crawl_request_union() -> Any:
|
|
29
|
+
"""Discriminated union of flat ``POST /v1/crawl`` bodies (``crawler_type`` discriminator)."""
|
|
30
|
+
members = tuple(CRAWLER_NAME_TO_CONFIG.values())
|
|
31
|
+
request_models = tuple(
|
|
32
|
+
build_crawl_request_model(config_cls) for config_cls in members
|
|
33
|
+
)
|
|
34
|
+
if len(request_models) == 1:
|
|
35
|
+
return request_models[0]
|
|
36
|
+
return Annotated[
|
|
37
|
+
Union[request_models], # type: ignore[valid-type]
|
|
38
|
+
Field(discriminator="crawler_type"),
|
|
39
|
+
]
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
CrawlRequestTypes = build_crawl_request_union()
|
|
43
|
+
CrawlRequest = CrawlRequestTypes
|
|
44
|
+
|
|
45
|
+
_crawl_request_adapter: TypeAdapter[BaseModel] = TypeAdapter(CrawlRequestTypes) # type: ignore[arg-type]
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def parse_crawl_request(data: object) -> BaseModel:
|
|
49
|
+
return _crawl_request_adapter.validate_python(data)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def crawler_config_from_request(request: BaseModel) -> CrawlerConfigTypes:
|
|
53
|
+
"""Rebuild deployment config from a flat crawl request (excludes ``urls`` only)."""
|
|
54
|
+
crawler_id = getattr(request, "crawler_type", None)
|
|
55
|
+
if not isinstance(crawler_id, str):
|
|
56
|
+
raise ValueError("Flat crawl request is missing crawler_type discriminator")
|
|
57
|
+
|
|
58
|
+
config_cls = CRAWLER_NAME_TO_CONFIG.get(crawler_id.lower())
|
|
59
|
+
if config_cls is None:
|
|
60
|
+
raise ValueError(f"No crawler config registered for {crawler_id!r}")
|
|
61
|
+
|
|
62
|
+
payload = request.model_dump(
|
|
63
|
+
exclude=_CRAWL_REQUEST_EXCLUDED_FIELDS,
|
|
64
|
+
mode="python",
|
|
65
|
+
)
|
|
66
|
+
return parse_crawler_config(payload)
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
"""Crawler request merge helpers (no HTTP)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from pydantic import BaseModel
|
|
8
|
+
|
|
9
|
+
from unique_search_proxy_core.projection import build_crawl_request_model
|
|
10
|
+
|
|
11
|
+
CRAWLER_TYPE_FIELD = "crawler_type"
|
|
12
|
+
URLS_FIELD = "urls"
|
|
13
|
+
TIMEOUT_FIELD = "timeout"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def crawler_config_defaults(config: BaseModel) -> dict[str, Any]:
|
|
17
|
+
"""Deployment defaults merged into each flat crawl request."""
|
|
18
|
+
defaults: dict[str, Any] = {}
|
|
19
|
+
for field_name in type(config).model_fields:
|
|
20
|
+
if field_name == CRAWLER_TYPE_FIELD:
|
|
21
|
+
continue
|
|
22
|
+
defaults[field_name] = getattr(config, field_name)
|
|
23
|
+
return defaults
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def merge_crawler_config_and_invocation(
|
|
27
|
+
config: BaseModel,
|
|
28
|
+
invocation: dict[str, Any],
|
|
29
|
+
) -> BaseModel:
|
|
30
|
+
"""Merge deployment config defaults with caller/LLM args into a flat crawl request."""
|
|
31
|
+
request_model = build_crawl_request_model(type(config))
|
|
32
|
+
defaults = crawler_config_defaults(config)
|
|
33
|
+
merged: dict[str, Any] = {**defaults, **invocation}
|
|
34
|
+
if CRAWLER_TYPE_FIELD in request_model.model_fields:
|
|
35
|
+
merged[CRAWLER_TYPE_FIELD] = getattr(config, CRAWLER_TYPE_FIELD)
|
|
36
|
+
return request_model.model_validate(merged)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
__all__ = [
|
|
40
|
+
"CRAWLER_TYPE_FIELD",
|
|
41
|
+
"TIMEOUT_FIELD",
|
|
42
|
+
"URLS_FIELD",
|
|
43
|
+
"crawler_config_defaults",
|
|
44
|
+
"merge_crawler_config_and_invocation",
|
|
45
|
+
]
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from unique_search_proxy_core.schema import (
|
|
6
|
+
ErrorDetail,
|
|
7
|
+
ProxyErrorCode,
|
|
8
|
+
)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class ProxyError(Exception):
|
|
12
|
+
"""Base exception for proxy failures with a stable error code."""
|
|
13
|
+
|
|
14
|
+
code: ProxyErrorCode = ProxyErrorCode.BAD_REQUEST
|
|
15
|
+
status_code: int = 400
|
|
16
|
+
retryable: bool = False
|
|
17
|
+
|
|
18
|
+
def __init__(
|
|
19
|
+
self,
|
|
20
|
+
message: str,
|
|
21
|
+
*,
|
|
22
|
+
engine: str | None = None,
|
|
23
|
+
crawler: str | None = None,
|
|
24
|
+
retryable: bool | None = None,
|
|
25
|
+
details: list[dict[str, Any]] | None = None,
|
|
26
|
+
) -> None:
|
|
27
|
+
super().__init__(message)
|
|
28
|
+
self.message = message
|
|
29
|
+
self.engine = engine
|
|
30
|
+
self.crawler = crawler
|
|
31
|
+
if retryable is not None:
|
|
32
|
+
self.retryable = retryable
|
|
33
|
+
self.details = details
|
|
34
|
+
|
|
35
|
+
def to_detail(self) -> ErrorDetail:
|
|
36
|
+
return ErrorDetail(
|
|
37
|
+
code=self.code.value,
|
|
38
|
+
message=self.message,
|
|
39
|
+
engine=self.engine,
|
|
40
|
+
crawler=self.crawler,
|
|
41
|
+
retryable=self.retryable,
|
|
42
|
+
details=self.details,
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class BadRequestProxyError(ProxyError):
|
|
47
|
+
code = ProxyErrorCode.BAD_REQUEST
|
|
48
|
+
status_code = 400
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class ValidationProxyError(ProxyError):
|
|
52
|
+
code = ProxyErrorCode.VALIDATION_ERROR
|
|
53
|
+
status_code = 422
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class ForbiddenTargetError(ProxyError):
|
|
57
|
+
code = ProxyErrorCode.FORBIDDEN_TARGET
|
|
58
|
+
status_code = 403
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class RateLimitedError(ProxyError):
|
|
62
|
+
code = ProxyErrorCode.RATE_LIMITED
|
|
63
|
+
status_code = 429
|
|
64
|
+
|
|
65
|
+
def __init__(
|
|
66
|
+
self,
|
|
67
|
+
message: str,
|
|
68
|
+
*,
|
|
69
|
+
retry_after_seconds: int | None = None,
|
|
70
|
+
**kwargs: Any,
|
|
71
|
+
) -> None:
|
|
72
|
+
super().__init__(message, retryable=True, **kwargs)
|
|
73
|
+
self.retry_after_seconds = retry_after_seconds
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
class UpstreamError(ProxyError):
|
|
77
|
+
code = ProxyErrorCode.UPSTREAM_ERROR
|
|
78
|
+
status_code = 502
|
|
79
|
+
retryable = True
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
class EngineNotConfiguredError(ProxyError):
|
|
83
|
+
code = ProxyErrorCode.ENGINE_NOT_CONFIGURED
|
|
84
|
+
status_code = 503
|
|
85
|
+
|
|
86
|
+
def __init__(self, provider: str, *, kind: str = "engine") -> None:
|
|
87
|
+
super().__init__(
|
|
88
|
+
f"{kind.capitalize()} '{provider}' is not registered or not configured",
|
|
89
|
+
engine=provider if kind == "engine" else None,
|
|
90
|
+
crawler=provider if kind == "crawler" else None,
|
|
91
|
+
)
|
|
92
|
+
self.provider = provider
|
|
93
|
+
self.kind = kind
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
class UpstreamTimeoutError(ProxyError):
|
|
97
|
+
code = ProxyErrorCode.UPSTREAM_TIMEOUT
|
|
98
|
+
status_code = 504
|
|
99
|
+
retryable = True
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
class EmptySearchResultsError(ProxyError):
|
|
103
|
+
code = ProxyErrorCode.EMPTY_SEARCH_RESULTS
|
|
104
|
+
status_code = 404
|
|
105
|
+
retryable = False
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
"""Deployment field policies: ``ExposableParam`` for optional provider knobs."""
|
|
2
|
+
|
|
3
|
+
from unique_search_proxy_core.param_policy.exposable_param import (
|
|
4
|
+
ExposableParam,
|
|
5
|
+
exposable_param_inner_type,
|
|
6
|
+
flatten_union_args,
|
|
7
|
+
is_exposable_param_field,
|
|
8
|
+
is_exposable_param_type,
|
|
9
|
+
unwrap_exposable_param_value,
|
|
10
|
+
)
|
|
11
|
+
from unique_search_proxy_core.param_policy.policy import QUERY_FIELD
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"QUERY_FIELD",
|
|
15
|
+
"ExposableParam",
|
|
16
|
+
"exposable_param_inner_type",
|
|
17
|
+
"flatten_union_args",
|
|
18
|
+
"is_exposable_param_field",
|
|
19
|
+
"is_exposable_param_type",
|
|
20
|
+
"unwrap_exposable_param_value",
|
|
21
|
+
]
|