parsimony-shared 0.7.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- parsimony_shared-0.7.0/.gitignore +40 -0
- parsimony_shared-0.7.0/PKG-INFO +6 -0
- parsimony_shared-0.7.0/parsimony_shared/__init__.py +23 -0
- parsimony_shared-0.7.0/parsimony_shared/cb_enumerate.py +184 -0
- parsimony_shared-0.7.0/pyproject.toml +21 -0
- parsimony_shared-0.7.0/tests/__init__.py +0 -0
- parsimony_shared-0.7.0/tests/test_smoke.py +55 -0
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
__pycache__/
|
|
2
|
+
*.py[cod]
|
|
3
|
+
*$py.class
|
|
4
|
+
*.so
|
|
5
|
+
|
|
6
|
+
.Python
|
|
7
|
+
build/
|
|
8
|
+
dist/
|
|
9
|
+
*.egg-info/
|
|
10
|
+
*.egg
|
|
11
|
+
|
|
12
|
+
.venv/
|
|
13
|
+
.env
|
|
14
|
+
.env.*
|
|
15
|
+
!.env.example
|
|
16
|
+
|
|
17
|
+
.pytest_cache/
|
|
18
|
+
.mypy_cache/
|
|
19
|
+
.ruff_cache/
|
|
20
|
+
.coverage
|
|
21
|
+
htmlcov/
|
|
22
|
+
coverage.xml
|
|
23
|
+
|
|
24
|
+
uv.lock
|
|
25
|
+
|
|
26
|
+
.vscode/
|
|
27
|
+
.council/
|
|
28
|
+
PLAN-*.md
|
|
29
|
+
.idea/
|
|
30
|
+
*.swp
|
|
31
|
+
.DS_Store
|
|
32
|
+
|
|
33
|
+
outputs/
|
|
34
|
+
logs/
|
|
35
|
+
# Recorded HTTP cassettes must never be committed — respx mocks are hand-authored
|
|
36
|
+
# from upstream API documentation. A pre-commit / CI regex scan is the belt; this
|
|
37
|
+
# ignore is the braces. Override per-file via `!` if you need a hand-authored
|
|
38
|
+
# fixture checked in.
|
|
39
|
+
packages/*/tests/fixtures/**
|
|
40
|
+
!packages/*/tests/fixtures/README.md
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
"""Internal shared helpers for official connector packages."""
|
|
2
|
+
|
|
3
|
+
from parsimony_shared.cb_enumerate import (
|
|
4
|
+
DEFAULT_RETRY_BACKOFFS_S,
|
|
5
|
+
DEFAULT_RETRY_STATUSES,
|
|
6
|
+
DESCRIPTION_CHAR_CAP,
|
|
7
|
+
MetadataCrawlConfig,
|
|
8
|
+
ThrottledJsonFetcher,
|
|
9
|
+
enumerate_descriptions,
|
|
10
|
+
parse_retry_after,
|
|
11
|
+
truncate_description,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
__all__ = [
|
|
15
|
+
"DESCRIPTION_CHAR_CAP",
|
|
16
|
+
"DEFAULT_RETRY_BACKOFFS_S",
|
|
17
|
+
"DEFAULT_RETRY_STATUSES",
|
|
18
|
+
"MetadataCrawlConfig",
|
|
19
|
+
"ThrottledJsonFetcher",
|
|
20
|
+
"enumerate_descriptions",
|
|
21
|
+
"parse_retry_after",
|
|
22
|
+
"truncate_description",
|
|
23
|
+
]
|
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
"""Shared helpers for central-bank catalog enumerators."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import logging
|
|
7
|
+
from collections.abc import Callable
|
|
8
|
+
from dataclasses import dataclass, field
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
import httpx
|
|
12
|
+
|
|
13
|
+
DESCRIPTION_CHAR_CAP = 1500
|
|
14
|
+
|
|
15
|
+
DEFAULT_RETRY_STATUSES = frozenset({429, 500, 502, 503, 504})
|
|
16
|
+
DEFAULT_RETRY_BACKOFFS_S: tuple[float, ...] = (1.0, 2.0, 4.0)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def truncate_description(text: str, *, cap: int = DESCRIPTION_CHAR_CAP) -> str:
|
|
20
|
+
"""Cap a string at ``cap`` characters; return as-is if shorter."""
|
|
21
|
+
if not text:
|
|
22
|
+
return ""
|
|
23
|
+
if len(text) <= cap:
|
|
24
|
+
return text
|
|
25
|
+
return text[:cap].rstrip()
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def enumerate_descriptions(*parts: str, cap: int = DESCRIPTION_CHAR_CAP, sep: str = " ") -> str:
|
|
29
|
+
"""Join non-empty description fragments and cap total length for embedders."""
|
|
30
|
+
joined = sep.join(p.strip() for p in parts if p and p.strip())
|
|
31
|
+
return truncate_description(joined, cap=cap)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def parse_retry_after(response: httpx.Response) -> float | None:
|
|
35
|
+
"""Parse the ``Retry-After`` header; ``None`` if absent or malformed."""
|
|
36
|
+
raw = response.headers.get("Retry-After")
|
|
37
|
+
if not raw:
|
|
38
|
+
return None
|
|
39
|
+
try:
|
|
40
|
+
return float(raw)
|
|
41
|
+
except ValueError:
|
|
42
|
+
return None
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@dataclass(frozen=True)
|
|
46
|
+
class MetadataCrawlConfig:
|
|
47
|
+
"""Throttling and retry policy for metadata enumeration crawls."""
|
|
48
|
+
|
|
49
|
+
concurrency: int = 4
|
|
50
|
+
inter_request_delay_s: float = 0.25
|
|
51
|
+
retry_statuses: frozenset[int] = field(default_factory=lambda: DEFAULT_RETRY_STATUSES)
|
|
52
|
+
retry_backoffs_s: tuple[float, ...] = DEFAULT_RETRY_BACKOFFS_S
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class ThrottledJsonFetcher:
|
|
56
|
+
"""Async JSON GET helper with semaphore throttling, delay, and retries."""
|
|
57
|
+
|
|
58
|
+
def __init__(
|
|
59
|
+
self,
|
|
60
|
+
client: httpx.AsyncClient,
|
|
61
|
+
*,
|
|
62
|
+
provider: str,
|
|
63
|
+
config: MetadataCrawlConfig | None = None,
|
|
64
|
+
logger: logging.Logger | None = None,
|
|
65
|
+
accept_non_json: Callable[[httpx.Response], bool] | None = None,
|
|
66
|
+
) -> None:
|
|
67
|
+
self._client = client
|
|
68
|
+
self._provider = provider
|
|
69
|
+
self._config = config or MetadataCrawlConfig()
|
|
70
|
+
self._logger = logger or logging.getLogger(__name__)
|
|
71
|
+
self._semaphore = asyncio.Semaphore(self._config.concurrency)
|
|
72
|
+
self._accept_non_json = accept_non_json
|
|
73
|
+
|
|
74
|
+
@property
|
|
75
|
+
def config(self) -> MetadataCrawlConfig:
|
|
76
|
+
return self._config
|
|
77
|
+
|
|
78
|
+
async def _get_with_retries(
|
|
79
|
+
self,
|
|
80
|
+
url: str,
|
|
81
|
+
*,
|
|
82
|
+
params: dict[str, str] | None = None,
|
|
83
|
+
label: str | None = None,
|
|
84
|
+
) -> httpx.Response | None:
|
|
85
|
+
"""GET *url* with throttling and retries; ``None`` after exhausted attempts."""
|
|
86
|
+
log_target = label or url
|
|
87
|
+
async with self._semaphore:
|
|
88
|
+
await asyncio.sleep(self._config.inter_request_delay_s)
|
|
89
|
+
last_status: int | None = None
|
|
90
|
+
last_error: str | None = None
|
|
91
|
+
for attempt, backoff in enumerate((*self._config.retry_backoffs_s, None)):
|
|
92
|
+
try:
|
|
93
|
+
response = await self._client.get(url, params=params)
|
|
94
|
+
except httpx.HTTPError as exc:
|
|
95
|
+
last_error = f"{type(exc).__name__}: {exc}"
|
|
96
|
+
if backoff is None:
|
|
97
|
+
break
|
|
98
|
+
await asyncio.sleep(backoff)
|
|
99
|
+
continue
|
|
100
|
+
|
|
101
|
+
if response.status_code == 200:
|
|
102
|
+
if self._accept_non_json is not None and not self._accept_non_json(response):
|
|
103
|
+
return None
|
|
104
|
+
return response
|
|
105
|
+
|
|
106
|
+
last_status = response.status_code
|
|
107
|
+
if response.status_code in self._config.retry_statuses and backoff is not None:
|
|
108
|
+
wait = parse_retry_after(response) or backoff
|
|
109
|
+
self._logger.info(
|
|
110
|
+
"%s %s returned %s (attempt %d); retrying in %.1fs",
|
|
111
|
+
self._provider,
|
|
112
|
+
log_target,
|
|
113
|
+
response.status_code,
|
|
114
|
+
attempt + 1,
|
|
115
|
+
wait,
|
|
116
|
+
)
|
|
117
|
+
await asyncio.sleep(wait)
|
|
118
|
+
continue
|
|
119
|
+
break
|
|
120
|
+
|
|
121
|
+
self._logger.warning(
|
|
122
|
+
"%s fetch failed for %s after retries (last_status=%s, last_error=%s)",
|
|
123
|
+
self._provider,
|
|
124
|
+
log_target,
|
|
125
|
+
last_status,
|
|
126
|
+
last_error,
|
|
127
|
+
)
|
|
128
|
+
return None
|
|
129
|
+
|
|
130
|
+
async def get_json(
|
|
131
|
+
self,
|
|
132
|
+
url: str,
|
|
133
|
+
*,
|
|
134
|
+
params: dict[str, str] | None = None,
|
|
135
|
+
label: str | None = None,
|
|
136
|
+
) -> Any | None:
|
|
137
|
+
"""GET *url* and return parsed JSON, or ``None`` after exhausted retries."""
|
|
138
|
+
log_target = label or url
|
|
139
|
+
response = await self._get_with_retries(url, params=params, label=label)
|
|
140
|
+
if response is None:
|
|
141
|
+
return None
|
|
142
|
+
try:
|
|
143
|
+
return response.json()
|
|
144
|
+
except ValueError as exc:
|
|
145
|
+
self._logger.warning("%s %s returned non-JSON body: %s", self._provider, log_target, exc)
|
|
146
|
+
return None
|
|
147
|
+
|
|
148
|
+
async def get_text(
|
|
149
|
+
self,
|
|
150
|
+
url: str,
|
|
151
|
+
*,
|
|
152
|
+
params: dict[str, str] | None = None,
|
|
153
|
+
label: str | None = None,
|
|
154
|
+
) -> str | None:
|
|
155
|
+
"""GET *url* and return response text, or ``None`` after exhausted retries."""
|
|
156
|
+
response = await self._get_with_retries(url, params=params, label=label)
|
|
157
|
+
if response is None:
|
|
158
|
+
return None
|
|
159
|
+
return response.text
|
|
160
|
+
|
|
161
|
+
async def get_content(
|
|
162
|
+
self,
|
|
163
|
+
url: str,
|
|
164
|
+
*,
|
|
165
|
+
params: dict[str, str] | None = None,
|
|
166
|
+
label: str | None = None,
|
|
167
|
+
) -> bytes | None:
|
|
168
|
+
"""GET *url* and return raw response bytes, or ``None`` after exhausted retries."""
|
|
169
|
+
response = await self._get_with_retries(url, params=params, label=label)
|
|
170
|
+
if response is None:
|
|
171
|
+
return None
|
|
172
|
+
return response.content
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
__all__ = [
|
|
176
|
+
"DESCRIPTION_CHAR_CAP",
|
|
177
|
+
"DEFAULT_RETRY_BACKOFFS_S",
|
|
178
|
+
"DEFAULT_RETRY_STATUSES",
|
|
179
|
+
"MetadataCrawlConfig",
|
|
180
|
+
"ThrottledJsonFetcher",
|
|
181
|
+
"enumerate_descriptions",
|
|
182
|
+
"parse_retry_after",
|
|
183
|
+
"truncate_description",
|
|
184
|
+
]
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "parsimony-shared"
|
|
3
|
+
version = "0.7.0"
|
|
4
|
+
description = "Shared HTTP and metadata-enumeration helpers for the parsimony central-bank connector packages."
|
|
5
|
+
requires-python = ">=3.11"
|
|
6
|
+
dependencies = [
|
|
7
|
+
"httpx>=0.27",
|
|
8
|
+
]
|
|
9
|
+
|
|
10
|
+
[build-system]
|
|
11
|
+
requires = ["hatchling"]
|
|
12
|
+
build-backend = "hatchling.build"
|
|
13
|
+
|
|
14
|
+
[tool.hatch.build.targets.wheel]
|
|
15
|
+
packages = ["parsimony_shared"]
|
|
16
|
+
|
|
17
|
+
[tool.parsimony.conformance]
|
|
18
|
+
# Not a `parsimony.providers` plugin — a shared helper library consumed by the
|
|
19
|
+
# central-bank connectors (bde/bdf/bdp/boj/destatis). Opt out of the release
|
|
20
|
+
# workflow's plugin-conformance gate (it has no provider entry point).
|
|
21
|
+
skip = true
|
|
File without changes
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
"""Offline smoke tests for parsimony-shared.
|
|
2
|
+
|
|
3
|
+
parsimony-shared is a shared helper library, not a ``parsimony.providers``
|
|
4
|
+
plugin, so it carries no conformance test. These checks pin the public helper
|
|
5
|
+
surface so the release build has real (offline) coverage instead of failing
|
|
6
|
+
pytest's "no tests collected" exit code.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import dataclasses
|
|
12
|
+
|
|
13
|
+
import httpx
|
|
14
|
+
import pytest
|
|
15
|
+
|
|
16
|
+
import parsimony_shared as ps
|
|
17
|
+
from parsimony_shared import (
|
|
18
|
+
DESCRIPTION_CHAR_CAP,
|
|
19
|
+
MetadataCrawlConfig,
|
|
20
|
+
enumerate_descriptions,
|
|
21
|
+
parse_retry_after,
|
|
22
|
+
truncate_description,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def test_public_api_exports() -> None:
|
|
27
|
+
for name in ps.__all__:
|
|
28
|
+
assert hasattr(ps, name), f"missing export: {name}"
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def test_truncate_description() -> None:
|
|
32
|
+
assert truncate_description("") == ""
|
|
33
|
+
assert truncate_description("short") == "short"
|
|
34
|
+
out = truncate_description("x" * (DESCRIPTION_CHAR_CAP + 50))
|
|
35
|
+
assert len(out) <= DESCRIPTION_CHAR_CAP
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def test_enumerate_descriptions_joins_and_caps() -> None:
|
|
39
|
+
assert enumerate_descriptions("a", "", " ", "b") == "a b"
|
|
40
|
+
capped = enumerate_descriptions("y" * 1000, "z" * 1000, cap=100)
|
|
41
|
+
assert len(capped) <= 100
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def test_metadata_crawl_config_defaults_are_frozen() -> None:
|
|
45
|
+
cfg = MetadataCrawlConfig()
|
|
46
|
+
assert cfg.concurrency >= 1
|
|
47
|
+
assert 429 in cfg.retry_statuses
|
|
48
|
+
with pytest.raises(dataclasses.FrozenInstanceError):
|
|
49
|
+
cfg.concurrency = 99 # type: ignore[misc]
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def test_parse_retry_after() -> None:
|
|
53
|
+
assert parse_retry_after(httpx.Response(429, headers={"Retry-After": "5"})) == 5.0
|
|
54
|
+
assert parse_retry_after(httpx.Response(429)) is None
|
|
55
|
+
assert parse_retry_after(httpx.Response(429, headers={"Retry-After": "soon"})) is None
|