parsimony-shared 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,40 @@
1
+ __pycache__/
2
+ *.py[cod]
3
+ *$py.class
4
+ *.so
5
+
6
+ .Python
7
+ build/
8
+ dist/
9
+ *.egg-info/
10
+ *.egg
11
+
12
+ .venv/
13
+ .env
14
+ .env.*
15
+ !.env.example
16
+
17
+ .pytest_cache/
18
+ .mypy_cache/
19
+ .ruff_cache/
20
+ .coverage
21
+ htmlcov/
22
+ coverage.xml
23
+
24
+ uv.lock
25
+
26
+ .vscode/
27
+ .council/
28
+ PLAN-*.md
29
+ .idea/
30
+ *.swp
31
+ .DS_Store
32
+
33
+ outputs/
34
+ logs/
35
+ # Recorded HTTP cassettes must never be committed — respx mocks are hand-authored
36
+ # from upstream API documentation. A pre-commit / CI regex scan is the belt; this
37
+ # ignore is the braces. Override per-file via `!` if you need a hand-authored
38
+ # fixture checked in.
39
+ packages/*/tests/fixtures/**
40
+ !packages/*/tests/fixtures/README.md
@@ -0,0 +1,6 @@
1
+ Metadata-Version: 2.4
2
+ Name: parsimony-shared
3
+ Version: 0.0.1
4
+ Summary: Shared HTTP and metadata-enumeration helpers for the parsimony central-bank connector packages.
5
+ Requires-Python: >=3.11
6
+ Requires-Dist: httpx>=0.27
@@ -0,0 +1,23 @@
1
+ """Internal shared helpers for official connector packages."""
2
+
3
+ from parsimony_shared.cb_enumerate import (
4
+ DEFAULT_RETRY_BACKOFFS_S,
5
+ DEFAULT_RETRY_STATUSES,
6
+ DESCRIPTION_CHAR_CAP,
7
+ MetadataCrawlConfig,
8
+ ThrottledJsonFetcher,
9
+ enumerate_descriptions,
10
+ parse_retry_after,
11
+ truncate_description,
12
+ )
13
+
14
+ __all__ = [
15
+ "DESCRIPTION_CHAR_CAP",
16
+ "DEFAULT_RETRY_BACKOFFS_S",
17
+ "DEFAULT_RETRY_STATUSES",
18
+ "MetadataCrawlConfig",
19
+ "ThrottledJsonFetcher",
20
+ "enumerate_descriptions",
21
+ "parse_retry_after",
22
+ "truncate_description",
23
+ ]
@@ -0,0 +1,181 @@
1
+ """Shared helpers for central-bank catalog enumerators."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ import time
7
+ from collections.abc import Callable
8
+ from dataclasses import dataclass, field
9
+ from typing import Any
10
+
11
+ import httpx
12
+
13
+ DESCRIPTION_CHAR_CAP = 1500
14
+
15
+ DEFAULT_RETRY_STATUSES = frozenset({429, 500, 502, 503, 504})
16
+ DEFAULT_RETRY_BACKOFFS_S: tuple[float, ...] = (1.0, 2.0, 4.0)
17
+
18
+
19
+ def truncate_description(text: str, *, cap: int = DESCRIPTION_CHAR_CAP) -> str:
20
+ """Cap a string at ``cap`` characters; return as-is if shorter."""
21
+ if not text:
22
+ return ""
23
+ if len(text) <= cap:
24
+ return text
25
+ return text[:cap].rstrip()
26
+
27
+
28
+ def enumerate_descriptions(*parts: str, cap: int = DESCRIPTION_CHAR_CAP, sep: str = " ") -> str:
29
+ """Join non-empty description fragments and cap total length for embedders."""
30
+ joined = sep.join(p.strip() for p in parts if p and p.strip())
31
+ return truncate_description(joined, cap=cap)
32
+
33
+
34
+ def parse_retry_after(response: httpx.Response) -> float | None:
35
+ """Parse the ``Retry-After`` header; ``None`` if absent or malformed."""
36
+ raw = response.headers.get("Retry-After")
37
+ if not raw:
38
+ return None
39
+ try:
40
+ return float(raw)
41
+ except ValueError:
42
+ return None
43
+
44
+
45
+ @dataclass(frozen=True)
46
+ class MetadataCrawlConfig:
47
+ """Throttling and retry policy for metadata enumeration crawls."""
48
+
49
+ inter_request_delay_s: float = 0.25
50
+ retry_statuses: frozenset[int] = field(default_factory=lambda: DEFAULT_RETRY_STATUSES)
51
+ retry_backoffs_s: tuple[float, ...] = DEFAULT_RETRY_BACKOFFS_S
52
+
53
+
54
+ class ThrottledJsonFetcher:
55
+ """Synchronous JSON GET helper with inter-request delay and retries."""
56
+
57
+ def __init__(
58
+ self,
59
+ client: httpx.Client,
60
+ *,
61
+ provider: str,
62
+ config: MetadataCrawlConfig | None = None,
63
+ logger: logging.Logger | None = None,
64
+ accept_non_json: Callable[[httpx.Response], bool] | None = None,
65
+ ) -> None:
66
+ self._client = client
67
+ self._provider = provider
68
+ self._config = config or MetadataCrawlConfig()
69
+ self._logger = logger or logging.getLogger(__name__)
70
+ self._accept_non_json = accept_non_json
71
+
72
+ @property
73
+ def config(self) -> MetadataCrawlConfig:
74
+ return self._config
75
+
76
+ def _get_with_retries(
77
+ self,
78
+ url: str,
79
+ *,
80
+ params: dict[str, str] | None = None,
81
+ label: str | None = None,
82
+ ) -> httpx.Response | None:
83
+ """GET *url* with throttling and retries; ``None`` after exhausted attempts."""
84
+ log_target = label or url
85
+ time.sleep(self._config.inter_request_delay_s)
86
+ last_status: int | None = None
87
+ last_error: str | None = None
88
+ for attempt, backoff in enumerate((*self._config.retry_backoffs_s, None)):
89
+ try:
90
+ response = self._client.get(url, params=params)
91
+ except httpx.HTTPError as exc:
92
+ last_error = f"{type(exc).__name__}: {exc}"
93
+ if backoff is None:
94
+ break
95
+ time.sleep(backoff)
96
+ continue
97
+
98
+ if response.status_code == 200:
99
+ if self._accept_non_json is not None and not self._accept_non_json(response):
100
+ return None
101
+ return response
102
+
103
+ last_status = response.status_code
104
+ if response.status_code in self._config.retry_statuses and backoff is not None:
105
+ wait = parse_retry_after(response) or backoff
106
+ self._logger.info(
107
+ "%s %s returned %s (attempt %d); retrying in %.1fs",
108
+ self._provider,
109
+ log_target,
110
+ response.status_code,
111
+ attempt + 1,
112
+ wait,
113
+ )
114
+ time.sleep(wait)
115
+ continue
116
+ break
117
+
118
+ self._logger.warning(
119
+ "%s fetch failed for %s after retries (last_status=%s, last_error=%s)",
120
+ self._provider,
121
+ log_target,
122
+ last_status,
123
+ last_error,
124
+ )
125
+ return None
126
+
127
+ def get_json(
128
+ self,
129
+ url: str,
130
+ *,
131
+ params: dict[str, str] | None = None,
132
+ label: str | None = None,
133
+ ) -> Any | None:
134
+ """GET *url* and return parsed JSON, or ``None`` after exhausted retries."""
135
+ log_target = label or url
136
+ response = self._get_with_retries(url, params=params, label=label)
137
+ if response is None:
138
+ return None
139
+ try:
140
+ return response.json()
141
+ except ValueError as exc:
142
+ self._logger.warning("%s %s returned non-JSON body: %s", self._provider, log_target, exc)
143
+ return None
144
+
145
+ def get_text(
146
+ self,
147
+ url: str,
148
+ *,
149
+ params: dict[str, str] | None = None,
150
+ label: str | None = None,
151
+ ) -> str | None:
152
+ """GET *url* and return response text, or ``None`` after exhausted retries."""
153
+ response = self._get_with_retries(url, params=params, label=label)
154
+ if response is None:
155
+ return None
156
+ return response.text
157
+
158
+ def get_content(
159
+ self,
160
+ url: str,
161
+ *,
162
+ params: dict[str, str] | None = None,
163
+ label: str | None = None,
164
+ ) -> bytes | None:
165
+ """GET *url* and return raw response bytes, or ``None`` after exhausted retries."""
166
+ response = self._get_with_retries(url, params=params, label=label)
167
+ if response is None:
168
+ return None
169
+ return response.content
170
+
171
+
172
+ __all__ = [
173
+ "DESCRIPTION_CHAR_CAP",
174
+ "DEFAULT_RETRY_BACKOFFS_S",
175
+ "DEFAULT_RETRY_STATUSES",
176
+ "MetadataCrawlConfig",
177
+ "ThrottledJsonFetcher",
178
+ "enumerate_descriptions",
179
+ "parse_retry_after",
180
+ "truncate_description",
181
+ ]
@@ -0,0 +1,21 @@
1
+ [project]
2
+ name = "parsimony-shared"
3
+ version = "0.0.1"
4
+ description = "Shared HTTP and metadata-enumeration helpers for the parsimony central-bank connector packages."
5
+ requires-python = ">=3.11"
6
+ dependencies = [
7
+ "httpx>=0.27",
8
+ ]
9
+
10
+ [build-system]
11
+ requires = ["hatchling"]
12
+ build-backend = "hatchling.build"
13
+
14
+ [tool.hatch.build.targets.wheel]
15
+ packages = ["parsimony_shared"]
16
+
17
+ [tool.parsimony.conformance]
18
+ # Not a `parsimony.providers` plugin — a shared helper library consumed by the
19
+ # central-bank connectors (bde/bdf/bdp/boj/destatis). Opt out of the release
20
+ # workflow's plugin-conformance gate (it has no provider entry point).
21
+ skip = true
File without changes
@@ -0,0 +1,54 @@
1
+ """Offline smoke tests for parsimony-shared.
2
+
3
+ parsimony-shared is a shared helper library, not a ``parsimony.providers``
4
+ plugin, so it carries no conformance test. These checks pin the public helper
5
+ surface so the release build has real (offline) coverage instead of failing
6
+ pytest's "no tests collected" exit code.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import dataclasses
12
+
13
+ import httpx
14
+ import parsimony_shared as ps
15
+ import pytest
16
+ from parsimony_shared import (
17
+ DESCRIPTION_CHAR_CAP,
18
+ MetadataCrawlConfig,
19
+ enumerate_descriptions,
20
+ parse_retry_after,
21
+ truncate_description,
22
+ )
23
+
24
+
25
+ def test_public_api_exports() -> None:
26
+ for name in ps.__all__:
27
+ assert hasattr(ps, name), f"missing export: {name}"
28
+
29
+
30
+ def test_truncate_description() -> None:
31
+ assert truncate_description("") == ""
32
+ assert truncate_description("short") == "short"
33
+ out = truncate_description("x" * (DESCRIPTION_CHAR_CAP + 50))
34
+ assert len(out) <= DESCRIPTION_CHAR_CAP
35
+
36
+
37
+ def test_enumerate_descriptions_joins_and_caps() -> None:
38
+ assert enumerate_descriptions("a", "", " ", "b") == "a b"
39
+ capped = enumerate_descriptions("y" * 1000, "z" * 1000, cap=100)
40
+ assert len(capped) <= 100
41
+
42
+
43
+ def test_metadata_crawl_config_defaults_are_frozen() -> None:
44
+ cfg = MetadataCrawlConfig()
45
+ assert cfg.inter_request_delay_s >= 0
46
+ assert 429 in cfg.retry_statuses
47
+ with pytest.raises(dataclasses.FrozenInstanceError):
48
+ cfg.inter_request_delay_s = 99.0 # type: ignore[misc]
49
+
50
+
51
+ def test_parse_retry_after() -> None:
52
+ assert parse_retry_after(httpx.Response(429, headers={"Retry-After": "5"})) == 5.0
53
+ assert parse_retry_after(httpx.Response(429)) is None
54
+ assert parse_retry_after(httpx.Response(429, headers={"Retry-After": "soon"})) is None