corpus-forge 0.1.0b2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- corpus_forge/__init__.py +3 -0
- corpus_forge/__main__.py +6 -0
- corpus_forge/_http.py +193 -0
- corpus_forge/_ml_device.py +57 -0
- corpus_forge/alembic/__init__.py +0 -0
- corpus_forge/alembic/env.py +120 -0
- corpus_forge/alembic/script.py.mako +30 -0
- corpus_forge/alembic/versions/.gitkeep +0 -0
- corpus_forge/alembic/versions/0001_core.py +353 -0
- corpus_forge/alembic/versions/0002_chunk_content_hash.py +53 -0
- corpus_forge/alembic/versions/0003_views.py +74 -0
- corpus_forge/alembic/versions/0004_sync.py +105 -0
- corpus_forge/alembic/versions/0005_fts.py +75 -0
- corpus_forge/alembic/versions/0006_writes_and_feedback.py +135 -0
- corpus_forge/alembic/versions/0007_chat_templates.py +60 -0
- corpus_forge/alembic/versions/0008_feedback_sessions.py +86 -0
- corpus_forge/alembic/versions/0009_feedback_host_default.py +70 -0
- corpus_forge/alembic/versions/0010_document_label_confidence.py +57 -0
- corpus_forge/alembic/versions/0011_image_embeddings.py +69 -0
- corpus_forge/backends/base.py +285 -0
- corpus_forge/backends/postgres.py +2655 -0
- corpus_forge/backends/sqlite.py +3225 -0
- corpus_forge/backends/sqlite_vec_loader.py +47 -0
- corpus_forge/chunkers/__init__.py +39 -0
- corpus_forge/chunkers/base.py +216 -0
- corpus_forge/chunkers/cdc.py +241 -0
- corpus_forge/chunkers/code.py +465 -0
- corpus_forge/chunkers/conversation.py +9 -0
- corpus_forge/chunkers/markdown.py +9 -0
- corpus_forge/classifiers/__init__.py +137 -0
- corpus_forge/classifiers/base.py +202 -0
- corpus_forge/classifiers/llm.py +254 -0
- corpus_forge/classifiers/registry.py +113 -0
- corpus_forge/classifiers/rule_based.py +342 -0
- corpus_forge/cli.py +1914 -0
- corpus_forge/config.py +624 -0
- corpus_forge/curation/__init__.py +36 -0
- corpus_forge/curation/prompts.py +54 -0
- corpus_forge/curation/selector.py +774 -0
- corpus_forge/daemon.py +77 -0
- corpus_forge/doctor/__init__.py +34 -0
- corpus_forge/doctor/checks.py +146 -0
- corpus_forge/embed.py +320 -0
- corpus_forge/embedders/base.py +62 -0
- corpus_forge/embedders/clip_local.py +123 -0
- corpus_forge/embedders/clip_remote.py +133 -0
- corpus_forge/embedders/multimodal.py +83 -0
- corpus_forge/embedders/openai.py +133 -0
- corpus_forge/embedders/registry.py +58 -0
- corpus_forge/embedders/sentence_transformers.py +124 -0
- corpus_forge/enrichers/__init__.py +157 -0
- corpus_forge/enrichers/base.py +319 -0
- corpus_forge/enrichers/qwen_local.py +180 -0
- corpus_forge/enrichers/qwen_remote.py +195 -0
- corpus_forge/enrichers/registry.py +61 -0
- corpus_forge/estimate.py +601 -0
- corpus_forge/eval/__init__.py +39 -0
- corpus_forge/eval/dataset.py +167 -0
- corpus_forge/eval/datasets/forge_self.corpus.md +84 -0
- corpus_forge/eval/datasets/forge_self.jsonl +28 -0
- corpus_forge/eval/metrics.py +171 -0
- corpus_forge/eval/runner.py +309 -0
- corpus_forge/export.py +251 -0
- corpus_forge/exports/huggingface.py +66 -0
- corpus_forge/extractors/__init__.py +16 -0
- corpus_forge/extractors/audio.py +86 -0
- corpus_forge/extractors/base.py +89 -0
- corpus_forge/extractors/code.py +243 -0
- corpus_forge/extractors/csv.py +74 -0
- corpus_forge/extractors/epub.py +73 -0
- corpus_forge/extractors/html.py +47 -0
- corpus_forge/extractors/image.py +98 -0
- corpus_forge/extractors/notebook.py +82 -0
- corpus_forge/extractors/office.py +82 -0
- corpus_forge/extractors/passthrough.py +43 -0
- corpus_forge/extractors/pdf.py +427 -0
- corpus_forge/extractors/plaintext.py +54 -0
- corpus_forge/extractors/registry.py +307 -0
- corpus_forge/extractors/structured.py +122 -0
- corpus_forge/extractors/subtitle.py +76 -0
- corpus_forge/extractors/video.py +157 -0
- corpus_forge/identity.py +30 -0
- corpus_forge/ingest.py +634 -0
- corpus_forge/mcp/__init__.py +25 -0
- corpus_forge/mcp/server.py +1557 -0
- corpus_forge/mcp/templates.py +270 -0
- corpus_forge/mcp/transport.py +33 -0
- corpus_forge/mcp/writes.py +706 -0
- corpus_forge/py.typed +0 -0
- corpus_forge/retrieval/__init__.py +29 -0
- corpus_forge/retrieval/fusion.py +89 -0
- corpus_forge/retrieval/normalize.py +72 -0
- corpus_forge/retrieval/rerank/__init__.py +35 -0
- corpus_forge/retrieval/rerank/base.py +64 -0
- corpus_forge/retrieval/rerank/cross_encoder.py +201 -0
- corpus_forge/retrieval/rerank/ollama.py +192 -0
- corpus_forge/retrieval/retriever.py +231 -0
- corpus_forge/retrieval/types.py +85 -0
- corpus_forge/schema/migrate.py +134 -0
- corpus_forge/schema/per_embedder.sql.tmpl +9 -0
- corpus_forge/setup/__init__.py +34 -0
- corpus_forge/setup/questions.toml +331 -0
- corpus_forge/setup/wizard.py +438 -0
- corpus_forge/sources/_flatten.py +66 -0
- corpus_forge/sources/_session_link.py +26 -0
- corpus_forge/sources/base.py +106 -0
- corpus_forge/sources/chatgpt_export.py +184 -0
- corpus_forge/sources/claude_code.py +119 -0
- corpus_forge/sources/codex_cli.py +90 -0
- corpus_forge/sources/filesystem.py +243 -0
- corpus_forge/sources/gemini_cli.py +98 -0
- corpus_forge/sources/jsonl_chat.py +94 -0
- corpus_forge/sources/markdown_vault.py +74 -0
- corpus_forge/sources/opencode.py +92 -0
- corpus_forge/sync/__init__.py +5 -0
- corpus_forge/sync/cloud.py +67 -0
- corpus_forge/sync/conflicts.py +75 -0
- corpus_forge/sync/echo.py +78 -0
- corpus_forge/sync/engine.py +59 -0
- corpus_forge/sync/fs.py +99 -0
- corpus_forge/sync/pull.py +155 -0
- corpus_forge/sync/push.py +248 -0
- corpus_forge/templates/__init__.py +86 -0
- corpus_forge/templates/builtins/__init__.py +1 -0
- corpus_forge/templates/builtins/alpaca.py +23 -0
- corpus_forge/templates/builtins/chatml.py +19 -0
- corpus_forge/templates/builtins/gemma.py +21 -0
- corpus_forge/templates/builtins/llama3.py +22 -0
- corpus_forge/templates/builtins/qwen.py +19 -0
- corpus_forge/templates/builtins/vicuna.py +23 -0
- corpus_forge/templates/hf.py +39 -0
- corpus_forge/templates/tools.py +25 -0
- corpus_forge/update/__init__.py +30 -0
- corpus_forge/update/channels.py +204 -0
- corpus_forge/update/version_check.py +202 -0
- corpus_forge/vlm/__init__.py +45 -0
- corpus_forge/vlm/base.py +146 -0
- corpus_forge/vlm/mistral.py +127 -0
- corpus_forge/vlm/ollama.py +138 -0
- corpus_forge/vlm/registry.py +143 -0
- corpus_forge/whisper/__init__.py +45 -0
- corpus_forge/whisper/base.py +138 -0
- corpus_forge/whisper/local.py +186 -0
- corpus_forge/whisper/registry.py +145 -0
- corpus_forge/whisper/remote.py +109 -0
- corpus_forge-0.1.0b2.dist-info/METADATA +758 -0
- corpus_forge-0.1.0b2.dist-info/RECORD +150 -0
- corpus_forge-0.1.0b2.dist-info/WHEEL +4 -0
- corpus_forge-0.1.0b2.dist-info/entry_points.txt +2 -0
- corpus_forge-0.1.0b2.dist-info/licenses/LICENSE +202 -0
corpus_forge/__init__.py
ADDED
corpus_forge/__main__.py
ADDED
corpus_forge/_http.py
ADDED
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
"""Shared HTTP transport for remote model-backend clients.
|
|
2
|
+
|
|
3
|
+
Every remote model integration in corpus-forge (VLM, Whisper, code
|
|
4
|
+
enricher, LLM classifier, multi-modal embedder) speaks to a JSON HTTP
|
|
5
|
+
endpoint and maps the same set of failure modes onto a family-specific
|
|
6
|
+
error triad — ``<Family>UnavailableError`` / ``TimeoutError`` /
|
|
7
|
+
``ResponseError``.
|
|
8
|
+
|
|
9
|
+
This module owns the mapping in one place. Each family declares the
|
|
10
|
+
triad with an :class:`HttpErrors` bundle and calls :func:`request_json`,
|
|
11
|
+
which:
|
|
12
|
+
|
|
13
|
+
- catches the standard ``requests`` exception ladder (``Timeout`` /
|
|
14
|
+
``ConnectionError`` / ``RequestException``) and raises the matching
|
|
15
|
+
family-typed error;
|
|
16
|
+
- optionally promotes ``401``/``403`` to the family's "unavailable"
|
|
17
|
+
bucket (API-key rejection is a configuration failure, not a flake);
|
|
18
|
+
- treats non-2xx HTTP, malformed JSON, non-object JSON, and missing
|
|
19
|
+
required keys as response errors with a truncated body snippet in the
|
|
20
|
+
message.
|
|
21
|
+
|
|
22
|
+
Tests mock ``requests.post`` / ``requests.get`` directly. This module
|
|
23
|
+
calls them by name (not via ``requests.request``) so existing
|
|
24
|
+
``patch("requests.post", ...)`` contracts continue to work.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
from __future__ import annotations
|
|
28
|
+
|
|
29
|
+
from collections.abc import Mapping, Sequence
|
|
30
|
+
from dataclasses import dataclass
|
|
31
|
+
from typing import Any, Literal
|
|
32
|
+
|
|
33
|
+
__all__ = ["HttpErrors", "bearer_headers", "request_json"]
|
|
34
|
+
|
|
35
|
+
# Snippet length for response bodies inside error messages. Long enough
|
|
36
|
+
# to be informative, short enough to keep audit logs scannable.
|
|
37
|
+
_BODY_SNIPPET = 200
|
|
38
|
+
|
|
39
|
+
Method = Literal["GET", "POST"]
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@dataclass(frozen=True)
|
|
43
|
+
class HttpErrors:
|
|
44
|
+
"""The three discriminable HTTP-transport error classes for a family.
|
|
45
|
+
|
|
46
|
+
Declared once at module scope per family (e.g.
|
|
47
|
+
``_ERR = HttpErrors(VLMUnavailableError, VLMTimeoutError,
|
|
48
|
+
VLMResponseError)``) and threaded through :func:`request_json` so
|
|
49
|
+
the shared transport raises the right family-typed exception.
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
unavailable: type[BaseException]
|
|
53
|
+
timeout: type[BaseException]
|
|
54
|
+
response: type[BaseException]
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def bearer_headers(
|
|
58
|
+
api_key: str | None, *, extra: Mapping[str, str] | None = None
|
|
59
|
+
) -> dict[str, str]:
|
|
60
|
+
"""Build a header dict with optional ``Authorization: Bearer <key>``.
|
|
61
|
+
|
|
62
|
+
Returns an empty dict (plus any ``extra`` overrides) when ``api_key``
|
|
63
|
+
is falsy — matches the "open hosted Ollama" case where the header is
|
|
64
|
+
omitted entirely.
|
|
65
|
+
"""
|
|
66
|
+
headers: dict[str, str] = {}
|
|
67
|
+
if api_key:
|
|
68
|
+
headers["Authorization"] = f"Bearer {api_key}"
|
|
69
|
+
if extra:
|
|
70
|
+
headers.update(extra)
|
|
71
|
+
return headers
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _snippet(text: str | None) -> str:
|
|
75
|
+
return (text or "")[:_BODY_SNIPPET]
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def request_json(
|
|
79
|
+
method: Method,
|
|
80
|
+
url: str,
|
|
81
|
+
*,
|
|
82
|
+
timeout_s: float,
|
|
83
|
+
errors: HttpErrors,
|
|
84
|
+
label: str,
|
|
85
|
+
base_url: str | None = None,
|
|
86
|
+
json_body: Mapping[str, Any] | None = None,
|
|
87
|
+
files: Mapping[str, Any] | None = None,
|
|
88
|
+
data: Mapping[str, Any] | None = None,
|
|
89
|
+
headers: Mapping[str, str] | None = None,
|
|
90
|
+
api_key: str | None = None,
|
|
91
|
+
required_keys: Sequence[str] = (),
|
|
92
|
+
auth_to_unavailable: bool = True,
|
|
93
|
+
health_check: bool = False,
|
|
94
|
+
) -> dict[str, Any]:
|
|
95
|
+
"""Issue an HTTP request and return the parsed JSON object.
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
method: ``"GET"`` or ``"POST"``. POST is dispatched to
|
|
99
|
+
``requests.post``; GET to ``requests.get`` — by name so test
|
|
100
|
+
``patch("requests.post", ...)`` contracts survive.
|
|
101
|
+
url: Fully composed request URL.
|
|
102
|
+
timeout_s: Per-request HTTP budget.
|
|
103
|
+
errors: Family-specific :class:`HttpErrors` triad.
|
|
104
|
+
label: Human-readable name used in error messages
|
|
105
|
+
(e.g. ``"Ollama generate"``, ``"Mistral OCR"``).
|
|
106
|
+
base_url: Base URL shown in "Cannot connect to <label> at
|
|
107
|
+
<base_url>" — defaults to ``url`` when omitted.
|
|
108
|
+
json_body: Mapping to serialise as the JSON request body.
|
|
109
|
+
files: Multipart upload files (POST only).
|
|
110
|
+
data: Multipart form-data fields (POST only).
|
|
111
|
+
headers: Extra request headers. ``Authorization`` is set
|
|
112
|
+
automatically when ``api_key`` is provided.
|
|
113
|
+
api_key: Bearer token. ``None`` / empty omits the header.
|
|
114
|
+
required_keys: Top-level keys that MUST appear in the parsed
|
|
115
|
+
JSON; missing keys raise ``errors.response`` (or
|
|
116
|
+
``errors.unavailable`` when ``health_check=True``).
|
|
117
|
+
auth_to_unavailable: When True (default), 401/403 responses are
|
|
118
|
+
raised as ``errors.unavailable`` ("API key rejected"). Set
|
|
119
|
+
False for endpoints without auth (local Ollama daemons).
|
|
120
|
+
health_check: Probe-mode toggle. When True, every non-success
|
|
121
|
+
failure — Timeout, non-2xx, malformed JSON, missing required
|
|
122
|
+
key — is raised as ``errors.unavailable`` ("not reachable" /
|
|
123
|
+
"unhealthy"). Use this for warmup probes; leave False for
|
|
124
|
+
body calls where Timeout vs Response is a meaningful
|
|
125
|
+
distinction for retry/back-off callers.
|
|
126
|
+
|
|
127
|
+
Returns:
|
|
128
|
+
The parsed top-level JSON object (always a ``dict``).
|
|
129
|
+
|
|
130
|
+
Raises:
|
|
131
|
+
errors.unavailable: connect refused / DNS failure / 401 / 403 /
|
|
132
|
+
generic ``RequestException``, or — with ``health_check=True``
|
|
133
|
+
— any other non-success.
|
|
134
|
+
errors.timeout: ``requests.Timeout`` on a body call
|
|
135
|
+
(``health_check=False``).
|
|
136
|
+
errors.response: non-2xx HTTP, malformed JSON, non-object JSON,
|
|
137
|
+
or a missing ``required_keys`` entry (``health_check=False``).
|
|
138
|
+
"""
|
|
139
|
+
import requests # noqa: PLC0415 — lazy: every model backend keeps `requests` optional
|
|
140
|
+
|
|
141
|
+
request_headers = bearer_headers(api_key, extra=headers)
|
|
142
|
+
|
|
143
|
+
kwargs: dict[str, Any] = {"headers": request_headers, "timeout": timeout_s}
|
|
144
|
+
if json_body is not None:
|
|
145
|
+
kwargs["json"] = dict(json_body)
|
|
146
|
+
if files is not None:
|
|
147
|
+
kwargs["files"] = dict(files)
|
|
148
|
+
if data is not None:
|
|
149
|
+
kwargs["data"] = dict(data)
|
|
150
|
+
|
|
151
|
+
base = base_url if base_url is not None else url
|
|
152
|
+
fn = requests.post if method == "POST" else requests.get
|
|
153
|
+
|
|
154
|
+
# In health-check mode, response/timeout failures collapse to the
|
|
155
|
+
# unavailable bucket. We pick the response-error class once and the
|
|
156
|
+
# body-validation branches reuse it.
|
|
157
|
+
body_error = errors.unavailable if health_check else errors.response
|
|
158
|
+
|
|
159
|
+
try:
|
|
160
|
+
resp = fn(url, **kwargs)
|
|
161
|
+
except requests.Timeout as exc:
|
|
162
|
+
if health_check:
|
|
163
|
+
raise errors.unavailable(
|
|
164
|
+
f"{label} at {base} did not respond within {timeout_s}s — is it reachable?"
|
|
165
|
+
) from exc
|
|
166
|
+
raise errors.timeout(f"{label} exceeded {timeout_s}s budget at {url}") from exc
|
|
167
|
+
except requests.ConnectionError as exc:
|
|
168
|
+
raise errors.unavailable(f"Cannot connect to {label} at {base}: {exc}") from exc
|
|
169
|
+
except requests.RequestException as exc:
|
|
170
|
+
raise errors.unavailable(f"{label} request failed: {exc}") from exc
|
|
171
|
+
|
|
172
|
+
if auth_to_unavailable and resp.status_code in (401, 403):
|
|
173
|
+
raise errors.unavailable(
|
|
174
|
+
f"{label} API key rejected (HTTP {resp.status_code}): {_snippet(resp.text)}"
|
|
175
|
+
)
|
|
176
|
+
if not resp.ok:
|
|
177
|
+
raise body_error(f"HTTP {resp.status_code}: {_snippet(resp.text)}")
|
|
178
|
+
|
|
179
|
+
try:
|
|
180
|
+
payload = resp.json()
|
|
181
|
+
except ValueError as exc:
|
|
182
|
+
raise body_error(f"Malformed JSON from {label}: {_snippet(resp.text)}") from exc
|
|
183
|
+
|
|
184
|
+
if not isinstance(payload, dict):
|
|
185
|
+
raise body_error(f"{label} returned non-object JSON: {str(payload)[:_BODY_SNIPPET]}")
|
|
186
|
+
|
|
187
|
+
for key in required_keys:
|
|
188
|
+
if key not in payload:
|
|
189
|
+
raise body_error(
|
|
190
|
+
f"{label} response missing {key!r} key: {str(payload)[:_BODY_SNIPPET]}"
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
return payload
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
"""Shared device-detection helper for sentence-transformers-style backends.
|
|
2
|
+
|
|
3
|
+
Four call sites used to roll the same MPS → CUDA → CPU heuristic by
|
|
4
|
+
hand: :class:`SentenceTransformersEmbedder`, :class:`ClipLocalEmbedder`,
|
|
5
|
+
:class:`LocalWhisper`, and :class:`CrossEncoderReranker`. They now all
|
|
6
|
+
call :func:`detect_device`.
|
|
7
|
+
|
|
8
|
+
The single subtlety is :class:`LocalWhisper`: ``faster-whisper`` does
|
|
9
|
+
not yet support the MPS backend, so it disables the MPS branch via
|
|
10
|
+
``prefer_mps=False``.
|
|
11
|
+
|
|
12
|
+
``torch`` is imported lazily so this module is safe to import in
|
|
13
|
+
environments where the ML stack isn't installed (it returns ``"cpu"``
|
|
14
|
+
unconditionally in that case).
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
__all__ = ["detect_device", "resolve_device"]
|
|
20
|
+
|
|
21
|
+
_AUTO = "auto"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def detect_device(*, prefer_mps: bool = True) -> str:
|
|
25
|
+
"""Pick the best available concrete device.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
prefer_mps: When True (default), Apple Silicon's Metal backend
|
|
29
|
+
is preferred when available. Set False for libraries that
|
|
30
|
+
don't yet support MPS (faster-whisper).
|
|
31
|
+
|
|
32
|
+
Returns:
|
|
33
|
+
``"mps"`` (when ``prefer_mps`` and available), ``"cuda"``, or
|
|
34
|
+
``"cpu"``. Falls back to ``"cpu"`` when ``torch`` isn't
|
|
35
|
+
importable so callers can still run on hosts without the ML
|
|
36
|
+
stack installed.
|
|
37
|
+
"""
|
|
38
|
+
try:
|
|
39
|
+
import torch # noqa: PLC0415
|
|
40
|
+
except ImportError:
|
|
41
|
+
return "cpu"
|
|
42
|
+
if prefer_mps and torch.backends.mps.is_available():
|
|
43
|
+
return "mps"
|
|
44
|
+
if torch.cuda.is_available():
|
|
45
|
+
return "cuda"
|
|
46
|
+
return "cpu"
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def resolve_device(device: str, *, prefer_mps: bool = True) -> str:
|
|
50
|
+
"""Translate the ``"auto"`` sentinel into a concrete device.
|
|
51
|
+
|
|
52
|
+
Any other value is returned unchanged so callers can pass through
|
|
53
|
+
user-specified ``"cpu"`` / ``"cuda"`` / ``"mps"`` strings.
|
|
54
|
+
"""
|
|
55
|
+
if device == _AUTO:
|
|
56
|
+
return detect_device(prefer_mps=prefer_mps)
|
|
57
|
+
return device
|
|
File without changes
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
"""Alembic migration environment.
|
|
2
|
+
|
|
3
|
+
Dialect-aware configuration:
|
|
4
|
+
- SQLite → render_as_batch=True (no DDL transactional ALTER support)
|
|
5
|
+
- Postgres → version_table_schema="corpus"
|
|
6
|
+
|
|
7
|
+
All operator-facing messages go through the ``alembic.runtime.migration``
|
|
8
|
+
logger (stderr). No print() calls.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import logging
|
|
14
|
+
import os
|
|
15
|
+
|
|
16
|
+
from alembic import context
|
|
17
|
+
from sqlalchemy import engine_from_config, pool
|
|
18
|
+
|
|
19
|
+
log = logging.getLogger("alembic.runtime.migration")
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _get_url() -> str:
|
|
23
|
+
"""Resolve the database URL from env var or alembic.ini config.
|
|
24
|
+
|
|
25
|
+
Priority:
|
|
26
|
+
1. ``DATABASE_URL`` environment variable (honoured by the legacy migrator)
|
|
27
|
+
2. ``CORPUS_FORGE_DATABASE_URL`` environment variable
|
|
28
|
+
3. ``sqlalchemy.url`` from the Alembic Config object
|
|
29
|
+
"""
|
|
30
|
+
url = os.environ.get("DATABASE_URL") or os.environ.get("CORPUS_FORGE_DATABASE_URL")
|
|
31
|
+
if url:
|
|
32
|
+
return url
|
|
33
|
+
cfg_url: str = context.config.get_main_option("sqlalchemy.url", default="")
|
|
34
|
+
return cfg_url
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def run_migrations_offline() -> None:
|
|
38
|
+
"""Run migrations in 'offline' mode.
|
|
39
|
+
|
|
40
|
+
This configures the context with just a URL and not an Engine; calls to
|
|
41
|
+
context.execute() emit the given string to the script output.
|
|
42
|
+
"""
|
|
43
|
+
url = _get_url()
|
|
44
|
+
|
|
45
|
+
# Determine whether this looks like a Postgres URL to set schema.
|
|
46
|
+
is_postgres = url.startswith(("postgresql", "postgres"))
|
|
47
|
+
|
|
48
|
+
configure_kwargs: dict = {
|
|
49
|
+
"url": url,
|
|
50
|
+
"target_metadata": None,
|
|
51
|
+
"literal_binds": True,
|
|
52
|
+
"dialect_opts": {"paramstyle": "named"},
|
|
53
|
+
"version_table": "alembic_version",
|
|
54
|
+
}
|
|
55
|
+
if is_postgres:
|
|
56
|
+
configure_kwargs["version_table_schema"] = "corpus"
|
|
57
|
+
|
|
58
|
+
with context.begin_transaction():
|
|
59
|
+
context.configure(**configure_kwargs)
|
|
60
|
+
context.run_migrations()
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def run_migrations_online() -> None:
|
|
64
|
+
"""Run migrations in 'online' mode (with an Engine/Connection)."""
|
|
65
|
+
creator = context.config.attributes.get("creator")
|
|
66
|
+
if creator is not None:
|
|
67
|
+
# In-memory SQLite: use the backend's shared-cache factory so Alembic
|
|
68
|
+
# operates on the same in-memory database as the SQLiteBackend instance.
|
|
69
|
+
connectable = engine_from_config(
|
|
70
|
+
{},
|
|
71
|
+
prefix="sqlalchemy.",
|
|
72
|
+
poolclass=pool.NullPool,
|
|
73
|
+
creator=creator,
|
|
74
|
+
# SQLite dialect is inferred from the creator connection.
|
|
75
|
+
url="sqlite+pysqlite://",
|
|
76
|
+
)
|
|
77
|
+
else:
|
|
78
|
+
connectable = engine_from_config(
|
|
79
|
+
context.config.get_section(context.config.config_ini_section, {}),
|
|
80
|
+
prefix="sqlalchemy.",
|
|
81
|
+
poolclass=pool.NullPool,
|
|
82
|
+
url=_get_url() or None,
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
with connectable.connect() as connection:
|
|
86
|
+
dialect_name: str = connection.dialect.name
|
|
87
|
+
|
|
88
|
+
if dialect_name == "sqlite":
|
|
89
|
+
context.configure(
|
|
90
|
+
connection=connection,
|
|
91
|
+
render_as_batch=True,
|
|
92
|
+
version_table="alembic_version",
|
|
93
|
+
target_metadata=None,
|
|
94
|
+
)
|
|
95
|
+
else:
|
|
96
|
+
# Postgres (and any other dialect) — use corpus schema for version table.
|
|
97
|
+
context.configure(
|
|
98
|
+
connection=connection,
|
|
99
|
+
version_table="alembic_version",
|
|
100
|
+
version_table_schema="corpus",
|
|
101
|
+
target_metadata=None,
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
with context.begin_transaction():
|
|
105
|
+
context.run_migrations()
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
# Module body: only execute when invoked via Alembic CLI / programmatic runner.
|
|
109
|
+
# Guarded so that a plain ``import corpus_forge.alembic.env`` (e.g. in tests)
|
|
110
|
+
# does not attempt to run migrations without a configured Alembic context.
|
|
111
|
+
try:
|
|
112
|
+
_offline = context.is_offline_mode()
|
|
113
|
+
except Exception:
|
|
114
|
+
# Not running under Alembic's migration framework — plain import, skip.
|
|
115
|
+
pass
|
|
116
|
+
else:
|
|
117
|
+
if _offline:
|
|
118
|
+
run_migrations_offline()
|
|
119
|
+
else:
|
|
120
|
+
run_migrations_online()
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
"""${message} # noqa: D
|
|
2
|
+
|
|
3
|
+
Revision ID: ${up_revision}
|
|
4
|
+
Revises: ${down_revision | comma,n}
|
|
5
|
+
Create Date: ${create_date}
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from typing import Union
|
|
11
|
+
|
|
12
|
+
from alembic import op
|
|
13
|
+
import sqlalchemy as sa
|
|
14
|
+
${imports if imports else ""}
|
|
15
|
+
|
|
16
|
+
# revision identifiers, used by Alembic.
|
|
17
|
+
revision: str = ${repr(up_revision)}
|
|
18
|
+
down_revision: Union[str, None] = ${repr(down_revision)}
|
|
19
|
+
branch_labels: Union[str, tuple[str, ...], None] = ${repr(branch_labels)}
|
|
20
|
+
depends_on: Union[str, tuple[str, ...], None] = ${repr(depends_on)}
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def upgrade() -> None:
|
|
24
|
+
"""Apply forward migrations."""
|
|
25
|
+
${upgrades if upgrades else "pass"}
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def downgrade() -> None:
|
|
29
|
+
"""Apply reverse migrations."""
|
|
30
|
+
${downgrades if downgrades else "pass"}
|
|
File without changes
|