xrtm-data 0.2.4__tar.gz → 0.2.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. {xrtm_data-0.2.4 → xrtm_data-0.2.6}/PKG-INFO +1 -1
  2. {xrtm_data-0.2.4 → xrtm_data-0.2.6}/pyproject.toml +1 -1
  3. {xrtm_data-0.2.4 → xrtm_data-0.2.6}/src/xrtm/data/__init__.py +4 -1
  4. {xrtm_data-0.2.4 → xrtm_data-0.2.6}/src/xrtm/data/core/__init__.py +2 -2
  5. {xrtm_data-0.2.4 → xrtm_data-0.2.6}/src/xrtm/data/core/interfaces.py +26 -5
  6. {xrtm_data-0.2.4 → xrtm_data-0.2.6}/src/xrtm/data/core/schemas/forecast.py +9 -1
  7. {xrtm_data-0.2.4 → xrtm_data-0.2.6}/src/xrtm/data/core/schemas/trade.py +36 -2
  8. xrtm_data-0.2.6/src/xrtm/data/corpora/__init__.py +99 -0
  9. xrtm_data-0.2.6/src/xrtm/data/corpora/_builtin_corpora.py +185 -0
  10. xrtm_data-0.2.6/src/xrtm/data/corpora/forecast_importer.py +517 -0
  11. xrtm_data-0.2.6/src/xrtm/data/corpora/importers.py +296 -0
  12. {xrtm_data-0.2.4 → xrtm_data-0.2.6}/src/xrtm/data/corpora/real_binary.py +14 -2
  13. xrtm_data-0.2.6/src/xrtm/data/corpora/registry.py +359 -0
  14. xrtm_data-0.2.6/src/xrtm/data/corpora/splits.py +261 -0
  15. {xrtm_data-0.2.4 → xrtm_data-0.2.6}/src/xrtm/data/providers/local/csv.py +15 -4
  16. xrtm_data-0.2.6/src/xrtm/data/providers/online/polymarket.py +256 -0
  17. {xrtm_data-0.2.4 → xrtm_data-0.2.6}/src/xrtm/data/providers/subgraph/polymarket.py +61 -7
  18. {xrtm_data-0.2.4 → xrtm_data-0.2.6}/src/xrtm/data/version.py +1 -1
  19. {xrtm_data-0.2.4 → xrtm_data-0.2.6}/src/xrtm_data.egg-info/PKG-INFO +1 -1
  20. {xrtm_data-0.2.4 → xrtm_data-0.2.6}/src/xrtm_data.egg-info/SOURCES.txt +9 -0
  21. xrtm_data-0.2.6/tests/test_corpus_importers.py +270 -0
  22. xrtm_data-0.2.6/tests/test_corpus_registry.py +281 -0
  23. xrtm_data-0.2.6/tests/test_corpus_splits.py +278 -0
  24. xrtm_data-0.2.6/tests/test_forecast_importer.py +302 -0
  25. {xrtm_data-0.2.4 → xrtm_data-0.2.6}/tests/test_local_datasource.py +14 -0
  26. {xrtm_data-0.2.4 → xrtm_data-0.2.6}/tests/test_polymarket_source.py +52 -3
  27. {xrtm_data-0.2.4 → xrtm_data-0.2.6}/tests/test_polymarket_subgraph.py +78 -0
  28. {xrtm_data-0.2.4 → xrtm_data-0.2.6}/tests/test_schemas.py +50 -2
  29. xrtm_data-0.2.4/src/xrtm/data/corpora/__init__.py +0 -36
  30. xrtm_data-0.2.4/src/xrtm/data/providers/online/polymarket.py +0 -167
  31. {xrtm_data-0.2.4 → xrtm_data-0.2.6}/LICENSE +0 -0
  32. {xrtm_data-0.2.4 → xrtm_data-0.2.6}/README.md +0 -0
  33. {xrtm_data-0.2.4 → xrtm_data-0.2.6}/setup.cfg +0 -0
  34. {xrtm_data-0.2.4 → xrtm_data-0.2.6}/src/xrtm/data/cli/__init__.py +0 -0
  35. {xrtm_data-0.2.4 → xrtm_data-0.2.6}/src/xrtm/data/core/schemas/__init__.py +0 -0
  36. {xrtm_data-0.2.4 → xrtm_data-0.2.6}/src/xrtm/data/core/schemas/prior.py +0 -0
  37. {xrtm_data-0.2.4 → xrtm_data-0.2.6}/src/xrtm/data/kit/__init__.py +0 -0
  38. {xrtm_data-0.2.4 → xrtm_data-0.2.6}/src/xrtm/data/kit/processors/__init__.py +0 -0
  39. {xrtm_data-0.2.4 → xrtm_data-0.2.6}/src/xrtm/data/providers/__init__.py +0 -0
  40. {xrtm_data-0.2.4 → xrtm_data-0.2.6}/src/xrtm/data/providers/local/__init__.py +0 -0
  41. {xrtm_data-0.2.4 → xrtm_data-0.2.6}/src/xrtm/data/providers/online/__init__.py +0 -0
  42. {xrtm_data-0.2.4 → xrtm_data-0.2.6}/src/xrtm/data/providers/subgraph/__init__.py +0 -0
  43. {xrtm_data-0.2.4 → xrtm_data-0.2.6}/src/xrtm_data.egg-info/dependency_links.txt +0 -0
  44. {xrtm_data-0.2.4 → xrtm_data-0.2.6}/src/xrtm_data.egg-info/entry_points.txt +0 -0
  45. {xrtm_data-0.2.4 → xrtm_data-0.2.6}/src/xrtm_data.egg-info/requires.txt +0 -0
  46. {xrtm_data-0.2.4 → xrtm_data-0.2.6}/src/xrtm_data.egg-info/top_level.txt +0 -0
  47. {xrtm_data-0.2.4 → xrtm_data-0.2.6}/tests/test_beta_fitter.py +0 -0
  48. {xrtm_data-0.2.4 → xrtm_data-0.2.6}/tests/test_cli_loading.py +0 -0
  49. {xrtm_data-0.2.4 → xrtm_data-0.2.6}/tests/test_cli_ux.py +0 -0
  50. {xrtm_data-0.2.4 → xrtm_data-0.2.6}/tests/test_prior_schemas.py +0 -0
  51. {xrtm_data-0.2.4 → xrtm_data-0.2.6}/tests/test_real_binary_corpus.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: xrtm-data
3
- Version: 0.2.4
3
+ Version: 0.2.6
4
4
  Summary: The Snapshot Vault for XRTM.
5
5
  Author-email: XRTM Team <moy@xrtm.org>
6
6
  License-Expression: Apache-2.0
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "xrtm-data"
7
- version = "0.2.4"
7
+ version = "0.2.6"
8
8
  description = "The Snapshot Vault for XRTM."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.11,<3.13"
@@ -31,7 +31,7 @@ Example:
31
31
  """
32
32
 
33
33
  # Core interfaces
34
- from xrtm.data.core import DataSource
34
+ from xrtm.data.core import DataSource, DataSourceError, SourceFetchError, SourceTemporalIntegrityError
35
35
 
36
36
  # Core schemas (public API)
37
37
  from xrtm.data.core.schemas import (
@@ -46,6 +46,9 @@ from xrtm.data.core.schemas import (
46
46
  __all__ = [
47
47
  # Interfaces
48
48
  "DataSource",
49
+ "DataSourceError",
50
+ "SourceFetchError",
51
+ "SourceTemporalIntegrityError",
49
52
  # Schemas
50
53
  "MetadataBase",
51
54
  "ForecastQuestion",
@@ -21,6 +21,6 @@ providers must implement. The core module is domain-agnostic and MUST NOT
21
21
  import from kit/ or providers/.
22
22
  """
23
23
 
24
- from xrtm.data.core.interfaces import DataSource
24
+ from xrtm.data.core.interfaces import DataSource, DataSourceError, SourceFetchError, SourceTemporalIntegrityError
25
25
 
26
- __all__ = ["DataSource"]
26
+ __all__ = ["DataSource", "DataSourceError", "SourceFetchError", "SourceTemporalIntegrityError"]
@@ -23,18 +23,31 @@ data provider, regardless of the source.
23
23
  Example:
24
24
  >>> from xrtm.data.core import DataSource
25
25
  >>> class MySource(DataSource):
26
- ... async def fetch_questions(self, query=None, limit=5):
26
+ ... async def fetch_questions(self, query=None, limit=5, *, snapshot_time=None):
27
27
  ... return []
28
28
  ... async def get_question_by_id(self, question_id):
29
29
  ... return None
30
30
  """
31
31
 
32
32
  import abc
33
+ from datetime import datetime
33
34
  from typing import List, Optional
34
35
 
35
36
  from xrtm.data.core.schemas.forecast import ForecastQuestion
36
37
 
37
38
 
39
+ class DataSourceError(RuntimeError):
40
+ r"""Base exception for data source failures."""
41
+
42
+
43
+ class SourceFetchError(DataSourceError):
44
+ r"""Raised when a provider cannot fetch or decode source data."""
45
+
46
+
47
+ class SourceTemporalIntegrityError(DataSourceError):
48
+ r"""Raised when a provider cannot satisfy a requested snapshot safely."""
49
+
50
+
38
51
  class DataSource(abc.ABC):
39
52
  r"""
40
53
  Abstract interface for gathering or streaming forecasting workloads.
@@ -47,18 +60,22 @@ class DataSource(abc.ABC):
47
60
 
48
61
  Example:
49
62
  >>> class LocalSource(DataSource):
50
- ... async def fetch_questions(self, query=None, limit=5):
63
+ ... async def fetch_questions(self, query=None, limit=5, *, snapshot_time=None):
51
64
  ... return [ForecastQuestion(id="1", title="Test")]
52
65
  """
53
66
 
54
67
  @abc.abstractmethod
55
- async def fetch_questions(self, query: Optional[str] = None, limit: int = 5) -> List[ForecastQuestion]:
68
+ async def fetch_questions(
69
+ self, query: Optional[str] = None, limit: int = 5, *, snapshot_time: Optional[datetime] = None
70
+ ) -> List[ForecastQuestion]:
56
71
  r"""
57
72
  Fetch a list of forecast questions from the data source.
58
73
 
59
74
  Args:
60
75
  query: Optional search/filter string.
61
76
  limit: Maximum number of questions to return.
77
+ snapshot_time: Optional end-of-history timestamp. Providers that cannot
78
+ satisfy historical snapshots must surface a temporal integrity error.
62
79
 
63
80
  Returns:
64
81
  List of ForecastQuestion objects matching the criteria.
@@ -66,12 +83,16 @@ class DataSource(abc.ABC):
66
83
  pass
67
84
 
68
85
  @abc.abstractmethod
69
- async def get_question_by_id(self, question_id: str) -> Optional[ForecastQuestion]:
86
+ async def get_question_by_id(
87
+ self, question_id: str, *, snapshot_time: Optional[datetime] = None
88
+ ) -> Optional[ForecastQuestion]:
70
89
  r"""
71
90
  Retrieve a single question by its unique identifier.
72
91
 
73
92
  Args:
74
93
  question_id: The unique identifier of the question.
94
+ snapshot_time: Optional end-of-history timestamp. Providers that cannot
95
+ satisfy historical snapshots must surface a temporal integrity error.
75
96
 
76
97
  Returns:
77
98
  The ForecastQuestion if found, None otherwise.
@@ -79,4 +100,4 @@ class DataSource(abc.ABC):
79
100
  pass
80
101
 
81
102
 
82
- __all__ = ["DataSource"]
103
+ __all__ = ["DataSource", "DataSourceError", "SourceFetchError", "SourceTemporalIntegrityError"]
@@ -28,7 +28,7 @@ Example:
28
28
  from datetime import datetime, timezone
29
29
  from typing import Any, Dict, List, Optional
30
30
 
31
- from pydantic import AliasChoices, BaseModel, ConfigDict, Field, model_validator
31
+ from pydantic import AliasChoices, BaseModel, ConfigDict, Field, field_validator, model_validator
32
32
 
33
33
 
34
34
  class MetadataBase(BaseModel):
@@ -66,6 +66,14 @@ class MetadataBase(BaseModel):
66
66
  source_version: Optional[str] = Field(None, description="Version of the data source")
67
67
  raw_data: Optional[Dict[str, Any]] = Field(None, description="Original unprocessed data")
68
68
 
69
+ @field_validator("created_at", "snapshot_time", mode="after")
70
+ @classmethod
71
+ def _normalize_temporal_fields(cls, value: datetime) -> datetime:
72
+ r"""Store temporal boundary fields as timezone-aware UTC datetimes."""
73
+ if value.tzinfo is None:
74
+ return value.replace(tzinfo=timezone.utc)
75
+ return value.astimezone(timezone.utc)
76
+
69
77
  def get(self, key: str, default: Any = None) -> Any:
70
78
  r"""Backward compatibility for dict-like access."""
71
79
  return getattr(self, key, default)
@@ -31,10 +31,17 @@ Example:
31
31
  ... )
32
32
  """
33
33
 
34
- from datetime import datetime
34
+ from datetime import datetime, timezone
35
35
  from typing import Optional
36
36
 
37
- from pydantic import BaseModel, Field
37
+ from pydantic import BaseModel, Field, field_validator, model_validator
38
+
39
+
40
+ def _as_utc(value: datetime) -> datetime:
41
+ r"""Normalize datetimes to timezone-aware UTC without rejecting legacy naive inputs."""
42
+ if value.tzinfo is None:
43
+ return value.replace(tzinfo=timezone.utc)
44
+ return value.astimezone(timezone.utc)
38
45
 
39
46
 
40
47
  class TradeEvent(BaseModel):
@@ -98,6 +105,12 @@ class TradeEvent(BaseModel):
98
105
  description="Transaction hash for verification",
99
106
  )
100
107
 
108
+ @field_validator("timestamp", mode="after")
109
+ @classmethod
110
+ def _normalize_timestamp(cls, value: datetime) -> datetime:
111
+ r"""Normalize trade timestamps to UTC to make window comparisons stable."""
112
+ return _as_utc(value)
113
+
101
114
  @property
102
115
  def yes_weight(self) -> float:
103
116
  r"""Volume-weighted contribution to Yes outcome: price × amount."""
@@ -149,6 +162,27 @@ class TradeWindow(BaseModel):
149
162
  description="Identifier for the market these trades belong to",
150
163
  )
151
164
 
165
+ @field_validator("start_time", "end_time", mode="after")
166
+ @classmethod
167
+ def _normalize_window_boundary(cls, value: datetime) -> datetime:
168
+ r"""Normalize window boundaries to UTC before enforcing leakage invariants."""
169
+ return _as_utc(value)
170
+
171
+ @model_validator(mode="after")
172
+ def _validate_temporal_bounds(self) -> "TradeWindow":
173
+ r"""Ensure a trade window cannot contain future or pre-window events."""
174
+ if self.end_time < self.start_time:
175
+ raise ValueError("end_time must not precede start_time")
176
+
177
+ leaked = [
178
+ trade.timestamp
179
+ for trade in self.trades
180
+ if trade.timestamp < self.start_time or trade.timestamp > self.end_time
181
+ ]
182
+ if leaked:
183
+ raise ValueError("trades must fall within [start_time, end_time]")
184
+ return self
185
+
152
186
  @property
153
187
  def total_volume(self) -> float:
154
188
  r"""Total trading volume in the window."""
@@ -0,0 +1,99 @@
1
+ # coding=utf-8
2
+ # Copyright 2026 XRTM Team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ r"""Public entry points for XRTM corpora.
17
+
18
+ The package-root API intentionally stays focused on the stable registry and the
19
+ embedded real-binary corpus. Less-stable helpers remain available from their
20
+ submodules and are kept here as lazy compatibility exports.
21
+
22
+ Example:
23
+ >>> from xrtm.data.corpora import get_corpus, list_available_corpora
24
+ >>> corpus = get_corpus("xrtm-real-binary-v1")
25
+ >>> metadata_list = list_available_corpora(release_gate_only=True)
26
+ """
27
+
28
+ from __future__ import annotations
29
+
30
+ from importlib import import_module
31
+
32
+ from xrtm.data.corpora.forecast_importer import FORECAST_CORPUS_ID
33
+ from xrtm.data.corpora.real_binary import (
34
+ REAL_BINARY_CORPUS_ID,
35
+ RealBinaryCorpusSource,
36
+ RealBinaryQuestionRecord,
37
+ load_real_binary_corpus,
38
+ load_real_binary_questions,
39
+ load_real_binary_resolved_outcomes,
40
+ validate_real_binary_corpus,
41
+ )
42
+ from xrtm.data.corpora.registry import (
43
+ CorpusAvailability,
44
+ CorpusManifest,
45
+ CorpusMetadata,
46
+ CorpusRegistry,
47
+ CorpusSplit,
48
+ CorpusTier,
49
+ LicenseType,
50
+ describe_corpus,
51
+ get_corpus,
52
+ get_corpus_metadata,
53
+ list_available_corpora,
54
+ prepare_corpus,
55
+ )
56
+
57
+ __all__ = [
58
+ "REAL_BINARY_CORPUS_ID",
59
+ "FORECAST_CORPUS_ID",
60
+ "RealBinaryQuestionRecord",
61
+ "RealBinaryCorpusSource",
62
+ "load_real_binary_corpus",
63
+ "load_real_binary_questions",
64
+ "load_real_binary_resolved_outcomes",
65
+ "validate_real_binary_corpus",
66
+ "CorpusRegistry",
67
+ "CorpusAvailability",
68
+ "CorpusMetadata",
69
+ "CorpusManifest",
70
+ "CorpusTier",
71
+ "LicenseType",
72
+ "CorpusSplit",
73
+ "describe_corpus",
74
+ "get_corpus",
75
+ "get_corpus_metadata",
76
+ "list_available_corpora",
77
+ "prepare_corpus",
78
+ ]
79
+
80
+ _COMPAT_EXPORTS = {
81
+ "FORECAST_HF_DATASET": ("xrtm.data.corpora.forecast_importer", "FORECAST_HF_DATASET"),
82
+ "FOReCAstImporter": ("xrtm.data.corpora.forecast_importer", "FOReCAstImporter"),
83
+ "CorpusImporter": ("xrtm.data.corpora.importers", "CorpusImporter"),
84
+ "ImportManifest": ("xrtm.data.corpora.importers", "ImportManifest"),
85
+ "OfflineCorpusCache": ("xrtm.data.corpora.importers", "OfflineCorpusCache"),
86
+ "DeterministicFixtureImporter": ("xrtm.data.corpora.importers", "DeterministicFixtureImporter"),
87
+ "SplitConfig": ("xrtm.data.corpora.splits", "SplitConfig"),
88
+ "CorpusSplitter": ("xrtm.data.corpora.splits", "CorpusSplitter"),
89
+ "SplitAwareCorpusSource": ("xrtm.data.corpora.splits", "SplitAwareCorpusSource"),
90
+ }
91
+
92
+ def __getattr__(name: str):
93
+ if name in _COMPAT_EXPORTS:
94
+ module_name, attr_name = _COMPAT_EXPORTS[name]
95
+ return getattr(import_module(module_name), attr_name)
96
+ raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
97
+
98
+ def __dir__() -> list[str]:
99
+ return sorted(set(__all__) | set(_COMPAT_EXPORTS))
@@ -0,0 +1,185 @@
1
+ # coding=utf-8
2
+ # Copyright 2026 XRTM Team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ r"""Concrete built-in corpus registrations.
17
+
18
+ This module keeps registry bootstrap and corpus-specific cache workflows out of
19
+ ``registry.py`` so the registry can stay focused on generic manifest handling.
20
+ """
21
+
22
+ from __future__ import annotations
23
+
24
+ import os
25
+ import warnings
26
+ from pathlib import Path
27
+ from typing import List, Optional
28
+
29
+ from xrtm.data.core import DataSource
30
+ from xrtm.data.corpora.forecast_importer import FORECAST_CORPUS_ID, FOReCAstImporter
31
+ from xrtm.data.corpora.importers import OfflineCorpusCache
32
+ from xrtm.data.corpora.real_binary import REAL_BINARY_CORPUS_ID, RealBinaryCorpusSource
33
+ from xrtm.data.corpora.registry import (
34
+ CorpusAvailability,
35
+ CorpusManifest,
36
+ CorpusMetadata,
37
+ CorpusSplit,
38
+ CorpusTier,
39
+ LicenseType,
40
+ )
41
+
42
+ _FORECAST_VERSION = "1.0"
43
+
44
+ def build_builtin_manifests() -> List[CorpusManifest]:
45
+ """Build manifests for the corpora shipped with the registry bootstrap."""
46
+ return [
47
+ _build_real_binary_manifest(),
48
+ _build_forecast_manifest(),
49
+ ]
50
+
51
+ def _build_real_binary_manifest() -> CorpusManifest:
52
+ real_binary_metadata = CorpusMetadata(
53
+ corpus_id=REAL_BINARY_CORPUS_ID,
54
+ name="XRTM Real Binary v1",
55
+ tier=CorpusTier.TIER_1,
56
+ license_type=LicenseType.APACHE_2_0,
57
+ description="Minimal deterministic real-world binary question corpus for CI smoke tests",
58
+ version="1.0",
59
+ release_gate_approved=True,
60
+ bundled=True,
61
+ size_estimate=25,
62
+ tags=["binary", "deterministic", "embedded", "seed-corpus"],
63
+ provenance_url="https://github.com/xrtm/xrtm",
64
+ license_url="https://www.apache.org/licenses/LICENSE-2.0",
65
+ )
66
+
67
+ return CorpusManifest(
68
+ corpus_id=REAL_BINARY_CORPUS_ID,
69
+ metadata=real_binary_metadata,
70
+ loader_fn=lambda: RealBinaryCorpusSource(),
71
+ available_splits=[CorpusSplit.FULL],
72
+ default_split=CorpusSplit.FULL,
73
+ )
74
+
75
+ def _build_forecast_manifest() -> CorpusManifest:
76
+ forecast_metadata = CorpusMetadata(
77
+ corpus_id=FORECAST_CORPUS_ID,
78
+ name="FOReCAst (Future Outcome Reasoning and Confidence Assessment)",
79
+ tier=CorpusTier.TIER_2,
80
+ license_type=LicenseType.MIT,
81
+ description="Academic benchmark dataset for probabilistic forecasting from NeurIPS 2025. "
82
+ "1,390 resolved questions from Metaculus. Evaluation-only until Tier 1 approval.",
83
+ version=_FORECAST_VERSION,
84
+ release_gate_approved=False,
85
+ bundled=False,
86
+ size_estimate=1390,
87
+ tags=["forecast", "external", "evaluation-only", "probabilistic"],
88
+ provenance_url="https://huggingface.co/datasets/MoyYuan/FOReCAst",
89
+ license_url="https://opensource.org/licenses/MIT",
90
+ citation="FOReCAst: Future Outcome Reasoning and Confidence Assessment. NeurIPS 2025 Datasets and Benchmarks Track.",
91
+ extra={
92
+ "tier_status": "evaluation-only",
93
+ "promotion_required": "explicit approval needed for Tier 1 promotion",
94
+ "non_commercial_clause": "pending clarification",
95
+ },
96
+ )
97
+
98
+ return CorpusManifest(
99
+ corpus_id=FORECAST_CORPUS_ID,
100
+ metadata=forecast_metadata,
101
+ loader_fn=lambda: _load_forecast_source(forecast_metadata.version),
102
+ available_splits=[CorpusSplit.FULL, CorpusSplit.TRAIN, CorpusSplit.EVAL],
103
+ default_split=CorpusSplit.FULL,
104
+ importer_module="xrtm.data.corpora.forecast_importer",
105
+ availability_loader=lambda cache_root: _describe_forecast_corpus(
106
+ forecast_metadata.version,
107
+ cache_root=cache_root,
108
+ ),
109
+ prepare_loader=lambda cache_root, refresh, use_hf_datasets: _prepare_forecast_corpus(
110
+ forecast_metadata.version,
111
+ cache_root=cache_root,
112
+ refresh=refresh,
113
+ use_hf_datasets=use_hf_datasets,
114
+ ),
115
+ )
116
+
117
+ def _describe_forecast_corpus(
118
+ version: str,
119
+ *,
120
+ cache_root: Optional[Path] = None,
121
+ ) -> CorpusAvailability:
122
+ resolved_cache_root = _resolve_cache_root(cache_root)
123
+ cache = OfflineCorpusCache(resolved_cache_root)
124
+ manifest = cache.load_manifest(FORECAST_CORPUS_ID, version)
125
+ import_method = manifest.metadata.get("import_method") if manifest is not None else None
126
+ source_mode = "preview" if import_method in {None, "fixture"} else "external-cache"
127
+
128
+ return CorpusAvailability(
129
+ corpus_id=FORECAST_CORPUS_ID,
130
+ version=version,
131
+ source_mode=source_mode,
132
+ bundled=False,
133
+ already_cached=manifest is not None,
134
+ record_count=manifest.record_count if manifest is not None else None,
135
+ import_method=import_method,
136
+ cache_root=resolved_cache_root,
137
+ data_dir=cache.get_corpus_dir(FORECAST_CORPUS_ID, version),
138
+ manifest_path=cache.get_manifest_path(FORECAST_CORPUS_ID, version),
139
+ )
140
+
141
+ def _prepare_forecast_corpus(
142
+ version: str,
143
+ *,
144
+ cache_root: Optional[Path] = None,
145
+ refresh: bool = False,
146
+ use_hf_datasets: bool = True,
147
+ ) -> CorpusAvailability:
148
+ resolved_cache_root = _resolve_cache_root(cache_root)
149
+ cache = OfflineCorpusCache(resolved_cache_root)
150
+ if cache.is_cached(FORECAST_CORPUS_ID, version) and not refresh:
151
+ return _describe_forecast_corpus(version, cache_root=resolved_cache_root)
152
+
153
+ importer = FOReCAstImporter(use_hf_datasets=use_hf_datasets)
154
+ data_dir = cache.get_corpus_dir(FORECAST_CORPUS_ID, version)
155
+ manifest = importer.import_corpus(data_dir, version=version)
156
+ cache.save_manifest(manifest)
157
+ return _describe_forecast_corpus(version, cache_root=resolved_cache_root)
158
+
159
+ def _load_forecast_source(version: str) -> DataSource:
160
+ importer = FOReCAstImporter(use_hf_datasets=False)
161
+ cache = OfflineCorpusCache(_resolve_cache_root())
162
+ data_dir = cache.get_corpus_dir(FORECAST_CORPUS_ID, version)
163
+ manifest = cache.load_manifest(FORECAST_CORPUS_ID, version)
164
+ if manifest is None:
165
+ warnings.warn(
166
+ "FOReCAst full cache not found; using the 3-record deterministic preview. "
167
+ "Prepare the corpus cache first to run large-scale validation.",
168
+ UserWarning,
169
+ stacklevel=4,
170
+ )
171
+ manifest = importer.import_corpus(data_dir, version=version)
172
+ cache.save_manifest(manifest)
173
+ elif manifest.metadata.get("import_method") == "fixture":
174
+ warnings.warn(
175
+ "FOReCAst cache currently contains the deterministic preview only. "
176
+ "Refresh the cache with the external dataset before relying on large-scale counts.",
177
+ UserWarning,
178
+ stacklevel=4,
179
+ )
180
+ return importer.load_from_manifest(manifest, data_dir)
181
+
182
+ def _resolve_cache_root(cache_root: Optional[Path] = None) -> Path:
183
+ if cache_root is not None:
184
+ return cache_root
185
+ return Path(os.environ.get("XRTM_CORPUS_CACHE", Path.home() / ".xrtm" / "corpus-cache"))