xrtm-data 0.2.4__tar.gz → 0.2.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {xrtm_data-0.2.4 → xrtm_data-0.2.6}/PKG-INFO +1 -1
- {xrtm_data-0.2.4 → xrtm_data-0.2.6}/pyproject.toml +1 -1
- {xrtm_data-0.2.4 → xrtm_data-0.2.6}/src/xrtm/data/__init__.py +4 -1
- {xrtm_data-0.2.4 → xrtm_data-0.2.6}/src/xrtm/data/core/__init__.py +2 -2
- {xrtm_data-0.2.4 → xrtm_data-0.2.6}/src/xrtm/data/core/interfaces.py +26 -5
- {xrtm_data-0.2.4 → xrtm_data-0.2.6}/src/xrtm/data/core/schemas/forecast.py +9 -1
- {xrtm_data-0.2.4 → xrtm_data-0.2.6}/src/xrtm/data/core/schemas/trade.py +36 -2
- xrtm_data-0.2.6/src/xrtm/data/corpora/__init__.py +99 -0
- xrtm_data-0.2.6/src/xrtm/data/corpora/_builtin_corpora.py +185 -0
- xrtm_data-0.2.6/src/xrtm/data/corpora/forecast_importer.py +517 -0
- xrtm_data-0.2.6/src/xrtm/data/corpora/importers.py +296 -0
- {xrtm_data-0.2.4 → xrtm_data-0.2.6}/src/xrtm/data/corpora/real_binary.py +14 -2
- xrtm_data-0.2.6/src/xrtm/data/corpora/registry.py +359 -0
- xrtm_data-0.2.6/src/xrtm/data/corpora/splits.py +261 -0
- {xrtm_data-0.2.4 → xrtm_data-0.2.6}/src/xrtm/data/providers/local/csv.py +15 -4
- xrtm_data-0.2.6/src/xrtm/data/providers/online/polymarket.py +256 -0
- {xrtm_data-0.2.4 → xrtm_data-0.2.6}/src/xrtm/data/providers/subgraph/polymarket.py +61 -7
- {xrtm_data-0.2.4 → xrtm_data-0.2.6}/src/xrtm/data/version.py +1 -1
- {xrtm_data-0.2.4 → xrtm_data-0.2.6}/src/xrtm_data.egg-info/PKG-INFO +1 -1
- {xrtm_data-0.2.4 → xrtm_data-0.2.6}/src/xrtm_data.egg-info/SOURCES.txt +9 -0
- xrtm_data-0.2.6/tests/test_corpus_importers.py +270 -0
- xrtm_data-0.2.6/tests/test_corpus_registry.py +281 -0
- xrtm_data-0.2.6/tests/test_corpus_splits.py +278 -0
- xrtm_data-0.2.6/tests/test_forecast_importer.py +302 -0
- {xrtm_data-0.2.4 → xrtm_data-0.2.6}/tests/test_local_datasource.py +14 -0
- {xrtm_data-0.2.4 → xrtm_data-0.2.6}/tests/test_polymarket_source.py +52 -3
- {xrtm_data-0.2.4 → xrtm_data-0.2.6}/tests/test_polymarket_subgraph.py +78 -0
- {xrtm_data-0.2.4 → xrtm_data-0.2.6}/tests/test_schemas.py +50 -2
- xrtm_data-0.2.4/src/xrtm/data/corpora/__init__.py +0 -36
- xrtm_data-0.2.4/src/xrtm/data/providers/online/polymarket.py +0 -167
- {xrtm_data-0.2.4 → xrtm_data-0.2.6}/LICENSE +0 -0
- {xrtm_data-0.2.4 → xrtm_data-0.2.6}/README.md +0 -0
- {xrtm_data-0.2.4 → xrtm_data-0.2.6}/setup.cfg +0 -0
- {xrtm_data-0.2.4 → xrtm_data-0.2.6}/src/xrtm/data/cli/__init__.py +0 -0
- {xrtm_data-0.2.4 → xrtm_data-0.2.6}/src/xrtm/data/core/schemas/__init__.py +0 -0
- {xrtm_data-0.2.4 → xrtm_data-0.2.6}/src/xrtm/data/core/schemas/prior.py +0 -0
- {xrtm_data-0.2.4 → xrtm_data-0.2.6}/src/xrtm/data/kit/__init__.py +0 -0
- {xrtm_data-0.2.4 → xrtm_data-0.2.6}/src/xrtm/data/kit/processors/__init__.py +0 -0
- {xrtm_data-0.2.4 → xrtm_data-0.2.6}/src/xrtm/data/providers/__init__.py +0 -0
- {xrtm_data-0.2.4 → xrtm_data-0.2.6}/src/xrtm/data/providers/local/__init__.py +0 -0
- {xrtm_data-0.2.4 → xrtm_data-0.2.6}/src/xrtm/data/providers/online/__init__.py +0 -0
- {xrtm_data-0.2.4 → xrtm_data-0.2.6}/src/xrtm/data/providers/subgraph/__init__.py +0 -0
- {xrtm_data-0.2.4 → xrtm_data-0.2.6}/src/xrtm_data.egg-info/dependency_links.txt +0 -0
- {xrtm_data-0.2.4 → xrtm_data-0.2.6}/src/xrtm_data.egg-info/entry_points.txt +0 -0
- {xrtm_data-0.2.4 → xrtm_data-0.2.6}/src/xrtm_data.egg-info/requires.txt +0 -0
- {xrtm_data-0.2.4 → xrtm_data-0.2.6}/src/xrtm_data.egg-info/top_level.txt +0 -0
- {xrtm_data-0.2.4 → xrtm_data-0.2.6}/tests/test_beta_fitter.py +0 -0
- {xrtm_data-0.2.4 → xrtm_data-0.2.6}/tests/test_cli_loading.py +0 -0
- {xrtm_data-0.2.4 → xrtm_data-0.2.6}/tests/test_cli_ux.py +0 -0
- {xrtm_data-0.2.4 → xrtm_data-0.2.6}/tests/test_prior_schemas.py +0 -0
- {xrtm_data-0.2.4 → xrtm_data-0.2.6}/tests/test_real_binary_corpus.py +0 -0
|
@@ -31,7 +31,7 @@ Example:
|
|
|
31
31
|
"""
|
|
32
32
|
|
|
33
33
|
# Core interfaces
|
|
34
|
-
from xrtm.data.core import DataSource
|
|
34
|
+
from xrtm.data.core import DataSource, DataSourceError, SourceFetchError, SourceTemporalIntegrityError
|
|
35
35
|
|
|
36
36
|
# Core schemas (public API)
|
|
37
37
|
from xrtm.data.core.schemas import (
|
|
@@ -46,6 +46,9 @@ from xrtm.data.core.schemas import (
|
|
|
46
46
|
__all__ = [
|
|
47
47
|
# Interfaces
|
|
48
48
|
"DataSource",
|
|
49
|
+
"DataSourceError",
|
|
50
|
+
"SourceFetchError",
|
|
51
|
+
"SourceTemporalIntegrityError",
|
|
49
52
|
# Schemas
|
|
50
53
|
"MetadataBase",
|
|
51
54
|
"ForecastQuestion",
|
|
@@ -21,6 +21,6 @@ providers must implement. The core module is domain-agnostic and MUST NOT
|
|
|
21
21
|
import from kit/ or providers/.
|
|
22
22
|
"""
|
|
23
23
|
|
|
24
|
-
from xrtm.data.core.interfaces import DataSource
|
|
24
|
+
from xrtm.data.core.interfaces import DataSource, DataSourceError, SourceFetchError, SourceTemporalIntegrityError
|
|
25
25
|
|
|
26
|
-
__all__ = ["DataSource"]
|
|
26
|
+
__all__ = ["DataSource", "DataSourceError", "SourceFetchError", "SourceTemporalIntegrityError"]
|
|
@@ -23,18 +23,31 @@ data provider, regardless of the source.
|
|
|
23
23
|
Example:
|
|
24
24
|
>>> from xrtm.data.core import DataSource
|
|
25
25
|
>>> class MySource(DataSource):
|
|
26
|
-
... async def fetch_questions(self, query=None, limit=5):
|
|
26
|
+
... async def fetch_questions(self, query=None, limit=5, *, snapshot_time=None):
|
|
27
27
|
... return []
|
|
28
28
|
... async def get_question_by_id(self, question_id):
|
|
29
29
|
... return None
|
|
30
30
|
"""
|
|
31
31
|
|
|
32
32
|
import abc
|
|
33
|
+
from datetime import datetime
|
|
33
34
|
from typing import List, Optional
|
|
34
35
|
|
|
35
36
|
from xrtm.data.core.schemas.forecast import ForecastQuestion
|
|
36
37
|
|
|
37
38
|
|
|
39
|
+
class DataSourceError(RuntimeError):
|
|
40
|
+
r"""Base exception for data source failures."""
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class SourceFetchError(DataSourceError):
|
|
44
|
+
r"""Raised when a provider cannot fetch or decode source data."""
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class SourceTemporalIntegrityError(DataSourceError):
|
|
48
|
+
r"""Raised when a provider cannot satisfy a requested snapshot safely."""
|
|
49
|
+
|
|
50
|
+
|
|
38
51
|
class DataSource(abc.ABC):
|
|
39
52
|
r"""
|
|
40
53
|
Abstract interface for gathering or streaming forecasting workloads.
|
|
@@ -47,18 +60,22 @@ class DataSource(abc.ABC):
|
|
|
47
60
|
|
|
48
61
|
Example:
|
|
49
62
|
>>> class LocalSource(DataSource):
|
|
50
|
-
... async def fetch_questions(self, query=None, limit=5):
|
|
63
|
+
... async def fetch_questions(self, query=None, limit=5, *, snapshot_time=None):
|
|
51
64
|
... return [ForecastQuestion(id="1", title="Test")]
|
|
52
65
|
"""
|
|
53
66
|
|
|
54
67
|
@abc.abstractmethod
|
|
55
|
-
async def fetch_questions(
|
|
68
|
+
async def fetch_questions(
|
|
69
|
+
self, query: Optional[str] = None, limit: int = 5, *, snapshot_time: Optional[datetime] = None
|
|
70
|
+
) -> List[ForecastQuestion]:
|
|
56
71
|
r"""
|
|
57
72
|
Fetch a list of forecast questions from the data source.
|
|
58
73
|
|
|
59
74
|
Args:
|
|
60
75
|
query: Optional search/filter string.
|
|
61
76
|
limit: Maximum number of questions to return.
|
|
77
|
+
snapshot_time: Optional end-of-history timestamp. Providers that cannot
|
|
78
|
+
satisfy historical snapshots must surface a temporal integrity error.
|
|
62
79
|
|
|
63
80
|
Returns:
|
|
64
81
|
List of ForecastQuestion objects matching the criteria.
|
|
@@ -66,12 +83,16 @@ class DataSource(abc.ABC):
|
|
|
66
83
|
pass
|
|
67
84
|
|
|
68
85
|
@abc.abstractmethod
|
|
69
|
-
async def get_question_by_id(
|
|
86
|
+
async def get_question_by_id(
|
|
87
|
+
self, question_id: str, *, snapshot_time: Optional[datetime] = None
|
|
88
|
+
) -> Optional[ForecastQuestion]:
|
|
70
89
|
r"""
|
|
71
90
|
Retrieve a single question by its unique identifier.
|
|
72
91
|
|
|
73
92
|
Args:
|
|
74
93
|
question_id: The unique identifier of the question.
|
|
94
|
+
snapshot_time: Optional end-of-history timestamp. Providers that cannot
|
|
95
|
+
satisfy historical snapshots must surface a temporal integrity error.
|
|
75
96
|
|
|
76
97
|
Returns:
|
|
77
98
|
The ForecastQuestion if found, None otherwise.
|
|
@@ -79,4 +100,4 @@ class DataSource(abc.ABC):
|
|
|
79
100
|
pass
|
|
80
101
|
|
|
81
102
|
|
|
82
|
-
__all__ = ["DataSource"]
|
|
103
|
+
__all__ = ["DataSource", "DataSourceError", "SourceFetchError", "SourceTemporalIntegrityError"]
|
|
@@ -28,7 +28,7 @@ Example:
|
|
|
28
28
|
from datetime import datetime, timezone
|
|
29
29
|
from typing import Any, Dict, List, Optional
|
|
30
30
|
|
|
31
|
-
from pydantic import AliasChoices, BaseModel, ConfigDict, Field, model_validator
|
|
31
|
+
from pydantic import AliasChoices, BaseModel, ConfigDict, Field, field_validator, model_validator
|
|
32
32
|
|
|
33
33
|
|
|
34
34
|
class MetadataBase(BaseModel):
|
|
@@ -66,6 +66,14 @@ class MetadataBase(BaseModel):
|
|
|
66
66
|
source_version: Optional[str] = Field(None, description="Version of the data source")
|
|
67
67
|
raw_data: Optional[Dict[str, Any]] = Field(None, description="Original unprocessed data")
|
|
68
68
|
|
|
69
|
+
@field_validator("created_at", "snapshot_time", mode="after")
|
|
70
|
+
@classmethod
|
|
71
|
+
def _normalize_temporal_fields(cls, value: datetime) -> datetime:
|
|
72
|
+
r"""Store temporal boundary fields as timezone-aware UTC datetimes."""
|
|
73
|
+
if value.tzinfo is None:
|
|
74
|
+
return value.replace(tzinfo=timezone.utc)
|
|
75
|
+
return value.astimezone(timezone.utc)
|
|
76
|
+
|
|
69
77
|
def get(self, key: str, default: Any = None) -> Any:
|
|
70
78
|
r"""Backward compatibility for dict-like access."""
|
|
71
79
|
return getattr(self, key, default)
|
|
@@ -31,10 +31,17 @@ Example:
|
|
|
31
31
|
... )
|
|
32
32
|
"""
|
|
33
33
|
|
|
34
|
-
from datetime import datetime
|
|
34
|
+
from datetime import datetime, timezone
|
|
35
35
|
from typing import Optional
|
|
36
36
|
|
|
37
|
-
from pydantic import BaseModel, Field
|
|
37
|
+
from pydantic import BaseModel, Field, field_validator, model_validator
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _as_utc(value: datetime) -> datetime:
|
|
41
|
+
r"""Normalize datetimes to timezone-aware UTC without rejecting legacy naive inputs."""
|
|
42
|
+
if value.tzinfo is None:
|
|
43
|
+
return value.replace(tzinfo=timezone.utc)
|
|
44
|
+
return value.astimezone(timezone.utc)
|
|
38
45
|
|
|
39
46
|
|
|
40
47
|
class TradeEvent(BaseModel):
|
|
@@ -98,6 +105,12 @@ class TradeEvent(BaseModel):
|
|
|
98
105
|
description="Transaction hash for verification",
|
|
99
106
|
)
|
|
100
107
|
|
|
108
|
+
@field_validator("timestamp", mode="after")
|
|
109
|
+
@classmethod
|
|
110
|
+
def _normalize_timestamp(cls, value: datetime) -> datetime:
|
|
111
|
+
r"""Normalize trade timestamps to UTC to make window comparisons stable."""
|
|
112
|
+
return _as_utc(value)
|
|
113
|
+
|
|
101
114
|
@property
|
|
102
115
|
def yes_weight(self) -> float:
|
|
103
116
|
r"""Volume-weighted contribution to Yes outcome: price × amount."""
|
|
@@ -149,6 +162,27 @@ class TradeWindow(BaseModel):
|
|
|
149
162
|
description="Identifier for the market these trades belong to",
|
|
150
163
|
)
|
|
151
164
|
|
|
165
|
+
@field_validator("start_time", "end_time", mode="after")
|
|
166
|
+
@classmethod
|
|
167
|
+
def _normalize_window_boundary(cls, value: datetime) -> datetime:
|
|
168
|
+
r"""Normalize window boundaries to UTC before enforcing leakage invariants."""
|
|
169
|
+
return _as_utc(value)
|
|
170
|
+
|
|
171
|
+
@model_validator(mode="after")
|
|
172
|
+
def _validate_temporal_bounds(self) -> "TradeWindow":
|
|
173
|
+
r"""Ensure a trade window cannot contain future or pre-window events."""
|
|
174
|
+
if self.end_time < self.start_time:
|
|
175
|
+
raise ValueError("end_time must not precede start_time")
|
|
176
|
+
|
|
177
|
+
leaked = [
|
|
178
|
+
trade.timestamp
|
|
179
|
+
for trade in self.trades
|
|
180
|
+
if trade.timestamp < self.start_time or trade.timestamp > self.end_time
|
|
181
|
+
]
|
|
182
|
+
if leaked:
|
|
183
|
+
raise ValueError("trades must fall within [start_time, end_time]")
|
|
184
|
+
return self
|
|
185
|
+
|
|
152
186
|
@property
|
|
153
187
|
def total_volume(self) -> float:
|
|
154
188
|
r"""Total trading volume in the window."""
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
# coding=utf-8
|
|
2
|
+
# Copyright 2026 XRTM Team. All rights reserved.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
|
|
16
|
+
r"""Public entry points for XRTM corpora.
|
|
17
|
+
|
|
18
|
+
The package-root API intentionally stays focused on the stable registry and the
|
|
19
|
+
embedded real-binary corpus. Less-stable helpers remain available from their
|
|
20
|
+
submodules and are kept here as lazy compatibility exports.
|
|
21
|
+
|
|
22
|
+
Example:
|
|
23
|
+
>>> from xrtm.data.corpora import get_corpus, list_available_corpora
|
|
24
|
+
>>> corpus = get_corpus("xrtm-real-binary-v1")
|
|
25
|
+
>>> metadata_list = list_available_corpora(release_gate_only=True)
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
from __future__ import annotations
|
|
29
|
+
|
|
30
|
+
from importlib import import_module
|
|
31
|
+
|
|
32
|
+
from xrtm.data.corpora.forecast_importer import FORECAST_CORPUS_ID
|
|
33
|
+
from xrtm.data.corpora.real_binary import (
|
|
34
|
+
REAL_BINARY_CORPUS_ID,
|
|
35
|
+
RealBinaryCorpusSource,
|
|
36
|
+
RealBinaryQuestionRecord,
|
|
37
|
+
load_real_binary_corpus,
|
|
38
|
+
load_real_binary_questions,
|
|
39
|
+
load_real_binary_resolved_outcomes,
|
|
40
|
+
validate_real_binary_corpus,
|
|
41
|
+
)
|
|
42
|
+
from xrtm.data.corpora.registry import (
|
|
43
|
+
CorpusAvailability,
|
|
44
|
+
CorpusManifest,
|
|
45
|
+
CorpusMetadata,
|
|
46
|
+
CorpusRegistry,
|
|
47
|
+
CorpusSplit,
|
|
48
|
+
CorpusTier,
|
|
49
|
+
LicenseType,
|
|
50
|
+
describe_corpus,
|
|
51
|
+
get_corpus,
|
|
52
|
+
get_corpus_metadata,
|
|
53
|
+
list_available_corpora,
|
|
54
|
+
prepare_corpus,
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
__all__ = [
|
|
58
|
+
"REAL_BINARY_CORPUS_ID",
|
|
59
|
+
"FORECAST_CORPUS_ID",
|
|
60
|
+
"RealBinaryQuestionRecord",
|
|
61
|
+
"RealBinaryCorpusSource",
|
|
62
|
+
"load_real_binary_corpus",
|
|
63
|
+
"load_real_binary_questions",
|
|
64
|
+
"load_real_binary_resolved_outcomes",
|
|
65
|
+
"validate_real_binary_corpus",
|
|
66
|
+
"CorpusRegistry",
|
|
67
|
+
"CorpusAvailability",
|
|
68
|
+
"CorpusMetadata",
|
|
69
|
+
"CorpusManifest",
|
|
70
|
+
"CorpusTier",
|
|
71
|
+
"LicenseType",
|
|
72
|
+
"CorpusSplit",
|
|
73
|
+
"describe_corpus",
|
|
74
|
+
"get_corpus",
|
|
75
|
+
"get_corpus_metadata",
|
|
76
|
+
"list_available_corpora",
|
|
77
|
+
"prepare_corpus",
|
|
78
|
+
]
|
|
79
|
+
|
|
80
|
+
_COMPAT_EXPORTS = {
|
|
81
|
+
"FORECAST_HF_DATASET": ("xrtm.data.corpora.forecast_importer", "FORECAST_HF_DATASET"),
|
|
82
|
+
"FOReCAstImporter": ("xrtm.data.corpora.forecast_importer", "FOReCAstImporter"),
|
|
83
|
+
"CorpusImporter": ("xrtm.data.corpora.importers", "CorpusImporter"),
|
|
84
|
+
"ImportManifest": ("xrtm.data.corpora.importers", "ImportManifest"),
|
|
85
|
+
"OfflineCorpusCache": ("xrtm.data.corpora.importers", "OfflineCorpusCache"),
|
|
86
|
+
"DeterministicFixtureImporter": ("xrtm.data.corpora.importers", "DeterministicFixtureImporter"),
|
|
87
|
+
"SplitConfig": ("xrtm.data.corpora.splits", "SplitConfig"),
|
|
88
|
+
"CorpusSplitter": ("xrtm.data.corpora.splits", "CorpusSplitter"),
|
|
89
|
+
"SplitAwareCorpusSource": ("xrtm.data.corpora.splits", "SplitAwareCorpusSource"),
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
def __getattr__(name: str):
|
|
93
|
+
if name in _COMPAT_EXPORTS:
|
|
94
|
+
module_name, attr_name = _COMPAT_EXPORTS[name]
|
|
95
|
+
return getattr(import_module(module_name), attr_name)
|
|
96
|
+
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
|
97
|
+
|
|
98
|
+
def __dir__() -> list[str]:
|
|
99
|
+
return sorted(set(__all__) | set(_COMPAT_EXPORTS))
|
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
# coding=utf-8
|
|
2
|
+
# Copyright 2026 XRTM Team. All rights reserved.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
|
|
16
|
+
r"""Concrete built-in corpus registrations.
|
|
17
|
+
|
|
18
|
+
This module keeps registry bootstrap and corpus-specific cache workflows out of
|
|
19
|
+
``registry.py`` so the registry can stay focused on generic manifest handling.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
from __future__ import annotations
|
|
23
|
+
|
|
24
|
+
import os
|
|
25
|
+
import warnings
|
|
26
|
+
from pathlib import Path
|
|
27
|
+
from typing import List, Optional
|
|
28
|
+
|
|
29
|
+
from xrtm.data.core import DataSource
|
|
30
|
+
from xrtm.data.corpora.forecast_importer import FORECAST_CORPUS_ID, FOReCAstImporter
|
|
31
|
+
from xrtm.data.corpora.importers import OfflineCorpusCache
|
|
32
|
+
from xrtm.data.corpora.real_binary import REAL_BINARY_CORPUS_ID, RealBinaryCorpusSource
|
|
33
|
+
from xrtm.data.corpora.registry import (
|
|
34
|
+
CorpusAvailability,
|
|
35
|
+
CorpusManifest,
|
|
36
|
+
CorpusMetadata,
|
|
37
|
+
CorpusSplit,
|
|
38
|
+
CorpusTier,
|
|
39
|
+
LicenseType,
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
_FORECAST_VERSION = "1.0"
|
|
43
|
+
|
|
44
|
+
def build_builtin_manifests() -> List[CorpusManifest]:
|
|
45
|
+
"""Build manifests for the corpora shipped with the registry bootstrap."""
|
|
46
|
+
return [
|
|
47
|
+
_build_real_binary_manifest(),
|
|
48
|
+
_build_forecast_manifest(),
|
|
49
|
+
]
|
|
50
|
+
|
|
51
|
+
def _build_real_binary_manifest() -> CorpusManifest:
|
|
52
|
+
real_binary_metadata = CorpusMetadata(
|
|
53
|
+
corpus_id=REAL_BINARY_CORPUS_ID,
|
|
54
|
+
name="XRTM Real Binary v1",
|
|
55
|
+
tier=CorpusTier.TIER_1,
|
|
56
|
+
license_type=LicenseType.APACHE_2_0,
|
|
57
|
+
description="Minimal deterministic real-world binary question corpus for CI smoke tests",
|
|
58
|
+
version="1.0",
|
|
59
|
+
release_gate_approved=True,
|
|
60
|
+
bundled=True,
|
|
61
|
+
size_estimate=25,
|
|
62
|
+
tags=["binary", "deterministic", "embedded", "seed-corpus"],
|
|
63
|
+
provenance_url="https://github.com/xrtm/xrtm",
|
|
64
|
+
license_url="https://www.apache.org/licenses/LICENSE-2.0",
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
return CorpusManifest(
|
|
68
|
+
corpus_id=REAL_BINARY_CORPUS_ID,
|
|
69
|
+
metadata=real_binary_metadata,
|
|
70
|
+
loader_fn=lambda: RealBinaryCorpusSource(),
|
|
71
|
+
available_splits=[CorpusSplit.FULL],
|
|
72
|
+
default_split=CorpusSplit.FULL,
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
def _build_forecast_manifest() -> CorpusManifest:
|
|
76
|
+
forecast_metadata = CorpusMetadata(
|
|
77
|
+
corpus_id=FORECAST_CORPUS_ID,
|
|
78
|
+
name="FOReCAst (Future Outcome Reasoning and Confidence Assessment)",
|
|
79
|
+
tier=CorpusTier.TIER_2,
|
|
80
|
+
license_type=LicenseType.MIT,
|
|
81
|
+
description="Academic benchmark dataset for probabilistic forecasting from NeurIPS 2025. "
|
|
82
|
+
"1,390 resolved questions from Metaculus. Evaluation-only until Tier 1 approval.",
|
|
83
|
+
version=_FORECAST_VERSION,
|
|
84
|
+
release_gate_approved=False,
|
|
85
|
+
bundled=False,
|
|
86
|
+
size_estimate=1390,
|
|
87
|
+
tags=["forecast", "external", "evaluation-only", "probabilistic"],
|
|
88
|
+
provenance_url="https://huggingface.co/datasets/MoyYuan/FOReCAst",
|
|
89
|
+
license_url="https://opensource.org/licenses/MIT",
|
|
90
|
+
citation="FOReCAst: Future Outcome Reasoning and Confidence Assessment. NeurIPS 2025 Datasets and Benchmarks Track.",
|
|
91
|
+
extra={
|
|
92
|
+
"tier_status": "evaluation-only",
|
|
93
|
+
"promotion_required": "explicit approval needed for Tier 1 promotion",
|
|
94
|
+
"non_commercial_clause": "pending clarification",
|
|
95
|
+
},
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
return CorpusManifest(
|
|
99
|
+
corpus_id=FORECAST_CORPUS_ID,
|
|
100
|
+
metadata=forecast_metadata,
|
|
101
|
+
loader_fn=lambda: _load_forecast_source(forecast_metadata.version),
|
|
102
|
+
available_splits=[CorpusSplit.FULL, CorpusSplit.TRAIN, CorpusSplit.EVAL],
|
|
103
|
+
default_split=CorpusSplit.FULL,
|
|
104
|
+
importer_module="xrtm.data.corpora.forecast_importer",
|
|
105
|
+
availability_loader=lambda cache_root: _describe_forecast_corpus(
|
|
106
|
+
forecast_metadata.version,
|
|
107
|
+
cache_root=cache_root,
|
|
108
|
+
),
|
|
109
|
+
prepare_loader=lambda cache_root, refresh, use_hf_datasets: _prepare_forecast_corpus(
|
|
110
|
+
forecast_metadata.version,
|
|
111
|
+
cache_root=cache_root,
|
|
112
|
+
refresh=refresh,
|
|
113
|
+
use_hf_datasets=use_hf_datasets,
|
|
114
|
+
),
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
def _describe_forecast_corpus(
|
|
118
|
+
version: str,
|
|
119
|
+
*,
|
|
120
|
+
cache_root: Optional[Path] = None,
|
|
121
|
+
) -> CorpusAvailability:
|
|
122
|
+
resolved_cache_root = _resolve_cache_root(cache_root)
|
|
123
|
+
cache = OfflineCorpusCache(resolved_cache_root)
|
|
124
|
+
manifest = cache.load_manifest(FORECAST_CORPUS_ID, version)
|
|
125
|
+
import_method = manifest.metadata.get("import_method") if manifest is not None else None
|
|
126
|
+
source_mode = "preview" if import_method in {None, "fixture"} else "external-cache"
|
|
127
|
+
|
|
128
|
+
return CorpusAvailability(
|
|
129
|
+
corpus_id=FORECAST_CORPUS_ID,
|
|
130
|
+
version=version,
|
|
131
|
+
source_mode=source_mode,
|
|
132
|
+
bundled=False,
|
|
133
|
+
already_cached=manifest is not None,
|
|
134
|
+
record_count=manifest.record_count if manifest is not None else None,
|
|
135
|
+
import_method=import_method,
|
|
136
|
+
cache_root=resolved_cache_root,
|
|
137
|
+
data_dir=cache.get_corpus_dir(FORECAST_CORPUS_ID, version),
|
|
138
|
+
manifest_path=cache.get_manifest_path(FORECAST_CORPUS_ID, version),
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
def _prepare_forecast_corpus(
|
|
142
|
+
version: str,
|
|
143
|
+
*,
|
|
144
|
+
cache_root: Optional[Path] = None,
|
|
145
|
+
refresh: bool = False,
|
|
146
|
+
use_hf_datasets: bool = True,
|
|
147
|
+
) -> CorpusAvailability:
|
|
148
|
+
resolved_cache_root = _resolve_cache_root(cache_root)
|
|
149
|
+
cache = OfflineCorpusCache(resolved_cache_root)
|
|
150
|
+
if cache.is_cached(FORECAST_CORPUS_ID, version) and not refresh:
|
|
151
|
+
return _describe_forecast_corpus(version, cache_root=resolved_cache_root)
|
|
152
|
+
|
|
153
|
+
importer = FOReCAstImporter(use_hf_datasets=use_hf_datasets)
|
|
154
|
+
data_dir = cache.get_corpus_dir(FORECAST_CORPUS_ID, version)
|
|
155
|
+
manifest = importer.import_corpus(data_dir, version=version)
|
|
156
|
+
cache.save_manifest(manifest)
|
|
157
|
+
return _describe_forecast_corpus(version, cache_root=resolved_cache_root)
|
|
158
|
+
|
|
159
|
+
def _load_forecast_source(version: str) -> DataSource:
|
|
160
|
+
importer = FOReCAstImporter(use_hf_datasets=False)
|
|
161
|
+
cache = OfflineCorpusCache(_resolve_cache_root())
|
|
162
|
+
data_dir = cache.get_corpus_dir(FORECAST_CORPUS_ID, version)
|
|
163
|
+
manifest = cache.load_manifest(FORECAST_CORPUS_ID, version)
|
|
164
|
+
if manifest is None:
|
|
165
|
+
warnings.warn(
|
|
166
|
+
"FOReCAst full cache not found; using the 3-record deterministic preview. "
|
|
167
|
+
"Prepare the corpus cache first to run large-scale validation.",
|
|
168
|
+
UserWarning,
|
|
169
|
+
stacklevel=4,
|
|
170
|
+
)
|
|
171
|
+
manifest = importer.import_corpus(data_dir, version=version)
|
|
172
|
+
cache.save_manifest(manifest)
|
|
173
|
+
elif manifest.metadata.get("import_method") == "fixture":
|
|
174
|
+
warnings.warn(
|
|
175
|
+
"FOReCAst cache currently contains the deterministic preview only. "
|
|
176
|
+
"Refresh the cache with the external dataset before relying on large-scale counts.",
|
|
177
|
+
UserWarning,
|
|
178
|
+
stacklevel=4,
|
|
179
|
+
)
|
|
180
|
+
return importer.load_from_manifest(manifest, data_dir)
|
|
181
|
+
|
|
182
|
+
def _resolve_cache_root(cache_root: Optional[Path] = None) -> Path:
|
|
183
|
+
if cache_root is not None:
|
|
184
|
+
return cache_root
|
|
185
|
+
return Path(os.environ.get("XRTM_CORPUS_CACHE", Path.home() / ".xrtm" / "corpus-cache"))
|