xrtm-data 0.2.5__tar.gz → 0.2.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {xrtm_data-0.2.5/src/xrtm_data.egg-info → xrtm_data-0.2.6}/PKG-INFO +1 -1
- {xrtm_data-0.2.5 → xrtm_data-0.2.6}/pyproject.toml +1 -1
- xrtm_data-0.2.6/src/xrtm/data/corpora/__init__.py +99 -0
- xrtm_data-0.2.6/src/xrtm/data/corpora/_builtin_corpora.py +185 -0
- xrtm_data-0.2.6/src/xrtm/data/corpora/forecast_importer.py +517 -0
- xrtm_data-0.2.6/src/xrtm/data/corpora/importers.py +296 -0
- {xrtm_data-0.2.5 → xrtm_data-0.2.6}/src/xrtm/data/corpora/real_binary.py +8 -0
- xrtm_data-0.2.6/src/xrtm/data/corpora/registry.py +359 -0
- xrtm_data-0.2.6/src/xrtm/data/corpora/splits.py +261 -0
- {xrtm_data-0.2.5 → xrtm_data-0.2.6}/src/xrtm/data/version.py +1 -1
- {xrtm_data-0.2.5 → xrtm_data-0.2.6/src/xrtm_data.egg-info}/PKG-INFO +1 -1
- {xrtm_data-0.2.5 → xrtm_data-0.2.6}/src/xrtm_data.egg-info/SOURCES.txt +9 -0
- xrtm_data-0.2.6/tests/test_corpus_importers.py +270 -0
- xrtm_data-0.2.6/tests/test_corpus_registry.py +281 -0
- xrtm_data-0.2.6/tests/test_corpus_splits.py +278 -0
- xrtm_data-0.2.6/tests/test_forecast_importer.py +302 -0
- xrtm_data-0.2.5/src/xrtm/data/corpora/__init__.py +0 -36
- {xrtm_data-0.2.5 → xrtm_data-0.2.6}/LICENSE +0 -0
- {xrtm_data-0.2.5 → xrtm_data-0.2.6}/README.md +0 -0
- {xrtm_data-0.2.5 → xrtm_data-0.2.6}/setup.cfg +0 -0
- {xrtm_data-0.2.5 → xrtm_data-0.2.6}/src/xrtm/data/__init__.py +0 -0
- {xrtm_data-0.2.5 → xrtm_data-0.2.6}/src/xrtm/data/cli/__init__.py +0 -0
- {xrtm_data-0.2.5 → xrtm_data-0.2.6}/src/xrtm/data/core/__init__.py +0 -0
- {xrtm_data-0.2.5 → xrtm_data-0.2.6}/src/xrtm/data/core/interfaces.py +0 -0
- {xrtm_data-0.2.5 → xrtm_data-0.2.6}/src/xrtm/data/core/schemas/__init__.py +0 -0
- {xrtm_data-0.2.5 → xrtm_data-0.2.6}/src/xrtm/data/core/schemas/forecast.py +0 -0
- {xrtm_data-0.2.5 → xrtm_data-0.2.6}/src/xrtm/data/core/schemas/prior.py +0 -0
- {xrtm_data-0.2.5 → xrtm_data-0.2.6}/src/xrtm/data/core/schemas/trade.py +0 -0
- {xrtm_data-0.2.5 → xrtm_data-0.2.6}/src/xrtm/data/kit/__init__.py +0 -0
- {xrtm_data-0.2.5 → xrtm_data-0.2.6}/src/xrtm/data/kit/processors/__init__.py +0 -0
- {xrtm_data-0.2.5 → xrtm_data-0.2.6}/src/xrtm/data/providers/__init__.py +0 -0
- {xrtm_data-0.2.5 → xrtm_data-0.2.6}/src/xrtm/data/providers/local/__init__.py +0 -0
- {xrtm_data-0.2.5 → xrtm_data-0.2.6}/src/xrtm/data/providers/local/csv.py +0 -0
- {xrtm_data-0.2.5 → xrtm_data-0.2.6}/src/xrtm/data/providers/online/__init__.py +0 -0
- {xrtm_data-0.2.5 → xrtm_data-0.2.6}/src/xrtm/data/providers/online/polymarket.py +0 -0
- {xrtm_data-0.2.5 → xrtm_data-0.2.6}/src/xrtm/data/providers/subgraph/__init__.py +0 -0
- {xrtm_data-0.2.5 → xrtm_data-0.2.6}/src/xrtm/data/providers/subgraph/polymarket.py +0 -0
- {xrtm_data-0.2.5 → xrtm_data-0.2.6}/src/xrtm_data.egg-info/dependency_links.txt +0 -0
- {xrtm_data-0.2.5 → xrtm_data-0.2.6}/src/xrtm_data.egg-info/entry_points.txt +0 -0
- {xrtm_data-0.2.5 → xrtm_data-0.2.6}/src/xrtm_data.egg-info/requires.txt +0 -0
- {xrtm_data-0.2.5 → xrtm_data-0.2.6}/src/xrtm_data.egg-info/top_level.txt +0 -0
- {xrtm_data-0.2.5 → xrtm_data-0.2.6}/tests/test_beta_fitter.py +0 -0
- {xrtm_data-0.2.5 → xrtm_data-0.2.6}/tests/test_cli_loading.py +0 -0
- {xrtm_data-0.2.5 → xrtm_data-0.2.6}/tests/test_cli_ux.py +0 -0
- {xrtm_data-0.2.5 → xrtm_data-0.2.6}/tests/test_local_datasource.py +0 -0
- {xrtm_data-0.2.5 → xrtm_data-0.2.6}/tests/test_polymarket_source.py +0 -0
- {xrtm_data-0.2.5 → xrtm_data-0.2.6}/tests/test_polymarket_subgraph.py +0 -0
- {xrtm_data-0.2.5 → xrtm_data-0.2.6}/tests/test_prior_schemas.py +0 -0
- {xrtm_data-0.2.5 → xrtm_data-0.2.6}/tests/test_real_binary_corpus.py +0 -0
- {xrtm_data-0.2.5 → xrtm_data-0.2.6}/tests/test_schemas.py +0 -0
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
# coding=utf-8
|
|
2
|
+
# Copyright 2026 XRTM Team. All rights reserved.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
|
|
16
|
+
r"""Public entry points for XRTM corpora.
|
|
17
|
+
|
|
18
|
+
The package-root API intentionally stays focused on the stable registry and the
|
|
19
|
+
embedded real-binary corpus. Less-stable helpers remain available from their
|
|
20
|
+
submodules and are kept here as lazy compatibility exports.
|
|
21
|
+
|
|
22
|
+
Example:
|
|
23
|
+
>>> from xrtm.data.corpora import get_corpus, list_available_corpora
|
|
24
|
+
>>> corpus = get_corpus("xrtm-real-binary-v1")
|
|
25
|
+
>>> metadata_list = list_available_corpora(release_gate_only=True)
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
from __future__ import annotations
|
|
29
|
+
|
|
30
|
+
from importlib import import_module
|
|
31
|
+
|
|
32
|
+
from xrtm.data.corpora.forecast_importer import FORECAST_CORPUS_ID
|
|
33
|
+
from xrtm.data.corpora.real_binary import (
|
|
34
|
+
REAL_BINARY_CORPUS_ID,
|
|
35
|
+
RealBinaryCorpusSource,
|
|
36
|
+
RealBinaryQuestionRecord,
|
|
37
|
+
load_real_binary_corpus,
|
|
38
|
+
load_real_binary_questions,
|
|
39
|
+
load_real_binary_resolved_outcomes,
|
|
40
|
+
validate_real_binary_corpus,
|
|
41
|
+
)
|
|
42
|
+
from xrtm.data.corpora.registry import (
|
|
43
|
+
CorpusAvailability,
|
|
44
|
+
CorpusManifest,
|
|
45
|
+
CorpusMetadata,
|
|
46
|
+
CorpusRegistry,
|
|
47
|
+
CorpusSplit,
|
|
48
|
+
CorpusTier,
|
|
49
|
+
LicenseType,
|
|
50
|
+
describe_corpus,
|
|
51
|
+
get_corpus,
|
|
52
|
+
get_corpus_metadata,
|
|
53
|
+
list_available_corpora,
|
|
54
|
+
prepare_corpus,
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
__all__ = [
|
|
58
|
+
"REAL_BINARY_CORPUS_ID",
|
|
59
|
+
"FORECAST_CORPUS_ID",
|
|
60
|
+
"RealBinaryQuestionRecord",
|
|
61
|
+
"RealBinaryCorpusSource",
|
|
62
|
+
"load_real_binary_corpus",
|
|
63
|
+
"load_real_binary_questions",
|
|
64
|
+
"load_real_binary_resolved_outcomes",
|
|
65
|
+
"validate_real_binary_corpus",
|
|
66
|
+
"CorpusRegistry",
|
|
67
|
+
"CorpusAvailability",
|
|
68
|
+
"CorpusMetadata",
|
|
69
|
+
"CorpusManifest",
|
|
70
|
+
"CorpusTier",
|
|
71
|
+
"LicenseType",
|
|
72
|
+
"CorpusSplit",
|
|
73
|
+
"describe_corpus",
|
|
74
|
+
"get_corpus",
|
|
75
|
+
"get_corpus_metadata",
|
|
76
|
+
"list_available_corpora",
|
|
77
|
+
"prepare_corpus",
|
|
78
|
+
]
|
|
79
|
+
|
|
80
|
+
_COMPAT_EXPORTS = {
|
|
81
|
+
"FORECAST_HF_DATASET": ("xrtm.data.corpora.forecast_importer", "FORECAST_HF_DATASET"),
|
|
82
|
+
"FOReCAstImporter": ("xrtm.data.corpora.forecast_importer", "FOReCAstImporter"),
|
|
83
|
+
"CorpusImporter": ("xrtm.data.corpora.importers", "CorpusImporter"),
|
|
84
|
+
"ImportManifest": ("xrtm.data.corpora.importers", "ImportManifest"),
|
|
85
|
+
"OfflineCorpusCache": ("xrtm.data.corpora.importers", "OfflineCorpusCache"),
|
|
86
|
+
"DeterministicFixtureImporter": ("xrtm.data.corpora.importers", "DeterministicFixtureImporter"),
|
|
87
|
+
"SplitConfig": ("xrtm.data.corpora.splits", "SplitConfig"),
|
|
88
|
+
"CorpusSplitter": ("xrtm.data.corpora.splits", "CorpusSplitter"),
|
|
89
|
+
"SplitAwareCorpusSource": ("xrtm.data.corpora.splits", "SplitAwareCorpusSource"),
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
def __getattr__(name: str):
|
|
93
|
+
if name in _COMPAT_EXPORTS:
|
|
94
|
+
module_name, attr_name = _COMPAT_EXPORTS[name]
|
|
95
|
+
return getattr(import_module(module_name), attr_name)
|
|
96
|
+
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
|
97
|
+
|
|
98
|
+
def __dir__() -> list[str]:
|
|
99
|
+
return sorted(set(__all__) | set(_COMPAT_EXPORTS))
|
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
# coding=utf-8
|
|
2
|
+
# Copyright 2026 XRTM Team. All rights reserved.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
|
|
16
|
+
r"""Concrete built-in corpus registrations.
|
|
17
|
+
|
|
18
|
+
This module keeps registry bootstrap and corpus-specific cache workflows out of
|
|
19
|
+
``registry.py`` so the registry can stay focused on generic manifest handling.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
from __future__ import annotations
|
|
23
|
+
|
|
24
|
+
import os
|
|
25
|
+
import warnings
|
|
26
|
+
from pathlib import Path
|
|
27
|
+
from typing import List, Optional
|
|
28
|
+
|
|
29
|
+
from xrtm.data.core import DataSource
|
|
30
|
+
from xrtm.data.corpora.forecast_importer import FORECAST_CORPUS_ID, FOReCAstImporter
|
|
31
|
+
from xrtm.data.corpora.importers import OfflineCorpusCache
|
|
32
|
+
from xrtm.data.corpora.real_binary import REAL_BINARY_CORPUS_ID, RealBinaryCorpusSource
|
|
33
|
+
from xrtm.data.corpora.registry import (
|
|
34
|
+
CorpusAvailability,
|
|
35
|
+
CorpusManifest,
|
|
36
|
+
CorpusMetadata,
|
|
37
|
+
CorpusSplit,
|
|
38
|
+
CorpusTier,
|
|
39
|
+
LicenseType,
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
_FORECAST_VERSION = "1.0"
|
|
43
|
+
|
|
44
|
+
def build_builtin_manifests() -> List[CorpusManifest]:
|
|
45
|
+
"""Build manifests for the corpora shipped with the registry bootstrap."""
|
|
46
|
+
return [
|
|
47
|
+
_build_real_binary_manifest(),
|
|
48
|
+
_build_forecast_manifest(),
|
|
49
|
+
]
|
|
50
|
+
|
|
51
|
+
def _build_real_binary_manifest() -> CorpusManifest:
|
|
52
|
+
real_binary_metadata = CorpusMetadata(
|
|
53
|
+
corpus_id=REAL_BINARY_CORPUS_ID,
|
|
54
|
+
name="XRTM Real Binary v1",
|
|
55
|
+
tier=CorpusTier.TIER_1,
|
|
56
|
+
license_type=LicenseType.APACHE_2_0,
|
|
57
|
+
description="Minimal deterministic real-world binary question corpus for CI smoke tests",
|
|
58
|
+
version="1.0",
|
|
59
|
+
release_gate_approved=True,
|
|
60
|
+
bundled=True,
|
|
61
|
+
size_estimate=25,
|
|
62
|
+
tags=["binary", "deterministic", "embedded", "seed-corpus"],
|
|
63
|
+
provenance_url="https://github.com/xrtm/xrtm",
|
|
64
|
+
license_url="https://www.apache.org/licenses/LICENSE-2.0",
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
return CorpusManifest(
|
|
68
|
+
corpus_id=REAL_BINARY_CORPUS_ID,
|
|
69
|
+
metadata=real_binary_metadata,
|
|
70
|
+
loader_fn=lambda: RealBinaryCorpusSource(),
|
|
71
|
+
available_splits=[CorpusSplit.FULL],
|
|
72
|
+
default_split=CorpusSplit.FULL,
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
def _build_forecast_manifest() -> CorpusManifest:
|
|
76
|
+
forecast_metadata = CorpusMetadata(
|
|
77
|
+
corpus_id=FORECAST_CORPUS_ID,
|
|
78
|
+
name="FOReCAst (Future Outcome Reasoning and Confidence Assessment)",
|
|
79
|
+
tier=CorpusTier.TIER_2,
|
|
80
|
+
license_type=LicenseType.MIT,
|
|
81
|
+
description="Academic benchmark dataset for probabilistic forecasting from NeurIPS 2025. "
|
|
82
|
+
"1,390 resolved questions from Metaculus. Evaluation-only until Tier 1 approval.",
|
|
83
|
+
version=_FORECAST_VERSION,
|
|
84
|
+
release_gate_approved=False,
|
|
85
|
+
bundled=False,
|
|
86
|
+
size_estimate=1390,
|
|
87
|
+
tags=["forecast", "external", "evaluation-only", "probabilistic"],
|
|
88
|
+
provenance_url="https://huggingface.co/datasets/MoyYuan/FOReCAst",
|
|
89
|
+
license_url="https://opensource.org/licenses/MIT",
|
|
90
|
+
citation="FOReCAst: Future Outcome Reasoning and Confidence Assessment. NeurIPS 2025 Datasets and Benchmarks Track.",
|
|
91
|
+
extra={
|
|
92
|
+
"tier_status": "evaluation-only",
|
|
93
|
+
"promotion_required": "explicit approval needed for Tier 1 promotion",
|
|
94
|
+
"non_commercial_clause": "pending clarification",
|
|
95
|
+
},
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
return CorpusManifest(
|
|
99
|
+
corpus_id=FORECAST_CORPUS_ID,
|
|
100
|
+
metadata=forecast_metadata,
|
|
101
|
+
loader_fn=lambda: _load_forecast_source(forecast_metadata.version),
|
|
102
|
+
available_splits=[CorpusSplit.FULL, CorpusSplit.TRAIN, CorpusSplit.EVAL],
|
|
103
|
+
default_split=CorpusSplit.FULL,
|
|
104
|
+
importer_module="xrtm.data.corpora.forecast_importer",
|
|
105
|
+
availability_loader=lambda cache_root: _describe_forecast_corpus(
|
|
106
|
+
forecast_metadata.version,
|
|
107
|
+
cache_root=cache_root,
|
|
108
|
+
),
|
|
109
|
+
prepare_loader=lambda cache_root, refresh, use_hf_datasets: _prepare_forecast_corpus(
|
|
110
|
+
forecast_metadata.version,
|
|
111
|
+
cache_root=cache_root,
|
|
112
|
+
refresh=refresh,
|
|
113
|
+
use_hf_datasets=use_hf_datasets,
|
|
114
|
+
),
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
def _describe_forecast_corpus(
|
|
118
|
+
version: str,
|
|
119
|
+
*,
|
|
120
|
+
cache_root: Optional[Path] = None,
|
|
121
|
+
) -> CorpusAvailability:
|
|
122
|
+
resolved_cache_root = _resolve_cache_root(cache_root)
|
|
123
|
+
cache = OfflineCorpusCache(resolved_cache_root)
|
|
124
|
+
manifest = cache.load_manifest(FORECAST_CORPUS_ID, version)
|
|
125
|
+
import_method = manifest.metadata.get("import_method") if manifest is not None else None
|
|
126
|
+
source_mode = "preview" if import_method in {None, "fixture"} else "external-cache"
|
|
127
|
+
|
|
128
|
+
return CorpusAvailability(
|
|
129
|
+
corpus_id=FORECAST_CORPUS_ID,
|
|
130
|
+
version=version,
|
|
131
|
+
source_mode=source_mode,
|
|
132
|
+
bundled=False,
|
|
133
|
+
already_cached=manifest is not None,
|
|
134
|
+
record_count=manifest.record_count if manifest is not None else None,
|
|
135
|
+
import_method=import_method,
|
|
136
|
+
cache_root=resolved_cache_root,
|
|
137
|
+
data_dir=cache.get_corpus_dir(FORECAST_CORPUS_ID, version),
|
|
138
|
+
manifest_path=cache.get_manifest_path(FORECAST_CORPUS_ID, version),
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
def _prepare_forecast_corpus(
|
|
142
|
+
version: str,
|
|
143
|
+
*,
|
|
144
|
+
cache_root: Optional[Path] = None,
|
|
145
|
+
refresh: bool = False,
|
|
146
|
+
use_hf_datasets: bool = True,
|
|
147
|
+
) -> CorpusAvailability:
|
|
148
|
+
resolved_cache_root = _resolve_cache_root(cache_root)
|
|
149
|
+
cache = OfflineCorpusCache(resolved_cache_root)
|
|
150
|
+
if cache.is_cached(FORECAST_CORPUS_ID, version) and not refresh:
|
|
151
|
+
return _describe_forecast_corpus(version, cache_root=resolved_cache_root)
|
|
152
|
+
|
|
153
|
+
importer = FOReCAstImporter(use_hf_datasets=use_hf_datasets)
|
|
154
|
+
data_dir = cache.get_corpus_dir(FORECAST_CORPUS_ID, version)
|
|
155
|
+
manifest = importer.import_corpus(data_dir, version=version)
|
|
156
|
+
cache.save_manifest(manifest)
|
|
157
|
+
return _describe_forecast_corpus(version, cache_root=resolved_cache_root)
|
|
158
|
+
|
|
159
|
+
def _load_forecast_source(version: str) -> DataSource:
|
|
160
|
+
importer = FOReCAstImporter(use_hf_datasets=False)
|
|
161
|
+
cache = OfflineCorpusCache(_resolve_cache_root())
|
|
162
|
+
data_dir = cache.get_corpus_dir(FORECAST_CORPUS_ID, version)
|
|
163
|
+
manifest = cache.load_manifest(FORECAST_CORPUS_ID, version)
|
|
164
|
+
if manifest is None:
|
|
165
|
+
warnings.warn(
|
|
166
|
+
"FOReCAst full cache not found; using the 3-record deterministic preview. "
|
|
167
|
+
"Prepare the corpus cache first to run large-scale validation.",
|
|
168
|
+
UserWarning,
|
|
169
|
+
stacklevel=4,
|
|
170
|
+
)
|
|
171
|
+
manifest = importer.import_corpus(data_dir, version=version)
|
|
172
|
+
cache.save_manifest(manifest)
|
|
173
|
+
elif manifest.metadata.get("import_method") == "fixture":
|
|
174
|
+
warnings.warn(
|
|
175
|
+
"FOReCAst cache currently contains the deterministic preview only. "
|
|
176
|
+
"Refresh the cache with the external dataset before relying on large-scale counts.",
|
|
177
|
+
UserWarning,
|
|
178
|
+
stacklevel=4,
|
|
179
|
+
)
|
|
180
|
+
return importer.load_from_manifest(manifest, data_dir)
|
|
181
|
+
|
|
182
|
+
def _resolve_cache_root(cache_root: Optional[Path] = None) -> Path:
|
|
183
|
+
if cache_root is not None:
|
|
184
|
+
return cache_root
|
|
185
|
+
return Path(os.environ.get("XRTM_CORPUS_CACHE", Path.home() / ".xrtm" / "corpus-cache"))
|