xrtm-data 0.2.5__tar.gz → 0.2.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. {xrtm_data-0.2.5/src/xrtm_data.egg-info → xrtm_data-0.2.6}/PKG-INFO +1 -1
  2. {xrtm_data-0.2.5 → xrtm_data-0.2.6}/pyproject.toml +1 -1
  3. xrtm_data-0.2.6/src/xrtm/data/corpora/__init__.py +99 -0
  4. xrtm_data-0.2.6/src/xrtm/data/corpora/_builtin_corpora.py +185 -0
  5. xrtm_data-0.2.6/src/xrtm/data/corpora/forecast_importer.py +517 -0
  6. xrtm_data-0.2.6/src/xrtm/data/corpora/importers.py +296 -0
  7. {xrtm_data-0.2.5 → xrtm_data-0.2.6}/src/xrtm/data/corpora/real_binary.py +8 -0
  8. xrtm_data-0.2.6/src/xrtm/data/corpora/registry.py +359 -0
  9. xrtm_data-0.2.6/src/xrtm/data/corpora/splits.py +261 -0
  10. {xrtm_data-0.2.5 → xrtm_data-0.2.6}/src/xrtm/data/version.py +1 -1
  11. {xrtm_data-0.2.5 → xrtm_data-0.2.6/src/xrtm_data.egg-info}/PKG-INFO +1 -1
  12. {xrtm_data-0.2.5 → xrtm_data-0.2.6}/src/xrtm_data.egg-info/SOURCES.txt +9 -0
  13. xrtm_data-0.2.6/tests/test_corpus_importers.py +270 -0
  14. xrtm_data-0.2.6/tests/test_corpus_registry.py +281 -0
  15. xrtm_data-0.2.6/tests/test_corpus_splits.py +278 -0
  16. xrtm_data-0.2.6/tests/test_forecast_importer.py +302 -0
  17. xrtm_data-0.2.5/src/xrtm/data/corpora/__init__.py +0 -36
  18. {xrtm_data-0.2.5 → xrtm_data-0.2.6}/LICENSE +0 -0
  19. {xrtm_data-0.2.5 → xrtm_data-0.2.6}/README.md +0 -0
  20. {xrtm_data-0.2.5 → xrtm_data-0.2.6}/setup.cfg +0 -0
  21. {xrtm_data-0.2.5 → xrtm_data-0.2.6}/src/xrtm/data/__init__.py +0 -0
  22. {xrtm_data-0.2.5 → xrtm_data-0.2.6}/src/xrtm/data/cli/__init__.py +0 -0
  23. {xrtm_data-0.2.5 → xrtm_data-0.2.6}/src/xrtm/data/core/__init__.py +0 -0
  24. {xrtm_data-0.2.5 → xrtm_data-0.2.6}/src/xrtm/data/core/interfaces.py +0 -0
  25. {xrtm_data-0.2.5 → xrtm_data-0.2.6}/src/xrtm/data/core/schemas/__init__.py +0 -0
  26. {xrtm_data-0.2.5 → xrtm_data-0.2.6}/src/xrtm/data/core/schemas/forecast.py +0 -0
  27. {xrtm_data-0.2.5 → xrtm_data-0.2.6}/src/xrtm/data/core/schemas/prior.py +0 -0
  28. {xrtm_data-0.2.5 → xrtm_data-0.2.6}/src/xrtm/data/core/schemas/trade.py +0 -0
  29. {xrtm_data-0.2.5 → xrtm_data-0.2.6}/src/xrtm/data/kit/__init__.py +0 -0
  30. {xrtm_data-0.2.5 → xrtm_data-0.2.6}/src/xrtm/data/kit/processors/__init__.py +0 -0
  31. {xrtm_data-0.2.5 → xrtm_data-0.2.6}/src/xrtm/data/providers/__init__.py +0 -0
  32. {xrtm_data-0.2.5 → xrtm_data-0.2.6}/src/xrtm/data/providers/local/__init__.py +0 -0
  33. {xrtm_data-0.2.5 → xrtm_data-0.2.6}/src/xrtm/data/providers/local/csv.py +0 -0
  34. {xrtm_data-0.2.5 → xrtm_data-0.2.6}/src/xrtm/data/providers/online/__init__.py +0 -0
  35. {xrtm_data-0.2.5 → xrtm_data-0.2.6}/src/xrtm/data/providers/online/polymarket.py +0 -0
  36. {xrtm_data-0.2.5 → xrtm_data-0.2.6}/src/xrtm/data/providers/subgraph/__init__.py +0 -0
  37. {xrtm_data-0.2.5 → xrtm_data-0.2.6}/src/xrtm/data/providers/subgraph/polymarket.py +0 -0
  38. {xrtm_data-0.2.5 → xrtm_data-0.2.6}/src/xrtm_data.egg-info/dependency_links.txt +0 -0
  39. {xrtm_data-0.2.5 → xrtm_data-0.2.6}/src/xrtm_data.egg-info/entry_points.txt +0 -0
  40. {xrtm_data-0.2.5 → xrtm_data-0.2.6}/src/xrtm_data.egg-info/requires.txt +0 -0
  41. {xrtm_data-0.2.5 → xrtm_data-0.2.6}/src/xrtm_data.egg-info/top_level.txt +0 -0
  42. {xrtm_data-0.2.5 → xrtm_data-0.2.6}/tests/test_beta_fitter.py +0 -0
  43. {xrtm_data-0.2.5 → xrtm_data-0.2.6}/tests/test_cli_loading.py +0 -0
  44. {xrtm_data-0.2.5 → xrtm_data-0.2.6}/tests/test_cli_ux.py +0 -0
  45. {xrtm_data-0.2.5 → xrtm_data-0.2.6}/tests/test_local_datasource.py +0 -0
  46. {xrtm_data-0.2.5 → xrtm_data-0.2.6}/tests/test_polymarket_source.py +0 -0
  47. {xrtm_data-0.2.5 → xrtm_data-0.2.6}/tests/test_polymarket_subgraph.py +0 -0
  48. {xrtm_data-0.2.5 → xrtm_data-0.2.6}/tests/test_prior_schemas.py +0 -0
  49. {xrtm_data-0.2.5 → xrtm_data-0.2.6}/tests/test_real_binary_corpus.py +0 -0
  50. {xrtm_data-0.2.5 → xrtm_data-0.2.6}/tests/test_schemas.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: xrtm-data
3
- Version: 0.2.5
3
+ Version: 0.2.6
4
4
  Summary: The Snapshot Vault for XRTM.
5
5
  Author-email: XRTM Team <moy@xrtm.org>
6
6
  License-Expression: Apache-2.0
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "xrtm-data"
7
- version = "0.2.5"
7
+ version = "0.2.6"
8
8
  description = "The Snapshot Vault for XRTM."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.11,<3.13"
@@ -0,0 +1,99 @@
1
+ # coding=utf-8
2
+ # Copyright 2026 XRTM Team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ r"""Public entry points for XRTM corpora.
17
+
18
+ The package-root API intentionally stays focused on the stable registry and the
19
+ embedded real-binary corpus. Less-stable helpers remain available from their
20
+ submodules and are kept here as lazy compatibility exports.
21
+
22
+ Example:
23
+ >>> from xrtm.data.corpora import get_corpus, list_available_corpora
24
+ >>> corpus = get_corpus("xrtm-real-binary-v1")
25
+ >>> metadata_list = list_available_corpora(release_gate_only=True)
26
+ """
27
+
28
+ from __future__ import annotations
29
+
30
+ from importlib import import_module
31
+
32
+ from xrtm.data.corpora.forecast_importer import FORECAST_CORPUS_ID
33
+ from xrtm.data.corpora.real_binary import (
34
+ REAL_BINARY_CORPUS_ID,
35
+ RealBinaryCorpusSource,
36
+ RealBinaryQuestionRecord,
37
+ load_real_binary_corpus,
38
+ load_real_binary_questions,
39
+ load_real_binary_resolved_outcomes,
40
+ validate_real_binary_corpus,
41
+ )
42
+ from xrtm.data.corpora.registry import (
43
+ CorpusAvailability,
44
+ CorpusManifest,
45
+ CorpusMetadata,
46
+ CorpusRegistry,
47
+ CorpusSplit,
48
+ CorpusTier,
49
+ LicenseType,
50
+ describe_corpus,
51
+ get_corpus,
52
+ get_corpus_metadata,
53
+ list_available_corpora,
54
+ prepare_corpus,
55
+ )
56
+
57
+ __all__ = [
58
+ "REAL_BINARY_CORPUS_ID",
59
+ "FORECAST_CORPUS_ID",
60
+ "RealBinaryQuestionRecord",
61
+ "RealBinaryCorpusSource",
62
+ "load_real_binary_corpus",
63
+ "load_real_binary_questions",
64
+ "load_real_binary_resolved_outcomes",
65
+ "validate_real_binary_corpus",
66
+ "CorpusRegistry",
67
+ "CorpusAvailability",
68
+ "CorpusMetadata",
69
+ "CorpusManifest",
70
+ "CorpusTier",
71
+ "LicenseType",
72
+ "CorpusSplit",
73
+ "describe_corpus",
74
+ "get_corpus",
75
+ "get_corpus_metadata",
76
+ "list_available_corpora",
77
+ "prepare_corpus",
78
+ ]
79
+
80
+ _COMPAT_EXPORTS = {
81
+ "FORECAST_HF_DATASET": ("xrtm.data.corpora.forecast_importer", "FORECAST_HF_DATASET"),
82
+ "FOReCAstImporter": ("xrtm.data.corpora.forecast_importer", "FOReCAstImporter"),
83
+ "CorpusImporter": ("xrtm.data.corpora.importers", "CorpusImporter"),
84
+ "ImportManifest": ("xrtm.data.corpora.importers", "ImportManifest"),
85
+ "OfflineCorpusCache": ("xrtm.data.corpora.importers", "OfflineCorpusCache"),
86
+ "DeterministicFixtureImporter": ("xrtm.data.corpora.importers", "DeterministicFixtureImporter"),
87
+ "SplitConfig": ("xrtm.data.corpora.splits", "SplitConfig"),
88
+ "CorpusSplitter": ("xrtm.data.corpora.splits", "CorpusSplitter"),
89
+ "SplitAwareCorpusSource": ("xrtm.data.corpora.splits", "SplitAwareCorpusSource"),
90
+ }
91
+
92
+ def __getattr__(name: str):
93
+ if name in _COMPAT_EXPORTS:
94
+ module_name, attr_name = _COMPAT_EXPORTS[name]
95
+ return getattr(import_module(module_name), attr_name)
96
+ raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
97
+
98
+ def __dir__() -> list[str]:
99
+ return sorted(set(__all__) | set(_COMPAT_EXPORTS))
@@ -0,0 +1,185 @@
1
+ # coding=utf-8
2
+ # Copyright 2026 XRTM Team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ r"""Concrete built-in corpus registrations.
17
+
18
+ This module keeps registry bootstrap and corpus-specific cache workflows out of
19
+ ``registry.py`` so the registry can stay focused on generic manifest handling.
20
+ """
21
+
22
+ from __future__ import annotations
23
+
24
+ import os
25
+ import warnings
26
+ from pathlib import Path
27
+ from typing import List, Optional
28
+
29
+ from xrtm.data.core import DataSource
30
+ from xrtm.data.corpora.forecast_importer import FORECAST_CORPUS_ID, FOReCAstImporter
31
+ from xrtm.data.corpora.importers import OfflineCorpusCache
32
+ from xrtm.data.corpora.real_binary import REAL_BINARY_CORPUS_ID, RealBinaryCorpusSource
33
+ from xrtm.data.corpora.registry import (
34
+ CorpusAvailability,
35
+ CorpusManifest,
36
+ CorpusMetadata,
37
+ CorpusSplit,
38
+ CorpusTier,
39
+ LicenseType,
40
+ )
41
+
42
+ _FORECAST_VERSION = "1.0"
43
+
44
+ def build_builtin_manifests() -> List[CorpusManifest]:
45
+ """Build manifests for the corpora shipped with the registry bootstrap."""
46
+ return [
47
+ _build_real_binary_manifest(),
48
+ _build_forecast_manifest(),
49
+ ]
50
+
51
+ def _build_real_binary_manifest() -> CorpusManifest:
52
+ real_binary_metadata = CorpusMetadata(
53
+ corpus_id=REAL_BINARY_CORPUS_ID,
54
+ name="XRTM Real Binary v1",
55
+ tier=CorpusTier.TIER_1,
56
+ license_type=LicenseType.APACHE_2_0,
57
+ description="Minimal deterministic real-world binary question corpus for CI smoke tests",
58
+ version="1.0",
59
+ release_gate_approved=True,
60
+ bundled=True,
61
+ size_estimate=25,
62
+ tags=["binary", "deterministic", "embedded", "seed-corpus"],
63
+ provenance_url="https://github.com/xrtm/xrtm",
64
+ license_url="https://www.apache.org/licenses/LICENSE-2.0",
65
+ )
66
+
67
+ return CorpusManifest(
68
+ corpus_id=REAL_BINARY_CORPUS_ID,
69
+ metadata=real_binary_metadata,
70
+ loader_fn=lambda: RealBinaryCorpusSource(),
71
+ available_splits=[CorpusSplit.FULL],
72
+ default_split=CorpusSplit.FULL,
73
+ )
74
+
75
+ def _build_forecast_manifest() -> CorpusManifest:
76
+ forecast_metadata = CorpusMetadata(
77
+ corpus_id=FORECAST_CORPUS_ID,
78
+ name="FOReCAst (Future Outcome Reasoning and Confidence Assessment)",
79
+ tier=CorpusTier.TIER_2,
80
+ license_type=LicenseType.MIT,
81
+ description="Academic benchmark dataset for probabilistic forecasting from NeurIPS 2025. "
82
+ "1,390 resolved questions from Metaculus. Evaluation-only until Tier 1 approval.",
83
+ version=_FORECAST_VERSION,
84
+ release_gate_approved=False,
85
+ bundled=False,
86
+ size_estimate=1390,
87
+ tags=["forecast", "external", "evaluation-only", "probabilistic"],
88
+ provenance_url="https://huggingface.co/datasets/MoyYuan/FOReCAst",
89
+ license_url="https://opensource.org/licenses/MIT",
90
+ citation="FOReCAst: Future Outcome Reasoning and Confidence Assessment. NeurIPS 2025 Datasets and Benchmarks Track.",
91
+ extra={
92
+ "tier_status": "evaluation-only",
93
+ "promotion_required": "explicit approval needed for Tier 1 promotion",
94
+ "non_commercial_clause": "pending clarification",
95
+ },
96
+ )
97
+
98
+ return CorpusManifest(
99
+ corpus_id=FORECAST_CORPUS_ID,
100
+ metadata=forecast_metadata,
101
+ loader_fn=lambda: _load_forecast_source(forecast_metadata.version),
102
+ available_splits=[CorpusSplit.FULL, CorpusSplit.TRAIN, CorpusSplit.EVAL],
103
+ default_split=CorpusSplit.FULL,
104
+ importer_module="xrtm.data.corpora.forecast_importer",
105
+ availability_loader=lambda cache_root: _describe_forecast_corpus(
106
+ forecast_metadata.version,
107
+ cache_root=cache_root,
108
+ ),
109
+ prepare_loader=lambda cache_root, refresh, use_hf_datasets: _prepare_forecast_corpus(
110
+ forecast_metadata.version,
111
+ cache_root=cache_root,
112
+ refresh=refresh,
113
+ use_hf_datasets=use_hf_datasets,
114
+ ),
115
+ )
116
+
117
+ def _describe_forecast_corpus(
118
+ version: str,
119
+ *,
120
+ cache_root: Optional[Path] = None,
121
+ ) -> CorpusAvailability:
122
+ resolved_cache_root = _resolve_cache_root(cache_root)
123
+ cache = OfflineCorpusCache(resolved_cache_root)
124
+ manifest = cache.load_manifest(FORECAST_CORPUS_ID, version)
125
+ import_method = manifest.metadata.get("import_method") if manifest is not None else None
126
+ source_mode = "preview" if import_method in {None, "fixture"} else "external-cache"
127
+
128
+ return CorpusAvailability(
129
+ corpus_id=FORECAST_CORPUS_ID,
130
+ version=version,
131
+ source_mode=source_mode,
132
+ bundled=False,
133
+ already_cached=manifest is not None,
134
+ record_count=manifest.record_count if manifest is not None else None,
135
+ import_method=import_method,
136
+ cache_root=resolved_cache_root,
137
+ data_dir=cache.get_corpus_dir(FORECAST_CORPUS_ID, version),
138
+ manifest_path=cache.get_manifest_path(FORECAST_CORPUS_ID, version),
139
+ )
140
+
141
+ def _prepare_forecast_corpus(
142
+ version: str,
143
+ *,
144
+ cache_root: Optional[Path] = None,
145
+ refresh: bool = False,
146
+ use_hf_datasets: bool = True,
147
+ ) -> CorpusAvailability:
148
+ resolved_cache_root = _resolve_cache_root(cache_root)
149
+ cache = OfflineCorpusCache(resolved_cache_root)
150
+ if cache.is_cached(FORECAST_CORPUS_ID, version) and not refresh:
151
+ return _describe_forecast_corpus(version, cache_root=resolved_cache_root)
152
+
153
+ importer = FOReCAstImporter(use_hf_datasets=use_hf_datasets)
154
+ data_dir = cache.get_corpus_dir(FORECAST_CORPUS_ID, version)
155
+ manifest = importer.import_corpus(data_dir, version=version)
156
+ cache.save_manifest(manifest)
157
+ return _describe_forecast_corpus(version, cache_root=resolved_cache_root)
158
+
159
+ def _load_forecast_source(version: str) -> DataSource:
160
+ importer = FOReCAstImporter(use_hf_datasets=False)
161
+ cache = OfflineCorpusCache(_resolve_cache_root())
162
+ data_dir = cache.get_corpus_dir(FORECAST_CORPUS_ID, version)
163
+ manifest = cache.load_manifest(FORECAST_CORPUS_ID, version)
164
+ if manifest is None:
165
+ warnings.warn(
166
+ "FOReCAst full cache not found; using the 3-record deterministic preview. "
167
+ "Prepare the corpus cache first to run large-scale validation.",
168
+ UserWarning,
169
+ stacklevel=4,
170
+ )
171
+ manifest = importer.import_corpus(data_dir, version=version)
172
+ cache.save_manifest(manifest)
173
+ elif manifest.metadata.get("import_method") == "fixture":
174
+ warnings.warn(
175
+ "FOReCAst cache currently contains the deterministic preview only. "
176
+ "Refresh the cache with the external dataset before relying on large-scale counts.",
177
+ UserWarning,
178
+ stacklevel=4,
179
+ )
180
+ return importer.load_from_manifest(manifest, data_dir)
181
+
182
+ def _resolve_cache_root(cache_root: Optional[Path] = None) -> Path:
183
+ if cache_root is not None:
184
+ return cache_root
185
+ return Path(os.environ.get("XRTM_CORPUS_CACHE", Path.home() / ".xrtm" / "corpus-cache"))