colliderml 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- colliderml-0.1.1/LICENSE +21 -0
- colliderml-0.1.1/MANIFEST.in +8 -0
- colliderml-0.1.1/PKG-INFO +138 -0
- colliderml-0.1.1/README.md +94 -0
- colliderml-0.1.1/colliderml/__init__.py +8 -0
- colliderml-0.1.1/colliderml/cli.py +108 -0
- colliderml-0.1.1/colliderml/core/__init__.py +7 -0
- colliderml-0.1.1/colliderml/core/data/__init__.py +21 -0
- colliderml-0.1.1/colliderml/core/data/config.py +104 -0
- colliderml-0.1.1/colliderml/core/data/dataset.py +45 -0
- colliderml-0.1.1/colliderml/core/data/manifest.py +139 -0
- colliderml-0.1.1/colliderml/core/get.py +45 -0
- colliderml-0.1.1/colliderml/core/io/__init__.py +5 -0
- colliderml-0.1.1/colliderml/core/io/downloader.py +226 -0
- colliderml-0.1.1/colliderml/utils/__init__.py +3 -0
- colliderml-0.1.1/colliderml.egg-info/PKG-INFO +138 -0
- colliderml-0.1.1/colliderml.egg-info/SOURCES.txt +41 -0
- colliderml-0.1.1/colliderml.egg-info/dependency_links.txt +1 -0
- colliderml-0.1.1/colliderml.egg-info/entry_points.txt +2 -0
- colliderml-0.1.1/colliderml.egg-info/requires.txt +14 -0
- colliderml-0.1.1/colliderml.egg-info/top_level.txt +1 -0
- colliderml-0.1.1/docs/.vitepress/cache/deps/chunk-VJWGEPT5.js +12542 -0
- colliderml-0.1.1/docs/.vitepress/cache/deps/chunk-VJWGEPT5.js.map +7 -0
- colliderml-0.1.1/docs/.vitepress/cache/deps/vitepress___@vue_devtools-api.js +163 -0
- colliderml-0.1.1/docs/.vitepress/cache/deps/vitepress___@vue_devtools-api.js.map +7 -0
- colliderml-0.1.1/docs/.vitepress/cache/deps/vitepress___@vueuse_core.js +9188 -0
- colliderml-0.1.1/docs/.vitepress/cache/deps/vitepress___@vueuse_core.js.map +7 -0
- colliderml-0.1.1/docs/.vitepress/cache/deps/vue.js +343 -0
- colliderml-0.1.1/docs/.vitepress/cache/deps/vue.js.map +7 -0
- colliderml-0.1.1/docs/.vitepress/components/AboutData.vue +88 -0
- colliderml-0.1.1/docs/.vitepress/components/DataConfig.vue +862 -0
- colliderml-0.1.1/docs/.vitepress/config.ts +78 -0
- colliderml-0.1.1/docs/.vitepress/shims-vue.d.ts +5 -0
- colliderml-0.1.1/docs/.vitepress/theme/custom.css +21 -0
- colliderml-0.1.1/docs/.vitepress/theme/index.ts +14 -0
- colliderml-0.1.1/docs/guide/introduction.md +48 -0
- colliderml-0.1.1/docs/index.md +60 -0
- colliderml-0.1.1/setup.cfg +33 -0
- colliderml-0.1.1/setup.py +50 -0
- colliderml-0.1.1/tests/core/io/test_downloader.py +145 -0
- colliderml-0.1.1/tests/core/io/test_downloader_integration.py +119 -0
- colliderml-0.1.1/tests/test_downloader.py +65 -0
colliderml-0.1.1/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2023 ColliderML Contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
Metadata-Version: 2.2
|
|
2
|
+
Name: colliderml
|
|
3
|
+
Version: 0.1.1
|
|
4
|
+
Summary: A modern machine learning library for high-energy physics data analysis
|
|
5
|
+
Home-page: https://github.com/murnanedaniel/colliderml
|
|
6
|
+
Author: Daniel Murnane
|
|
7
|
+
Author-email: dtmurnane@lbl.gov
|
|
8
|
+
Project-URL: Documentation, https://murnanedaniel.github.io/colliderml
|
|
9
|
+
Project-URL: Source, https://github.com/murnanedaniel/colliderml
|
|
10
|
+
Project-URL: Issues, https://github.com/murnanedaniel/colliderml/issues
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Intended Audience :: Science/Research
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Topic :: Scientific/Engineering :: Physics
|
|
17
|
+
Requires-Python: >=3.10,<3.12
|
|
18
|
+
Description-Content-Type: text/markdown
|
|
19
|
+
License-File: LICENSE
|
|
20
|
+
Requires-Dist: requests>=2.31.0
|
|
21
|
+
Requires-Dist: tqdm>=4.66.0
|
|
22
|
+
Requires-Dist: numpy>=1.24.0
|
|
23
|
+
Requires-Dist: pydantic>=2.5.0
|
|
24
|
+
Requires-Dist: h5py>=3.10.0
|
|
25
|
+
Provides-Extra: dev
|
|
26
|
+
Requires-Dist: pytest>=7.4.0; extra == "dev"
|
|
27
|
+
Requires-Dist: pytest-cov>=4.1.0; extra == "dev"
|
|
28
|
+
Requires-Dist: black>=23.11.0; extra == "dev"
|
|
29
|
+
Requires-Dist: ruff>=0.1.6; extra == "dev"
|
|
30
|
+
Requires-Dist: mypy>=1.7.0; extra == "dev"
|
|
31
|
+
Requires-Dist: mkdocs-material>=9.4.0; extra == "dev"
|
|
32
|
+
Requires-Dist: mkdocstrings[python]>=0.24.0; extra == "dev"
|
|
33
|
+
Dynamic: author
|
|
34
|
+
Dynamic: author-email
|
|
35
|
+
Dynamic: classifier
|
|
36
|
+
Dynamic: description
|
|
37
|
+
Dynamic: description-content-type
|
|
38
|
+
Dynamic: home-page
|
|
39
|
+
Dynamic: project-url
|
|
40
|
+
Dynamic: provides-extra
|
|
41
|
+
Dynamic: requires-dist
|
|
42
|
+
Dynamic: requires-python
|
|
43
|
+
Dynamic: summary
|
|
44
|
+
|
|
45
|
+
# ColliderML
|
|
46
|
+
|
|
47
|
+
[](https://github.com/murnanedaniel/colliderml/actions/workflows/tests.yml)
|
|
48
|
+

|
|
49
|
+
[](https://www.python.org/downloads/)
|
|
50
|
+
[](https://opensource.org/licenses/MIT)
|
|
51
|
+
|
|
52
|
+
A modern machine learning library for high-energy physics data analysis.
|
|
53
|
+
|
|
54
|
+
## Features
|
|
55
|
+
|
|
56
|
+
- Efficient parallel data downloading with resume capability
|
|
57
|
+
- Support for common HEP data formats
|
|
58
|
+
- Machine learning utilities for particle physics
|
|
59
|
+
- Visualization tools for physics data
|
|
60
|
+
|
|
61
|
+
## Installation
|
|
62
|
+
|
|
63
|
+
### For Users
|
|
64
|
+
```bash
|
|
65
|
+
# Create and activate environment
|
|
66
|
+
conda create -n collider-env python=3.11 # 3.10 or 3.11 recommended
|
|
67
|
+
conda activate collider-env
|
|
68
|
+
|
|
69
|
+
# Install package
|
|
70
|
+
pip install colliderml
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
### For Developers
|
|
74
|
+
```bash
|
|
75
|
+
# Create and activate environment
|
|
76
|
+
conda create -n collider-dev python=3.11 # 3.10 or 3.11 recommended
|
|
77
|
+
conda activate collider-dev
|
|
78
|
+
|
|
79
|
+
# Clone repository
|
|
80
|
+
git clone https://github.com/murnanedaniel/colliderml.git
|
|
81
|
+
cd colliderml
|
|
82
|
+
|
|
83
|
+
# Install in development mode with extra dependencies
|
|
84
|
+
pip install -e ".[dev]"
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
## Quick Start
|
|
88
|
+
|
|
89
|
+
```python
|
|
90
|
+
from colliderml.core.data.manifest import ManifestClient
|
|
91
|
+
from colliderml.core.io import DataDownloader
|
|
92
|
+
|
|
93
|
+
manifest = ManifestClient()
|
|
94
|
+
files = manifest.select_files(campaign=None, datasets=["ttbar"], objects=["tracks"], max_events=1000)
|
|
95
|
+
|
|
96
|
+
downloader = DataDownloader()
|
|
97
|
+
results = downloader.download_files([f.path for f in files], local_dir="data", max_workers=4, resume=True)
|
|
98
|
+
|
|
99
|
+
for path, result in results.items():
|
|
100
|
+
print(path, result.success, result.error)
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
### Features
|
|
104
|
+
|
|
105
|
+
- **Manifest-driven**: Always selects files from the latest portal manifest
|
|
106
|
+
- **Parallel Downloads**: Download multiple files concurrently
|
|
107
|
+
- **Resume Capability**: Optionally resume interrupted downloads
|
|
108
|
+
- **Progress Tracking**: Real-time progress bars
|
|
109
|
+
- **Clear Errors**: Helpful failure messages and HEAD checks
|
|
110
|
+
|
|
111
|
+
## Development
|
|
112
|
+
|
|
113
|
+
1. Activate your environment:
|
|
114
|
+
```bash
|
|
115
|
+
conda activate collider-dev
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
2. Run tests:
|
|
119
|
+
```bash
|
|
120
|
+
# Run unit tests only
|
|
121
|
+
pytest -v -m "not integration"
|
|
122
|
+
|
|
123
|
+
# Run all tests including integration tests
|
|
124
|
+
pytest -v
|
|
125
|
+
|
|
126
|
+
# Run with coverage report
|
|
127
|
+
pytest --cov=colliderml
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
3. Build documentation:
|
|
131
|
+
```bash
|
|
132
|
+
mkdocs build
|
|
133
|
+
mkdocs serve # View at http://127.0.0.1:8000
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
## License
|
|
137
|
+
|
|
138
|
+
[MIT License](LICENSE)
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
# ColliderML
|
|
2
|
+
|
|
3
|
+
[](https://github.com/murnanedaniel/colliderml/actions/workflows/tests.yml)
|
|
4
|
+

|
|
5
|
+
[](https://www.python.org/downloads/)
|
|
6
|
+
[](https://opensource.org/licenses/MIT)
|
|
7
|
+
|
|
8
|
+
A modern machine learning library for high-energy physics data analysis.
|
|
9
|
+
|
|
10
|
+
## Features
|
|
11
|
+
|
|
12
|
+
- Efficient parallel data downloading with resume capability
|
|
13
|
+
- Support for common HEP data formats
|
|
14
|
+
- Machine learning utilities for particle physics
|
|
15
|
+
- Visualization tools for physics data
|
|
16
|
+
|
|
17
|
+
## Installation
|
|
18
|
+
|
|
19
|
+
### For Users
|
|
20
|
+
```bash
|
|
21
|
+
# Create and activate environment
|
|
22
|
+
conda create -n collider-env python=3.11 # 3.10 or 3.11 recommended
|
|
23
|
+
conda activate collider-env
|
|
24
|
+
|
|
25
|
+
# Install package
|
|
26
|
+
pip install colliderml
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
### For Developers
|
|
30
|
+
```bash
|
|
31
|
+
# Create and activate environment
|
|
32
|
+
conda create -n collider-dev python=3.11 # 3.10 or 3.11 recommended
|
|
33
|
+
conda activate collider-dev
|
|
34
|
+
|
|
35
|
+
# Clone repository
|
|
36
|
+
git clone https://github.com/murnanedaniel/colliderml.git
|
|
37
|
+
cd colliderml
|
|
38
|
+
|
|
39
|
+
# Install in development mode with extra dependencies
|
|
40
|
+
pip install -e ".[dev]"
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
## Quick Start
|
|
44
|
+
|
|
45
|
+
```python
|
|
46
|
+
from colliderml.core.data.manifest import ManifestClient
|
|
47
|
+
from colliderml.core.io import DataDownloader
|
|
48
|
+
|
|
49
|
+
manifest = ManifestClient()
|
|
50
|
+
files = manifest.select_files(campaign=None, datasets=["ttbar"], objects=["tracks"], max_events=1000)
|
|
51
|
+
|
|
52
|
+
downloader = DataDownloader()
|
|
53
|
+
results = downloader.download_files([f.path for f in files], local_dir="data", max_workers=4, resume=True)
|
|
54
|
+
|
|
55
|
+
for path, result in results.items():
|
|
56
|
+
print(path, result.success, result.error)
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
### Features
|
|
60
|
+
|
|
61
|
+
- **Manifest-driven**: Always selects files from the latest portal manifest
|
|
62
|
+
- **Parallel Downloads**: Download multiple files concurrently
|
|
63
|
+
- **Resume Capability**: Optionally resume interrupted downloads
|
|
64
|
+
- **Progress Tracking**: Real-time progress bars
|
|
65
|
+
- **Clear Errors**: Helpful failure messages and HEAD checks
|
|
66
|
+
|
|
67
|
+
## Development
|
|
68
|
+
|
|
69
|
+
1. Activate your environment:
|
|
70
|
+
```bash
|
|
71
|
+
conda activate collider-dev
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
2. Run tests:
|
|
75
|
+
```bash
|
|
76
|
+
# Run unit tests only
|
|
77
|
+
pytest -v -m "not integration"
|
|
78
|
+
|
|
79
|
+
# Run all tests including integration tests
|
|
80
|
+
pytest -v
|
|
81
|
+
|
|
82
|
+
# Run with coverage report
|
|
83
|
+
pytest --cov=colliderml
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
3. Build documentation:
|
|
87
|
+
```bash
|
|
88
|
+
mkdocs build
|
|
89
|
+
mkdocs serve # View at http://127.0.0.1:8000
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
## License
|
|
93
|
+
|
|
94
|
+
[MIT License](LICENSE)
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""ColliderML command line interface."""
|
|
3
|
+
|
|
4
|
+
import argparse
|
|
5
|
+
import sys
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from colliderml.core.io import DataDownloader
|
|
8
|
+
from colliderml.core.data.manifest import ManifestClient
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def get(args):
|
|
12
|
+
"""Handle the get command (manifest-driven)."""
|
|
13
|
+
downloader = DataDownloader()
|
|
14
|
+
manifest = ManifestClient()
|
|
15
|
+
|
|
16
|
+
# Determine campaign
|
|
17
|
+
campaign = args.campaign if args.campaign and args.campaign != "default" else None
|
|
18
|
+
|
|
19
|
+
# Parse lists
|
|
20
|
+
datasets = args.datasets.split(',') if args.datasets else None
|
|
21
|
+
objects = args.objects.split(',') if args.objects else None
|
|
22
|
+
|
|
23
|
+
# Select files
|
|
24
|
+
try:
|
|
25
|
+
files = manifest.select_files(
|
|
26
|
+
campaign=campaign,
|
|
27
|
+
datasets=datasets,
|
|
28
|
+
objects=objects,
|
|
29
|
+
max_events=args.events,
|
|
30
|
+
version=args.version,
|
|
31
|
+
)
|
|
32
|
+
except Exception as e:
|
|
33
|
+
print(f"\nError reading manifest: {e}")
|
|
34
|
+
sys.exit(1)
|
|
35
|
+
|
|
36
|
+
if not files:
|
|
37
|
+
print("No files matched the selection from the manifest.")
|
|
38
|
+
sys.exit(0)
|
|
39
|
+
|
|
40
|
+
print("\nGet Configuration:")
|
|
41
|
+
print(f"Campaign: {args.campaign or 'default'}")
|
|
42
|
+
print(f"Version: {args.version or 'dataset defaults'}")
|
|
43
|
+
print(f"Datasets: {', '.join(datasets) if datasets else 'ALL'}")
|
|
44
|
+
print(f"Objects: {', '.join(objects) if objects else 'ALL'}")
|
|
45
|
+
print(f"Requested events: {args.events if args.events else 'ALL'}")
|
|
46
|
+
print(f"Output directory: {args.output_dir}")
|
|
47
|
+
|
|
48
|
+
# Download
|
|
49
|
+
remote_paths = [f.path for f in files]
|
|
50
|
+
results = downloader.download_files(
|
|
51
|
+
remote_paths=remote_paths,
|
|
52
|
+
local_dir=args.output_dir,
|
|
53
|
+
max_workers=args.workers,
|
|
54
|
+
resume=not args.no_resume,
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
successful = [r for r in results.values() if r.success]
|
|
58
|
+
failed = [r for r in results.values() if not r.success]
|
|
59
|
+
|
|
60
|
+
print("\nGet Summary:")
|
|
61
|
+
print(f"Total files: {len(results)}")
|
|
62
|
+
print(f"Successful: {len(successful)}")
|
|
63
|
+
print(f"Failed: {len(failed)}")
|
|
64
|
+
if failed:
|
|
65
|
+
print("\nFailed downloads:")
|
|
66
|
+
for path, result in results.items():
|
|
67
|
+
if not result.success:
|
|
68
|
+
print(f"✗ {path}: {result.error}")
|
|
69
|
+
sys.exit(1)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def main():
|
|
73
|
+
"""Main entry point."""
|
|
74
|
+
parser = argparse.ArgumentParser(description="ColliderML command line interface")
|
|
75
|
+
subparsers = parser.add_subparsers(dest='command', help='Command to run')
|
|
76
|
+
|
|
77
|
+
# Get command (manifest-driven)
|
|
78
|
+
get_parser = subparsers.add_parser('get', help='Get files using manifest selection')
|
|
79
|
+
get_parser.add_argument('-c', '--campaign', type=str, default='default',
|
|
80
|
+
help='Campaign name (or "default" to use manifest default)')
|
|
81
|
+
get_parser.add_argument('-d', '--datasets', type=str,
|
|
82
|
+
help='Comma-separated list of datasets (e.g. ttbar,qcd)')
|
|
83
|
+
get_parser.add_argument('-o', '--objects', type=str,
|
|
84
|
+
help='Comma-separated list of objects (e.g. tracks,hits)')
|
|
85
|
+
get_parser.add_argument('-e', '--events', type=int, default=None,
|
|
86
|
+
help='Max number of events to download (across selection)')
|
|
87
|
+
get_parser.add_argument('-O', '--output-dir', '--output_dir', dest='output_dir', type=str, default='data',
|
|
88
|
+
help='Directory to save downloaded files')
|
|
89
|
+
get_parser.add_argument('-w', '--workers', type=int, default=4,
|
|
90
|
+
help='Number of parallel downloads')
|
|
91
|
+
get_parser.add_argument('--no-resume', '--no_resume', dest='no_resume', action='store_true',
|
|
92
|
+
help='Disable resuming partial downloads')
|
|
93
|
+
get_parser.add_argument('-v', '--version', type=str, default=None,
|
|
94
|
+
help='Dataset version to use (overrides dataset default_version)')
|
|
95
|
+
|
|
96
|
+
args = parser.parse_args()
|
|
97
|
+
|
|
98
|
+
if args.command == 'get':
|
|
99
|
+
get(args)
|
|
100
|
+
else:
|
|
101
|
+
parser.print_help()
|
|
102
|
+
sys.exit(1)
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
if __name__ == '__main__':
|
|
106
|
+
main()
|
|
107
|
+
|
|
108
|
+
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
"""Data handling functionality for ColliderML."""
|
|
2
|
+
|
|
3
|
+
from .dataset import Dataset
|
|
4
|
+
from .config import (
|
|
5
|
+
PileupLevel,
|
|
6
|
+
DataType,
|
|
7
|
+
OBJECT_CONFIGS,
|
|
8
|
+
VALID_PROCESSES,
|
|
9
|
+
BASE_URL,
|
|
10
|
+
MANIFEST_URL,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"Dataset",
|
|
15
|
+
"PileupLevel",
|
|
16
|
+
"DataType",
|
|
17
|
+
"OBJECT_CONFIGS",
|
|
18
|
+
"VALID_PROCESSES",
|
|
19
|
+
"BASE_URL",
|
|
20
|
+
"MANIFEST_URL",
|
|
21
|
+
]
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
"""Configuration and lightweight enums/constants for ColliderML.
|
|
2
|
+
|
|
3
|
+
This module provides base URLs and minimal legacy constants while we migrate
|
|
4
|
+
to a manifest-driven data selection model.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from enum import Enum
|
|
10
|
+
from typing import Dict, Set
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
# Base URL configuration
|
|
14
|
+
BASE_URL: str = "https://portal.nersc.gov/cfs/m4958/ColliderML/"
|
|
15
|
+
MANIFEST_URL: str = f"{BASE_URL}manifest.json"
|
|
16
|
+
|
|
17
|
+
# Downloader base URLs list (kept for backward compatibility)
|
|
18
|
+
DEFAULT_URLS = [BASE_URL]
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
# Legacy-style enums (kept to avoid breaking existing imports/tests)
|
|
22
|
+
class PileupLevel(str, Enum):
|
|
23
|
+
SINGLE = "single-particle"
|
|
24
|
+
LOW = "pileup-10"
|
|
25
|
+
HIGH = "pileup-200"
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class DataType(str, Enum):
|
|
29
|
+
RECO = "reco"
|
|
30
|
+
TRUTH = "truth"
|
|
31
|
+
MEASUREMENTS = "measurements"
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
# Minimal object configuration placeholder to satisfy existing imports
|
|
35
|
+
class _ObjectConfig:
|
|
36
|
+
def __init__(self, data_type: DataType):
|
|
37
|
+
self.data_type = data_type
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
# Representative objects; the authoritative list comes from the manifest
|
|
41
|
+
OBJECT_CONFIGS: Dict[str, _ObjectConfig] = {
|
|
42
|
+
"tracks": _ObjectConfig(DataType.RECO),
|
|
43
|
+
"particle_flow": _ObjectConfig(DataType.RECO),
|
|
44
|
+
"particles": _ObjectConfig(DataType.TRUTH),
|
|
45
|
+
"tracker_hits": _ObjectConfig(DataType.MEASUREMENTS),
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
# Representative physics processes; authoritative set comes from the manifest
|
|
50
|
+
VALID_PROCESSES: Set[str] = {"ttbar", "wjets", "zjets", "susy", "higgs", "qcd", "exotics"}
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
# Legacy constants used by older CLI/tests; superseded by manifest data
|
|
54
|
+
EVENTS_PER_FILE: int = 1000
|
|
55
|
+
|
|
56
|
+
# Optional dataset sizes for summary planning (will be superseded by manifest totals)
|
|
57
|
+
DATASET_SIZES: Dict[str, int] = {
|
|
58
|
+
"ttbar": 100_000,
|
|
59
|
+
"qcd": 100_000,
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def get_object_path(
|
|
64
|
+
pileup: PileupLevel,
|
|
65
|
+
process: str,
|
|
66
|
+
object_name: str,
|
|
67
|
+
start_event: int,
|
|
68
|
+
end_event: int,
|
|
69
|
+
) -> str:
|
|
70
|
+
"""Build a legacy-style relative path for an object file.
|
|
71
|
+
|
|
72
|
+
This is maintained only for backward compatibility with existing tests and
|
|
73
|
+
scripts. New code should select paths from the manifest instead.
|
|
74
|
+
"""
|
|
75
|
+
|
|
76
|
+
# Very lightweight validation to match test expectations
|
|
77
|
+
if process not in VALID_PROCESSES:
|
|
78
|
+
raise ValueError("Invalid process")
|
|
79
|
+
if object_name not in OBJECT_CONFIGS:
|
|
80
|
+
raise ValueError("Invalid object type")
|
|
81
|
+
|
|
82
|
+
# Use a fixed version segment to satisfy test path format
|
|
83
|
+
version = "v1"
|
|
84
|
+
data_type = OBJECT_CONFIGS[object_name].data_type.value
|
|
85
|
+
pileup_str = pileup.value if isinstance(pileup, PileupLevel) else str(pileup)
|
|
86
|
+
filename = f"{pileup_str}.{process}.{version}.{data_type}.{object_name}.events{start_event}-{end_event}.h5"
|
|
87
|
+
rel_path = f"{pileup_str}/{process}/{version}/{data_type}/{object_name}/{filename}"
|
|
88
|
+
return rel_path
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
__all__ = [
|
|
92
|
+
"BASE_URL",
|
|
93
|
+
"MANIFEST_URL",
|
|
94
|
+
"DEFAULT_URLS",
|
|
95
|
+
"PileupLevel",
|
|
96
|
+
"DataType",
|
|
97
|
+
"OBJECT_CONFIGS",
|
|
98
|
+
"VALID_PROCESSES",
|
|
99
|
+
"EVENTS_PER_FILE",
|
|
100
|
+
"DATASET_SIZES",
|
|
101
|
+
"get_object_path",
|
|
102
|
+
]
|
|
103
|
+
|
|
104
|
+
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
"""Dataset class for handling HEP data."""
|
|
2
|
+
|
|
3
|
+
from typing import Optional, List, Dict, Any
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
class Dataset:
|
|
7
|
+
"""Base class for handling HEP datasets."""
|
|
8
|
+
|
|
9
|
+
def __init__(self, name: str, files: List[str]):
|
|
10
|
+
"""Initialize a dataset.
|
|
11
|
+
|
|
12
|
+
Args:
|
|
13
|
+
name: Name of the dataset.
|
|
14
|
+
files: List of file paths in the dataset.
|
|
15
|
+
"""
|
|
16
|
+
self.name = name
|
|
17
|
+
self.files = files
|
|
18
|
+
self._metadata: Dict[str, Any] = {}
|
|
19
|
+
|
|
20
|
+
def __len__(self) -> int:
|
|
21
|
+
"""Get the number of files in the dataset."""
|
|
22
|
+
return len(self.files)
|
|
23
|
+
|
|
24
|
+
def add_metadata(self, key: str, value: Any) -> None:
|
|
25
|
+
"""Add metadata to the dataset.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
key: Metadata key.
|
|
29
|
+
value: Metadata value.
|
|
30
|
+
"""
|
|
31
|
+
self._metadata[key] = value
|
|
32
|
+
|
|
33
|
+
def get_metadata(self, key: str) -> Any:
|
|
34
|
+
"""Get metadata from the dataset.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
key: Metadata key.
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
The metadata value.
|
|
41
|
+
|
|
42
|
+
Raises:
|
|
43
|
+
KeyError: If the key doesn't exist.
|
|
44
|
+
"""
|
|
45
|
+
return self._metadata[key]
|