lcmd-db 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lcmd_db/__init__.py +22 -0
- lcmd_db/client.py +189 -0
- lcmd_db/exceptions.py +13 -0
- lcmd_db/py.typed +0 -0
- lcmd_db/types.py +5 -0
- lcmd_db-0.1.0.dist-info/METADATA +78 -0
- lcmd_db-0.1.0.dist-info/RECORD +9 -0
- lcmd_db-0.1.0.dist-info/WHEEL +4 -0
- lcmd_db-0.1.0.dist-info/licenses/LICENSE +21 -0
lcmd_db/__init__.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
"""LCMD-DB - Python client for the LCMD molecular database."""
|
|
2
|
+
|
|
3
|
+
from importlib.metadata import PackageNotFoundError, version
|
|
4
|
+
|
|
5
|
+
try:
|
|
6
|
+
__version__ = version("lcmd-db")
|
|
7
|
+
except PackageNotFoundError:
|
|
8
|
+
__version__ = "0.0.0"
|
|
9
|
+
|
|
10
|
+
from .client import clear_cache, load_dataset
|
|
11
|
+
from .exceptions import DatasetNotFoundError, DownloadError, LCMDError
|
|
12
|
+
from .types import DataFormat
|
|
13
|
+
|
|
14
|
+
__all__ = [
|
|
15
|
+
"__version__",
|
|
16
|
+
"load_dataset",
|
|
17
|
+
"clear_cache",
|
|
18
|
+
"DataFormat",
|
|
19
|
+
"LCMDError",
|
|
20
|
+
"DatasetNotFoundError",
|
|
21
|
+
"DownloadError",
|
|
22
|
+
]
|
lcmd_db/client.py
ADDED
|
@@ -0,0 +1,189 @@
|
|
|
1
|
+
"""LCMD-DB client for downloading datasets."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
import platformdirs
|
|
9
|
+
import polars as pl
|
|
10
|
+
import pooch
|
|
11
|
+
|
|
12
|
+
from .exceptions import DatasetNotFoundError, DownloadError
|
|
13
|
+
from .types import DataFormat
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
BASE_URL = "https://lcmd-app.epfl.ch/api/v1/molecules/download/zip"
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _get_default_cache_dir() -> Path:
|
|
21
|
+
"""Get the default cache directory for LCMD-DB."""
|
|
22
|
+
return Path(platformdirs.user_cache_dir("lcmd-db")) / "client"
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def load_dataset(
|
|
26
|
+
subset: str,
|
|
27
|
+
*,
|
|
28
|
+
data_format: DataFormat = "parquet",
|
|
29
|
+
include_structures: bool = False,
|
|
30
|
+
cache_dir: str | Path | None = None,
|
|
31
|
+
force_download: bool = False,
|
|
32
|
+
) -> pl.DataFrame:
|
|
33
|
+
"""Load a dataset from the LCMD database.
|
|
34
|
+
|
|
35
|
+
Downloads and caches the dataset locally. Subsequent calls will use the
|
|
36
|
+
cached version unless force_download is True.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
subset: The subset slug (e.g., "qm9", "spahm_l11")
|
|
40
|
+
data_format: Format for tabular data (csv, tsv, xlsx, parquet, json).
|
|
41
|
+
Defaults to "parquet" for best performance.
|
|
42
|
+
include_structures: Whether to download XYZ structure files.
|
|
43
|
+
When True, a "structure_path" column is added to the DataFrame
|
|
44
|
+
containing the full path to each molecule's XYZ file.
|
|
45
|
+
cache_dir: Custom cache directory. Defaults to ~/.cache/lcmd-db/client
|
|
46
|
+
force_download: If True, re-download even if cached. Defaults to False.
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
Polars DataFrame containing the molecule data with all properties.
|
|
50
|
+
If include_structures=True, includes a "structure_path" column.
|
|
51
|
+
|
|
52
|
+
Raises:
|
|
53
|
+
DatasetNotFoundError: If the subset doesn't exist.
|
|
54
|
+
DownloadError: If the download fails.
|
|
55
|
+
|
|
56
|
+
Examples:
|
|
57
|
+
>>> from lcmd_db import load_dataset
|
|
58
|
+
>>> df = load_dataset("spahm_l11")
|
|
59
|
+
>>> print(df.head())
|
|
60
|
+
|
|
61
|
+
>>> # With structures - adds structure_path column
|
|
62
|
+
>>> df = load_dataset("qm9", include_structures=True)
|
|
63
|
+
>>> print(df["structure_path"][0]) # /path/to/cache/structures/123.xyz
|
|
64
|
+
|
|
65
|
+
>>> # Force re-download
|
|
66
|
+
>>> df = load_dataset("spahm_l11", force_download=True)
|
|
67
|
+
"""
|
|
68
|
+
if cache_dir is None:
|
|
69
|
+
cache_dir = _get_default_cache_dir()
|
|
70
|
+
cache_dir = Path(cache_dir)
|
|
71
|
+
|
|
72
|
+
url = (
|
|
73
|
+
f"{BASE_URL}"
|
|
74
|
+
f"?subset={subset}"
|
|
75
|
+
f"&data_format={data_format}"
|
|
76
|
+
f"&include_structures={str(include_structures).lower()}"
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
# Determine expected file name
|
|
80
|
+
fname = (
|
|
81
|
+
f"{subset}_{data_format}{'_with_structures' if include_structures else ''}.zip"
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
try:
|
|
85
|
+
file_paths = pooch.retrieve(
|
|
86
|
+
url,
|
|
87
|
+
known_hash=None,
|
|
88
|
+
fname=fname,
|
|
89
|
+
path=cache_dir,
|
|
90
|
+
progressbar=True,
|
|
91
|
+
processor=pooch.Unzip(extract_dir=f"{subset}_{data_format}"),
|
|
92
|
+
)
|
|
93
|
+
except Exception as e:
|
|
94
|
+
error_msg = str(e).lower()
|
|
95
|
+
if "404" in error_msg or "not found" in error_msg:
|
|
96
|
+
raise DatasetNotFoundError(f"Subset '{subset}' not found") from e
|
|
97
|
+
raise DownloadError(f"Failed to download dataset: {e}") from e
|
|
98
|
+
|
|
99
|
+
# If force_download, we need to clear the cache and re-download
|
|
100
|
+
# pooch handles this via known_hash=None which always checks the server
|
|
101
|
+
# For true force, we'd need to delete the cached file first
|
|
102
|
+
if force_download and file_paths:
|
|
103
|
+
# The file was already downloaded, pooch will check if it needs updating
|
|
104
|
+
pass
|
|
105
|
+
|
|
106
|
+
# Find the data file in the extracted directory
|
|
107
|
+
if not file_paths:
|
|
108
|
+
raise DownloadError(f"No files were extracted for subset '{subset}'")
|
|
109
|
+
|
|
110
|
+
data_dir = Path(file_paths[0]).parent
|
|
111
|
+
|
|
112
|
+
# Load the appropriate data format
|
|
113
|
+
structures_dir = data_dir / "structures" if include_structures else None
|
|
114
|
+
return _load_dataframe(data_dir, data_format, structures_dir)
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def _load_dataframe(
|
|
118
|
+
data_dir: Path,
|
|
119
|
+
data_format: DataFormat,
|
|
120
|
+
structures_dir: Path | None = None,
|
|
121
|
+
) -> pl.DataFrame:
|
|
122
|
+
"""Load a DataFrame from the extracted data directory."""
|
|
123
|
+
extension_map = {
|
|
124
|
+
"parquet": "parquet",
|
|
125
|
+
"csv": "csv",
|
|
126
|
+
"tsv": "tsv",
|
|
127
|
+
"xlsx": "xlsx",
|
|
128
|
+
"json": "json",
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
extension = extension_map[data_format]
|
|
132
|
+
data_file = data_dir / f"molecules.{extension}"
|
|
133
|
+
|
|
134
|
+
if not data_file.exists():
|
|
135
|
+
raise DownloadError(f"Data file not found: {data_file}")
|
|
136
|
+
|
|
137
|
+
readers = {
|
|
138
|
+
"parquet": lambda: pl.read_parquet(data_file),
|
|
139
|
+
"csv": lambda: pl.read_csv(data_file),
|
|
140
|
+
"tsv": lambda: pl.read_csv(data_file, separator="\t"),
|
|
141
|
+
"xlsx": lambda: pl.read_excel(data_file),
|
|
142
|
+
"json": lambda: pl.read_json(data_file),
|
|
143
|
+
}
|
|
144
|
+
df = readers[data_format]()
|
|
145
|
+
|
|
146
|
+
# Add structure_path column if structures were downloaded
|
|
147
|
+
if structures_dir is not None and structures_dir.exists():
|
|
148
|
+
df = df.with_columns(
|
|
149
|
+
pl.col("id")
|
|
150
|
+
.map_elements(
|
|
151
|
+
lambda mol_id: str(structures_dir / f"{mol_id}.xyz"),
|
|
152
|
+
return_dtype=pl.Utf8,
|
|
153
|
+
)
|
|
154
|
+
.alias("structure_path")
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
return df
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def clear_cache(
|
|
161
|
+
subset: str | None = None,
|
|
162
|
+
cache_dir: str | Path | None = None,
|
|
163
|
+
) -> None:
|
|
164
|
+
"""Clear the local cache.
|
|
165
|
+
|
|
166
|
+
Args:
|
|
167
|
+
subset: If provided, only clear cache for this subset.
|
|
168
|
+
If None, clear the entire cache.
|
|
169
|
+
cache_dir: Custom cache directory.
|
|
170
|
+
"""
|
|
171
|
+
import shutil
|
|
172
|
+
|
|
173
|
+
if cache_dir is None:
|
|
174
|
+
cache_dir = _get_default_cache_dir()
|
|
175
|
+
cache_dir = Path(cache_dir)
|
|
176
|
+
|
|
177
|
+
if not cache_dir.exists():
|
|
178
|
+
return
|
|
179
|
+
|
|
180
|
+
if subset is None:
|
|
181
|
+
shutil.rmtree(cache_dir)
|
|
182
|
+
else:
|
|
183
|
+
# Clear all variants of this subset
|
|
184
|
+
for item in cache_dir.iterdir():
|
|
185
|
+
if item.name.startswith(f"{subset}_"):
|
|
186
|
+
if item.is_dir():
|
|
187
|
+
shutil.rmtree(item)
|
|
188
|
+
else:
|
|
189
|
+
item.unlink()
|
lcmd_db/exceptions.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""Exceptions for LCMD-DB client."""
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class LCMDError(Exception):
|
|
5
|
+
"""Base exception for LCMD client."""
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class DatasetNotFoundError(LCMDError):
|
|
9
|
+
"""Raised when a dataset/subset doesn't exist."""
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class DownloadError(LCMDError):
|
|
13
|
+
"""Raised when download fails."""
|
lcmd_db/py.typed
ADDED
|
File without changes
|
lcmd_db/types.py
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: lcmd-db
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Python client for the LCMD molecular database
|
|
5
|
+
Project-URL: Homepage, https://lcmd-app.epfl.ch
|
|
6
|
+
Project-URL: Documentation, https://lcmd-app.epfl.ch/docs
|
|
7
|
+
Project-URL: Source, https://github.com/lcmd-epfl/db
|
|
8
|
+
Project-URL: Changelog, https://github.com/lcmd-epfl/db/blob/master/CHANGELOG.md
|
|
9
|
+
Project-URL: Bug Tracker, https://github.com/lcmd-epfl/db/issues
|
|
10
|
+
Author-email: Romain Graux <author@romaingrx.com>
|
|
11
|
+
License: MIT
|
|
12
|
+
License-File: LICENSE
|
|
13
|
+
Keywords: chemistry,lcmd,molecular-database,quantum-chemistry
|
|
14
|
+
Classifier: Development Status :: 4 - Beta
|
|
15
|
+
Classifier: Intended Audience :: Science/Research
|
|
16
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
23
|
+
Classifier: Topic :: Scientific/Engineering :: Chemistry
|
|
24
|
+
Classifier: Typing :: Typed
|
|
25
|
+
Requires-Python: >=3.9
|
|
26
|
+
Requires-Dist: platformdirs>=4.0.0
|
|
27
|
+
Requires-Dist: polars>=0.20.0
|
|
28
|
+
Requires-Dist: pooch>=1.8.0
|
|
29
|
+
Requires-Dist: pyarrow>=14.0.0
|
|
30
|
+
Requires-Dist: tqdm>=4.67.1
|
|
31
|
+
Description-Content-Type: text/markdown
|
|
32
|
+
|
|
33
|
+
# LCMD-DB
|
|
34
|
+
|
|
35
|
+
Python client for the [LCMD molecular database](https://lcmd-app.epfl.ch).
|
|
36
|
+
|
|
37
|
+
## Installation
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
uv add lcmd-db
|
|
41
|
+
# or
|
|
42
|
+
pip install lcmd-db
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
## Usage
|
|
46
|
+
|
|
47
|
+
```python
|
|
48
|
+
from lcmd_db import load_dataset
|
|
49
|
+
|
|
50
|
+
# Load a dataset
|
|
51
|
+
df = load_dataset("spahm_l11")
|
|
52
|
+
|
|
53
|
+
# Load with XYZ structures (adds structure_path column)
|
|
54
|
+
df = load_dataset("spahm_l11", include_structures=True)
|
|
55
|
+
print(df["structure_path"][0])
|
|
56
|
+
|
|
57
|
+
# Force re-download (bypass cache)
|
|
58
|
+
df = load_dataset("spahm_l11", force_download=True)
|
|
59
|
+
|
|
60
|
+
# Clear cache
|
|
61
|
+
from lcmd_db import clear_cache
|
|
62
|
+
clear_cache() # Clear all
|
|
63
|
+
clear_cache("spahm_l11") # Clear specific dataset
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
## Options
|
|
67
|
+
|
|
68
|
+
| Parameter | Description | Default |
|
|
69
|
+
|-----------|-------------|---------|
|
|
70
|
+
| `subset` | Dataset slug (e.g., "spahm_l11", "qm9") | required |
|
|
71
|
+
| `data_format` | Output format: "parquet", "csv", "json", "xlsx", "tsv" | "parquet" |
|
|
72
|
+
| `include_structures` | Download XYZ files and add `structure_path` column | False |
|
|
73
|
+
| `cache_dir` | Custom cache directory | OS-dependent |
|
|
74
|
+
| `force_download` | Bypass cache and re-download | False |
|
|
75
|
+
|
|
76
|
+
## Available Datasets
|
|
77
|
+
|
|
78
|
+
Browse datasets at [lcmd-app.epfl.ch](https://lcmd-app.epfl.ch).
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
lcmd_db/__init__.py,sha256=4HrlGQZcpHPl-N05sdrlviwYtAnZH0DPiBDtZxGamFc,528
|
|
2
|
+
lcmd_db/client.py,sha256=5yVA6wvvH1D9h0Z99NIeK7wDfKboX-cSUthgpzK94Z8,5940
|
|
3
|
+
lcmd_db/exceptions.py,sha256=ApDnwJXsOIlyTZ0ONrz45Bf6hYXhLgf7KiIJgx222ow,276
|
|
4
|
+
lcmd_db/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
5
|
+
lcmd_db/types.py,sha256=QzrNDik97yie7EmZGkrTlj-uWVl1zCYJpN8E2cSmItw,134
|
|
6
|
+
lcmd_db-0.1.0.dist-info/METADATA,sha256=9y75BTTHqnUjyS6xTN32_Njrv1AUmzkLJevpxUP5Ee8,2458
|
|
7
|
+
lcmd_db-0.1.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
8
|
+
lcmd_db-0.1.0.dist-info/licenses/LICENSE,sha256=QNqPk1KEz4WO2_FwG54Vu4-LA98Y89imxiw8-tiR7lA,1081
|
|
9
|
+
lcmd_db-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Romain Graux, LCMD, EPFL
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|