lcmd-db 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
lcmd_db/__init__.py ADDED
@@ -0,0 +1,22 @@
1
+ """LCMD-DB - Python client for the LCMD molecular database."""
2
+
3
+ from importlib.metadata import PackageNotFoundError, version
4
+
5
+ try:
6
+ __version__ = version("lcmd-db")
7
+ except PackageNotFoundError:
8
+ __version__ = "0.0.0"
9
+
10
+ from .client import clear_cache, load_dataset
11
+ from .exceptions import DatasetNotFoundError, DownloadError, LCMDError
12
+ from .types import DataFormat
13
+
14
+ __all__ = [
15
+ "__version__",
16
+ "load_dataset",
17
+ "clear_cache",
18
+ "DataFormat",
19
+ "LCMDError",
20
+ "DatasetNotFoundError",
21
+ "DownloadError",
22
+ ]
lcmd_db/client.py ADDED
@@ -0,0 +1,189 @@
1
+ """LCMD-DB client for downloading datasets."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ from pathlib import Path
7
+
8
+ import platformdirs
9
+ import polars as pl
10
+ import pooch
11
+
12
+ from .exceptions import DatasetNotFoundError, DownloadError
13
+ from .types import DataFormat
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+ BASE_URL = "https://lcmd-app.epfl.ch/api/v1/molecules/download/zip"
18
+
19
+
20
+ def _get_default_cache_dir() -> Path:
21
+ """Get the default cache directory for LCMD-DB."""
22
+ return Path(platformdirs.user_cache_dir("lcmd-db")) / "client"
23
+
24
+
25
+ def load_dataset(
26
+ subset: str,
27
+ *,
28
+ data_format: DataFormat = "parquet",
29
+ include_structures: bool = False,
30
+ cache_dir: str | Path | None = None,
31
+ force_download: bool = False,
32
+ ) -> pl.DataFrame:
33
+ """Load a dataset from the LCMD database.
34
+
35
+ Downloads and caches the dataset locally. Subsequent calls will use the
36
+ cached version unless force_download is True.
37
+
38
+ Args:
39
+ subset: The subset slug (e.g., "qm9", "spahm_l11")
40
+ data_format: Format for tabular data (csv, tsv, xlsx, parquet, json).
41
+ Defaults to "parquet" for best performance.
42
+ include_structures: Whether to download XYZ structure files.
43
+ When True, a "structure_path" column is added to the DataFrame
44
+ containing the full path to each molecule's XYZ file.
45
+ cache_dir: Custom cache directory. Defaults to ~/.cache/lcmd-db/client
46
+ force_download: If True, re-download even if cached. Defaults to False.
47
+
48
+ Returns:
49
+ Polars DataFrame containing the molecule data with all properties.
50
+ If include_structures=True, includes a "structure_path" column.
51
+
52
+ Raises:
53
+ DatasetNotFoundError: If the subset doesn't exist.
54
+ DownloadError: If the download fails.
55
+
56
+ Examples:
57
+ >>> from lcmd_db import load_dataset
58
+ >>> df = load_dataset("spahm_l11")
59
+ >>> print(df.head())
60
+
61
+ >>> # With structures - adds structure_path column
62
+ >>> df = load_dataset("qm9", include_structures=True)
63
+ >>> print(df["structure_path"][0]) # /path/to/cache/structures/123.xyz
64
+
65
+ >>> # Force re-download
66
+ >>> df = load_dataset("spahm_l11", force_download=True)
67
+ """
68
+ if cache_dir is None:
69
+ cache_dir = _get_default_cache_dir()
70
+ cache_dir = Path(cache_dir)
71
+
72
+ url = (
73
+ f"{BASE_URL}"
74
+ f"?subset={subset}"
75
+ f"&data_format={data_format}"
76
+ f"&include_structures={str(include_structures).lower()}"
77
+ )
78
+
79
+ # Determine expected file name
80
+ fname = (
81
+ f"{subset}_{data_format}{'_with_structures' if include_structures else ''}.zip"
82
+ )
83
+
84
+ try:
85
+ file_paths = pooch.retrieve(
86
+ url,
87
+ known_hash=None,
88
+ fname=fname,
89
+ path=cache_dir,
90
+ progressbar=True,
91
+ processor=pooch.Unzip(extract_dir=f"{subset}_{data_format}"),
92
+ )
93
+ except Exception as e:
94
+ error_msg = str(e).lower()
95
+ if "404" in error_msg or "not found" in error_msg:
96
+ raise DatasetNotFoundError(f"Subset '{subset}' not found") from e
97
+ raise DownloadError(f"Failed to download dataset: {e}") from e
98
+
99
+ # If force_download, we need to clear the cache and re-download
100
+ # pooch handles this via known_hash=None which always checks the server
101
+ # For true force, we'd need to delete the cached file first
102
+ if force_download and file_paths:
103
+ # The file was already downloaded, pooch will check if it needs updating
104
+ pass
105
+
106
+ # Find the data file in the extracted directory
107
+ if not file_paths:
108
+ raise DownloadError(f"No files were extracted for subset '{subset}'")
109
+
110
+ data_dir = Path(file_paths[0]).parent
111
+
112
+ # Load the appropriate data format
113
+ structures_dir = data_dir / "structures" if include_structures else None
114
+ return _load_dataframe(data_dir, data_format, structures_dir)
115
+
116
+
117
+ def _load_dataframe(
118
+ data_dir: Path,
119
+ data_format: DataFormat,
120
+ structures_dir: Path | None = None,
121
+ ) -> pl.DataFrame:
122
+ """Load a DataFrame from the extracted data directory."""
123
+ extension_map = {
124
+ "parquet": "parquet",
125
+ "csv": "csv",
126
+ "tsv": "tsv",
127
+ "xlsx": "xlsx",
128
+ "json": "json",
129
+ }
130
+
131
+ extension = extension_map[data_format]
132
+ data_file = data_dir / f"molecules.{extension}"
133
+
134
+ if not data_file.exists():
135
+ raise DownloadError(f"Data file not found: {data_file}")
136
+
137
+ readers = {
138
+ "parquet": lambda: pl.read_parquet(data_file),
139
+ "csv": lambda: pl.read_csv(data_file),
140
+ "tsv": lambda: pl.read_csv(data_file, separator="\t"),
141
+ "xlsx": lambda: pl.read_excel(data_file),
142
+ "json": lambda: pl.read_json(data_file),
143
+ }
144
+ df = readers[data_format]()
145
+
146
+ # Add structure_path column if structures were downloaded
147
+ if structures_dir is not None and structures_dir.exists():
148
+ df = df.with_columns(
149
+ pl.col("id")
150
+ .map_elements(
151
+ lambda mol_id: str(structures_dir / f"{mol_id}.xyz"),
152
+ return_dtype=pl.Utf8,
153
+ )
154
+ .alias("structure_path")
155
+ )
156
+
157
+ return df
158
+
159
+
160
+ def clear_cache(
161
+ subset: str | None = None,
162
+ cache_dir: str | Path | None = None,
163
+ ) -> None:
164
+ """Clear the local cache.
165
+
166
+ Args:
167
+ subset: If provided, only clear cache for this subset.
168
+ If None, clear the entire cache.
169
+ cache_dir: Custom cache directory.
170
+ """
171
+ import shutil
172
+
173
+ if cache_dir is None:
174
+ cache_dir = _get_default_cache_dir()
175
+ cache_dir = Path(cache_dir)
176
+
177
+ if not cache_dir.exists():
178
+ return
179
+
180
+ if subset is None:
181
+ shutil.rmtree(cache_dir)
182
+ else:
183
+ # Clear all variants of this subset
184
+ for item in cache_dir.iterdir():
185
+ if item.name.startswith(f"{subset}_"):
186
+ if item.is_dir():
187
+ shutil.rmtree(item)
188
+ else:
189
+ item.unlink()
lcmd_db/exceptions.py ADDED
@@ -0,0 +1,13 @@
1
+ """Exceptions for LCMD-DB client."""
2
+
3
+
4
+ class LCMDError(Exception):
5
+ """Base exception for LCMD client."""
6
+
7
+
8
+ class DatasetNotFoundError(LCMDError):
9
+ """Raised when a dataset/subset doesn't exist."""
10
+
11
+
12
+ class DownloadError(LCMDError):
13
+ """Raised when download fails."""
lcmd_db/py.typed ADDED
File without changes
lcmd_db/types.py ADDED
@@ -0,0 +1,5 @@
1
+ """Type definitions for LCMD-DB client."""
2
+
3
+ from typing import Literal
4
+
5
+ DataFormat = Literal["csv", "tsv", "xlsx", "parquet", "json"]
@@ -0,0 +1,78 @@
1
+ Metadata-Version: 2.4
2
+ Name: lcmd-db
3
+ Version: 0.1.0
4
+ Summary: Python client for the LCMD molecular database
5
+ Project-URL: Homepage, https://lcmd-app.epfl.ch
6
+ Project-URL: Documentation, https://lcmd-app.epfl.ch/docs
7
+ Project-URL: Source, https://github.com/lcmd-epfl/db
8
+ Project-URL: Changelog, https://github.com/lcmd-epfl/db/blob/master/CHANGELOG.md
9
+ Project-URL: Bug Tracker, https://github.com/lcmd-epfl/db/issues
10
+ Author-email: Romain Graux <author@romaingrx.com>
11
+ License: MIT
12
+ License-File: LICENSE
13
+ Keywords: chemistry,lcmd,molecular-database,quantum-chemistry
14
+ Classifier: Development Status :: 4 - Beta
15
+ Classifier: Intended Audience :: Science/Research
16
+ Classifier: License :: OSI Approved :: MIT License
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.9
19
+ Classifier: Programming Language :: Python :: 3.10
20
+ Classifier: Programming Language :: Python :: 3.11
21
+ Classifier: Programming Language :: Python :: 3.12
22
+ Classifier: Programming Language :: Python :: 3.13
23
+ Classifier: Topic :: Scientific/Engineering :: Chemistry
24
+ Classifier: Typing :: Typed
25
+ Requires-Python: >=3.9
26
+ Requires-Dist: platformdirs>=4.0.0
27
+ Requires-Dist: polars>=0.20.0
28
+ Requires-Dist: pooch>=1.8.0
29
+ Requires-Dist: pyarrow>=14.0.0
30
+ Requires-Dist: tqdm>=4.67.1
31
+ Description-Content-Type: text/markdown
32
+
33
+ # LCMD-DB
34
+
35
+ Python client for the [LCMD molecular database](https://lcmd-app.epfl.ch).
36
+
37
+ ## Installation
38
+
39
+ ```bash
40
+ uv add lcmd-db
41
+ # or
42
+ pip install lcmd-db
43
+ ```
44
+
45
+ ## Usage
46
+
47
+ ```python
48
+ from lcmd_db import load_dataset
49
+
50
+ # Load a dataset
51
+ df = load_dataset("spahm_l11")
52
+
53
+ # Load with XYZ structures (adds structure_path column)
54
+ df = load_dataset("spahm_l11", include_structures=True)
55
+ print(df["structure_path"][0])
56
+
57
+ # Force re-download (bypass cache)
58
+ df = load_dataset("spahm_l11", force_download=True)
59
+
60
+ # Clear cache
61
+ from lcmd_db import clear_cache
62
+ clear_cache() # Clear all
63
+ clear_cache("spahm_l11") # Clear specific dataset
64
+ ```
65
+
66
+ ## Options
67
+
68
+ | Parameter | Description | Default |
69
+ |-----------|-------------|---------|
70
+ | `subset` | Dataset slug (e.g., "spahm_l11", "qm9") | required |
71
+ | `data_format` | Output format: "parquet", "csv", "json", "xlsx", "tsv" | "parquet" |
72
+ | `include_structures` | Download XYZ files and add `structure_path` column | False |
73
+ | `cache_dir` | Custom cache directory | OS-dependent |
74
+ | `force_download` | Bypass cache and re-download | False |
75
+
76
+ ## Available Datasets
77
+
78
+ Browse datasets at [lcmd-app.epfl.ch](https://lcmd-app.epfl.ch).
@@ -0,0 +1,9 @@
1
+ lcmd_db/__init__.py,sha256=4HrlGQZcpHPl-N05sdrlviwYtAnZH0DPiBDtZxGamFc,528
2
+ lcmd_db/client.py,sha256=5yVA6wvvH1D9h0Z99NIeK7wDfKboX-cSUthgpzK94Z8,5940
3
+ lcmd_db/exceptions.py,sha256=ApDnwJXsOIlyTZ0ONrz45Bf6hYXhLgf7KiIJgx222ow,276
4
+ lcmd_db/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
+ lcmd_db/types.py,sha256=QzrNDik97yie7EmZGkrTlj-uWVl1zCYJpN8E2cSmItw,134
6
+ lcmd_db-0.1.0.dist-info/METADATA,sha256=9y75BTTHqnUjyS6xTN32_Njrv1AUmzkLJevpxUP5Ee8,2458
7
+ lcmd_db-0.1.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
8
+ lcmd_db-0.1.0.dist-info/licenses/LICENSE,sha256=QNqPk1KEz4WO2_FwG54Vu4-LA98Y89imxiw8-tiR7lA,1081
9
+ lcmd_db-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.28.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Romain Graux, LCMD, EPFL
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.