hscida 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hscida/__init__.py +77 -0
- hscida-0.1.0.dist-info/METADATA +53 -0
- hscida-0.1.0.dist-info/RECORD +4 -0
- hscida-0.1.0.dist-info/WHEEL +4 -0
hscida/__init__.py
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
from dataclasses import dataclass, field
|
|
2
|
+
from typing import Any, cast
|
|
3
|
+
from hereutil import here
|
|
4
|
+
import narwhals as nw
|
|
5
|
+
import duckdb
|
|
6
|
+
import os
|
|
7
|
+
import polars as pl
|
|
8
|
+
|
|
9
|
+
from dotenv import dotenv_values
|
|
10
|
+
|
|
11
|
+
_DEFAULT_DUCKDB_CONFIG = dict(parquet_metadata_cache="true", preserve_insertion_order="false", enable_fsst_vectors="true")
|
|
12
|
+
_PROJROOT = str(here())
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class DataAccessConfig:
|
|
16
|
+
glob_pattern: str
|
|
17
|
+
init_sql: str
|
|
18
|
+
duckdb_config: dict[str, Any] = field(default_factory=lambda: _DEFAULT_DUCKDB_CONFIG)
|
|
19
|
+
projroot: str = _PROJROOT
|
|
20
|
+
|
|
21
|
+
def config_from_env() -> DataAccessConfig:
|
|
22
|
+
c = {
|
|
23
|
+
**dotenv_values(),
|
|
24
|
+
**os.environ,
|
|
25
|
+
}
|
|
26
|
+
return DataAccessConfig(
|
|
27
|
+
glob_pattern=c.get('GLOB_PATTERN', ''),
|
|
28
|
+
init_sql=c.get('INIT_SQL', ''),
|
|
29
|
+
duckdb_config={k: v for k, v in [pair.split('=') for pair in c['DUCKDB_CONFIG'].split(',')] } if 'DUCKDB_CONFIG' in c else _DEFAULT_DUCKDB_CONFIG,
|
|
30
|
+
projroot=c.get('PROJROOT', _PROJROOT)
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
class DataAccess:
|
|
34
|
+
def __init__(self, config: DataAccessConfig = config_from_env()) -> None:
|
|
35
|
+
self.con = duckdb.connect(config=config.duckdb_config)
|
|
36
|
+
self.con.sql(config.init_sql)
|
|
37
|
+
self.datasets = dict[str, nw.LazyFrame[duckdb.DuckDBPyRelation]]()
|
|
38
|
+
self.config = config
|
|
39
|
+
|
|
40
|
+
def register_files_as_view(self, table_name: str, *paths: str, replace: bool = False) -> None:
|
|
41
|
+
self.con.sql(f"CREATE {('OR REPLACE' if replace else '')} VIEW {'IF NOT EXISTS' if not replace else ''} {table_name} AS FROM read_{'parquet' if paths[0].endswith('.parquet') else 'csv'}(['{"', '".join(paths)}'], hive_partitioning=true);")
|
|
42
|
+
|
|
43
|
+
def f(self, dataset: str, *paths: str,replace: bool = False, debug: bool = False) -> nw.LazyFrame[duckdb.DuckDBPyRelation]:
|
|
44
|
+
if dataset not in self.datasets or replace:
|
|
45
|
+
if not paths:
|
|
46
|
+
paths = tuple(path[0] for path in self.con.sql("FROM "+self.config.glob_pattern.format(dataset=dataset,projroot=self.config.projroot)).fetchall())
|
|
47
|
+
if debug:
|
|
48
|
+
print(f"DEBUG: Found paths for dataset {dataset}: {paths}")
|
|
49
|
+
if not paths:
|
|
50
|
+
print(f"No files found for dataset {dataset} in {self.config.glob_pattern.format(dataset=dataset,projroot=self.config.projroot)}")
|
|
51
|
+
return cast(nw.LazyFrame[duckdb.DuckDBPyRelation], None)
|
|
52
|
+
self.register_files_as_view(dataset, *paths, replace=replace)
|
|
53
|
+
self.datasets[dataset] = nw.from_native(self.con.sql(f'FROM {dataset}'))
|
|
54
|
+
return self.datasets[dataset]
|
|
55
|
+
|
|
56
|
+
c = nw.col
|
|
57
|
+
l = nw.lit
|
|
58
|
+
|
|
59
|
+
def to_narwhals(duckdb_relation: duckdb.DuckDBPyRelation) -> nw.LazyFrame[duckdb.DuckDBPyRelation]:
|
|
60
|
+
return nw.from_native(duckdb_relation)
|
|
61
|
+
|
|
62
|
+
n = to_narwhals
|
|
63
|
+
|
|
64
|
+
def to_duckdb(lnf: nw.LazyFrame[duckdb.DuckDBPyRelation]) -> duckdb.DuckDBPyRelation:
|
|
65
|
+
return lnf.to_native()
|
|
66
|
+
|
|
67
|
+
d = to_duckdb
|
|
68
|
+
|
|
69
|
+
def to_polars(lnf: nw.LazyFrame[duckdb.DuckDBPyRelation]) -> pl.DataFrame:
|
|
70
|
+
return lnf.collect(backend='polars').to_native()
|
|
71
|
+
|
|
72
|
+
p = to_polars
|
|
73
|
+
|
|
74
|
+
def to_pandas(lnf: nw.LazyFrame[duckdb.DuckDBPyRelation]):
|
|
75
|
+
return d(lnf).df()
|
|
76
|
+
|
|
77
|
+
__all__ = [ "DataAccess", "c", "l", "to_narwhals", "n", "to_duckdb", "d", "to_polars", "p","to_pandas" ]
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: hscida
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: HSCI data access provider
|
|
5
|
+
Requires-Python: >=3.12
|
|
6
|
+
Requires-Dist: dotenv>=0.9.9
|
|
7
|
+
Requires-Dist: duckdb>=1.5.2
|
|
8
|
+
Requires-Dist: hereutil>=0.1.5
|
|
9
|
+
Requires-Dist: narwhals>=2.21.0
|
|
10
|
+
Requires-Dist: pandas>=3.0.3
|
|
11
|
+
Requires-Dist: polars>=1.40.1
|
|
12
|
+
Requires-Dist: pyarrow>=22.0.0
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
|
|
15
|
+
# hsci-da
|
|
16
|
+
|
|
17
|
+
Installable Python package for HSCI data access utilities built on DuckDB and Narwhals.
|
|
18
|
+
|
|
19
|
+
## Install
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
pip install .
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
## Run tests
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
pytest
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
## Publish To PyPI (Local)
|
|
32
|
+
|
|
33
|
+
Set your token once in your shell profile:
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
export UV_PUBLISH_TOKEN=pypi-xxxx
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
Release flow from your machine:
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
# bump version in pyproject.toml first
|
|
43
|
+
uv sync --dev
|
|
44
|
+
uv run pytest -q
|
|
45
|
+
uv build
|
|
46
|
+
uv publish
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
Dry-run against TestPyPI:
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
uv publish --publish-url https://test.pypi.org/legacy/
|
|
53
|
+
```
|
|
@@ -0,0 +1,4 @@
|
|
|
1
|
+
hscida/__init__.py,sha256=EQwvSz5e12kfWw72dFQdqy8ETLmls3-D82deLz1-824,3178
|
|
2
|
+
hscida-0.1.0.dist-info/METADATA,sha256=lOWaogoQIHHc_4_uw6AfHlceWOesSnwWET3-OvgKCYk,883
|
|
3
|
+
hscida-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
|
|
4
|
+
hscida-0.1.0.dist-info/RECORD,,
|