hscida 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
hscida/__init__.py ADDED
@@ -0,0 +1,77 @@
1
+ from dataclasses import dataclass, field
2
+ from typing import Any, cast
3
+ from hereutil import here
4
+ import narwhals as nw
5
+ import duckdb
6
+ import os
7
+ import polars as pl
8
+
9
+ from dotenv import dotenv_values
10
+
11
+ _DEFAULT_DUCKDB_CONFIG = dict(parquet_metadata_cache="true", preserve_insertion_order="false", enable_fsst_vectors="true")
12
+ _PROJROOT = str(here())
13
+
14
+ @dataclass
15
+ class DataAccessConfig:
16
+ glob_pattern: str
17
+ init_sql: str
18
+ duckdb_config: dict[str, Any] = field(default_factory=lambda: _DEFAULT_DUCKDB_CONFIG)
19
+ projroot: str = _PROJROOT
20
+
21
+ def config_from_env() -> DataAccessConfig:
22
+ c = {
23
+ **dotenv_values(),
24
+ **os.environ,
25
+ }
26
+ return DataAccessConfig(
27
+ glob_pattern=c.get('GLOB_PATTERN', ''),
28
+ init_sql=c.get('INIT_SQL', ''),
29
+ duckdb_config={k: v for k, v in [pair.split('=') for pair in c['DUCKDB_CONFIG'].split(',')] } if 'DUCKDB_CONFIG' in c else _DEFAULT_DUCKDB_CONFIG,
30
+ projroot=c.get('PROJROOT', _PROJROOT)
31
+ )
32
+
33
+ class DataAccess:
34
+ def __init__(self, config: DataAccessConfig = config_from_env()) -> None:
35
+ self.con = duckdb.connect(config=config.duckdb_config)
36
+ self.con.sql(config.init_sql)
37
+ self.datasets = dict[str, nw.LazyFrame[duckdb.DuckDBPyRelation]]()
38
+ self.config = config
39
+
40
+ def register_files_as_view(self, table_name: str, *paths: str, replace: bool = False) -> None:
41
+ self.con.sql(f"CREATE {('OR REPLACE' if replace else '')} VIEW {'IF NOT EXISTS' if not replace else ''} {table_name} AS FROM read_{'parquet' if paths[0].endswith('.parquet') else 'csv'}(['{"', '".join(paths)}'], hive_partitioning=true);")
42
+
43
+ def f(self, dataset: str, *paths: str,replace: bool = False, debug: bool = False) -> nw.LazyFrame[duckdb.DuckDBPyRelation]:
44
+ if dataset not in self.datasets or replace:
45
+ if not paths:
46
+ paths = tuple(path[0] for path in self.con.sql("FROM "+self.config.glob_pattern.format(dataset=dataset,projroot=self.config.projroot)).fetchall())
47
+ if debug:
48
+ print(f"DEBUG: Found paths for dataset {dataset}: {paths}")
49
+ if not paths:
50
+ print(f"No files found for dataset {dataset} in {self.config.glob_pattern.format(dataset=dataset,projroot=self.config.projroot)}")
51
+ return cast(nw.LazyFrame[duckdb.DuckDBPyRelation], None)
52
+ self.register_files_as_view(dataset, *paths, replace=replace)
53
+ self.datasets[dataset] = nw.from_native(self.con.sql(f'FROM {dataset}'))
54
+ return self.datasets[dataset]
55
+
56
+ c = nw.col
57
+ l = nw.lit
58
+
59
+ def to_narwhals(duckdb_relation: duckdb.DuckDBPyRelation) -> nw.LazyFrame[duckdb.DuckDBPyRelation]:
60
+ return nw.from_native(duckdb_relation)
61
+
62
+ n = to_narwhals
63
+
64
+ def to_duckdb(lnf: nw.LazyFrame[duckdb.DuckDBPyRelation]) -> duckdb.DuckDBPyRelation:
65
+ return lnf.to_native()
66
+
67
+ d = to_duckdb
68
+
69
+ def to_polars(lnf: nw.LazyFrame[duckdb.DuckDBPyRelation]) -> pl.DataFrame:
70
+ return lnf.collect(backend='polars').to_native()
71
+
72
+ p = to_polars
73
+
74
+ def to_pandas(lnf: nw.LazyFrame[duckdb.DuckDBPyRelation]):
75
+ return d(lnf).df()
76
+
77
+ __all__ = [ "DataAccess", "c", "l", "to_narwhals", "n", "to_duckdb", "d", "to_polars", "p","to_pandas" ]
@@ -0,0 +1,53 @@
1
+ Metadata-Version: 2.4
2
+ Name: hscida
3
+ Version: 0.1.0
4
+ Summary: HSCI data access provider
5
+ Requires-Python: >=3.12
6
+ Requires-Dist: dotenv>=0.9.9
7
+ Requires-Dist: duckdb>=1.5.2
8
+ Requires-Dist: hereutil>=0.1.5
9
+ Requires-Dist: narwhals>=2.21.0
10
+ Requires-Dist: pandas>=3.0.3
11
+ Requires-Dist: polars>=1.40.1
12
+ Requires-Dist: pyarrow>=22.0.0
13
+ Description-Content-Type: text/markdown
14
+
15
+ # hsci-da
16
+
17
+ Installable Python package for HSCI data access utilities built on DuckDB and Narwhals.
18
+
19
+ ## Install
20
+
21
+ ```bash
22
+ pip install .
23
+ ```
24
+
25
+ ## Run tests
26
+
27
+ ```bash
28
+ pytest
29
+ ```
30
+
31
+ ## Publish To PyPI (Local)
32
+
33
+ Set your token once in your shell profile:
34
+
35
+ ```bash
36
+ export UV_PUBLISH_TOKEN=pypi-xxxx
37
+ ```
38
+
39
+ Release flow from your machine:
40
+
41
+ ```bash
42
+ # bump version in pyproject.toml first
43
+ uv sync --dev
44
+ uv run pytest -q
45
+ uv build
46
+ uv publish
47
+ ```
48
+
49
+ Dry-run against TestPyPI:
50
+
51
+ ```bash
52
+ uv publish --publish-url https://test.pypi.org/legacy/
53
+ ```
@@ -0,0 +1,4 @@
1
+ hscida/__init__.py,sha256=EQwvSz5e12kfWw72dFQdqy8ETLmls3-D82deLz1-824,3178
2
+ hscida-0.1.0.dist-info/METADATA,sha256=lOWaogoQIHHc_4_uw6AfHlceWOesSnwWET3-OvgKCYk,883
3
+ hscida-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
4
+ hscida-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.30.1
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any