PyPI - datapond - Versions diffs - 0.1.0__tar.gz - Mend

datapond 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

datapond-0.1.0/.gitignore +6 -0
datapond-0.1.0/LICENSE +21 -0
datapond-0.1.0/PKG-INFO +149 -0
datapond-0.1.0/README.md +119 -0
datapond-0.1.0/pyproject.toml +44 -0
datapond-0.1.0/src/datapond/__init__.py +42 -0
datapond-0.1.0/src/datapond/cli.py +168 -0
datapond-0.1.0/src/datapond/connection.py +84 -0
datapond-0.1.0/src/datapond/describe.py +144 -0
datapond-0.1.0/src/datapond/download.py +176 -0
datapond-0.1.0/src/datapond/registry.py +81 -0

datapond-0.1.0/.gitignore ADDED Viewed

@@ -0,0 +1,6 @@
+__pycache__/
+*.pyc
+*.egg-info/
+dist/
+build/
+.eggs/

datapond-0.1.0/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 datapond-db
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

datapond-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,149 @@
+Metadata-Version: 2.4
+Name: datapond
+Version: 0.1.0
+Summary: Instantly connect to curated DuckDB databases built from public data
+Project-URL: Homepage, https://datapond-db.github.io/website
+Project-URL: Repository, https://github.com/datapond-db/datapond-python
+Project-URL: Registry, https://github.com/datapond-db/registry
+Author: Ian Nason
+License-Expression: MIT
+License-File: LICENSE
+Keywords: data,database,duckdb,government-data,public-data
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Developers
+Classifier: Intended Audience :: Science/Research
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Classifier: Topic :: Database
+Classifier: Topic :: Scientific/Engineering
+Requires-Python: >=3.9
+Requires-Dist: duckdb>=0.9.0
+Requires-Dist: requests
+Provides-Extra: download
+Requires-Dist: huggingface-hub; extra == 'download'
+Description-Content-Type: text/markdown
+# datapond
+**Public data, instantly queryable.**
+datapond gives you instant SQL access to curated DuckDB databases built from public data sources -- no downloads, no API keys, no setup.
+## Install
+```bash
+pip install datapond
+```
+For faster downloads from Hugging Face:
+```bash
+pip install datapond[download]
+```
+## Quick start
+### Browse available databases
+```python
+import datapond
+# See what's available
+datapond.list()
+# Get details about a specific database
+datapond.info("eoir")
+```
+### Connect and query
+```python
+import datapond
+con = datapond.connect("eoir")
+con.sql("SHOW TABLES").show()
+con.sql("SELECT * FROM cases LIMIT 10").show()
+```
+The connection is a standard [duckdb.Connection](https://duckdb.org/docs/api/python/overview) -- use it however you normally use DuckDB, including with pandas and Polars.
+```python
+df = con.sql("SELECT * FROM cases LIMIT 1000").df()  # pandas
+pl = con.sql("SELECT * FROM cases LIMIT 1000").pl()   # polars
+```
+### Download for offline use
+```python
+datapond.download("eoir")
+# Later, connect locally
+con = datapond.connect("eoir", local=True)
+```
+### Update a local database
+```python
+datapond.update("eoir")
+```
+## Multi-database queries
+Attach multiple databases at once and query across them:
+```python
+con = datapond.connect(["eoir", "foia"])
+# Tables are namespaced by database ID
+con.sql("SELECT * FROM eoir.cases LIMIT 5").show()
+con.sql("SELECT * FROM foia.requests LIMIT 5").show()
+```
+## CLI
+datapond also includes a command-line interface:
+```bash
+# List available databases
+datapond list
+# Show database details
+datapond info eoir
+# Download a database
+datapond download eoir --path ./data/
+# Open an interactive SQL session
+datapond connect eoir
+```
+## How it works
+datapond connects to read-only DuckDB files hosted remotely via the [httpfs extension](https://duckdb.org/docs/extensions/httpfs/overview). The [registry](https://github.com/datapond-db/registry) maintains a catalog of available databases with their URLs and metadata.
+When you call `datapond.connect()`, it:
+1. Looks up the database in the registry
+2. Installs and loads the httpfs extension
+3. Attaches the remote DuckDB file as read-only
+4. Returns a connection ready for queries
+No data is downloaded unless you explicitly call `datapond.download()`.
+## Links
+- [Website](https://datapond-db.github.io/website)
+- [Registry](https://github.com/datapond-db/registry) -- catalog of available databases
+- [Source](https://github.com/datapond-db/datapond-python)
+## Contributing
+Contributions are welcome. To add a new database to datapond, submit a pull request to the [registry](https://github.com/datapond-db/registry) repository.
+## License
+MIT

datapond-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,119 @@
+# datapond
+**Public data, instantly queryable.**
+datapond gives you instant SQL access to curated DuckDB databases built from public data sources -- no downloads, no API keys, no setup.
+## Install
+```bash
+pip install datapond
+```
+For faster downloads from Hugging Face:
+```bash
+pip install datapond[download]
+```
+## Quick start
+### Browse available databases
+```python
+import datapond
+# See what's available
+datapond.list()
+# Get details about a specific database
+datapond.info("eoir")
+```
+### Connect and query
+```python
+import datapond
+con = datapond.connect("eoir")
+con.sql("SHOW TABLES").show()
+con.sql("SELECT * FROM cases LIMIT 10").show()
+```
+The connection is a standard [duckdb.Connection](https://duckdb.org/docs/api/python/overview) -- use it however you normally use DuckDB, including with pandas and Polars.
+```python
+df = con.sql("SELECT * FROM cases LIMIT 1000").df()  # pandas
+pl = con.sql("SELECT * FROM cases LIMIT 1000").pl()   # polars
+```
+### Download for offline use
+```python
+datapond.download("eoir")
+# Later, connect locally
+con = datapond.connect("eoir", local=True)
+```
+### Update a local database
+```python
+datapond.update("eoir")
+```
+## Multi-database queries
+Attach multiple databases at once and query across them:
+```python
+con = datapond.connect(["eoir", "foia"])
+# Tables are namespaced by database ID
+con.sql("SELECT * FROM eoir.cases LIMIT 5").show()
+con.sql("SELECT * FROM foia.requests LIMIT 5").show()
+```
+## CLI
+datapond also includes a command-line interface:
+```bash
+# List available databases
+datapond list
+# Show database details
+datapond info eoir
+# Download a database
+datapond download eoir --path ./data/
+# Open an interactive SQL session
+datapond connect eoir
+```
+## How it works
+datapond connects to read-only DuckDB files hosted remotely via the [httpfs extension](https://duckdb.org/docs/extensions/httpfs/overview). The [registry](https://github.com/datapond-db/registry) maintains a catalog of available databases with their URLs and metadata.
+When you call `datapond.connect()`, it:
+1. Looks up the database in the registry
+2. Installs and loads the httpfs extension
+3. Attaches the remote DuckDB file as read-only
+4. Returns a connection ready for queries
+No data is downloaded unless you explicitly call `datapond.download()`.
+## Links
+- [Website](https://datapond-db.github.io/website)
+- [Registry](https://github.com/datapond-db/registry) -- catalog of available databases
+- [Source](https://github.com/datapond-db/datapond-python)
+## Contributing
+Contributions are welcome. To add a new database to datapond, submit a pull request to the [registry](https://github.com/datapond-db/registry) repository.
+## License
+MIT

datapond-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,44 @@
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+[project]
+name = "datapond"
+version = "0.1.0"
+description = "Instantly connect to curated DuckDB databases built from public data"
+readme = "README.md"
+license = "MIT"
+requires-python = ">=3.9"
+authors = [
+    { name = "Ian Nason" },
+]
+keywords = ["duckdb", "data", "public-data", "government-data", "database"]
+classifiers = [
+    "Development Status :: 3 - Alpha",
+    "Intended Audience :: Science/Research",
+    "Intended Audience :: Developers",
+    "License :: OSI Approved :: MIT License",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Programming Language :: Python :: 3.13",
+    "Topic :: Database",
+    "Topic :: Scientific/Engineering",
+]
+dependencies = [
+    "duckdb>=0.9.0",
+    "requests",
+]
+[project.optional-dependencies]
+download = ["huggingface_hub"]
+[project.urls]
+Homepage = "https://datapond-db.github.io/website"
+Repository = "https://github.com/datapond-db/datapond-python"
+Registry = "https://github.com/datapond-db/registry"
+[project.scripts]
+datapond = "datapond.cli:main"

datapond-0.1.0/src/datapond/__init__.py ADDED Viewed

@@ -0,0 +1,42 @@
+"""
+datapond - Public data, instantly queryable.
+Instantly connect to curated DuckDB databases built from public data.
+"""
+__version__ = "0.1.0"
+from datapond.registry import list_databases, get_database, get_registry
+from datapond.connection import connect
+from datapond.download import download, update
+from datapond.describe import describe
+def list():
+    """Return a list of all available database IDs."""
+    return list_databases()
+def info(db_id: str):
+    """Print formatted information about a database."""
+    db = get_database(db_id)
+    rows = db.get("rows", 0)
+    if rows >= 1_000_000:
+        rows_str = f"{rows / 1_000_000:.1f}M"
+    elif rows >= 1_000:
+        rows_str = f"{rows / 1_000:.1f}K"
+    else:
+        rows_str = str(rows)
+    print(f"  {db['name']}")
+    print(f"  {rows_str} rows | {db.get('tables', '?')} tables | {db.get('size_gb', '?')} GB")
+    print(f"  Source: {db.get('source', 'Unknown')}")
+    if db.get("github"):
+        print(f"  GitHub: {db['github'].replace('https://', '')}")
+    if db.get("huggingface"):
+        print(f"  Hugging Face: {db['huggingface'].replace('https://', '')}")
+    if db.get("license"):
+        print(f"  License: {db['license']}")
+    if db.get("updated"):
+        print(f"  Updated: {db['updated']}")

datapond-0.1.0/src/datapond/cli.py ADDED Viewed

@@ -0,0 +1,168 @@
+"""
+Command-line interface for datapond.
+"""
+import argparse
+import shutil
+import subprocess
+import sys
+def main():
+    parser = argparse.ArgumentParser(
+        prog="datapond",
+        description="Public data, instantly queryable.",
+    )
+    subparsers = parser.add_subparsers(dest="command")
+    # datapond list
+    subparsers.add_parser("list", help="List all available databases")
+    # datapond info <db_id>
+    info_parser = subparsers.add_parser("info", help="Show database details")
+    info_parser.add_argument("db_id", help="Database ID")
+    # datapond download <db_id> [--path PATH]
+    dl_parser = subparsers.add_parser("download", help="Download a database")
+    dl_parser.add_argument("db_id", help="Database ID")
+    dl_parser.add_argument("--path", default=None, help="Destination path")
+    # datapond describe <db_id> [--table TABLE] [--search PATTERN]
+    desc_parser = subparsers.add_parser(
+        "describe", help="Describe tables, columns, and join keys"
+    )
+    desc_parser.add_argument("db_id", help="Database ID")
+    desc_parser.add_argument("--table", default=None, help="Show columns for a specific table")
+    desc_parser.add_argument("--search", default=None, help="Search column names")
+    # datapond connect <db_id>
+    connect_parser = subparsers.add_parser(
+        "connect", help="Open an interactive session with a database"
+    )
+    connect_parser.add_argument("db_id", help="Database ID")
+    args = parser.parse_args()
+    if args.command is None:
+        parser.print_help()
+        sys.exit(1)
+    if args.command == "list":
+        _cmd_list()
+    elif args.command == "info":
+        _cmd_info(args.db_id)
+    elif args.command == "download":
+        _cmd_download(args.db_id, args.path)
+    elif args.command == "describe":
+        _cmd_describe(args.db_id, args.table, args.search)
+    elif args.command == "connect":
+        _cmd_connect(args.db_id)
+def _cmd_list():
+    from datapond.registry import list_databases
+    databases = list_databases()
+    if not databases:
+        print("No databases found in registry.")
+        return
+    for db_id in databases:
+        print(db_id)
+def _cmd_info(db_id):
+    import datapond
+    datapond.info(db_id)
+def _cmd_download(db_id, path):
+    from datapond.download import download
+    try:
+        download(db_id, path=path)
+    except Exception as e:
+        print(f"Error: {e}", file=sys.stderr)
+        sys.exit(1)
+def _cmd_describe(db_id, table, search):
+    from datapond.describe import describe
+    try:
+        describe(db_id, table=table, search=search)
+    except Exception as e:
+        print(f"Error: {e}", file=sys.stderr)
+        sys.exit(1)
+def _cmd_connect(db_id):
+    from datapond.registry import get_database
+    db = get_database(db_id)
+    attach_url = db["attach_url"]
+    # Try the duckdb CLI first
+    duckdb_bin = shutil.which("duckdb")
+    if duckdb_bin:
+        init_sql = (
+            f"INSTALL httpfs; LOAD httpfs; "
+            f"ATTACH '{attach_url}' AS {db_id} (READ_ONLY); "
+            f"USE {db_id};"
+        )
+        try:
+            subprocess.run([duckdb_bin, "-cmd", init_sql], check=True)
+        except subprocess.CalledProcessError as e:
+            sys.exit(e.returncode)
+        return
+    # Fallback: Python REPL
+    print(f"duckdb CLI not found. Starting Python REPL for {db_id}...")
+    print("Type SQL queries, or 'exit' to quit.\n")
+    from datapond.connection import connect
+    con = connect(db_id)
+    while True:
+        try:
+            query = input(f"{db_id}> ")
+        except (EOFError, KeyboardInterrupt):
+            print()
+            break
+        query = query.strip()
+        if not query:
+            continue
+        if query.lower() in ("exit", "quit", ".exit", ".quit"):
+            break
+        try:
+            result = con.execute(query)
+            rows = result.fetchall()
+            if rows:
+                columns = [desc[0] for desc in result.description]
+                col_widths = [len(c) for c in columns]
+                for row in rows:
+                    for i, val in enumerate(row):
+                        col_widths[i] = max(col_widths[i], len(str(val)))
+                header = " | ".join(
+                    c.ljust(col_widths[i]) for i, c in enumerate(columns)
+                )
+                print(header)
+                print("-+-".join("-" * w for w in col_widths))
+                for row in rows:
+                    line = " | ".join(
+                        str(v).ljust(col_widths[i]) for i, v in enumerate(row)
+                    )
+                    print(line)
+                print(f"({len(rows)} rows)")
+            else:
+                print("OK")
+        except Exception as e:
+            print(f"Error: {e}")
+if __name__ == "__main__":
+    main()

datapond-0.1.0/src/datapond/connection.py ADDED Viewed

@@ -0,0 +1,84 @@
+"""
+Database connection management for datapond.
+Provides DuckDB connections to remote or local databases.
+"""
+from pathlib import Path
+from typing import Union
+import duckdb
+from datapond.registry import get_database
+def connect(db_id: Union[str, list], local: bool = False):
+    """Connect to one or more datapond databases and return a DuckDB connection.
+    Args:
+        db_id: A single database ID string, or a list of database IDs.
+        local: If True, attach from local ~/.datapond/{db_id}.duckdb files
+               instead of remote URLs. The files must already be downloaded.
+    Returns:
+        A duckdb.Connection with the database(s) attached.
+    """
+    if isinstance(db_id, str):
+        return _connect_single(db_id, local=local)
+    if isinstance(db_id, list):
+        return _connect_multi(db_id, local=local)
+    raise TypeError(f"db_id must be a string or list, got {type(db_id).__name__}")
+def _connect_single(db_id: str, local: bool = False):
+    """Connect to a single database, attaching it and setting it as default."""
+    db = get_database(db_id)
+    con = duckdb.connect()
+    if local:
+        path = _local_path(db_id)
+        con.execute(f"ATTACH '{path}' AS {db_id} (READ_ONLY)")
+    else:
+        attach_url = db["attach_url"]
+        con.install_extension("httpfs")
+        con.load_extension("httpfs")
+        con.execute(f"ATTACH '{attach_url}' AS {db_id} (READ_ONLY)")
+    con.execute(f"USE {db_id}")
+    return con
+def _connect_multi(db_ids: list, local: bool = False):
+    """Connect to multiple databases, attaching each under its own schema name."""
+    if not db_ids:
+        raise ValueError("db_id list must not be empty")
+    con = duckdb.connect()
+    installed_httpfs = False
+    for db_id in db_ids:
+        db = get_database(db_id)
+        if local:
+            path = _local_path(db_id)
+            con.execute(f"ATTACH '{path}' AS {db_id} (READ_ONLY)")
+        else:
+            if not installed_httpfs:
+                con.install_extension("httpfs")
+                con.load_extension("httpfs")
+                installed_httpfs = True
+            attach_url = db["attach_url"]
+            con.execute(f"ATTACH '{attach_url}' AS {db_id} (READ_ONLY)")
+    return con
+def _local_path(db_id: str) -> str:
+    """Return the expected local path for a downloaded database file."""
+    path = Path.home() / ".datapond" / f"{db_id}.duckdb"
+    if not path.exists():
+        raise FileNotFoundError(
+            f"Local database file not found: {path}\n"
+            f"Download it first with: datapond.download('{db_id}')"
+        )
+    return str(path)

datapond-0.1.0/src/datapond/describe.py ADDED Viewed

@@ -0,0 +1,144 @@
+"""
+Describe databases, tables, and columns using the _columns data dictionary.
+"""
+from datapond.connection import connect
+def describe(db_id: str, table: str = None, search: str = None):
+    """Describe a database's tables and columns.
+    Args:
+        db_id: The database ID.
+        table: If provided, show columns for this specific table.
+        search: If provided, search column names across all tables.
+    """
+    con = connect(db_id)
+    if search:
+        _search_columns(con, search)
+    elif table:
+        _describe_table(con, table)
+    else:
+        _describe_database(con)
+def _describe_database(con):
+    """Print all tables with row counts."""
+    try:
+        rows = con.execute(
+            "SELECT table_name, row_count, description "
+            "FROM _metadata "
+            "WHERE table_name NOT IN ('_metadata', '_columns') "
+            "ORDER BY table_name"
+        ).fetchall()
+    except Exception:
+        # Fall back to information_schema if _metadata is missing
+        rows = con.execute(
+            "SELECT table_name, NULL, NULL "
+            "FROM information_schema.tables "
+            "WHERE table_schema NOT IN ('information_schema', 'pg_catalog') "
+            "  AND table_name NOT IN ('_metadata', '_columns') "
+            "ORDER BY table_name"
+        ).fetchall()
+    if not rows:
+        print("  No tables found.")
+        return
+    # Calculate column widths
+    name_w = max(len(r[0]) for r in rows)
+    name_w = max(name_w, 5)
+    for table_name, row_count, description in rows:
+        count_str = f"{row_count:>12,}" if row_count else "            "
+        desc_str = f"  {description}" if description else ""
+        print(f"  {table_name:<{name_w}}  {count_str} rows{desc_str}")
+def _describe_table(con, table):
+    """Print all columns in a table with types, null%, examples, join hints."""
+    try:
+        cols = con.execute(
+            "SELECT column_name, data_type, null_pct, example_value, join_hint "
+            "FROM _columns WHERE table_name = ? ORDER BY rowid",
+            [table],
+        ).fetchall()
+    except Exception:
+        # Fall back to information_schema
+        cols = con.execute(
+            "SELECT column_name, data_type, NULL, NULL, NULL "
+            "FROM information_schema.columns "
+            "WHERE table_name = ? ORDER BY ordinal_position",
+            [table],
+        ).fetchall()
+    if not cols:
+        print(f"  Table '{table}' not found.")
+        return
+    # Get row count
+    try:
+        meta = con.execute(
+            "SELECT row_count FROM _metadata WHERE table_name = ?", [table]
+        ).fetchone()
+        if meta and meta[0]:
+            print(f"  {table} ({meta[0]:,} rows)")
+        else:
+            print(f"  {table}")
+    except Exception:
+        print(f"  {table}")
+    print()
+    # Calculate column widths
+    name_w = max(max(len(c[0]) for c in cols), 6)
+    type_w = max(max(len(c[1]) for c in cols), 4)
+    header = f"  {'Column':<{name_w}}  {'Type':<{type_w}}  {'Nulls':>6}  {'Example':<40}  Join"
+    print(header)
+    print(f"  {'-' * name_w}  {'-' * type_w}  {'-' * 6}  {'-' * 40}  {'-' * 4}")
+    for col_name, dtype, null_pct, example, join_hint in cols:
+        null_str = f"{null_pct:5.1f}%" if null_pct is not None else "      "
+        ex_str = (example[:40] if example else "")
+        join_str = join_hint if join_hint else ""
+        print(f"  {col_name:<{name_w}}  {dtype:<{type_w}}  {null_str}  {ex_str:<40}  {join_str}")
+def _search_columns(con, pattern):
+    """Search column names across all tables."""
+    pattern_upper = pattern.upper()
+    try:
+        cols = con.execute(
+            "SELECT table_name, column_name, data_type, join_hint "
+            "FROM _columns "
+            "WHERE UPPER(column_name) LIKE '%' || ? || '%' "
+            "ORDER BY table_name, column_name",
+            [pattern_upper],
+        ).fetchall()
+    except Exception:
+        cols = con.execute(
+            "SELECT table_name, column_name, data_type, NULL "
+            "FROM information_schema.columns "
+            "WHERE table_schema NOT IN ('information_schema', 'pg_catalog') "
+            "  AND UPPER(column_name) LIKE '%' || ? || '%' "
+            "ORDER BY table_name, column_name",
+            [pattern_upper],
+        ).fetchall()
+    if not cols:
+        print(f"  No columns matching '{pattern}'.")
+        return
+    print(f"  {len(cols)} columns matching '{pattern}':")
+    print()
+    tbl_w = max(len(c[0]) for c in cols)
+    name_w = max(len(c[1]) for c in cols)
+    type_w = max(len(c[2]) for c in cols)
+    for table_name, col_name, dtype, join_hint in cols:
+        join_str = f"  ({join_hint})" if join_hint else ""
+        print(f"  {table_name:<{tbl_w}}  {col_name:<{name_w}}  {dtype:<{type_w}}{join_str}")

datapond-0.1.0/src/datapond/download.py ADDED Viewed

@@ -0,0 +1,176 @@
+"""
+Download management for datapond databases.
+Supports downloading via huggingface_hub (preferred) or requests (fallback).
+"""
+import os
+import shutil
+from datetime import datetime, timezone
+from pathlib import Path
+import requests
+from datapond.registry import get_database
+DATAPOND_DIR = Path.home() / ".datapond"
+def download(db_id: str, path: str = None) -> Path:
+    """Download a database file.
+    Args:
+        db_id: The database ID to download.
+        path: Destination path. Defaults to ~/.datapond/{db_id}.duckdb.
+              If path is a directory, saves as {path}/{db_id}.duckdb.
+    Returns:
+        The path to the downloaded file.
+    """
+    db = get_database(db_id)
+    filename = f"{db_id}.duckdb"
+    if path is None:
+        dest = DATAPOND_DIR / filename
+    else:
+        dest = Path(path)
+        if dest.is_dir():
+            dest = dest / filename
+    dest.parent.mkdir(parents=True, exist_ok=True)
+    hf_url = db.get("huggingface")
+    if hf_url:
+        repo_id = _extract_hf_repo_id(hf_url)
+        if _try_hf_download(repo_id, filename, dest):
+            print(f"Downloaded {db_id} to {dest}")
+            return dest
+    # Fallback: download via requests from the attach_url or a direct link
+    download_url = db.get("download_url") or db.get("attach_url")
+    if not download_url:
+        raise ValueError(f"No download URL available for '{db_id}'")
+    _download_with_requests(download_url, dest, db_id)
+    print(f"Downloaded {db_id} to {dest}")
+    return dest
+def update(db_id: str) -> Path:
+    """Re-download a database if the remote version is newer.
+    Args:
+        db_id: The database ID to update.
+    Returns:
+        The path to the local file.
+    """
+    db = get_database(db_id)
+    local_path = DATAPOND_DIR / f"{db_id}.duckdb"
+    if not local_path.exists():
+        print(f"No local copy found. Downloading {db_id}...")
+        return download(db_id)
+    remote_updated = db.get("updated")
+    if remote_updated:
+        remote_dt = datetime.fromisoformat(remote_updated)
+        if remote_dt.tzinfo is None:
+            remote_dt = remote_dt.replace(tzinfo=timezone.utc)
+        local_mtime = datetime.fromtimestamp(
+            local_path.stat().st_mtime, tz=timezone.utc
+        )
+        if local_mtime >= remote_dt:
+            print(f"{db_id} is already up to date.")
+            return local_path
+    print(f"Updating {db_id}...")
+    return download(db_id)
+def _extract_hf_repo_id(hf_url: str) -> str:
+    """Extract the repo ID from a Hugging Face URL.
+    Example: "https://huggingface.co/datasets/Nason/eoir-database"
+             -> "Nason/eoir-database"
+    """
+    parts = hf_url.rstrip("/").split("/")
+    # URL format: https://huggingface.co/datasets/{org}/{repo}
+    # We want the last two path segments
+    return "/".join(parts[-2:])
+def _try_hf_download(repo_id: str, filename: str, dest: Path) -> bool:
+    """Try downloading via huggingface_hub. Returns True on success."""
+    try:
+        from huggingface_hub import hf_hub_download
+    except ImportError:
+        return False
+    try:
+        cached_path = hf_hub_download(
+            repo_id=repo_id,
+            filename=filename,
+            repo_type="dataset",
+        )
+        dest.parent.mkdir(parents=True, exist_ok=True)
+        shutil.copy2(cached_path, dest)
+        return True
+    except Exception as e:
+        print(f"huggingface_hub download failed ({e}), falling back to requests...")
+        return False
+def _download_with_requests(url: str, dest: Path, db_id: str):
+    """Download a file using requests, with a progress indicator."""
+    resp = requests.get(url, stream=True, timeout=30)
+    resp.raise_for_status()
+    total = int(resp.headers.get("content-length", 0))
+    # Try to use tqdm for a nice progress bar
+    try:
+        from tqdm import tqdm
+        _download_with_tqdm(resp, dest, total, db_id)
+    except ImportError:
+        _download_with_print(resp, dest, total, db_id)
+def _download_with_tqdm(resp, dest: Path, total: int, db_id: str):
+    """Download with tqdm progress bar."""
+    from tqdm import tqdm
+    with open(dest, "wb") as f:
+        with tqdm(
+            total=total or None,
+            unit="B",
+            unit_scale=True,
+            desc=db_id,
+        ) as bar:
+            for chunk in resp.iter_content(chunk_size=8192):
+                f.write(chunk)
+                bar.update(len(chunk))
+def _download_with_print(resp, dest: Path, total: int, db_id: str):
+    """Download with simple printed progress."""
+    downloaded = 0
+    last_pct = -1
+    with open(dest, "wb") as f:
+        for chunk in resp.iter_content(chunk_size=8192):
+            f.write(chunk)
+            downloaded += len(chunk)
+            if total > 0:
+                pct = int(downloaded * 100 / total)
+                if pct != last_pct and pct % 10 == 0:
+                    print(f"  {db_id}: {pct}%")
+                    last_pct = pct
+    if total > 0:
+        size_mb = total / (1024 * 1024)
+        print(f"  {db_id}: complete ({size_mb:.1f} MB)")
+    else:
+        size_mb = downloaded / (1024 * 1024)
+        print(f"  {db_id}: complete ({size_mb:.1f} MB)")

datapond-0.1.0/src/datapond/registry.py ADDED Viewed

@@ -0,0 +1,81 @@
+"""
+Registry client for datapond.
+Fetches and caches the database registry from GitHub.
+"""
+import json
+import os
+import time
+from pathlib import Path
+import requests
+REGISTRY_URL = (
+    "https://raw.githubusercontent.com/datapond-db/registry/main/registry.json"
+)
+CACHE_DIR = Path.home() / ".datapond"
+CACHE_FILE = CACHE_DIR / "registry.json"
+CACHE_TTL = 3600  # 1 hour in seconds
+def _cache_is_fresh() -> bool:
+    """Check if the local registry cache exists and is less than 1 hour old."""
+    if not CACHE_FILE.exists():
+        return False
+    age = time.time() - CACHE_FILE.stat().st_mtime
+    return age < CACHE_TTL
+def _fetch_registry() -> dict:
+    """Fetch the registry from GitHub and cache it locally."""
+    try:
+        resp = requests.get(REGISTRY_URL, timeout=15)
+        resp.raise_for_status()
+    except requests.RequestException as e:
+        # If we have a stale cache, use it rather than failing
+        if CACHE_FILE.exists():
+            with open(CACHE_FILE, "r") as f:
+                return json.load(f)
+        raise ConnectionError(
+            f"Failed to fetch registry and no local cache available: {e}"
+        ) from e
+    data = resp.json()
+    CACHE_DIR.mkdir(parents=True, exist_ok=True)
+    with open(CACHE_FILE, "w") as f:
+        json.dump(data, f, indent=2)
+    return data
+def get_registry() -> dict:
+    """Return the parsed registry dict, using cache if fresh."""
+    if _cache_is_fresh():
+        with open(CACHE_FILE, "r") as f:
+            return json.load(f)
+    return _fetch_registry()
+def get_database(db_id: str) -> dict:
+    """Return a single database entry from the registry.
+    Raises ValueError if the database ID is not found.
+    """
+    registry = get_registry()
+    databases = registry.get("databases", [])
+    for db in databases:
+        if db.get("id") == db_id:
+            return db
+    available = [db.get("id") for db in databases]
+    raise ValueError(
+        f"Database '{db_id}' not found in registry. "
+        f"Available databases: {', '.join(available)}"
+    )
+def list_databases() -> list:
+    """Return a list of all database IDs in the registry."""
+    registry = get_registry()
+    return [db["id"] for db in registry.get("databases", [])]