datapond 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,6 @@
1
+ __pycache__/
2
+ *.pyc
3
+ *.egg-info/
4
+ dist/
5
+ build/
6
+ .eggs/
datapond-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 datapond-db
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,149 @@
1
+ Metadata-Version: 2.4
2
+ Name: datapond
3
+ Version: 0.1.0
4
+ Summary: Instantly connect to curated DuckDB databases built from public data
5
+ Project-URL: Homepage, https://datapond-db.github.io/website
6
+ Project-URL: Repository, https://github.com/datapond-db/datapond-python
7
+ Project-URL: Registry, https://github.com/datapond-db/registry
8
+ Author: Ian Nason
9
+ License-Expression: MIT
10
+ License-File: LICENSE
11
+ Keywords: data,database,duckdb,government-data,public-data
12
+ Classifier: Development Status :: 3 - Alpha
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: Intended Audience :: Science/Research
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.9
18
+ Classifier: Programming Language :: Python :: 3.10
19
+ Classifier: Programming Language :: Python :: 3.11
20
+ Classifier: Programming Language :: Python :: 3.12
21
+ Classifier: Programming Language :: Python :: 3.13
22
+ Classifier: Topic :: Database
23
+ Classifier: Topic :: Scientific/Engineering
24
+ Requires-Python: >=3.9
25
+ Requires-Dist: duckdb>=0.9.0
26
+ Requires-Dist: requests
27
+ Provides-Extra: download
28
+ Requires-Dist: huggingface-hub; extra == 'download'
29
+ Description-Content-Type: text/markdown
30
+
31
+ # datapond
32
+
33
+ **Public data, instantly queryable.**
34
+
35
+ datapond gives you instant SQL access to curated DuckDB databases built from public data sources -- no downloads, no API keys, no setup.
36
+
37
+ ## Install
38
+
39
+ ```bash
40
+ pip install datapond
41
+ ```
42
+
43
+ For faster downloads from Hugging Face:
44
+
45
+ ```bash
46
+ pip install datapond[download]
47
+ ```
48
+
49
+ ## Quick start
50
+
51
+ ### Browse available databases
52
+
53
+ ```python
54
+ import datapond
55
+
56
+ # See what's available
57
+ datapond.list()
58
+
59
+ # Get details about a specific database
60
+ datapond.info("eoir")
61
+ ```
62
+
63
+ ### Connect and query
64
+
65
+ ```python
66
+ import datapond
67
+
68
+ con = datapond.connect("eoir")
69
+ con.sql("SHOW TABLES").show()
70
+ con.sql("SELECT * FROM cases LIMIT 10").show()
71
+ ```
72
+
73
+ The connection is a standard [duckdb.Connection](https://duckdb.org/docs/api/python/overview) -- use it however you normally use DuckDB, including with pandas and Polars.
74
+
75
+ ```python
76
+ df = con.sql("SELECT * FROM cases LIMIT 1000").df() # pandas
77
+ pl = con.sql("SELECT * FROM cases LIMIT 1000").pl() # polars
78
+ ```
79
+
80
+ ### Download for offline use
81
+
82
+ ```python
83
+ datapond.download("eoir")
84
+
85
+ # Later, connect locally
86
+ con = datapond.connect("eoir", local=True)
87
+ ```
88
+
89
+ ### Update a local database
90
+
91
+ ```python
92
+ datapond.update("eoir")
93
+ ```
94
+
95
+ ## Multi-database queries
96
+
97
+ Attach multiple databases at once and query across them:
98
+
99
+ ```python
100
+ con = datapond.connect(["eoir", "foia"])
101
+
102
+ # Tables are namespaced by database ID
103
+ con.sql("SELECT * FROM eoir.cases LIMIT 5").show()
104
+ con.sql("SELECT * FROM foia.requests LIMIT 5").show()
105
+ ```
106
+
107
+ ## CLI
108
+
109
+ datapond also includes a command-line interface:
110
+
111
+ ```bash
112
+ # List available databases
113
+ datapond list
114
+
115
+ # Show database details
116
+ datapond info eoir
117
+
118
+ # Download a database
119
+ datapond download eoir --path ./data/
120
+
121
+ # Open an interactive SQL session
122
+ datapond connect eoir
123
+ ```
124
+
125
+ ## How it works
126
+
127
+ datapond connects to read-only DuckDB files hosted remotely via the [httpfs extension](https://duckdb.org/docs/extensions/httpfs/overview). The [registry](https://github.com/datapond-db/registry) maintains a catalog of available databases with their URLs and metadata.
128
+
129
+ When you call `datapond.connect()`, it:
130
+ 1. Looks up the database in the registry
131
+ 2. Installs and loads the httpfs extension
132
+ 3. Attaches the remote DuckDB file as read-only
133
+ 4. Returns a connection ready for queries
134
+
135
+ No data is downloaded unless you explicitly call `datapond.download()`.
136
+
137
+ ## Links
138
+
139
+ - [Website](https://datapond-db.github.io/website)
140
+ - [Registry](https://github.com/datapond-db/registry) -- catalog of available databases
141
+ - [Source](https://github.com/datapond-db/datapond-python)
142
+
143
+ ## Contributing
144
+
145
+ Contributions are welcome. To add a new database to datapond, submit a pull request to the [registry](https://github.com/datapond-db/registry) repository.
146
+
147
+ ## License
148
+
149
+ MIT
@@ -0,0 +1,119 @@
1
+ # datapond
2
+
3
+ **Public data, instantly queryable.**
4
+
5
+ datapond gives you instant SQL access to curated DuckDB databases built from public data sources -- no downloads, no API keys, no setup.
6
+
7
+ ## Install
8
+
9
+ ```bash
10
+ pip install datapond
11
+ ```
12
+
13
+ For faster downloads from Hugging Face:
14
+
15
+ ```bash
16
+ pip install datapond[download]
17
+ ```
18
+
19
+ ## Quick start
20
+
21
+ ### Browse available databases
22
+
23
+ ```python
24
+ import datapond
25
+
26
+ # See what's available
27
+ datapond.list()
28
+
29
+ # Get details about a specific database
30
+ datapond.info("eoir")
31
+ ```
32
+
33
+ ### Connect and query
34
+
35
+ ```python
36
+ import datapond
37
+
38
+ con = datapond.connect("eoir")
39
+ con.sql("SHOW TABLES").show()
40
+ con.sql("SELECT * FROM cases LIMIT 10").show()
41
+ ```
42
+
43
+ The connection is a standard [duckdb.Connection](https://duckdb.org/docs/api/python/overview) -- use it however you normally use DuckDB, including with pandas and Polars.
44
+
45
+ ```python
46
+ df = con.sql("SELECT * FROM cases LIMIT 1000").df() # pandas
47
+ pl = con.sql("SELECT * FROM cases LIMIT 1000").pl() # polars
48
+ ```
49
+
50
+ ### Download for offline use
51
+
52
+ ```python
53
+ datapond.download("eoir")
54
+
55
+ # Later, connect locally
56
+ con = datapond.connect("eoir", local=True)
57
+ ```
58
+
59
+ ### Update a local database
60
+
61
+ ```python
62
+ datapond.update("eoir")
63
+ ```
64
+
65
+ ## Multi-database queries
66
+
67
+ Attach multiple databases at once and query across them:
68
+
69
+ ```python
70
+ con = datapond.connect(["eoir", "foia"])
71
+
72
+ # Tables are namespaced by database ID
73
+ con.sql("SELECT * FROM eoir.cases LIMIT 5").show()
74
+ con.sql("SELECT * FROM foia.requests LIMIT 5").show()
75
+ ```
76
+
77
+ ## CLI
78
+
79
+ datapond also includes a command-line interface:
80
+
81
+ ```bash
82
+ # List available databases
83
+ datapond list
84
+
85
+ # Show database details
86
+ datapond info eoir
87
+
88
+ # Download a database
89
+ datapond download eoir --path ./data/
90
+
91
+ # Open an interactive SQL session
92
+ datapond connect eoir
93
+ ```
94
+
95
+ ## How it works
96
+
97
+ datapond connects to read-only DuckDB files hosted remotely via the [httpfs extension](https://duckdb.org/docs/extensions/httpfs/overview). The [registry](https://github.com/datapond-db/registry) maintains a catalog of available databases with their URLs and metadata.
98
+
99
+ When you call `datapond.connect()`, it:
100
+ 1. Looks up the database in the registry
101
+ 2. Installs and loads the httpfs extension
102
+ 3. Attaches the remote DuckDB file as read-only
103
+ 4. Returns a connection ready for queries
104
+
105
+ No data is downloaded unless you explicitly call `datapond.download()`.
106
+
107
+ ## Links
108
+
109
+ - [Website](https://datapond-db.github.io/website)
110
+ - [Registry](https://github.com/datapond-db/registry) -- catalog of available databases
111
+ - [Source](https://github.com/datapond-db/datapond-python)
112
+
113
+ ## Contributing
114
+
115
+ Contributions are welcome. To add a new database to datapond, submit a pull request to the [registry](https://github.com/datapond-db/registry) repository.
116
+
117
+ ## License
118
+
119
+ MIT
@@ -0,0 +1,44 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "datapond"
7
+ version = "0.1.0"
8
+ description = "Instantly connect to curated DuckDB databases built from public data"
9
+ readme = "README.md"
10
+ license = "MIT"
11
+ requires-python = ">=3.9"
12
+ authors = [
13
+ { name = "Ian Nason" },
14
+ ]
15
+ keywords = ["duckdb", "data", "public-data", "government-data", "database"]
16
+ classifiers = [
17
+ "Development Status :: 3 - Alpha",
18
+ "Intended Audience :: Science/Research",
19
+ "Intended Audience :: Developers",
20
+ "License :: OSI Approved :: MIT License",
21
+ "Programming Language :: Python :: 3",
22
+ "Programming Language :: Python :: 3.9",
23
+ "Programming Language :: Python :: 3.10",
24
+ "Programming Language :: Python :: 3.11",
25
+ "Programming Language :: Python :: 3.12",
26
+ "Programming Language :: Python :: 3.13",
27
+ "Topic :: Database",
28
+ "Topic :: Scientific/Engineering",
29
+ ]
30
+ dependencies = [
31
+ "duckdb>=0.9.0",
32
+ "requests",
33
+ ]
34
+
35
+ [project.optional-dependencies]
36
+ download = ["huggingface_hub"]
37
+
38
+ [project.urls]
39
+ Homepage = "https://datapond-db.github.io/website"
40
+ Repository = "https://github.com/datapond-db/datapond-python"
41
+ Registry = "https://github.com/datapond-db/registry"
42
+
43
+ [project.scripts]
44
+ datapond = "datapond.cli:main"
@@ -0,0 +1,42 @@
1
+ """
2
+ datapond - Public data, instantly queryable.
3
+
4
+ Instantly connect to curated DuckDB databases built from public data.
5
+ """
6
+
7
+ __version__ = "0.1.0"
8
+
9
+ from datapond.registry import list_databases, get_database, get_registry
10
+ from datapond.connection import connect
11
+ from datapond.download import download, update
12
+ from datapond.describe import describe
13
+
14
+
15
+ def list():
16
+ """Return a list of all available database IDs."""
17
+ return list_databases()
18
+
19
+
20
+ def info(db_id: str):
21
+ """Print formatted information about a database."""
22
+ db = get_database(db_id)
23
+
24
+ rows = db.get("rows", 0)
25
+ if rows >= 1_000_000:
26
+ rows_str = f"{rows / 1_000_000:.1f}M"
27
+ elif rows >= 1_000:
28
+ rows_str = f"{rows / 1_000:.1f}K"
29
+ else:
30
+ rows_str = str(rows)
31
+
32
+ print(f" {db['name']}")
33
+ print(f" {rows_str} rows | {db.get('tables', '?')} tables | {db.get('size_gb', '?')} GB")
34
+ print(f" Source: {db.get('source', 'Unknown')}")
35
+ if db.get("github"):
36
+ print(f" GitHub: {db['github'].replace('https://', '')}")
37
+ if db.get("huggingface"):
38
+ print(f" Hugging Face: {db['huggingface'].replace('https://', '')}")
39
+ if db.get("license"):
40
+ print(f" License: {db['license']}")
41
+ if db.get("updated"):
42
+ print(f" Updated: {db['updated']}")
@@ -0,0 +1,168 @@
1
+ """
2
+ Command-line interface for datapond.
3
+ """
4
+
5
+ import argparse
6
+ import shutil
7
+ import subprocess
8
+ import sys
9
+
10
+
11
+ def main():
12
+ parser = argparse.ArgumentParser(
13
+ prog="datapond",
14
+ description="Public data, instantly queryable.",
15
+ )
16
+ subparsers = parser.add_subparsers(dest="command")
17
+
18
+ # datapond list
19
+ subparsers.add_parser("list", help="List all available databases")
20
+
21
+ # datapond info <db_id>
22
+ info_parser = subparsers.add_parser("info", help="Show database details")
23
+ info_parser.add_argument("db_id", help="Database ID")
24
+
25
+ # datapond download <db_id> [--path PATH]
26
+ dl_parser = subparsers.add_parser("download", help="Download a database")
27
+ dl_parser.add_argument("db_id", help="Database ID")
28
+ dl_parser.add_argument("--path", default=None, help="Destination path")
29
+
30
+ # datapond describe <db_id> [--table TABLE] [--search PATTERN]
31
+ desc_parser = subparsers.add_parser(
32
+ "describe", help="Describe tables, columns, and join keys"
33
+ )
34
+ desc_parser.add_argument("db_id", help="Database ID")
35
+ desc_parser.add_argument("--table", default=None, help="Show columns for a specific table")
36
+ desc_parser.add_argument("--search", default=None, help="Search column names")
37
+
38
+ # datapond connect <db_id>
39
+ connect_parser = subparsers.add_parser(
40
+ "connect", help="Open an interactive session with a database"
41
+ )
42
+ connect_parser.add_argument("db_id", help="Database ID")
43
+
44
+ args = parser.parse_args()
45
+
46
+ if args.command is None:
47
+ parser.print_help()
48
+ sys.exit(1)
49
+
50
+ if args.command == "list":
51
+ _cmd_list()
52
+ elif args.command == "info":
53
+ _cmd_info(args.db_id)
54
+ elif args.command == "download":
55
+ _cmd_download(args.db_id, args.path)
56
+ elif args.command == "describe":
57
+ _cmd_describe(args.db_id, args.table, args.search)
58
+ elif args.command == "connect":
59
+ _cmd_connect(args.db_id)
60
+
61
+
62
+ def _cmd_list():
63
+ from datapond.registry import list_databases
64
+
65
+ databases = list_databases()
66
+ if not databases:
67
+ print("No databases found in registry.")
68
+ return
69
+ for db_id in databases:
70
+ print(db_id)
71
+
72
+
73
+ def _cmd_info(db_id):
74
+ import datapond
75
+
76
+ datapond.info(db_id)
77
+
78
+
79
+ def _cmd_download(db_id, path):
80
+ from datapond.download import download
81
+
82
+ try:
83
+ download(db_id, path=path)
84
+ except Exception as e:
85
+ print(f"Error: {e}", file=sys.stderr)
86
+ sys.exit(1)
87
+
88
+
89
+ def _cmd_describe(db_id, table, search):
90
+ from datapond.describe import describe
91
+
92
+ try:
93
+ describe(db_id, table=table, search=search)
94
+ except Exception as e:
95
+ print(f"Error: {e}", file=sys.stderr)
96
+ sys.exit(1)
97
+
98
+
99
+ def _cmd_connect(db_id):
100
+ from datapond.registry import get_database
101
+
102
+ db = get_database(db_id)
103
+ attach_url = db["attach_url"]
104
+
105
+ # Try the duckdb CLI first
106
+ duckdb_bin = shutil.which("duckdb")
107
+ if duckdb_bin:
108
+ init_sql = (
109
+ f"INSTALL httpfs; LOAD httpfs; "
110
+ f"ATTACH '{attach_url}' AS {db_id} (READ_ONLY); "
111
+ f"USE {db_id};"
112
+ )
113
+ try:
114
+ subprocess.run([duckdb_bin, "-cmd", init_sql], check=True)
115
+ except subprocess.CalledProcessError as e:
116
+ sys.exit(e.returncode)
117
+ return
118
+
119
+ # Fallback: Python REPL
120
+ print(f"duckdb CLI not found. Starting Python REPL for {db_id}...")
121
+ print("Type SQL queries, or 'exit' to quit.\n")
122
+
123
+ from datapond.connection import connect
124
+
125
+ con = connect(db_id)
126
+
127
+ while True:
128
+ try:
129
+ query = input(f"{db_id}> ")
130
+ except (EOFError, KeyboardInterrupt):
131
+ print()
132
+ break
133
+
134
+ query = query.strip()
135
+ if not query:
136
+ continue
137
+ if query.lower() in ("exit", "quit", ".exit", ".quit"):
138
+ break
139
+
140
+ try:
141
+ result = con.execute(query)
142
+ rows = result.fetchall()
143
+ if rows:
144
+ columns = [desc[0] for desc in result.description]
145
+ col_widths = [len(c) for c in columns]
146
+ for row in rows:
147
+ for i, val in enumerate(row):
148
+ col_widths[i] = max(col_widths[i], len(str(val)))
149
+
150
+ header = " | ".join(
151
+ c.ljust(col_widths[i]) for i, c in enumerate(columns)
152
+ )
153
+ print(header)
154
+ print("-+-".join("-" * w for w in col_widths))
155
+ for row in rows:
156
+ line = " | ".join(
157
+ str(v).ljust(col_widths[i]) for i, v in enumerate(row)
158
+ )
159
+ print(line)
160
+ print(f"({len(rows)} rows)")
161
+ else:
162
+ print("OK")
163
+ except Exception as e:
164
+ print(f"Error: {e}")
165
+
166
+
167
+ if __name__ == "__main__":
168
+ main()
@@ -0,0 +1,84 @@
1
+ """
2
+ Database connection management for datapond.
3
+
4
+ Provides DuckDB connections to remote or local databases.
5
+ """
6
+
7
+ from pathlib import Path
8
+ from typing import Union
9
+
10
+ import duckdb
11
+
12
+ from datapond.registry import get_database
13
+
14
+
15
+ def connect(db_id: Union[str, list], local: bool = False):
16
+ """Connect to one or more datapond databases and return a DuckDB connection.
17
+
18
+ Args:
19
+ db_id: A single database ID string, or a list of database IDs.
20
+ local: If True, attach from local ~/.datapond/{db_id}.duckdb files
21
+ instead of remote URLs. The files must already be downloaded.
22
+
23
+ Returns:
24
+ A duckdb.Connection with the database(s) attached.
25
+ """
26
+ if isinstance(db_id, str):
27
+ return _connect_single(db_id, local=local)
28
+ if isinstance(db_id, list):
29
+ return _connect_multi(db_id, local=local)
30
+ raise TypeError(f"db_id must be a string or list, got {type(db_id).__name__}")
31
+
32
+
33
+ def _connect_single(db_id: str, local: bool = False):
34
+ """Connect to a single database, attaching it and setting it as default."""
35
+ db = get_database(db_id)
36
+ con = duckdb.connect()
37
+
38
+ if local:
39
+ path = _local_path(db_id)
40
+ con.execute(f"ATTACH '{path}' AS {db_id} (READ_ONLY)")
41
+ else:
42
+ attach_url = db["attach_url"]
43
+ con.install_extension("httpfs")
44
+ con.load_extension("httpfs")
45
+ con.execute(f"ATTACH '{attach_url}' AS {db_id} (READ_ONLY)")
46
+
47
+ con.execute(f"USE {db_id}")
48
+ return con
49
+
50
+
51
+ def _connect_multi(db_ids: list, local: bool = False):
52
+ """Connect to multiple databases, attaching each under its own schema name."""
53
+ if not db_ids:
54
+ raise ValueError("db_id list must not be empty")
55
+
56
+ con = duckdb.connect()
57
+ installed_httpfs = False
58
+
59
+ for db_id in db_ids:
60
+ db = get_database(db_id)
61
+
62
+ if local:
63
+ path = _local_path(db_id)
64
+ con.execute(f"ATTACH '{path}' AS {db_id} (READ_ONLY)")
65
+ else:
66
+ if not installed_httpfs:
67
+ con.install_extension("httpfs")
68
+ con.load_extension("httpfs")
69
+ installed_httpfs = True
70
+ attach_url = db["attach_url"]
71
+ con.execute(f"ATTACH '{attach_url}' AS {db_id} (READ_ONLY)")
72
+
73
+ return con
74
+
75
+
76
+ def _local_path(db_id: str) -> str:
77
+ """Return the expected local path for a downloaded database file."""
78
+ path = Path.home() / ".datapond" / f"{db_id}.duckdb"
79
+ if not path.exists():
80
+ raise FileNotFoundError(
81
+ f"Local database file not found: {path}\n"
82
+ f"Download it first with: datapond.download('{db_id}')"
83
+ )
84
+ return str(path)
@@ -0,0 +1,144 @@
1
+ """
2
+ Describe databases, tables, and columns using the _columns data dictionary.
3
+ """
4
+
5
+ from datapond.connection import connect
6
+
7
+
8
+ def describe(db_id: str, table: str = None, search: str = None):
9
+ """Describe a database's tables and columns.
10
+
11
+ Args:
12
+ db_id: The database ID.
13
+ table: If provided, show columns for this specific table.
14
+ search: If provided, search column names across all tables.
15
+ """
16
+ con = connect(db_id)
17
+
18
+ if search:
19
+ _search_columns(con, search)
20
+ elif table:
21
+ _describe_table(con, table)
22
+ else:
23
+ _describe_database(con)
24
+
25
+
26
+ def _describe_database(con):
27
+ """Print all tables with row counts."""
28
+ try:
29
+ rows = con.execute(
30
+ "SELECT table_name, row_count, description "
31
+ "FROM _metadata "
32
+ "WHERE table_name NOT IN ('_metadata', '_columns') "
33
+ "ORDER BY table_name"
34
+ ).fetchall()
35
+ except Exception:
36
+ # Fall back to information_schema if _metadata is missing
37
+ rows = con.execute(
38
+ "SELECT table_name, NULL, NULL "
39
+ "FROM information_schema.tables "
40
+ "WHERE table_schema NOT IN ('information_schema', 'pg_catalog') "
41
+ " AND table_name NOT IN ('_metadata', '_columns') "
42
+ "ORDER BY table_name"
43
+ ).fetchall()
44
+
45
+ if not rows:
46
+ print(" No tables found.")
47
+ return
48
+
49
+ # Calculate column widths
50
+ name_w = max(len(r[0]) for r in rows)
51
+ name_w = max(name_w, 5)
52
+
53
+ for table_name, row_count, description in rows:
54
+ count_str = f"{row_count:>12,}" if row_count else " "
55
+ desc_str = f" {description}" if description else ""
56
+ print(f" {table_name:<{name_w}} {count_str} rows{desc_str}")
57
+
58
+
59
+ def _describe_table(con, table):
60
+ """Print all columns in a table with types, null%, examples, join hints."""
61
+ try:
62
+ cols = con.execute(
63
+ "SELECT column_name, data_type, null_pct, example_value, join_hint "
64
+ "FROM _columns WHERE table_name = ? ORDER BY rowid",
65
+ [table],
66
+ ).fetchall()
67
+ except Exception:
68
+ # Fall back to information_schema
69
+ cols = con.execute(
70
+ "SELECT column_name, data_type, NULL, NULL, NULL "
71
+ "FROM information_schema.columns "
72
+ "WHERE table_name = ? ORDER BY ordinal_position",
73
+ [table],
74
+ ).fetchall()
75
+
76
+ if not cols:
77
+ print(f" Table '{table}' not found.")
78
+ return
79
+
80
+ # Get row count
81
+ try:
82
+ meta = con.execute(
83
+ "SELECT row_count FROM _metadata WHERE table_name = ?", [table]
84
+ ).fetchone()
85
+ if meta and meta[0]:
86
+ print(f" {table} ({meta[0]:,} rows)")
87
+ else:
88
+ print(f" {table}")
89
+ except Exception:
90
+ print(f" {table}")
91
+
92
+ print()
93
+
94
+ # Calculate column widths
95
+ name_w = max(max(len(c[0]) for c in cols), 6)
96
+ type_w = max(max(len(c[1]) for c in cols), 4)
97
+
98
+ header = f" {'Column':<{name_w}} {'Type':<{type_w}} {'Nulls':>6} {'Example':<40} Join"
99
+ print(header)
100
+ print(f" {'-' * name_w} {'-' * type_w} {'-' * 6} {'-' * 40} {'-' * 4}")
101
+
102
+ for col_name, dtype, null_pct, example, join_hint in cols:
103
+ null_str = f"{null_pct:5.1f}%" if null_pct is not None else " "
104
+ ex_str = (example[:40] if example else "")
105
+ join_str = join_hint if join_hint else ""
106
+ print(f" {col_name:<{name_w}} {dtype:<{type_w}} {null_str} {ex_str:<40} {join_str}")
107
+
108
+
109
+ def _search_columns(con, pattern):
110
+ """Search column names across all tables."""
111
+ pattern_upper = pattern.upper()
112
+
113
+ try:
114
+ cols = con.execute(
115
+ "SELECT table_name, column_name, data_type, join_hint "
116
+ "FROM _columns "
117
+ "WHERE UPPER(column_name) LIKE '%' || ? || '%' "
118
+ "ORDER BY table_name, column_name",
119
+ [pattern_upper],
120
+ ).fetchall()
121
+ except Exception:
122
+ cols = con.execute(
123
+ "SELECT table_name, column_name, data_type, NULL "
124
+ "FROM information_schema.columns "
125
+ "WHERE table_schema NOT IN ('information_schema', 'pg_catalog') "
126
+ " AND UPPER(column_name) LIKE '%' || ? || '%' "
127
+ "ORDER BY table_name, column_name",
128
+ [pattern_upper],
129
+ ).fetchall()
130
+
131
+ if not cols:
132
+ print(f" No columns matching '{pattern}'.")
133
+ return
134
+
135
+ print(f" {len(cols)} columns matching '{pattern}':")
136
+ print()
137
+
138
+ tbl_w = max(len(c[0]) for c in cols)
139
+ name_w = max(len(c[1]) for c in cols)
140
+ type_w = max(len(c[2]) for c in cols)
141
+
142
+ for table_name, col_name, dtype, join_hint in cols:
143
+ join_str = f" ({join_hint})" if join_hint else ""
144
+ print(f" {table_name:<{tbl_w}} {col_name:<{name_w}} {dtype:<{type_w}}{join_str}")
@@ -0,0 +1,176 @@
1
+ """
2
+ Download management for datapond databases.
3
+
4
+ Supports downloading via huggingface_hub (preferred) or requests (fallback).
5
+ """
6
+
7
+ import os
8
+ import shutil
9
+ from datetime import datetime, timezone
10
+ from pathlib import Path
11
+
12
+ import requests
13
+
14
+ from datapond.registry import get_database
15
+
16
+ DATAPOND_DIR = Path.home() / ".datapond"
17
+
18
+
19
+ def download(db_id: str, path: str = None) -> Path:
20
+ """Download a database file.
21
+
22
+ Args:
23
+ db_id: The database ID to download.
24
+ path: Destination path. Defaults to ~/.datapond/{db_id}.duckdb.
25
+ If path is a directory, saves as {path}/{db_id}.duckdb.
26
+
27
+ Returns:
28
+ The path to the downloaded file.
29
+ """
30
+ db = get_database(db_id)
31
+ filename = f"{db_id}.duckdb"
32
+
33
+ if path is None:
34
+ dest = DATAPOND_DIR / filename
35
+ else:
36
+ dest = Path(path)
37
+ if dest.is_dir():
38
+ dest = dest / filename
39
+
40
+ dest.parent.mkdir(parents=True, exist_ok=True)
41
+
42
+ hf_url = db.get("huggingface")
43
+ if hf_url:
44
+ repo_id = _extract_hf_repo_id(hf_url)
45
+ if _try_hf_download(repo_id, filename, dest):
46
+ print(f"Downloaded {db_id} to {dest}")
47
+ return dest
48
+
49
+ # Fallback: download via requests from the attach_url or a direct link
50
+ download_url = db.get("download_url") or db.get("attach_url")
51
+ if not download_url:
52
+ raise ValueError(f"No download URL available for '{db_id}'")
53
+
54
+ _download_with_requests(download_url, dest, db_id)
55
+ print(f"Downloaded {db_id} to {dest}")
56
+ return dest
57
+
58
+
59
+ def update(db_id: str) -> Path:
60
+ """Re-download a database if the remote version is newer.
61
+
62
+ Args:
63
+ db_id: The database ID to update.
64
+
65
+ Returns:
66
+ The path to the local file.
67
+ """
68
+ db = get_database(db_id)
69
+ local_path = DATAPOND_DIR / f"{db_id}.duckdb"
70
+
71
+ if not local_path.exists():
72
+ print(f"No local copy found. Downloading {db_id}...")
73
+ return download(db_id)
74
+
75
+ remote_updated = db.get("updated")
76
+ if remote_updated:
77
+ remote_dt = datetime.fromisoformat(remote_updated)
78
+ if remote_dt.tzinfo is None:
79
+ remote_dt = remote_dt.replace(tzinfo=timezone.utc)
80
+ local_mtime = datetime.fromtimestamp(
81
+ local_path.stat().st_mtime, tz=timezone.utc
82
+ )
83
+ if local_mtime >= remote_dt:
84
+ print(f"{db_id} is already up to date.")
85
+ return local_path
86
+
87
+ print(f"Updating {db_id}...")
88
+ return download(db_id)
89
+
90
+
91
+ def _extract_hf_repo_id(hf_url: str) -> str:
92
+ """Extract the repo ID from a Hugging Face URL.
93
+
94
+ Example: "https://huggingface.co/datasets/Nason/eoir-database"
95
+ -> "Nason/eoir-database"
96
+ """
97
+ parts = hf_url.rstrip("/").split("/")
98
+ # URL format: https://huggingface.co/datasets/{org}/{repo}
99
+ # We want the last two path segments
100
+ return "/".join(parts[-2:])
101
+
102
+
103
+ def _try_hf_download(repo_id: str, filename: str, dest: Path) -> bool:
104
+ """Try downloading via huggingface_hub. Returns True on success."""
105
+ try:
106
+ from huggingface_hub import hf_hub_download
107
+ except ImportError:
108
+ return False
109
+
110
+ try:
111
+ cached_path = hf_hub_download(
112
+ repo_id=repo_id,
113
+ filename=filename,
114
+ repo_type="dataset",
115
+ )
116
+ dest.parent.mkdir(parents=True, exist_ok=True)
117
+ shutil.copy2(cached_path, dest)
118
+ return True
119
+ except Exception as e:
120
+ print(f"huggingface_hub download failed ({e}), falling back to requests...")
121
+ return False
122
+
123
+
124
+ def _download_with_requests(url: str, dest: Path, db_id: str):
125
+ """Download a file using requests, with a progress indicator."""
126
+ resp = requests.get(url, stream=True, timeout=30)
127
+ resp.raise_for_status()
128
+
129
+ total = int(resp.headers.get("content-length", 0))
130
+
131
+ # Try to use tqdm for a nice progress bar
132
+ try:
133
+ from tqdm import tqdm
134
+
135
+ _download_with_tqdm(resp, dest, total, db_id)
136
+ except ImportError:
137
+ _download_with_print(resp, dest, total, db_id)
138
+
139
+
140
+ def _download_with_tqdm(resp, dest: Path, total: int, db_id: str):
141
+ """Download with tqdm progress bar."""
142
+ from tqdm import tqdm
143
+
144
+ with open(dest, "wb") as f:
145
+ with tqdm(
146
+ total=total or None,
147
+ unit="B",
148
+ unit_scale=True,
149
+ desc=db_id,
150
+ ) as bar:
151
+ for chunk in resp.iter_content(chunk_size=8192):
152
+ f.write(chunk)
153
+ bar.update(len(chunk))
154
+
155
+
156
+ def _download_with_print(resp, dest: Path, total: int, db_id: str):
157
+ """Download with simple printed progress."""
158
+ downloaded = 0
159
+ last_pct = -1
160
+
161
+ with open(dest, "wb") as f:
162
+ for chunk in resp.iter_content(chunk_size=8192):
163
+ f.write(chunk)
164
+ downloaded += len(chunk)
165
+ if total > 0:
166
+ pct = int(downloaded * 100 / total)
167
+ if pct != last_pct and pct % 10 == 0:
168
+ print(f" {db_id}: {pct}%")
169
+ last_pct = pct
170
+
171
+ if total > 0:
172
+ size_mb = total / (1024 * 1024)
173
+ print(f" {db_id}: complete ({size_mb:.1f} MB)")
174
+ else:
175
+ size_mb = downloaded / (1024 * 1024)
176
+ print(f" {db_id}: complete ({size_mb:.1f} MB)")
@@ -0,0 +1,81 @@
1
+ """
2
+ Registry client for datapond.
3
+
4
+ Fetches and caches the database registry from GitHub.
5
+ """
6
+
7
+ import json
8
+ import os
9
+ import time
10
+ from pathlib import Path
11
+
12
+ import requests
13
+
14
+ REGISTRY_URL = (
15
+ "https://raw.githubusercontent.com/datapond-db/registry/main/registry.json"
16
+ )
17
+ CACHE_DIR = Path.home() / ".datapond"
18
+ CACHE_FILE = CACHE_DIR / "registry.json"
19
+ CACHE_TTL = 3600 # 1 hour in seconds
20
+
21
+
22
+ def _cache_is_fresh() -> bool:
23
+ """Check if the local registry cache exists and is less than 1 hour old."""
24
+ if not CACHE_FILE.exists():
25
+ return False
26
+ age = time.time() - CACHE_FILE.stat().st_mtime
27
+ return age < CACHE_TTL
28
+
29
+
30
+ def _fetch_registry() -> dict:
31
+ """Fetch the registry from GitHub and cache it locally."""
32
+ try:
33
+ resp = requests.get(REGISTRY_URL, timeout=15)
34
+ resp.raise_for_status()
35
+ except requests.RequestException as e:
36
+ # If we have a stale cache, use it rather than failing
37
+ if CACHE_FILE.exists():
38
+ with open(CACHE_FILE, "r") as f:
39
+ return json.load(f)
40
+ raise ConnectionError(
41
+ f"Failed to fetch registry and no local cache available: {e}"
42
+ ) from e
43
+
44
+ data = resp.json()
45
+
46
+ CACHE_DIR.mkdir(parents=True, exist_ok=True)
47
+ with open(CACHE_FILE, "w") as f:
48
+ json.dump(data, f, indent=2)
49
+
50
+ return data
51
+
52
+
53
+ def get_registry() -> dict:
54
+ """Return the parsed registry dict, using cache if fresh."""
55
+ if _cache_is_fresh():
56
+ with open(CACHE_FILE, "r") as f:
57
+ return json.load(f)
58
+ return _fetch_registry()
59
+
60
+
61
+ def get_database(db_id: str) -> dict:
62
+ """Return a single database entry from the registry.
63
+
64
+ Raises ValueError if the database ID is not found.
65
+ """
66
+ registry = get_registry()
67
+ databases = registry.get("databases", [])
68
+ for db in databases:
69
+ if db.get("id") == db_id:
70
+ return db
71
+ available = [db.get("id") for db in databases]
72
+ raise ValueError(
73
+ f"Database '{db_id}' not found in registry. "
74
+ f"Available databases: {', '.join(available)}"
75
+ )
76
+
77
+
78
+ def list_databases() -> list:
79
+ """Return a list of all database IDs in the registry."""
80
+ registry = get_registry()
81
+ return [db["id"] for db in registry.get("databases", [])]