quackpipe 0.6.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- quackpipe/__init__.py +45 -0
- quackpipe/builder.py +58 -0
- quackpipe/cli.py +28 -0
- quackpipe/commands/__init__.py +0 -0
- quackpipe/commands/common.py +43 -0
- quackpipe/commands/generate_sqlmesh_config.py +85 -0
- quackpipe/commands/ui.py +74 -0
- quackpipe/config.py +35 -0
- quackpipe/core.py +123 -0
- quackpipe/etl_utils.py +110 -0
- quackpipe/exceptions.py +15 -0
- quackpipe/secrets.py +100 -0
- quackpipe/sources/__init__.py +3 -0
- quackpipe/sources/azure_blob.py +76 -0
- quackpipe/sources/base.py +43 -0
- quackpipe/sources/ducklake/__init__.py +115 -0
- quackpipe/sources/ducklake/providers.py +108 -0
- quackpipe/sources/postgres.py +68 -0
- quackpipe/sources/s3.py +77 -0
- quackpipe/sources/sqlite.py +42 -0
- quackpipe/test_utils/__init__.py +0 -0
- quackpipe/test_utils/data_fixtures.py +113 -0
- quackpipe/test_utils/fixtures.py +478 -0
- quackpipe/utils.py +59 -0
- quackpipe-0.6.1.dist-info/METADATA +193 -0
- quackpipe-0.6.1.dist-info/RECORD +30 -0
- quackpipe-0.6.1.dist-info/WHEEL +5 -0
- quackpipe-0.6.1.dist-info/entry_points.txt +2 -0
- quackpipe-0.6.1.dist-info/licenses/LICENSE +21 -0
- quackpipe-0.6.1.dist-info/top_level.txt +1 -0
quackpipe/__init__.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
"""
|
|
2
|
+
quackpipe - A configuration-driven ETL helper for DuckDB.
|
|
3
|
+
|
|
4
|
+
This library provides simple, high-level functions to connect DuckDB
|
|
5
|
+
to various data sources based on a YAML configuration file or a
|
|
6
|
+
programmatic builder.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import logging
|
|
10
|
+
import os
|
|
11
|
+
|
|
12
|
+
# Expose the primary user-facing functions and classes.
|
|
13
|
+
from .builder import QuackpipeBuilder
|
|
14
|
+
from .config import SourceConfig, SourceType
|
|
15
|
+
from .core import session, with_session
|
|
16
|
+
from .exceptions import ConfigError, QuackpipeError, SecretError
|
|
17
|
+
from .secrets import configure_secret_provider
|
|
18
|
+
|
|
19
|
+
# Set up the library's top-level logger
|
|
20
|
+
_default_level = os.getenv('QUACKPIPE_LOG_LEVEL', 'WARNING').upper()
|
|
21
|
+
_root_logger = logging.getLogger(__name__)
|
|
22
|
+
_root_logger.setLevel(getattr(logging, _default_level, logging.WARNING))
|
|
23
|
+
_root_logger.addHandler(logging.NullHandler())
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
__all__ = [
|
|
27
|
+
# Core API
|
|
28
|
+
"session",
|
|
29
|
+
"with_session",
|
|
30
|
+
|
|
31
|
+
# Builder API
|
|
32
|
+
"QuackpipeBuilder",
|
|
33
|
+
|
|
34
|
+
# Configuration Types
|
|
35
|
+
"SourceConfig",
|
|
36
|
+
"SourceType",
|
|
37
|
+
|
|
38
|
+
# Secret Management
|
|
39
|
+
"configure_secret_provider",
|
|
40
|
+
|
|
41
|
+
# Exceptions
|
|
42
|
+
"QuackpipeError",
|
|
43
|
+
"ConfigError",
|
|
44
|
+
"SecretError",
|
|
45
|
+
]
|
quackpipe/builder.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
"""
|
|
2
|
+
The Builder API for programmatically constructing a quackpipe session.
|
|
3
|
+
"""
|
|
4
|
+
from typing import Any, Self
|
|
5
|
+
|
|
6
|
+
from .config import SourceConfig, SourceType
|
|
7
|
+
from .core import session as core_session # Avoid circular import
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class QuackpipeBuilder:
|
|
11
|
+
"""A fluent builder for creating a quackpipe session without a YAML file."""
|
|
12
|
+
|
|
13
|
+
def __init__(self):
|
|
14
|
+
self._sources: list[SourceConfig] = []
|
|
15
|
+
|
|
16
|
+
def add_source(self, name: str, type: SourceType, config: dict[str, Any] = None, secret_name: str = None) -> Self:
|
|
17
|
+
"""
|
|
18
|
+
Adds a data source to the configuration.
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
name: The name for the data source (e.g., 'pg_main').
|
|
22
|
+
type: The type of the source, using the SourceType enum.
|
|
23
|
+
config: A dictionary of non-secret parameters.
|
|
24
|
+
secret_name: The logical name of the secret bundle.
|
|
25
|
+
|
|
26
|
+
Returns:
|
|
27
|
+
The builder instance for chaining.
|
|
28
|
+
"""
|
|
29
|
+
source = SourceConfig(
|
|
30
|
+
name=name,
|
|
31
|
+
type=type,
|
|
32
|
+
config=config or {},
|
|
33
|
+
secret_name=secret_name
|
|
34
|
+
)
|
|
35
|
+
self._sources.append(source)
|
|
36
|
+
return self
|
|
37
|
+
|
|
38
|
+
def get_configs(self) -> list[SourceConfig]:
|
|
39
|
+
"""
|
|
40
|
+
Returns the list of SourceConfig objects that have been added to the builder.
|
|
41
|
+
This is useful for passing to high-level utilities like `move_data`.
|
|
42
|
+
"""
|
|
43
|
+
return self._sources
|
|
44
|
+
|
|
45
|
+
def session(self, **kwargs):
|
|
46
|
+
"""
|
|
47
|
+
Builds and enters the session context manager. Can accept the same arguments
|
|
48
|
+
as the core session function, like `sources=['source_a']`.
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
A context manager yielding a configured DuckDB connection.
|
|
52
|
+
"""
|
|
53
|
+
if not self._sources:
|
|
54
|
+
raise ValueError("Cannot build a session with no sources defined.")
|
|
55
|
+
|
|
56
|
+
# Pass the built configs and any extra arguments (like `sources`)
|
|
57
|
+
# to the core session manager.
|
|
58
|
+
return core_session(configs=self._sources, **kwargs)
|
quackpipe/cli.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
"""
|
|
2
|
+
cli.py
|
|
3
|
+
|
|
4
|
+
This module provides the main entry point for the quackpipe command-line interface.
|
|
5
|
+
It discovers and registers commands from the 'commands' submodule.
|
|
6
|
+
"""
|
|
7
|
+
import argparse
|
|
8
|
+
|
|
9
|
+
# Import the registration functions from each command module
|
|
10
|
+
from .commands import generate_sqlmesh_config, ui
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def main():
|
|
14
|
+
"""Main function to parse arguments and dispatch commands."""
|
|
15
|
+
parser = argparse.ArgumentParser(description="quackpipe: A DuckDB ETL Helper CLI.")
|
|
16
|
+
subparsers = parser.add_subparsers(dest="command", required=True, help="Available commands")
|
|
17
|
+
|
|
18
|
+
# Register all available commands
|
|
19
|
+
generate_sqlmesh_config.register_command(subparsers)
|
|
20
|
+
ui.register_command(subparsers)
|
|
21
|
+
|
|
22
|
+
# Parse the arguments and call the handler function assigned by the subparser
|
|
23
|
+
args = parser.parse_args()
|
|
24
|
+
args.func(args)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
if __name__ == "__main__":
|
|
28
|
+
main()
|
|
File without changes
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
"""
|
|
2
|
+
src/quackpipe/commands/common.py
|
|
3
|
+
|
|
4
|
+
This module contains common utilities shared across CLI command modules.
|
|
5
|
+
"""
|
|
6
|
+
import logging
|
|
7
|
+
import sys
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def setup_cli_logging(verbose_level: int = 0):
|
|
11
|
+
"""
|
|
12
|
+
Configures the root logger for quackpipe to ensure CLI output is visible.
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
verbose_level (int): The verbosity level. 0 for WARNING, 1 for INFO, 2+ for DEBUG.
|
|
16
|
+
"""
|
|
17
|
+
# Map the integer verbosity level to a logging level
|
|
18
|
+
if verbose_level >= 2:
|
|
19
|
+
level = logging.DEBUG
|
|
20
|
+
elif verbose_level == 1:
|
|
21
|
+
level = logging.INFO
|
|
22
|
+
else:
|
|
23
|
+
# Default to WARNING to avoid being too noisy
|
|
24
|
+
level = logging.WARNING
|
|
25
|
+
|
|
26
|
+
# Get the top-level logger for the library
|
|
27
|
+
log = logging.getLogger("quackpipe")
|
|
28
|
+
log.setLevel(level)
|
|
29
|
+
|
|
30
|
+
# Create a handler to write messages to the console (stdout)
|
|
31
|
+
handler = logging.StreamHandler(sys.stdout)
|
|
32
|
+
|
|
33
|
+
# Create a formatter and add it to the handler
|
|
34
|
+
formatter = logging.Formatter('%(asctime)s - %(message)s')
|
|
35
|
+
handler.setFormatter(formatter)
|
|
36
|
+
|
|
37
|
+
# Add the handler to the logger. This ensures messages will be output.
|
|
38
|
+
# We clear existing handlers to avoid duplicate messages if run in a notebook.
|
|
39
|
+
if log.hasHandlers():
|
|
40
|
+
log.handlers.clear()
|
|
41
|
+
log.addHandler(handler)
|
|
42
|
+
|
|
43
|
+
return log
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
"""
|
|
2
|
+
src/quackpipe/commands/generate_sqlmesh_config.py
|
|
3
|
+
|
|
4
|
+
This module contains the implementation for the 'generate-sqlmesh-config' CLI command.
|
|
5
|
+
"""
|
|
6
|
+
from argparse import _SubParsersAction
|
|
7
|
+
|
|
8
|
+
import yaml
|
|
9
|
+
|
|
10
|
+
from ..config import SourceConfig
|
|
11
|
+
from ..core import SOURCE_HANDLER_REGISTRY
|
|
12
|
+
from ..secrets import configure_secret_provider, fetch_raw_secret_bundle
|
|
13
|
+
from ..utils import get_configs
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _generate_raw_sql(configs: list[SourceConfig]) -> str:
|
|
17
|
+
"""Instantiates handlers and generates the full setup SQL with resolved secrets."""
|
|
18
|
+
all_sql_statements = []
|
|
19
|
+
for cfg in configs:
|
|
20
|
+
HandlerClass = SOURCE_HANDLER_REGISTRY.get(cfg.type)
|
|
21
|
+
if not HandlerClass:
|
|
22
|
+
continue
|
|
23
|
+
full_context = {**cfg.config, "connection_name": cfg.name, "secret_name": cfg.secret_name}
|
|
24
|
+
handler_instance = HandlerClass(full_context)
|
|
25
|
+
sql = handler_instance.render_sql()
|
|
26
|
+
all_sql_statements.append(sql)
|
|
27
|
+
return "\n\n".join(filter(None, all_sql_statements))
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _replace_secrets_with_placeholders(sql_string: str, configs: list[SourceConfig]) -> str:
|
|
31
|
+
"""Takes a raw SQL string and replaces secret values with environment variable placeholders."""
|
|
32
|
+
final_sql = sql_string
|
|
33
|
+
for cfg in configs:
|
|
34
|
+
if cfg.secret_name:
|
|
35
|
+
resolved_secrets = fetch_raw_secret_bundle(cfg.secret_name)
|
|
36
|
+
value_to_placeholder = {}
|
|
37
|
+
for env_var_name, value in resolved_secrets.items():
|
|
38
|
+
placeholder = f"${{{env_var_name}}}"
|
|
39
|
+
value_to_placeholder[f"'{value}'"] = f"'{placeholder}'"
|
|
40
|
+
value_to_placeholder[str(value)] = placeholder
|
|
41
|
+
for val, placeholder in sorted(value_to_placeholder.items(), key=lambda item: len(item[0]), reverse=True):
|
|
42
|
+
final_sql = final_sql.replace(val, placeholder)
|
|
43
|
+
return final_sql
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _build_sqlmesh_dict(init_sql_block: str, gateway_name: str, state_db: str) -> dict:
|
|
47
|
+
"""Constructs the Python dictionary for the SQLMesh config YAML."""
|
|
48
|
+
return {'gateways': {gateway_name: {'connection': {'type': 'duckdb', 'init': init_sql_block},
|
|
49
|
+
'state_connection': {'type': 'duckdb', 'database': state_db}}},
|
|
50
|
+
'default_gateway': gateway_name}
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def handler(args):
|
|
54
|
+
"""The main handler function for the generate-sqlmesh-config command."""
|
|
55
|
+
configure_secret_provider(env_file=args.env_file)
|
|
56
|
+
print(f"Reading quackpipe configuration from: {args.config}")
|
|
57
|
+
quackpipe_configs = get_configs(config_path=args.config)
|
|
58
|
+
raw_sql = _generate_raw_sql(quackpipe_configs)
|
|
59
|
+
final_sql_with_placeholders = _replace_secrets_with_placeholders(raw_sql, quackpipe_configs)
|
|
60
|
+
sqlmesh_config_dict = _build_sqlmesh_dict(final_sql_with_placeholders, args.gateway_name, args.state_db)
|
|
61
|
+
try:
|
|
62
|
+
with open(args.output, 'w') as f:
|
|
63
|
+
yaml.dump(sqlmesh_config_dict, f, sort_keys=False, default_flow_style=False, indent=2)
|
|
64
|
+
print(f"✅ Successfully generated SQLMesh config at: {args.output}")
|
|
65
|
+
except Exception as e:
|
|
66
|
+
print(f"❌ Failed to write output file: {e}")
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def register_command(subparsers: _SubParsersAction):
|
|
70
|
+
"""Registers the command and its arguments to the main CLI parser."""
|
|
71
|
+
parser_gen = subparsers.add_parser(
|
|
72
|
+
"generate-sqlmesh-config",
|
|
73
|
+
help="Generate a SQLMesh config file from a quackpipe config."
|
|
74
|
+
)
|
|
75
|
+
parser_gen.add_argument("-c", "--config", default="config.yml",
|
|
76
|
+
help="Path to the input quackpipe config.yml file. (Default: config.yml)")
|
|
77
|
+
parser_gen.add_argument("-o", "--output", default="sqlmesh_config.yml",
|
|
78
|
+
help="Path for the output SQLMesh config file. (Default: sqlmesh_config.yml)")
|
|
79
|
+
parser_gen.add_argument("--gateway-name", default="quackpipe_gateway",
|
|
80
|
+
help="The name for the gateway in the SQLMesh config. (Default: quackpipe_gateway)")
|
|
81
|
+
parser_gen.add_argument("--state-db", default=".sqlmesh/state.db",
|
|
82
|
+
help="The path for the SQLMesh state database. (Default: .sqlmesh/state.db)")
|
|
83
|
+
parser_gen.add_argument("--env-file", default=".env",
|
|
84
|
+
help="Path to the environment file to load secrets from. (Default: .env)")
|
|
85
|
+
parser_gen.set_defaults(func=handler)
|
quackpipe/commands/ui.py
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
"""
|
|
2
|
+
src/quackpipe/commands/ui.py
|
|
3
|
+
|
|
4
|
+
This module contains the implementation for the 'ui' CLI command.
|
|
5
|
+
"""
|
|
6
|
+
from argparse import _SubParsersAction
|
|
7
|
+
|
|
8
|
+
from .. import ConfigError
|
|
9
|
+
from ..core import session
|
|
10
|
+
from .common import setup_cli_logging
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def handler(args):
|
|
14
|
+
"""The main handler function for the ui command."""
|
|
15
|
+
log = setup_cli_logging(args.verbose)
|
|
16
|
+
|
|
17
|
+
sources_to_load = args.sources if args.sources else "all configured sources"
|
|
18
|
+
log.info(f"Attempting to start UI session for: {sources_to_load}")
|
|
19
|
+
log.debug(f"Using config file: {args.config} and env file: {args.env_file}")
|
|
20
|
+
|
|
21
|
+
try:
|
|
22
|
+
with session(config_path=args.config, env_file=args.env_file, sources=args.sources) as con:
|
|
23
|
+
log.info("Session created.")
|
|
24
|
+
|
|
25
|
+
log.info(f"Setting UI port to {args.port}...")
|
|
26
|
+
con.execute(f"SET ui_local_port = {args.port};")
|
|
27
|
+
|
|
28
|
+
log.info("Starting DuckDB UI server...")
|
|
29
|
+
con.execute("CALL start_ui_server();")
|
|
30
|
+
|
|
31
|
+
log.warning(f"✅ DuckDB UI is running at: http://localhost:{args.port}")
|
|
32
|
+
log.info("All sources from your config are attached and ready to query.")
|
|
33
|
+
|
|
34
|
+
try:
|
|
35
|
+
# Wait for user input to keep the server alive.
|
|
36
|
+
input("Press Enter or Ctrl+C to exit and shut down the UI server...")
|
|
37
|
+
except KeyboardInterrupt:
|
|
38
|
+
# Handle Ctrl+C gracefully by just printing a newline and proceeding.
|
|
39
|
+
print() # Move to the next line after the ^C character
|
|
40
|
+
pass
|
|
41
|
+
|
|
42
|
+
log.info("Stopping DuckDB UI server...")
|
|
43
|
+
con.execute("CALL stop_ui_server();")
|
|
44
|
+
|
|
45
|
+
except Exception as e:
|
|
46
|
+
log_msg = f"❌ Failed to start UI session: {e}"
|
|
47
|
+
if isinstance(e, ConfigError):
|
|
48
|
+
log.warning(log_msg)
|
|
49
|
+
else:
|
|
50
|
+
log.error(log_msg, exc_info=True)
|
|
51
|
+
finally:
|
|
52
|
+
log.info("Shutting down.")
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def register_command(subparsers: _SubParsersAction):
|
|
56
|
+
"""Registers the command and its arguments to the main CLI parser."""
|
|
57
|
+
parser_ui = subparsers.add_parser(
|
|
58
|
+
"ui",
|
|
59
|
+
help="Launch an interactive DuckDB UI with pre-configured sources."
|
|
60
|
+
)
|
|
61
|
+
parser_ui.add_argument("-c", "--config", default="config.yml",
|
|
62
|
+
help="Path to the input quackpipe config.yml file. (Default: config.yml)")
|
|
63
|
+
parser_ui.add_argument("--env-file", default=".env",
|
|
64
|
+
help="Path to the environment file to load secrets from. (Default: .env)")
|
|
65
|
+
parser_ui.add_argument("-p", "--port", type=int, default=4213, help="Port to run the DuckDB UI on. (Default: 4213)")
|
|
66
|
+
parser_ui.add_argument(
|
|
67
|
+
"-v", "--verbose",
|
|
68
|
+
action="count",
|
|
69
|
+
default=0,
|
|
70
|
+
help="Increase output verbosity. Use -v for INFO and -vv for DEBUG."
|
|
71
|
+
)
|
|
72
|
+
parser_ui.add_argument("sources", nargs='*',
|
|
73
|
+
help="Optional: A space-separated list of specific sources to load. If omitted, all sources are loaded.")
|
|
74
|
+
parser_ui.set_defaults(func=handler)
|
quackpipe/config.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Defines the typed configuration objects for quackpipe.
|
|
3
|
+
"""
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
from enum import Enum
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass(frozen=True)
|
|
10
|
+
class Plugin:
|
|
11
|
+
"""A structured definition for a DuckDB plugin that may require special installation."""
|
|
12
|
+
name: str
|
|
13
|
+
repository: str | None = None
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class SourceType(Enum):
|
|
17
|
+
"""Enumeration of supported source types."""
|
|
18
|
+
POSTGRES = "postgres"
|
|
19
|
+
S3 = "s3"
|
|
20
|
+
AZURE = "azure"
|
|
21
|
+
DUCKLAKE = "ducklake"
|
|
22
|
+
SQLITE = "sqlite"
|
|
23
|
+
PARQUET = "parquet"
|
|
24
|
+
CSV = "csv"
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass
|
|
28
|
+
class SourceConfig:
|
|
29
|
+
"""
|
|
30
|
+
A structured configuration object for a single data source.
|
|
31
|
+
"""
|
|
32
|
+
name: str
|
|
33
|
+
type: SourceType
|
|
34
|
+
config: dict[str, Any] = field(default_factory=dict)
|
|
35
|
+
secret_name: str | None = None
|
quackpipe/core.py
ADDED
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
"""
|
|
2
|
+
The core logic of quackpipe.
|
|
3
|
+
"""
|
|
4
|
+
import logging
|
|
5
|
+
from collections.abc import Generator
|
|
6
|
+
from contextlib import contextmanager
|
|
7
|
+
from functools import wraps
|
|
8
|
+
|
|
9
|
+
import duckdb
|
|
10
|
+
|
|
11
|
+
from quackpipe.config import Plugin, SourceConfig, SourceType
|
|
12
|
+
from quackpipe.secrets import configure_secret_provider
|
|
13
|
+
|
|
14
|
+
# Import all handlers
|
|
15
|
+
from quackpipe.sources import azure_blob, ducklake, postgres, s3, sqlite
|
|
16
|
+
from quackpipe.utils import get_configs
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
# The registry stores the handler CLASSES, not instances.
|
|
21
|
+
SOURCE_HANDLER_REGISTRY = {
|
|
22
|
+
SourceType.POSTGRES: postgres.PostgresHandler,
|
|
23
|
+
SourceType.S3: s3.S3Handler,
|
|
24
|
+
SourceType.AZURE: azure_blob.AzureBlobHandler,
|
|
25
|
+
SourceType.DUCKLAKE: ducklake.DuckLakeHandler,
|
|
26
|
+
SourceType.SQLITE: sqlite.SQLiteHandler,
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _prepare_connection(con: duckdb.DuckDBPyConnection, configs: list[SourceConfig]):
|
|
31
|
+
"""Configures a DuckDB connection from a list of SourceConfig objects."""
|
|
32
|
+
if not configs:
|
|
33
|
+
return
|
|
34
|
+
|
|
35
|
+
# 1. Instantiate all handlers first
|
|
36
|
+
instantiated_handlers = []
|
|
37
|
+
for cfg in configs:
|
|
38
|
+
HandlerClass = SOURCE_HANDLER_REGISTRY.get(cfg.type)
|
|
39
|
+
if not HandlerClass:
|
|
40
|
+
logger.warning("Warning: No handler class found for source type '%s'. Skipping.", cfg.type.value)
|
|
41
|
+
continue
|
|
42
|
+
|
|
43
|
+
full_context = {
|
|
44
|
+
**cfg.config,
|
|
45
|
+
"connection_name": cfg.name,
|
|
46
|
+
"secret_name": cfg.secret_name,
|
|
47
|
+
}
|
|
48
|
+
handler_instance = HandlerClass(full_context)
|
|
49
|
+
instantiated_handlers.append(handler_instance)
|
|
50
|
+
|
|
51
|
+
# 2. Gather all required plugins from the instantiated handlers
|
|
52
|
+
required_plugins = set()
|
|
53
|
+
for handler in instantiated_handlers:
|
|
54
|
+
required_plugins.update(handler.required_plugins)
|
|
55
|
+
|
|
56
|
+
# 3. Install and load all extensions
|
|
57
|
+
for plugin_def in required_plugins:
|
|
58
|
+
if isinstance(plugin_def, Plugin):
|
|
59
|
+
# It's a structured Plugin object with extra parameters
|
|
60
|
+
plugin_name = plugin_def.name
|
|
61
|
+
install_params = {'repository': plugin_def.repository}
|
|
62
|
+
# Filter out None values to avoid passing `repository=None`
|
|
63
|
+
clean_params = {k: v for k, v in install_params.items() if v is not None}
|
|
64
|
+
con.install_extension(plugin_name, **clean_params)
|
|
65
|
+
else:
|
|
66
|
+
# It's a simple string (the name of the plugin)
|
|
67
|
+
plugin_name = plugin_def
|
|
68
|
+
con.install_extension(plugin_name)
|
|
69
|
+
|
|
70
|
+
# Loading the extension only requires the name
|
|
71
|
+
con.load_extension(plugin_name)
|
|
72
|
+
|
|
73
|
+
# 4. Render and execute the setup SQL for each handler
|
|
74
|
+
for handler in instantiated_handlers:
|
|
75
|
+
setup_sql = handler.render_sql()
|
|
76
|
+
logger.debug(setup_sql)
|
|
77
|
+
try:
|
|
78
|
+
con.execute(setup_sql)
|
|
79
|
+
except (duckdb.ParserException, duckdb.IOException):
|
|
80
|
+
logger.exception(setup_sql)
|
|
81
|
+
raise
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
@contextmanager
|
|
85
|
+
def session(
|
|
86
|
+
config_path: str | None = None,
|
|
87
|
+
configs: list[SourceConfig] | None = None,
|
|
88
|
+
sources: list[str] | None = None,
|
|
89
|
+
env_file: str | None = None
|
|
90
|
+
) -> Generator[duckdb.DuckDBPyConnection, None, None]:
|
|
91
|
+
"""
|
|
92
|
+
A context manager providing a pre-configured DuckDB connection.
|
|
93
|
+
"""
|
|
94
|
+
configure_secret_provider(env_file=env_file)
|
|
95
|
+
|
|
96
|
+
all_configs = get_configs(config_path, configs)
|
|
97
|
+
|
|
98
|
+
active_configs = all_configs
|
|
99
|
+
if sources:
|
|
100
|
+
active_configs = [c for c in all_configs if c.name in sources]
|
|
101
|
+
|
|
102
|
+
con = duckdb.connect(database=':memory:')
|
|
103
|
+
try:
|
|
104
|
+
_prepare_connection(con, active_configs)
|
|
105
|
+
yield con
|
|
106
|
+
finally:
|
|
107
|
+
con.close()
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def with_session(**session_kwargs):
|
|
111
|
+
"""
|
|
112
|
+
A decorator to inject a pre-configured DuckDB connection into a function.
|
|
113
|
+
"""
|
|
114
|
+
|
|
115
|
+
def decorator(func):
|
|
116
|
+
@wraps(func)
|
|
117
|
+
def wrapper(*args, **kwargs):
|
|
118
|
+
with session(**session_kwargs) as con:
|
|
119
|
+
return func(con, *args, **kwargs)
|
|
120
|
+
|
|
121
|
+
return wrapper
|
|
122
|
+
|
|
123
|
+
return decorator
|
quackpipe/etl_utils.py
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
"""
|
|
2
|
+
High-level utility functions for common ETL operations.
|
|
3
|
+
"""
|
|
4
|
+
import logging
|
|
5
|
+
|
|
6
|
+
import duckdb
|
|
7
|
+
import pandas as pd
|
|
8
|
+
|
|
9
|
+
from .config import SourceConfig, SourceType
|
|
10
|
+
|
|
11
|
+
# Import the session context manager from core and config loader from utils
|
|
12
|
+
from .core import session
|
|
13
|
+
from .utils import get_configs
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def to_df(con: duckdb.DuckDBPyConnection, query: str) -> pd.DataFrame:
|
|
19
|
+
"""Executes a query and returns the result as a pandas DataFrame."""
|
|
20
|
+
return con.execute(query).fetchdf()
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def create_table_from_df(con: duckdb.DuckDBPyConnection, df: pd.DataFrame, table_name: str):
|
|
24
|
+
"""Creates a new table in DuckDB from a pandas DataFrame, replacing if it exists."""
|
|
25
|
+
con.execute(f"CREATE OR REPLACE TABLE {table_name} AS SELECT * FROM df")
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def move_data(
|
|
29
|
+
source_query: str,
|
|
30
|
+
destination_name: str,
|
|
31
|
+
table_name: str,
|
|
32
|
+
config_path: str | None = None,
|
|
33
|
+
configs: list[SourceConfig] | None = None,
|
|
34
|
+
env_file: str | None = None,
|
|
35
|
+
mode: str = 'replace',
|
|
36
|
+
format: str = 'parquet'
|
|
37
|
+
):
|
|
38
|
+
"""
|
|
39
|
+
A self-contained utility to move data from a source query to a destination.
|
|
40
|
+
This function creates and manages its own quackpipe session.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
source_query: The SELECT query to execute for the source data.
|
|
44
|
+
destination_name: The logical name of the destination source from the config.
|
|
45
|
+
table_name: The name of the table or file to create at the destination.
|
|
46
|
+
config_path: Path to the YAML configuration file.
|
|
47
|
+
configs: A direct list of SourceConfig objects.
|
|
48
|
+
env_file: Path to an env file to use.
|
|
49
|
+
mode: Write mode. 'replace' or 'append'.
|
|
50
|
+
format: The file format for file-based destinations (e.g., 'parquet', 'csv').
|
|
51
|
+
"""
|
|
52
|
+
# Load all configurations using the shared helper function.
|
|
53
|
+
all_configs = get_configs(config_path, configs)
|
|
54
|
+
|
|
55
|
+
try:
|
|
56
|
+
# Find the destination config to determine its type.
|
|
57
|
+
dest_config = next(c for c in all_configs if c.name == destination_name)
|
|
58
|
+
except StopIteration as e:
|
|
59
|
+
raise ValueError(f"Destination '{destination_name}' not found in the provided configuration.") from e
|
|
60
|
+
|
|
61
|
+
# This utility creates its own session to perform the work.
|
|
62
|
+
with session(configs=all_configs, env_file=env_file) as con:
|
|
63
|
+
if dest_config.type == SourceType.S3:
|
|
64
|
+
base_path = dest_config.config.get('path', f"s3://{destination_name}/")
|
|
65
|
+
if not base_path.endswith('/'):
|
|
66
|
+
base_path += '/'
|
|
67
|
+
full_path = f"{base_path}{table_name}.{format}"
|
|
68
|
+
sql = f"COPY ({source_query}) TO '{full_path}' (FORMAT {format.upper()});"
|
|
69
|
+
con.execute(sql)
|
|
70
|
+
logger.info("Data successfully copied to %s", full_path)
|
|
71
|
+
|
|
72
|
+
elif dest_config.type == SourceType.DUCKLAKE:
|
|
73
|
+
full_table_name = f"{destination_name}.{table_name}"
|
|
74
|
+
if mode == 'replace':
|
|
75
|
+
sql = f"CREATE OR REPLACE TABLE {full_table_name} AS ({source_query});"
|
|
76
|
+
elif mode == 'append':
|
|
77
|
+
sql = f"INSERT INTO {full_table_name} ({source_query});"
|
|
78
|
+
else:
|
|
79
|
+
raise ValueError(f"Invalid mode '{mode}'. Use 'replace' or 'append'.")
|
|
80
|
+
con.execute(sql)
|
|
81
|
+
logger.info("Data successfully moved to table %s", full_table_name)
|
|
82
|
+
|
|
83
|
+
elif dest_config.type in [SourceType.POSTGRES, SourceType.SQLITE]:
|
|
84
|
+
is_read_only = dest_config.config.get('read_only', True)
|
|
85
|
+
if is_read_only:
|
|
86
|
+
raise PermissionError(
|
|
87
|
+
f"Cannot write to destination '{destination_name}' because it is configured as read-only. "
|
|
88
|
+
"To enable writing, set 'read_only: false' in your configuration for this source."
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
full_table_name = f"{destination_name}.{table_name}"
|
|
92
|
+
if mode == 'replace':
|
|
93
|
+
con.execute(f"DROP TABLE IF EXISTS {full_table_name};")
|
|
94
|
+
sql = f"CREATE TABLE {full_table_name} AS ({source_query});"
|
|
95
|
+
elif mode == 'append':
|
|
96
|
+
sql = f"INSERT INTO {full_table_name} ({source_query});"
|
|
97
|
+
else:
|
|
98
|
+
raise ValueError(f"Invalid mode '{mode}'. Use 'replace' or 'append'.")
|
|
99
|
+
con.execute(sql)
|
|
100
|
+
logger.info("Data successfully moved to table %s", full_table_name)
|
|
101
|
+
|
|
102
|
+
else:
|
|
103
|
+
if mode == 'replace':
|
|
104
|
+
sql = f"CREATE OR REPLACE TABLE {table_name} AS ({source_query});"
|
|
105
|
+
elif mode == 'append':
|
|
106
|
+
sql = f"INSERT INTO {table_name} ({source_query});"
|
|
107
|
+
else:
|
|
108
|
+
raise ValueError(f"Invalid mode '{mode}'. Use 'replace' or 'append'.")
|
|
109
|
+
con.execute(sql)
|
|
110
|
+
logger.info("Data successfully moved to in-memory table '%s'", table_name)
|
quackpipe/exceptions.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Exception classes for quackpipe.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
class QuackpipeError(Exception):
|
|
6
|
+
"""Base exception for quackpipe."""
|
|
7
|
+
pass
|
|
8
|
+
|
|
9
|
+
class ConfigError(QuackpipeError):
|
|
10
|
+
"""Raised when there's an error with configuration."""
|
|
11
|
+
pass
|
|
12
|
+
|
|
13
|
+
class SecretError(QuackpipeError):
|
|
14
|
+
"""Raised when there's an error with secret management."""
|
|
15
|
+
pass
|