phlo-sling 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,16 @@
1
+ Metadata-Version: 2.4
2
+ Name: phlo-sling
3
+ Version: 0.1.0
4
+ Summary: Sling replication ingestion provider for Phlo
5
+ Author-email: Phlo Team <team@phlo.dev>
6
+ License: MIT
7
+ Requires-Python: >=3.11
8
+ Description-Content-Type: text/plain
9
+ Requires-Dist: phlo>=0.1.0
10
+ Requires-Dist: sling>=1.0.0
11
+ Requires-Dist: pyarrow>=21.0.0
12
+ Provides-Extra: dev
13
+ Requires-Dist: pytest>=7.0; extra == "dev"
14
+ Requires-Dist: ruff>=0.1.0; extra == "dev"
15
+
16
+ Sling-based database replication ingestion provider for Phlo.
@@ -0,0 +1,111 @@
1
+ # phlo-sling
2
+
3
+ Sling-based database replication ingestion provider for Phlo.
4
+
5
+ ## Overview
6
+
7
+ `phlo-sling` wraps [Sling](https://slingdata.io/) as a complementary ingestion engine alongside `phlo-dlt`. Sling is a data movement CLI (DB→DB, File→DB, DB→File) with 30+ connectors, optimized for high-speed database replication.
8
+
9
+ Where DLT excels at API-based ingestion with schema evolution and normalisation, Sling excels at raw-speed database replication with wildcard stream selection (`my_schema.*`), incremental modes, and direct DB-to-DB transfers.
10
+
11
+ ## Installation
12
+
13
+ ```bash
14
+ pip install phlo-sling
15
+ ```
16
+
17
+ ## Usage
18
+
19
+ ### Decorator-Based Replication
20
+
21
+ ```python
22
+ import phlo
23
+ from phlo_sling import phlo_sling_replication
24
+
25
+
26
+ @phlo_sling_replication(
27
+ stream_name="public.users",
28
+ table_name="users",
29
+ source_conn="PHLO_POSTGRES",
30
+ target_conn="WAREHOUSE",
31
+ group="crm",
32
+ mode="incremental",
33
+ primary_key="id",
34
+ update_key="updated_at",
35
+ cron="0 */2 * * *",
36
+ owner="data-team",
37
+ )
38
+ def replicate_users(context):
39
+ """Replicate users table from Postgres into raw.users on WAREHOUSE."""
40
+ return None
41
+ ```
42
+
43
+ ### Python-First File Discovery
44
+
45
+ Use `phlo_sling_assets` when you want Python logic to discover folders/files
46
+ first and register one Sling-backed asset per result.
47
+
48
+ ```python
49
+ from pathlib import Path
50
+
51
+ from phlo_sling import SlingReplication, phlo_sling_assets
52
+
53
+
54
+ @phlo_sling_assets(group="finance")
55
+ def discover_workbooks():
56
+ root = Path("/mnt/finance")
57
+
58
+ for workbook in root.rglob("*.xlsx"):
59
+ table_name = workbook.stem.replace("-", "_").lower()
60
+ yield SlingReplication(
61
+ stream_name=f"file://{workbook}",
62
+ table_name=table_name,
63
+ source_conn="LOCAL",
64
+ target_conn="WAREHOUSE",
65
+ object=f"raw.{table_name}",
66
+ mode="full-refresh",
67
+ source_options={"sheet": "Sheet1!A:F"},
68
+ description=f"Ingest workbook {workbook.name}",
69
+ metadata={"workbook_path": str(workbook)},
70
+ tags={"format": "xlsx"},
71
+ )
72
+ ```
73
+
74
+ Use the original `phlo_sling_replication` decorator when you want one stable
75
+ asset whose function may return runtime Sling overrides such as a dynamic
76
+ `src_stream` or `where` clause.
77
+
78
+ ### CLI Commands
79
+
80
+ ```bash
81
+ # Run replication from YAML
82
+ phlo sling run --replication replications/pg_to_lake.yaml
83
+
84
+ # Run ad-hoc replication
85
+ phlo sling run --source PHLO_POSTGRES --stream public.users --target PHLO_S3 --object raw/users.parquet
86
+
87
+ # Override the inferred destination object when needed
88
+ phlo sling run --source PHLO_POSTGRES --stream public.users --target WAREHOUSE --object raw.users
89
+
90
+ # List connections
91
+ phlo sling conns
92
+
93
+ # Discover available streams
94
+ phlo sling discover PHLO_POSTGRES
95
+ phlo sling discover PHLO_POSTGRES --schema public --format json
96
+ ```
97
+
98
+ ## Configuration
99
+
100
+ The following environment variables can be used to configure Sling:
101
+
102
+ - `SLING_DEFAULT_NAMESPACE` - Default namespace for generated replication table names (default: "raw")
103
+ - `SLING_DEFAULT_MODE` - Default replication mode (default: "incremental")
104
+ - `SLING_AUTO_CONNECTIONS` - Auto-generate Sling connections from Phlo capability metadata (default: true)
105
+ - `PHLO_OBJECT_STORE` - Select the active `object_store` capability when more than one is installed
106
+
107
+ Notes:
108
+
109
+ - Decorator-backed replications need a real Sling destination. When `target_conn` is set and `object` is omitted, `phlo-sling` targets `<namespace>.<table_name>` automatically.
110
+ - If you set `SLING_AUTO_CONNECTIONS=false`, `phlo-sling` stops injecting `PHLO_POSTGRES` / `PHLO_S3` connection definitions into the environment.
111
+ - `PHLO_S3` now resolves from the active `object_store` capability instead of importing `phlo-minio` / `phlo-rustfs` directly. If both are installed, set `PHLO_OBJECT_STORE=minio` or `PHLO_OBJECT_STORE=rustfs`.
@@ -0,0 +1,44 @@
1
+ [build-system]
2
+ requires = ["setuptools>=45", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "phlo-sling"
7
+ version = "0.1.0"
8
+ description = "Sling replication ingestion provider for Phlo"
9
+ readme = {text = "Sling-based database replication ingestion provider for Phlo.", content-type = "text/plain"}
10
+ requires-python = ">=3.11"
11
+ authors = [
12
+ {name = "Phlo Team", email = "team@phlo.dev"},
13
+ ]
14
+ license = {text = "MIT"}
15
+ dependencies = [
16
+ "phlo>=0.1.0",
17
+ "sling>=1.0.0",
18
+ "pyarrow>=21.0.0",
19
+ ]
20
+
21
+ [project.optional-dependencies]
22
+ dev = [
23
+ "pytest>=7.0",
24
+ "ruff>=0.1.0",
25
+ ]
26
+
27
+ [project.entry-points."phlo.plugins.assets"]
28
+ sling = "phlo_sling.plugin:SlingAssetProvider"
29
+
30
+ [project.entry-points."phlo.plugins.ingestion_providers"]
31
+ sling = "phlo_sling.plugin:SlingIngestionProvider"
32
+
33
+ [project.entry-points."phlo.plugins.cli"]
34
+ sling = "phlo_sling.cli_plugin:SlingCliPlugin"
35
+
36
+ [tool.setuptools]
37
+ package-dir = {"" = "src"}
38
+
39
+ [tool.setuptools.packages.find]
40
+ where = ["src"]
41
+
42
+ [tool.ruff]
43
+ line-length = 100
44
+ target-version = "py311"
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,28 @@
1
+ from collections.abc import Callable
2
+ from typing import Any
3
+
4
+ from phlo_sling.registry import SlingReplication
5
+
6
+
7
+ def phlo_sling_assets(*args: Any, **kwargs: Any) -> Callable[..., Any]:
8
+ """Lazily resolve and forward to the Sling asset discovery decorator."""
9
+ from phlo_sling.decorator import phlo_sling_assets as _phlo_sling_assets
10
+
11
+ return _phlo_sling_assets(*args, **kwargs)
12
+
13
+
14
+ def phlo_sling_replication(*args: Any, **kwargs: Any) -> Callable[..., Any]:
15
+ """Lazily resolve and forward to the sling replication decorator factory."""
16
+ from phlo_sling.decorator import phlo_sling_replication as _phlo_sling_replication
17
+
18
+ return _phlo_sling_replication(*args, **kwargs)
19
+
20
+
21
+ def get_sling_assets() -> list[Any]:
22
+ """Lazily resolve and return registered sling replication assets."""
23
+ from phlo_sling.decorator import get_sling_assets as _get_sling_assets
24
+
25
+ return _get_sling_assets()
26
+
27
+
28
+ __all__ = ["SlingReplication", "get_sling_assets", "phlo_sling_assets", "phlo_sling_replication"]
@@ -0,0 +1,187 @@
1
+ """CLI commands for Sling replication management."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import subprocess
7
+
8
+ import click
9
+
10
+ from phlo.logging import get_logger
11
+ from phlo_sling.connections import apply_sling_connection_env
12
+ from phlo_sling.settings import get_settings
13
+
14
+ logger = get_logger(__name__)
15
+
16
+
17
+ @click.group("sling")
18
+ def sling_group() -> None:
19
+ """Sling replication commands."""
20
+
21
+
22
+ @sling_group.command("run")
23
+ @click.option("--replication", "-r", type=click.Path(exists=True), help="Sling replication YAML.")
24
+ @click.option("--source", "-s", help="Source connection name.")
25
+ @click.option("--target", "-t", help="Target connection name.")
26
+ @click.option("--stream", help="Source stream (e.g., 'public.users').")
27
+ @click.option("--object", "target_object", help="Target object/table name.")
28
+ @click.option(
29
+ "--mode",
30
+ default=None,
31
+ help="Replication mode. Defaults to SLING_DEFAULT_MODE when omitted.",
32
+ )
33
+ def run_command(
34
+ replication: str | None,
35
+ source: str | None,
36
+ target: str | None,
37
+ stream: str | None,
38
+ target_object: str | None,
39
+ mode: str | None,
40
+ ) -> None:
41
+ """Run a Sling replication.
42
+
43
+ Either provide --replication YAML or --source/--stream/--target for ad-hoc runs.
44
+ """
45
+ from sling import Replication, Sling
46
+
47
+ apply_sling_connection_env()
48
+ resolved_mode = mode or get_settings().sling_default_mode
49
+
50
+ if replication:
51
+ click.echo(f"Running replication from {replication}")
52
+ repl = Replication(file_path=replication)
53
+ repl.run()
54
+ elif source and stream:
55
+ if not target:
56
+ raise click.UsageError("Provide --target for ad-hoc runs.")
57
+ resolved_target_object = _resolve_target_object(stream=stream, target_object=target_object)
58
+ click.echo(f"Replicating {stream} from {source}")
59
+ config = Sling(
60
+ src_conn=source,
61
+ src_stream=stream,
62
+ tgt_conn=target,
63
+ tgt_object=resolved_target_object,
64
+ mode=resolved_mode,
65
+ )
66
+ config.run()
67
+ else:
68
+ raise click.UsageError("Provide --replication YAML or --source/--stream.")
69
+
70
+
71
+ @sling_group.command("conns")
72
+ @click.option("--auto/--no-auto", default=True, help="Include auto-discovered connections.")
73
+ def conns_command(auto: bool) -> None:
74
+ """List available Sling connections.
75
+
76
+ Shows auto-discovered connections from Phlo capability metadata and any
77
+ connections from explicit env.yaml files.
78
+ """
79
+ if auto:
80
+ from phlo_sling.connections import resolve_phlo_connections
81
+
82
+ connections = resolve_phlo_connections()
83
+ if connections:
84
+ click.echo("Auto-discovered connections:")
85
+ for name, config in connections.items():
86
+ conn_type = config.get("type", "unknown")
87
+ host = config.get("host") or config.get("endpoint", "")
88
+ click.echo(f" {name}: {conn_type} ({host})")
89
+ else:
90
+ click.echo("No auto-discovered connections found.")
91
+
92
+ click.echo("\nSling native connections:")
93
+ try:
94
+ result = _run_sling_cli_command(["conns", "list"])
95
+ click.echo(result.stdout, nl=False)
96
+ except Exception as exc:
97
+ click.echo(f" Could not list native connections: {exc}")
98
+
99
+
100
+ @sling_group.command("discover")
101
+ @click.argument("connection")
102
+ @click.option("--schema", help="Filter by schema name.")
103
+ @click.option(
104
+ "--format",
105
+ "output_format",
106
+ type=click.Choice(["table", "json"]),
107
+ default="table",
108
+ show_default=True,
109
+ help="Output format.",
110
+ )
111
+ def discover_command(connection: str, schema: str | None, output_format: str) -> None:
112
+ """Discover available streams from a Sling connection.
113
+
114
+ Lists tables/views available in the source connection for use as
115
+ stream_name in @phlo_sling_replication decorators.
116
+ """
117
+ apply_sling_connection_env()
118
+
119
+ click.echo(f"Discovering streams from {connection}...")
120
+ try:
121
+ command = ["conns", "discover", connection]
122
+ if schema:
123
+ command.extend(["--pattern", f"{schema}.*"])
124
+
125
+ result = _run_sling_cli_command(command)
126
+ if output_format == "json":
127
+ click.echo(json.dumps(_parse_discovery_output(result.stdout), indent=2))
128
+ return
129
+
130
+ click.echo(result.stdout, nl=False)
131
+ except Exception as exc:
132
+ raise click.ClickException(f"Discovery failed: {exc}") from exc
133
+
134
+
135
+ def _resolve_target_object(stream: str, target_object: str | None) -> str:
136
+ """Resolve the destination object for an ad-hoc Sling run."""
137
+ if target_object:
138
+ return target_object
139
+ if "*" in stream:
140
+ raise click.UsageError("Provide --object when --stream uses a wildcard.")
141
+ return stream
142
+
143
+
144
+ def _get_sling_binary() -> str:
145
+ """Return the Sling binary path, honoring package settings."""
146
+ settings = get_settings()
147
+ if settings.sling_binary_path:
148
+ return settings.sling_binary_path
149
+
150
+ from sling.bin import SLING_BIN
151
+
152
+ return SLING_BIN
153
+
154
+
155
+ def _run_sling_cli_command(args: list[str]) -> subprocess.CompletedProcess[str]:
156
+ """Execute the Sling CLI and return captured output."""
157
+ return subprocess.run(
158
+ [_get_sling_binary(), *args],
159
+ check=True,
160
+ capture_output=True,
161
+ text=True,
162
+ )
163
+
164
+
165
+ def _parse_discovery_output(output: str) -> list[dict[str, str]]:
166
+ """Parse Sling's ASCII discovery table into JSON-serializable rows."""
167
+ lines = [line.rstrip() for line in output.splitlines() if line.strip()]
168
+ table_lines = [line for line in lines if "|" in line]
169
+ if len(table_lines) < 2:
170
+ return []
171
+
172
+ headers = [_normalize_column_name(part) for part in table_lines[0].split("|")]
173
+ rows: list[dict[str, str]] = []
174
+ for line in table_lines[1:]:
175
+ values = [part.strip() for part in line.split("|")]
176
+ if len(values) != len(headers):
177
+ continue
178
+ if all(set(value) <= {"-"} for value in values):
179
+ continue
180
+ rows.append(dict(zip(headers, values, strict=True)))
181
+
182
+ return rows
183
+
184
+
185
+ def _normalize_column_name(value: str) -> str:
186
+ """Normalize discovery table headers for JSON output."""
187
+ return value.strip().lower().replace(" ", "_")
@@ -0,0 +1,25 @@
1
+ """CLI plugin for Sling commands."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import click
6
+
7
+ from phlo.plugins.base import CliCommandPlugin, PluginMetadata
8
+ from phlo_sling.cli_commands import sling_group
9
+
10
+
11
+ class SlingCliPlugin(CliCommandPlugin):
12
+ """Expose Sling CLI command groups to the Phlo plugin system."""
13
+
14
+ @property
15
+ def metadata(self) -> PluginMetadata:
16
+ """Return plugin metadata for CLI command discovery."""
17
+ return PluginMetadata(
18
+ name="sling",
19
+ version="0.1.0",
20
+ description="Sling replication CLI commands for Phlo",
21
+ )
22
+
23
+ def get_cli_commands(self) -> list[click.Command]:
24
+ """Return CLI commands contributed by this plugin."""
25
+ return [sling_group]
@@ -0,0 +1,186 @@
1
+ """Auto-generate Sling connections from Phlo capability metadata."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ from collections.abc import MutableMapping
7
+ from typing import Any
8
+
9
+ from phlo.capabilities import list_capabilities, resolve_capability
10
+ from phlo.infrastructure.config import load_project_config
11
+ from phlo.logging import get_logger
12
+ from phlo_sling.settings import get_settings
13
+
14
+ logger = get_logger(__name__)
15
+
16
+
17
+ def resolve_phlo_connections() -> dict[str, dict[str, Any]]:
18
+ """Build Sling connection definitions from installed Phlo package settings.
19
+
20
+ Inspects known Phlo capability providers (phlo-postgres, phlo-minio, etc.)
21
+ and generates Sling-compatible connection dicts.
22
+
23
+ Returns:
24
+ Dict mapping connection name to Sling connection config.
25
+ """
26
+ if not get_settings().sling_auto_connections:
27
+ logger.debug("sling_auto_connections_disabled")
28
+ return {}
29
+
30
+ connections: dict[str, dict[str, Any]] = {}
31
+
32
+ connections.update(_resolve_postgres_connection())
33
+ connections.update(_resolve_iceberg_connection())
34
+ connections.update(_resolve_s3_connection())
35
+
36
+ return connections
37
+
38
+
39
+ def _project_env_value(name: str) -> str | None:
40
+ """Read a non-secret default from phlo.yaml env: when host os.environ lacks it."""
41
+ try:
42
+ project_config = load_project_config()
43
+ except Exception as exc:
44
+ logger.debug("project_env_lookup_failed", name=name, error=str(exc))
45
+ return None
46
+
47
+ env_config = project_config.get("env", {})
48
+ if not isinstance(env_config, dict):
49
+ return None
50
+
51
+ value = env_config.get(name)
52
+ return value if isinstance(value, str) and value else None
53
+
54
+
55
+ def _ensure_capabilities_discovered(*kinds: str) -> None:
56
+ """Populate the capability registry only when the requested kinds are absent."""
57
+ if any(list_capabilities(kind) for kind in kinds):
58
+ return
59
+
60
+ from phlo.capabilities.discovery import discover_capabilities
61
+
62
+ discover_capabilities()
63
+
64
+
65
+ def _get_iceberg_settings():
66
+ """Import phlo-iceberg settings lazily for optional package installs."""
67
+ from phlo_iceberg.settings import get_settings as get_iceberg_settings
68
+
69
+ return get_iceberg_settings()
70
+
71
+
72
+ def _resolve_postgres_connection() -> dict[str, dict[str, Any]]:
73
+ """Resolve Postgres connection from phlo-postgres settings."""
74
+ try:
75
+ from phlo_postgres.settings import get_settings as get_pg_settings
76
+
77
+ pg = get_pg_settings()
78
+ return {
79
+ "PHLO_POSTGRES": {
80
+ "type": "postgres",
81
+ "host": pg.postgres_host,
82
+ "port": pg.postgres_port,
83
+ "database": pg.postgres_db,
84
+ "user": pg.postgres_user,
85
+ "password": pg.postgres_password,
86
+ "schema": getattr(pg, "postgres_schema", "public"),
87
+ }
88
+ }
89
+ except (ImportError, Exception) as exc:
90
+ logger.debug("postgres_connection_skipped", error=str(exc))
91
+ return {}
92
+
93
+
94
+ def _resolve_iceberg_connection() -> dict[str, dict[str, Any]]:
95
+ """Resolve an Iceberg REST catalog connection from phlo-iceberg settings."""
96
+ try:
97
+ settings = _get_iceberg_settings()
98
+ ref = settings.iceberg_default_ref
99
+ config = settings.get_pyiceberg_catalog_config(ref)
100
+ return {
101
+ "PHLO_ICEBERG": {
102
+ "type": "iceberg",
103
+ "catalog_type": "rest",
104
+ "rest_uri": config["uri"],
105
+ "rest_warehouse": config["warehouse"],
106
+ "s3_endpoint": config["s3.endpoint"],
107
+ "s3_access_key_id": config["s3.access-key-id"],
108
+ "s3_secret_access_key": config["s3.secret-access-key"],
109
+ "s3_region": config["s3.region"],
110
+ "schema": settings.iceberg_default_namespace,
111
+ }
112
+ }
113
+ except (ImportError, Exception) as exc:
114
+ logger.debug("iceberg_connection_skipped", error=str(exc))
115
+ return {}
116
+
117
+
118
+ def _resolve_s3_connection() -> dict[str, dict[str, Any]]:
119
+ """Resolve S3 connection from the active object-store capability."""
120
+ _ensure_capabilities_discovered("object_store")
121
+ requested_name = os.environ.get("PHLO_OBJECT_STORE") or _project_env_value("PHLO_OBJECT_STORE")
122
+ resolution = resolve_capability("object_store", requested_name)
123
+ if resolution is None:
124
+ available = list_capabilities("object_store")
125
+ logger.debug(
126
+ "object_store_connection_skipped",
127
+ requested_name=requested_name,
128
+ available=available,
129
+ )
130
+ return {}
131
+
132
+ provider = resolution.provider
133
+ if hasattr(provider, "to_sling_connection"):
134
+ config = provider.to_sling_connection()
135
+ else:
136
+ config = {
137
+ key: value
138
+ for key, value in resolution.metadata.items()
139
+ if key in {"type", "endpoint", "access_key_id", "secret_access_key", "region"}
140
+ }
141
+
142
+ if not config:
143
+ logger.debug(
144
+ "object_store_connection_missing_config",
145
+ capability_name=resolution.name,
146
+ )
147
+ return {}
148
+
149
+ return {"PHLO_S3": config}
150
+
151
+
152
+ def export_sling_env(connections: dict[str, dict[str, Any]]) -> dict[str, str]:
153
+ """Convert connection dicts to Sling environment variable format.
154
+
155
+ Sling expects connections as environment variables with JSON values.
156
+
157
+ Args:
158
+ connections: Dict of connection name → connection config.
159
+
160
+ Returns:
161
+ Dict of environment variable name → JSON string value.
162
+ """
163
+ import json
164
+
165
+ env_vars: dict[str, str] = {}
166
+ for name, config in connections.items():
167
+ env_vars[name] = json.dumps(config)
168
+ return env_vars
169
+
170
+
171
+ def apply_sling_connection_env(environ: MutableMapping[str, str] | None = None) -> dict[str, str]:
172
+ """Inject resolved Sling connections into an environment mapping.
173
+
174
+ Existing variables win over auto-generated values.
175
+
176
+ Args:
177
+ environ: Environment mapping to mutate. Defaults to ``os.environ``.
178
+
179
+ Returns:
180
+ Dict of injected environment variables.
181
+ """
182
+ target_env = os.environ if environ is None else environ
183
+ env_vars = export_sling_env(resolve_phlo_connections())
184
+ for name, value in env_vars.items():
185
+ target_env.setdefault(name, value)
186
+ return env_vars