PyPI - dbt-vectorize - Versions diffs - 0.1.4__tar.gz - Mend

dbt-vectorize 0.1.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

dbt_vectorize-0.1.4/PKG-INFO +203 -0
dbt_vectorize-0.1.4/README.md +185 -0
dbt_vectorize-0.1.4/dbt_vectorize/__init__.py +3 -0
dbt_vectorize-0.1.4/dbt_vectorize/bin/__init__.py +2 -0
dbt_vectorize-0.1.4/dbt_vectorize/bin/pg_embedder +0 -0
dbt_vectorize-0.1.4/dbt_vectorize/cli.py +165 -0
dbt_vectorize-0.1.4/dbt_vectorize.egg-info/PKG-INFO +203 -0
dbt_vectorize-0.1.4/dbt_vectorize.egg-info/SOURCES.txt +13 -0
dbt_vectorize-0.1.4/dbt_vectorize.egg-info/dependency_links.txt +1 -0
dbt_vectorize-0.1.4/dbt_vectorize.egg-info/entry_points.txt +2 -0
dbt_vectorize-0.1.4/dbt_vectorize.egg-info/requires.txt +1 -0
dbt_vectorize-0.1.4/dbt_vectorize.egg-info/top_level.txt +1 -0
dbt_vectorize-0.1.4/pyproject.toml +39 -0
dbt_vectorize-0.1.4/setup.cfg +4 -0
dbt_vectorize-0.1.4/setup.py +27 -0

dbt_vectorize-0.1.4/PKG-INFO ADDED Viewed

@@ -0,0 +1,203 @@
+Metadata-Version: 2.4
+Name: dbt-vectorize
+Version: 0.1.4
+Summary: dbt + Rust vectorization runner for pgvector
+Author-email: Maria Dubyaga <kraftaa@gmail.com>
+License-Expression: Apache-2.0
+Project-URL: Homepage, https://github.com/kraftaa/dbt-vector
+Project-URL: Repository, https://github.com/kraftaa/dbt-vector
+Classifier: Environment :: Console
+Classifier: Intended Audience :: Developers
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Rust
+Classifier: Topic :: Database
+Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Requires-Python: >=3.9
+Description-Content-Type: text/markdown
+Requires-Dist: PyYAML>=6.0
+# dbt-vectors (prototype scaffold)
+> Make vector indexes a first-class materialization in dbt. This repo is an MVP scaffold to prove the concept.
+## Why
+- dbt today only materializes SQL artifacts (table, view, incremental, ephemeral).
+- Vector pipelines require SQL + embeddings + upsert to a vector DB; teams currently stitch that with ad-hoc external scripts.
+- A custom `vector_index` materialization can run inside `dbt build`, generating embeddings, handling incremental logic, and writing to pgvector/Pinecone/Qdrant.
+## What’s here
+- **dbt package skeleton** with a `vector_index` materialization and dispatchable macros (pgvector working).
+- **Rust embedder** (`rust/embedding_engine`) that can generate embeddings via OpenAI, Amazon Bedrock, or a local ONNX model (no Python needed).
+- **`./bin/vectorize` runner**: orchestrates `dbt run` for the model and then calls the Rust embedder to write embeddings into Postgres/pgvector.
+- **Examples** to show how a model is defined and run.
+## Prerequisites
+`dbt-vectorize` does not vendor dbt. It uses whatever dbt binary you point it to (`DBT=...`) or find on PATH.
+Verify your existing dbt + adapter:
+```bash
+dbt --version
+```
+You should see a plugin like `postgres` under "Plugins".
+If you do not have dbt + postgres adapter installed:
+```bash
+python -m pip install "dbt-core~=1.9" "dbt-postgres~=1.9"
+```
+You also need pgvector available in Postgres:
+- install the extension package on the **Postgres server** (`vector.control` must exist on that server)
+- enable it in each **database** you want to use
+```sql
+CREATE EXTENSION IF NOT EXISTS vector;
+```
+(`pgvector` is the project name; the SQL extension name is `vector`.)
+## Repo layout
+- `dbt_project.yml` – declares this as a dbt package and exposes macros.
+- `macros/materializations/vector_index.sql` – Jinja materialization scaffold (pgvector first, adapters dispatchable).
+- `macros/adapters/vector_index_pgvector.sql` – pgvector adapter macro that creates/loads the target table.
+- `bin/vectorize` – orchestration command that runs dbt and then Rust embedding.
+- `rust/embedding_engine` – Rust crate and `pg_embedder` binary used for embedding generation/upsert.
+## Next steps (MVP path)
+1. Harden Rust embedding provider support (OpenAI/Bedrock/local ONNX) with better diagnostics and retries. ⏳
+2. Expand adapter macros beyond pgvector (Pinecone/Qdrant). ⏳
+3. Add end-to-end integration tests for dbt + pgvector + `pg_embedder`. ⏳
+4. Publish package docs and a reproducible quickstart. ⏳
+## Example model (current)
+```sql
+{{ config(
+    materialized='vector_index',
+    vector_db='pgvector',
+    index_name='knowledge_base',
+    embedding_model='text-embedding-3-small',
+    dimensions=(env_var('EMBED_DIMS', 1536) | int),
+    metadata_columns=['source', 'created_at', 'doc_id']
+) }}
+select
+    doc_id,
+    chunk_text as text,
+    source,
+    created_at
+from {{ ref('staging_documents') }}
+where is_active = true
+```
+Running `./bin/vectorize --select vector_knowledge_base` should:
+- fetch incremental rows
+- generate embeddings via Rust engine
+- upsert to pgvector (or Pinecone/Qdrant via adapters)
+- emit metrics (processed, failed, latency) and freshness tests
+## Run locally (preferred: existing local Postgres)
+1) Ensure Postgres is running, reachable (`PGHOST/PGPORT/PGUSER/PGDATABASE`), and has `vector` enabled:
+```sql
+CREATE EXTENSION IF NOT EXISTS vector;
+```
+2) Choose a provider and matching dimensions:
+```
+# Local ONNX (MiniLM, 384 dims)
+EMBED_PROVIDER=local
+EMBED_MODEL=sentence-transformers/all-MiniLM-L6-v2
+EMBED_LOCAL_MODEL_PATH=$PWD/ml_model   # contains model.onnx + tokenizer.json
+EMBED_DIMS=384
+# OpenAI
+EMBED_PROVIDER=openai
+EMBED_MODEL=text-embedding-3-small
+EMBED_DIMS=1536   # or a smaller dim if you request it from OpenAI
+# Bedrock Titan v2 (defaults)
+EMBED_PROVIDER=bedrock
+EMBED_MODEL=amazon.titan-embed-text-v2:0
+EMBED_DIMS=1024   # or 512/256 if you override
+```
+3) Run vectorization (dbt model + embedding upsert):
+```
+PGHOST=localhost PGPORT=5432 PGUSER=postgres PGDATABASE=postgres \
+EMBED_PROVIDER=... EMBED_MODEL=... EMBED_DIMS=... \
+./bin/vectorize --select vector_knowledge_base
+```
+Shortcut with env file:
+```
+cp .env.vectorize.example .env.vectorize
+./bin/vectorize --select vector_knowledge_base
+```
+`bin/vectorize` auto-loads `.env.vectorize` if present. Use `VECTORIZE_ENV_FILE=/path/to/file` to load a different env file.
+Expected CLI output (example):
+```
+[vectorize] running dbt model vector_knowledge_base (provider=local, model=sentence-transformers/all-MiniLM-L6-v2)
+...
+Done. PASS=1 WARN=0 ERROR=0 SKIP=0 TOTAL=1
+[vectorize] generating embeddings via Rust into public.knowledge_base
+embedded 20 rows into public.knowledge_base
+[vectorize] done.
+```
+Quick verification in Postgres:
+```sql
+SELECT count(*) AS rows FROM public.knowledge_base;
+SELECT
+  doc_id,
+  (embedding::float4[])[1:8] AS first_8_dims,
+  source,
+  created_at
+FROM public.knowledge_base
+LIMIT 5;
+```
+## Optional Docker Postgres
+Use this only if you want a disposable local pgvector instance:
+```
+docker-compose up -d postgres
+```
+If Docker/Colima is not running, this will fail with a daemon connection error.
+## Build pip package (`dbt-vectorize`)
+Build from repo root (factorlens-style, bundles Rust binary in wheel):
+```bash
+./scripts/build_wheel_with_binary.sh
+```
+Artifacts will be written to `dist/`.
+Install locally:
+```bash
+python -m pip install dist/dbt_vectorize-*.whl
+```
+CLI entrypoint after install:
+```bash
+dbt-vectorize --select vector_knowledge_base
+```
+CI release wheel build (macOS arm64 + Linux x86_64):
+- workflow file: `.github/workflows/release.yml`
+- trigger manually from Actions or push a `v*` tag
+- outputs platform-specific wheels under workflow artifacts / GitHub release assets
+### Supported embedding dimensions (set `EMBED_DIMS` to match)
+- OpenAI `text-embedding-3-small`: 1536 (can request smaller via API parameter)
+- OpenAI `text-embedding-3-large`: 3072 (can request smaller)
+- Bedrock Titan embed text v2: 1024 (or 512/256)
+- Bedrock Titan embed text v1: 1024 (or 512/256)
+- Bedrock Cohere Embed v4: 1536 (or 1024/512/256)
+- Local MiniLM (all-MiniLM-L6-v2 ONNX): 384
+## Notes
+- The Rust embedder is Python-free.
+- Keep your Postgres `vector` column dimension aligned with `EMBED_DIMS`.
+- IVFFLAT indexes warn on very small datasets; that’s expected. Rebuild after you have more rows.

dbt_vectorize-0.1.4/README.md ADDED Viewed

@@ -0,0 +1,185 @@
+# dbt-vectors (prototype scaffold)
+> Make vector indexes a first-class materialization in dbt. This repo is an MVP scaffold to prove the concept.
+## Why
+- dbt today only materializes SQL artifacts (table, view, incremental, ephemeral).
+- Vector pipelines require SQL + embeddings + upsert to a vector DB; teams currently stitch that with ad-hoc external scripts.
+- A custom `vector_index` materialization can run inside `dbt build`, generating embeddings, handling incremental logic, and writing to pgvector/Pinecone/Qdrant.
+## What’s here
+- **dbt package skeleton** with a `vector_index` materialization and dispatchable macros (pgvector working).
+- **Rust embedder** (`rust/embedding_engine`) that can generate embeddings via OpenAI, Amazon Bedrock, or a local ONNX model (no Python needed).
+- **`./bin/vectorize` runner**: orchestrates `dbt run` for the model and then calls the Rust embedder to write embeddings into Postgres/pgvector.
+- **Examples** to show how a model is defined and run.
+## Prerequisites
+`dbt-vectorize` does not vendor dbt. It uses whatever dbt binary you point it to (`DBT=...`) or find on PATH.
+Verify your existing dbt + adapter:
+```bash
+dbt --version
+```
+You should see a plugin like `postgres` under "Plugins".
+If you do not have dbt + postgres adapter installed:
+```bash
+python -m pip install "dbt-core~=1.9" "dbt-postgres~=1.9"
+```
+You also need pgvector available in Postgres:
+- install the extension package on the **Postgres server** (`vector.control` must exist on that server)
+- enable it in each **database** you want to use
+```sql
+CREATE EXTENSION IF NOT EXISTS vector;
+```
+(`pgvector` is the project name; the SQL extension name is `vector`.)
+## Repo layout
+- `dbt_project.yml` – declares this as a dbt package and exposes macros.
+- `macros/materializations/vector_index.sql` – Jinja materialization scaffold (pgvector first, adapters dispatchable).
+- `macros/adapters/vector_index_pgvector.sql` – pgvector adapter macro that creates/loads the target table.
+- `bin/vectorize` – orchestration command that runs dbt and then Rust embedding.
+- `rust/embedding_engine` – Rust crate and `pg_embedder` binary used for embedding generation/upsert.
+## Next steps (MVP path)
+1. Harden Rust embedding provider support (OpenAI/Bedrock/local ONNX) with better diagnostics and retries. ⏳
+2. Expand adapter macros beyond pgvector (Pinecone/Qdrant). ⏳
+3. Add end-to-end integration tests for dbt + pgvector + `pg_embedder`. ⏳
+4. Publish package docs and a reproducible quickstart. ⏳
+## Example model (current)
+```sql
+{{ config(
+    materialized='vector_index',
+    vector_db='pgvector',
+    index_name='knowledge_base',
+    embedding_model='text-embedding-3-small',
+    dimensions=(env_var('EMBED_DIMS', 1536) | int),
+    metadata_columns=['source', 'created_at', 'doc_id']
+) }}
+select
+    doc_id,
+    chunk_text as text,
+    source,
+    created_at
+from {{ ref('staging_documents') }}
+where is_active = true
+```
+Running `./bin/vectorize --select vector_knowledge_base` should:
+- fetch incremental rows
+- generate embeddings via Rust engine
+- upsert to pgvector (or Pinecone/Qdrant via adapters)
+- emit metrics (processed, failed, latency) and freshness tests
+## Run locally (preferred: existing local Postgres)
+1) Ensure Postgres is running, reachable (`PGHOST/PGPORT/PGUSER/PGDATABASE`), and has `vector` enabled:
+```sql
+CREATE EXTENSION IF NOT EXISTS vector;
+```
+2) Choose a provider and matching dimensions:
+```
+# Local ONNX (MiniLM, 384 dims)
+EMBED_PROVIDER=local
+EMBED_MODEL=sentence-transformers/all-MiniLM-L6-v2
+EMBED_LOCAL_MODEL_PATH=$PWD/ml_model   # contains model.onnx + tokenizer.json
+EMBED_DIMS=384
+# OpenAI
+EMBED_PROVIDER=openai
+EMBED_MODEL=text-embedding-3-small
+EMBED_DIMS=1536   # or a smaller dim if you request it from OpenAI
+# Bedrock Titan v2 (defaults)
+EMBED_PROVIDER=bedrock
+EMBED_MODEL=amazon.titan-embed-text-v2:0
+EMBED_DIMS=1024   # or 512/256 if you override
+```
+3) Run vectorization (dbt model + embedding upsert):
+```
+PGHOST=localhost PGPORT=5432 PGUSER=postgres PGDATABASE=postgres \
+EMBED_PROVIDER=... EMBED_MODEL=... EMBED_DIMS=... \
+./bin/vectorize --select vector_knowledge_base
+```
+Shortcut with env file:
+```
+cp .env.vectorize.example .env.vectorize
+./bin/vectorize --select vector_knowledge_base
+```
+`bin/vectorize` auto-loads `.env.vectorize` if present. Use `VECTORIZE_ENV_FILE=/path/to/file` to load a different env file.
+Expected CLI output (example):
+```
+[vectorize] running dbt model vector_knowledge_base (provider=local, model=sentence-transformers/all-MiniLM-L6-v2)
+...
+Done. PASS=1 WARN=0 ERROR=0 SKIP=0 TOTAL=1
+[vectorize] generating embeddings via Rust into public.knowledge_base
+embedded 20 rows into public.knowledge_base
+[vectorize] done.
+```
+Quick verification in Postgres:
+```sql
+SELECT count(*) AS rows FROM public.knowledge_base;
+SELECT
+  doc_id,
+  (embedding::float4[])[1:8] AS first_8_dims,
+  source,
+  created_at
+FROM public.knowledge_base
+LIMIT 5;
+```
+## Optional Docker Postgres
+Use this only if you want a disposable local pgvector instance:
+```
+docker-compose up -d postgres
+```
+If Docker/Colima is not running, this will fail with a daemon connection error.
+## Build pip package (`dbt-vectorize`)
+Build from repo root (factorlens-style, bundles Rust binary in wheel):
+```bash
+./scripts/build_wheel_with_binary.sh
+```
+Artifacts will be written to `dist/`.
+Install locally:
+```bash
+python -m pip install dist/dbt_vectorize-*.whl
+```
+CLI entrypoint after install:
+```bash
+dbt-vectorize --select vector_knowledge_base
+```
+CI release wheel build (macOS arm64 + Linux x86_64):
+- workflow file: `.github/workflows/release.yml`
+- trigger manually from Actions or push a `v*` tag
+- outputs platform-specific wheels under workflow artifacts / GitHub release assets
+### Supported embedding dimensions (set `EMBED_DIMS` to match)
+- OpenAI `text-embedding-3-small`: 1536 (can request smaller via API parameter)
+- OpenAI `text-embedding-3-large`: 3072 (can request smaller)
+- Bedrock Titan embed text v2: 1024 (or 512/256)
+- Bedrock Titan embed text v1: 1024 (or 512/256)
+- Bedrock Cohere Embed v4: 1536 (or 1024/512/256)
+- Local MiniLM (all-MiniLM-L6-v2 ONNX): 384
+## Notes
+- The Rust embedder is Python-free.
+- Keep your Postgres `vector` column dimension aligned with `EMBED_DIMS`.
+- IVFFLAT indexes warn on very small datasets; that’s expected. Rebuild after you have more rows.

dbt_vectorize-0.1.4/dbt_vectorize/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+__all__ = ["__version__"]
+__version__ = "0.1.4"

dbt_vectorize-0.1.4/dbt_vectorize/bin/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ """Package data holder for bundled native binaries."""
2	+

dbt_vectorize-0.1.4/dbt_vectorize/bin/pg_embedder ADDED Viewed

Binary file

dbt_vectorize-0.1.4/dbt_vectorize/cli.py ADDED Viewed

@@ -0,0 +1,165 @@
+from __future__ import annotations
+import os
+import shutil
+import subprocess
+import sys
+from pathlib import Path
+import yaml
+def _resolve_cwd() -> Path:
+    explicit = os.environ.get("DBT_VECTORIZE_CWD")
+    if explicit:
+        return Path(explicit).expanduser().resolve()
+    return Path.cwd()
+def _find_repo_root(start: Path) -> Path | None:
+    env_root = os.environ.get("DBT_VECTORIZE_REPO")
+    if env_root:
+        p = Path(env_root).expanduser().resolve()
+        if (p / "dbt_project.yml").exists() and (p / "rust" / "embedding_engine" / "Cargo.toml").exists():
+            return p
+    cur = start.resolve()
+    for p in [cur, *cur.parents]:
+        if (p / "dbt_project.yml").exists() and (p / "rust" / "embedding_engine" / "Cargo.toml").exists():
+            return p
+    return None
+def _packaged_embedder() -> str | None:
+    pkg_dir = Path(__file__).resolve().parent
+    candidates = [
+        pkg_dir / "bin" / "pg_embedder",
+        pkg_dir / "bin" / "pg_embedder.exe",
+    ]
+    for c in candidates:
+        if c.exists() and os.access(c, os.X_OK):
+            return str(c)
+    return None
+def _find_pg_embedder_cmd(cwd: Path) -> tuple[list[str], Path | None]:
+    explicit = os.environ.get("DBT_VECTORIZE_PG_EMBEDDER")
+    if explicit:
+        return [explicit], cwd
+    packaged = _packaged_embedder()
+    if packaged:
+        return [packaged], cwd
+    cargo = shutil.which("cargo")
+    repo = _find_repo_root(cwd)
+    if cargo and repo:
+        return [cargo, "run", "--quiet", "--bin", "pg_embedder", "--release", "--"], repo / "rust" / "embedding_engine"
+    raise FileNotFoundError(
+        "Could not find runnable pg_embedder backend. "
+        "Set DBT_VECTORIZE_PG_EMBEDDER, install wheel with bundled binary, "
+        "or run inside a cloned dbt-vector repo with Rust/cargo available."
+    )
+def _build_dbt_cmd(cwd: Path, argv: list[str]) -> tuple[list[str], dict[str, str], str, str]:
+    dbt = os.environ.get("DBT", "dbt")
+    profile_dir = os.environ.get("PROFILE_DIR") or os.environ.get("DBT_PROFILES_DIR") or str(cwd)
+    project_dir = os.environ.get("PROJECT_DIR") or str(cwd)
+    select_model = os.environ.get("SELECT_MODEL", "vector_knowledge_base")
+    embed_provider = os.environ.get("EMBED_PROVIDER", "local")
+    embed_model = os.environ.get("EMBED_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
+    run_args = list(argv) if argv else ["--select", select_model]
+    cmd = [
+        dbt,
+        "run",
+        "--no-partial-parse",
+        "--profiles-dir",
+        profile_dir,
+        "--project-dir",
+        project_dir,
+        *run_args,
+    ]
+    env = os.environ.copy()
+    env["EMBED_PROVIDER"] = embed_provider
+    env["EMBED_MODEL"] = embed_model
+    return cmd, env, embed_provider, embed_model
+def _build_embed_env(cwd: Path) -> dict[str, str]:
+    env = os.environ.copy()
+    profile_dir = env.get("PROFILE_DIR") or env.get("DBT_PROFILES_DIR") or str(cwd)
+    profile_name = env.get("PROFILE", "default")
+    target_name = env.get("TARGET")
+    profile_file = env.get("PROFILE_FILE") or str(Path(profile_dir) / "profiles.yml")
+    # Fallback to dbt profile values when PG* are not explicitly provided.
+    if (
+        not env.get("PGHOST")
+        or not env.get("PGPORT")
+        or not env.get("PGUSER")
+        or not env.get("PGDATABASE")
+    ):
+        p = Path(profile_file)
+        if p.exists():
+            with p.open("r", encoding="utf-8") as f:
+                data = yaml.safe_load(f) or {}
+            profile = data.get(profile_name, {}) or {}
+            outputs = profile.get("outputs", {}) or {}
+            target = target_name or profile.get("target")
+            if not target and outputs:
+                target = next(iter(outputs.keys()))
+            cfg = outputs.get(target, {}) if target else {}
+            if cfg.get("type") == "postgres":
+                if not env.get("PGHOST") and cfg.get("host") is not None:
+                    env["PGHOST"] = str(cfg["host"])
+                if not env.get("PGPORT") and cfg.get("port") is not None:
+                    env["PGPORT"] = str(cfg["port"])
+                if not env.get("PGUSER") and cfg.get("user") is not None:
+                    env["PGUSER"] = str(cfg["user"])
+                if not env.get("PGPASSWORD") and cfg.get("password") is not None:
+                    env["PGPASSWORD"] = str(cfg["password"])
+                if not env.get("PGDATABASE") and cfg.get("dbname") is not None:
+                    env["PGDATABASE"] = str(cfg["dbname"])
+                if not env.get("SCHEMA") and cfg.get("schema") is not None:
+                    env["SCHEMA"] = str(cfg["schema"])
+    env.setdefault("PGHOST", "localhost")
+    env.setdefault("PGPORT", "5432")
+    env.setdefault("PGUSER", "postgres")
+    env.setdefault("PGPASSWORD", "")
+    env.setdefault("PGDATABASE", "postgres")
+    env.setdefault("SCHEMA", "public")
+    env.setdefault("INDEX_NAME", "knowledge_base")
+    env.setdefault("EMBED_DIMS", "1536")
+    env.setdefault("EMBED_PROVIDER", "local")
+    env.setdefault("EMBED_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
+    return env
+def main() -> int:
+    argv = sys.argv[1:]
+    cwd = _resolve_cwd()
+    dbt_cmd, dbt_env, provider, model = _build_dbt_cmd(cwd, argv)
+    print(f"[vectorize] running dbt model selection (provider={provider}, model={model})")
+    dbt_proc = subprocess.run(dbt_cmd, cwd=str(cwd), env=dbt_env)
+    if dbt_proc.returncode != 0:
+        return dbt_proc.returncode
+    embed_cmd, embed_cwd = _find_pg_embedder_cmd(cwd)
+    embed_env = _build_embed_env(cwd)
+    schema = embed_env.get("SCHEMA", "public")
+    index_name = embed_env.get("INDEX_NAME", "knowledge_base")
+    print(f"[vectorize] generating embeddings via Rust into {schema}.{index_name}")
+    embed_proc = subprocess.run(embed_cmd, cwd=str(embed_cwd or cwd), env=embed_env)
+    if embed_proc.returncode != 0:
+        return embed_proc.returncode
+    print("[vectorize] done.")
+    return 0
+if __name__ == "__main__":
+    raise SystemExit(main())

dbt_vectorize-0.1.4/dbt_vectorize.egg-info/PKG-INFO ADDED Viewed

@@ -0,0 +1,203 @@
+Metadata-Version: 2.4
+Name: dbt-vectorize
+Version: 0.1.4
+Summary: dbt + Rust vectorization runner for pgvector
+Author-email: Maria Dubyaga <kraftaa@gmail.com>
+License-Expression: Apache-2.0
+Project-URL: Homepage, https://github.com/kraftaa/dbt-vector
+Project-URL: Repository, https://github.com/kraftaa/dbt-vector
+Classifier: Environment :: Console
+Classifier: Intended Audience :: Developers
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Rust
+Classifier: Topic :: Database
+Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Requires-Python: >=3.9
+Description-Content-Type: text/markdown
+Requires-Dist: PyYAML>=6.0
+# dbt-vectors (prototype scaffold)
+> Make vector indexes a first-class materialization in dbt. This repo is an MVP scaffold to prove the concept.
+## Why
+- dbt today only materializes SQL artifacts (table, view, incremental, ephemeral).
+- Vector pipelines require SQL + embeddings + upsert to a vector DB; teams currently stitch that with ad-hoc external scripts.
+- A custom `vector_index` materialization can run inside `dbt build`, generating embeddings, handling incremental logic, and writing to pgvector/Pinecone/Qdrant.
+## What’s here
+- **dbt package skeleton** with a `vector_index` materialization and dispatchable macros (pgvector working).
+- **Rust embedder** (`rust/embedding_engine`) that can generate embeddings via OpenAI, Amazon Bedrock, or a local ONNX model (no Python needed).
+- **`./bin/vectorize` runner**: orchestrates `dbt run` for the model and then calls the Rust embedder to write embeddings into Postgres/pgvector.
+- **Examples** to show how a model is defined and run.
+## Prerequisites
+`dbt-vectorize` does not vendor dbt. It uses whatever dbt binary you point it to (`DBT=...`) or find on PATH.
+Verify your existing dbt + adapter:
+```bash
+dbt --version
+```
+You should see a plugin like `postgres` under "Plugins".
+If you do not have dbt + postgres adapter installed:
+```bash
+python -m pip install "dbt-core~=1.9" "dbt-postgres~=1.9"
+```
+You also need pgvector available in Postgres:
+- install the extension package on the **Postgres server** (`vector.control` must exist on that server)
+- enable it in each **database** you want to use
+```sql
+CREATE EXTENSION IF NOT EXISTS vector;
+```
+(`pgvector` is the project name; the SQL extension name is `vector`.)
+## Repo layout
+- `dbt_project.yml` – declares this as a dbt package and exposes macros.
+- `macros/materializations/vector_index.sql` – Jinja materialization scaffold (pgvector first, adapters dispatchable).
+- `macros/adapters/vector_index_pgvector.sql` – pgvector adapter macro that creates/loads the target table.
+- `bin/vectorize` – orchestration command that runs dbt and then Rust embedding.
+- `rust/embedding_engine` – Rust crate and `pg_embedder` binary used for embedding generation/upsert.
+## Next steps (MVP path)
+1. Harden Rust embedding provider support (OpenAI/Bedrock/local ONNX) with better diagnostics and retries. ⏳
+2. Expand adapter macros beyond pgvector (Pinecone/Qdrant). ⏳
+3. Add end-to-end integration tests for dbt + pgvector + `pg_embedder`. ⏳
+4. Publish package docs and a reproducible quickstart. ⏳
+## Example model (current)
+```sql
+{{ config(
+    materialized='vector_index',
+    vector_db='pgvector',
+    index_name='knowledge_base',
+    embedding_model='text-embedding-3-small',
+    dimensions=(env_var('EMBED_DIMS', 1536) | int),
+    metadata_columns=['source', 'created_at', 'doc_id']
+) }}
+select
+    doc_id,
+    chunk_text as text,
+    source,
+    created_at
+from {{ ref('staging_documents') }}
+where is_active = true
+```
+Running `./bin/vectorize --select vector_knowledge_base` should:
+- fetch incremental rows
+- generate embeddings via Rust engine
+- upsert to pgvector (or Pinecone/Qdrant via adapters)
+- emit metrics (processed, failed, latency) and freshness tests
+## Run locally (preferred: existing local Postgres)
+1) Ensure Postgres is running, reachable (`PGHOST/PGPORT/PGUSER/PGDATABASE`), and has `vector` enabled:
+```sql
+CREATE EXTENSION IF NOT EXISTS vector;
+```
+2) Choose a provider and matching dimensions:
+```
+# Local ONNX (MiniLM, 384 dims)
+EMBED_PROVIDER=local
+EMBED_MODEL=sentence-transformers/all-MiniLM-L6-v2
+EMBED_LOCAL_MODEL_PATH=$PWD/ml_model   # contains model.onnx + tokenizer.json
+EMBED_DIMS=384
+# OpenAI
+EMBED_PROVIDER=openai
+EMBED_MODEL=text-embedding-3-small
+EMBED_DIMS=1536   # or a smaller dim if you request it from OpenAI
+# Bedrock Titan v2 (defaults)
+EMBED_PROVIDER=bedrock
+EMBED_MODEL=amazon.titan-embed-text-v2:0
+EMBED_DIMS=1024   # or 512/256 if you override
+```
+3) Run vectorization (dbt model + embedding upsert):
+```
+PGHOST=localhost PGPORT=5432 PGUSER=postgres PGDATABASE=postgres \
+EMBED_PROVIDER=... EMBED_MODEL=... EMBED_DIMS=... \
+./bin/vectorize --select vector_knowledge_base
+```
+Shortcut with env file:
+```
+cp .env.vectorize.example .env.vectorize
+./bin/vectorize --select vector_knowledge_base
+```
+`bin/vectorize` auto-loads `.env.vectorize` if present. Use `VECTORIZE_ENV_FILE=/path/to/file` to load a different env file.
+Expected CLI output (example):
+```
+[vectorize] running dbt model vector_knowledge_base (provider=local, model=sentence-transformers/all-MiniLM-L6-v2)
+...
+Done. PASS=1 WARN=0 ERROR=0 SKIP=0 TOTAL=1
+[vectorize] generating embeddings via Rust into public.knowledge_base
+embedded 20 rows into public.knowledge_base
+[vectorize] done.
+```
+Quick verification in Postgres:
+```sql
+SELECT count(*) AS rows FROM public.knowledge_base;
+SELECT
+  doc_id,
+  (embedding::float4[])[1:8] AS first_8_dims,
+  source,
+  created_at
+FROM public.knowledge_base
+LIMIT 5;
+```
+## Optional Docker Postgres
+Use this only if you want a disposable local pgvector instance:
+```
+docker-compose up -d postgres
+```
+If Docker/Colima is not running, this will fail with a daemon connection error.
+## Build pip package (`dbt-vectorize`)
+Build from repo root (factorlens-style, bundles Rust binary in wheel):
+```bash
+./scripts/build_wheel_with_binary.sh
+```
+Artifacts will be written to `dist/`.
+Install locally:
+```bash
+python -m pip install dist/dbt_vectorize-*.whl
+```
+CLI entrypoint after install:
+```bash
+dbt-vectorize --select vector_knowledge_base
+```
+CI release wheel build (macOS arm64 + Linux x86_64):
+- workflow file: `.github/workflows/release.yml`
+- trigger manually from Actions or push a `v*` tag
+- outputs platform-specific wheels under workflow artifacts / GitHub release assets
+### Supported embedding dimensions (set `EMBED_DIMS` to match)
+- OpenAI `text-embedding-3-small`: 1536 (can request smaller via API parameter)
+- OpenAI `text-embedding-3-large`: 3072 (can request smaller)
+- Bedrock Titan embed text v2: 1024 (or 512/256)
+- Bedrock Titan embed text v1: 1024 (or 512/256)
+- Bedrock Cohere Embed v4: 1536 (or 1024/512/256)
+- Local MiniLM (all-MiniLM-L6-v2 ONNX): 384
+## Notes
+- The Rust embedder is Python-free.
+- Keep your Postgres `vector` column dimension aligned with `EMBED_DIMS`.
+- IVFFLAT indexes warn on very small datasets; that’s expected. Rebuild after you have more rows.

dbt_vectorize-0.1.4/dbt_vectorize.egg-info/SOURCES.txt ADDED Viewed

@@ -0,0 +1,13 @@
+README.md
+pyproject.toml
+setup.py
+dbt_vectorize/__init__.py
+dbt_vectorize/cli.py
+dbt_vectorize.egg-info/PKG-INFO
+dbt_vectorize.egg-info/SOURCES.txt
+dbt_vectorize.egg-info/dependency_links.txt
+dbt_vectorize.egg-info/entry_points.txt
+dbt_vectorize.egg-info/requires.txt
+dbt_vectorize.egg-info/top_level.txt
+dbt_vectorize/bin/__init__.py
+dbt_vectorize/bin/pg_embedder

dbt_vectorize-0.1.4/dbt_vectorize.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+

dbt_vectorize-0.1.4/dbt_vectorize.egg-info/entry_points.txt ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ [console_scripts]
2	+ dbt-vectorize = dbt_vectorize.cli:main

dbt_vectorize-0.1.4/dbt_vectorize.egg-info/requires.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ PyYAML>=6.0

dbt_vectorize-0.1.4/dbt_vectorize.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ dbt_vectorize

dbt_vectorize-0.1.4/pyproject.toml ADDED Viewed

@@ -0,0 +1,39 @@
+[build-system]
+requires = ["setuptools>=70.1"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "dbt-vectorize"
+version = "0.1.4"
+description = "dbt + Rust vectorization runner for pgvector"
+readme = "README.md"
+license = "Apache-2.0"
+requires-python = ">=3.9"
+authors = [
+  { name = "Maria Dubyaga", email = "kraftaa@gmail.com" }
+]
+dependencies = [
+  "PyYAML>=6.0",
+]
+classifiers = [
+  "Environment :: Console",
+  "Intended Audience :: Developers",
+  "Programming Language :: Python :: 3",
+  "Programming Language :: Rust",
+  "Topic :: Database",
+  "Topic :: Scientific/Engineering :: Artificial Intelligence",
+]
+[project.scripts]
+dbt-vectorize = "dbt_vectorize.cli:main"
+[project.urls]
+Homepage = "https://github.com/kraftaa/dbt-vector"
+Repository = "https://github.com/kraftaa/dbt-vector"
+[tool.setuptools]
+packages = ["dbt_vectorize", "dbt_vectorize.bin"]
+include-package-data = true
+[tool.setuptools.package-data]
+dbt_vectorize = ["bin/*"]

dbt_vectorize-0.1.4/setup.cfg ADDED Viewed

@@ -0,0 +1,4 @@
+[egg_info]
+tag_build =
+tag_date = 0

dbt_vectorize-0.1.4/setup.py ADDED Viewed

@@ -0,0 +1,27 @@
+from setuptools import setup
+try:
+    from setuptools.command.bdist_wheel import bdist_wheel as _bdist_wheel
+except Exception:
+    try:
+        from wheel.bdist_wheel import bdist_wheel as _bdist_wheel
+    except Exception:
+        _bdist_wheel = None
+if _bdist_wheel is not None:
+    class bdist_wheel(_bdist_wheel):
+        # Force platform wheels because we bundle a native Rust binary.
+        def finalize_options(self):
+            super().finalize_options()
+            self.root_is_pure = False
+        # The package works with any Python 3 version; only the platform matters.
+        def get_tag(self):
+            _py, _abi, plat = super().get_tag()
+            return "py3", "none", plat
+    setup(cmdclass={"bdist_wheel": bdist_wheel})
+else:
+    setup()