PyPI - vedana-etl - Versions diffs - 0.1.0.dev3__py3-none-any.whl - Mend

vedana-etl 0.1.0.dev3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

vedana_etl/__init__.py +0 -0
vedana_etl/app.py +10 -0
vedana_etl/catalog.py +266 -0
vedana_etl/config.py +22 -0
vedana_etl/pipeline.py +142 -0
vedana_etl/py.typed +0 -0
vedana_etl/schemas.py +31 -0
vedana_etl/settings.py +23 -0
vedana_etl/steps.py +685 -0
vedana_etl/store.py +208 -0
vedana_etl-0.1.0.dev3.dist-info/METADATA +51 -0
vedana_etl-0.1.0.dev3.dist-info/RECORD +13 -0
vedana_etl-0.1.0.dev3.dist-info/WHEEL +4 -0

vedana_etl/__init__.py ADDED Viewed

File without changes

vedana_etl/app.py ADDED Viewed

@@ -0,0 +1,10 @@
+from datapipe.compute import Catalog
+from datapipe_app import DatapipeAPI
+from vedana_etl.config import ds
+from vedana_etl.pipeline import default_custom_steps, get_pipeline
+# base app - no extra tables / steps
+pipeline = get_pipeline(custom_steps=default_custom_steps)
+app = DatapipeAPI(ds, Catalog({}), pipeline)

vedana_etl/catalog.py ADDED Viewed

@@ -0,0 +1,266 @@
+from datapipe.compute import Table
+from datapipe.store.database import TableStoreDB
+from datapipe.store.neo4j import Neo4JStore
+from pgvector.sqlalchemy import Vector
+from sqlalchemy import Boolean, Column, Float, String
+from vedana_core.settings import settings as core_settings
+import vedana_etl.schemas as schemas
+from vedana_etl.config import DBCONN_DATAPIPE, MEMGRAPH_CONN_ARGS
+dm_links = Table(
+    name="dm_links",
+    store=TableStoreDB(
+        dbconn=DBCONN_DATAPIPE,
+        name="dm_links",
+        data_sql_schema=[
+            Column("anchor1", String, primary_key=True),
+            Column("anchor2", String, primary_key=True),
+            Column("sentence", String, primary_key=True),
+            Column("description", String),
+            Column("query", String),
+            Column("anchor1_link_column_name", String),
+            Column("anchor2_link_column_name", String),
+            Column("has_direction", Boolean, default=False),
+        ],
+    ),
+)
+dm_anchor_attributes = Table(
+    name="dm_anchor_attributes",
+    store=TableStoreDB(
+        dbconn=DBCONN_DATAPIPE,
+        name="dm_anchor_attributes",
+        data_sql_schema=[
+            Column("anchor", String, primary_key=True),
+            Column("attribute_name", String, primary_key=True),
+            Column("description", String),
+            Column("data_example", String),
+            Column("embeddable", Boolean),
+            Column("query", String),
+            Column("dtype", String),
+            Column("embed_threshold", Float),
+        ],
+    ),
+)
+dm_link_attributes = Table(
+    name="dm_link_attributes",
+    store=TableStoreDB(
+        dbconn=DBCONN_DATAPIPE,
+        name="dm_link_attributes",
+        data_sql_schema=[
+            Column("link", String, primary_key=True),
+            Column("attribute_name", String, primary_key=True),
+            Column("description", String),
+            Column("data_example", String),
+            Column("embeddable", Boolean),
+            Column("query", String),
+            Column("dtype", String),
+            Column("embed_threshold", Float),
+        ],
+    ),
+)
+dm_anchors = Table(
+    name="dm_anchors",
+    store=TableStoreDB(
+        dbconn=DBCONN_DATAPIPE,
+        name="dm_anchors",
+        data_sql_schema=[
+            Column("noun", String, primary_key=True),
+            Column("description", String),
+            Column("id_example", String),
+            Column("query", String),
+        ],
+    ),
+)
+dm_queries = Table(
+    name="dm_queries",
+    store=TableStoreDB(
+        dbconn=DBCONN_DATAPIPE,
+        name="dm_queries",
+        data_sql_schema=[
+            Column("query_name", String, primary_key=True),
+            Column("query_example", String),
+        ],
+    ),
+)
+dm_prompts = Table(
+    name="dm_prompts",
+    store=TableStoreDB(
+        dbconn=DBCONN_DATAPIPE,
+        name="dm_prompts",
+        data_sql_schema=[
+            Column("name", String, primary_key=True),
+            Column("text", String),
+        ],
+    ),
+)
+dm_conversation_lifecycle = Table(
+    name="dm_conversation_lifecycle",
+    store=TableStoreDB(
+        dbconn=DBCONN_DATAPIPE,
+        name="dm_conversation_lifecycle",
+        data_sql_schema=[
+            Column("event", String, primary_key=True),
+            Column("text", String),
+        ],
+    ),
+)
+grist_nodes = Table(
+    name="grist_nodes",
+    store=TableStoreDB(
+        dbconn=DBCONN_DATAPIPE,
+        name="grist_nodes",
+        data_sql_schema=schemas.GENERIC_NODE_DATA_SCHEMA,
+    ),
+)
+grist_edges = Table(
+    name="grist_edges",
+    store=TableStoreDB(
+        dbconn=DBCONN_DATAPIPE,
+        name="grist_edges",
+        data_sql_schema=schemas.GENERIC_EDGE_DATA_SCHEMA,
+    ),
+)
+# --- Tables used as input for memgraph ---
+nodes = Table(
+    name="nodes",
+    store=TableStoreDB(
+        dbconn=DBCONN_DATAPIPE,
+        name="nodes",
+        data_sql_schema=schemas.GENERIC_NODE_DATA_SCHEMA,
+    ),
+)
+edges = Table(
+    name="edges",
+    store=TableStoreDB(
+        dbconn=DBCONN_DATAPIPE,
+        name="edges",
+        data_sql_schema=schemas.GENERIC_EDGE_DATA_SCHEMA,
+    ),
+)
+# --- Memgraph-related tables ---
+memgraph_anchor_indexes = Table(
+    name="memgraph_anchor_indexes",
+    store=TableStoreDB(
+        dbconn=DBCONN_DATAPIPE,
+        name="memgraph_anchor_indexes",
+        data_sql_schema=[
+            Column("anchor", String, primary_key=True),
+        ],
+    ),
+)
+memgraph_link_indexes = Table(
+    name="memgraph_link_indexes",
+    store=TableStoreDB(
+        dbconn=DBCONN_DATAPIPE,
+        name="memgraph_link_indexes",
+        data_sql_schema=[
+            Column("link", String, primary_key=True),
+        ],
+    ),
+)
+memgraph_anchor_vector_indexes = Table(
+    name="memgraph_anchor_vector_indexes",
+    store=TableStoreDB(
+        dbconn=DBCONN_DATAPIPE,
+        name="memgraph_anchor_vector_indexes",
+        data_sql_schema=[
+            Column("anchor", String, primary_key=True),
+            Column("attribute_name", String, primary_key=True),
+        ],
+    ),
+)
+memgraph_link_vector_indexes = Table(
+    name="memgraph_link_vector_indexes",
+    store=TableStoreDB(
+        dbconn=DBCONN_DATAPIPE,
+        name="memgraph_link_vector_indexes",
+        data_sql_schema=[
+            Column("link", String, primary_key=True),
+            Column("attribute_name", String, primary_key=True),
+        ],
+    ),
+)
+memgraph_nodes = Table(
+    name="memgraph_nodes",
+    store=Neo4JStore(
+        connection_kwargs=MEMGRAPH_CONN_ARGS,
+        data_sql_schema=schemas.GENERIC_NODE_DATA_SCHEMA,
+    ),
+)
+memgraph_edges = Table(
+    name="memgraph_edges",
+    store=Neo4JStore(
+        connection_kwargs=MEMGRAPH_CONN_ARGS,
+        data_sql_schema=schemas.GENERIC_EDGE_DATA_SCHEMA,
+    ),
+)
+# --- VTS (pgvector) ---
+# embedding size column is fixed for indexing and is defined through settings. Definition is then fixed in migrations
+rag_anchor_embeddings = Table(
+    name="rag_anchor_embeddings",
+    store=TableStoreDB(
+        dbconn=DBCONN_DATAPIPE,
+        name="rag_anchor_embeddings",
+        data_sql_schema=[
+            Column("node_id", String, primary_key=True),
+            Column("attribute_name", String, primary_key=True),
+            Column("label", String, nullable=False),
+            Column("attribute_value", String),
+            Column("embedding", Vector(dim=core_settings.embeddings_dim), nullable=False),
+        ],
+    ),
+)
+rag_edge_embeddings = Table(
+    name="rag_edge_embeddings",
+    store=TableStoreDB(
+        dbconn=DBCONN_DATAPIPE,
+        name="rag_edge_embeddings",
+        data_sql_schema=[
+            Column("from_node_id", String, primary_key=True),
+            Column("to_node_id", String, primary_key=True),
+            Column("edge_label", String, primary_key=True),
+            Column("attribute_name", String, primary_key=True),
+            Column("attribute_value", String),
+            Column("embedding", Vector(dim=core_settings.embeddings_dim), nullable=False),
+        ],
+    ),
+)
+# --- Eval pipeline ---
+eval_gds = Table(
+    name="eval_gds",
+    store=TableStoreDB(
+        dbconn=DBCONN_DATAPIPE,
+        name="eval_gds",
+        data_sql_schema=[
+            Column("gds_question", String, primary_key=True),
+            Column("gds_answer", String),
+            Column("question_scenario", String),
+            Column("question_comment", String),
+            Column("question_context", String),
+        ],
+    ),
+)

vedana_etl/config.py ADDED Viewed

@@ -0,0 +1,22 @@
+import json
+from functools import partial
+from datapipe.compute import Catalog
+from datapipe.datatable import DataStore
+from datapipe.store.database import DBConn
+from vedana_core.settings import settings as core_settings
+from vedana_etl.settings import settings
+MEMGRAPH_CONN_ARGS = {
+    "uri": core_settings.memgraph_uri,
+    "auth": (core_settings.memgraph_user, core_settings.memgraph_pwd),
+}
+DBCONN_DATAPIPE = DBConn(
+    connstr=settings.db_conn_uri, create_engine_kwargs=dict(json_serializer=partial(json.dumps, ensure_ascii=False))
+)
+ds = DataStore(DBCONN_DATAPIPE)
+catalog = Catalog({})

vedana_etl/pipeline.py ADDED Viewed

@@ -0,0 +1,142 @@
+from datapipe.compute import Pipeline
+from datapipe.step.batch_generate import BatchGenerate
+from datapipe.step.batch_transform import BatchTransform
+import vedana_etl.steps as steps
+from vedana_etl.catalog import (
+    dm_anchor_attributes,
+    dm_anchors,
+    dm_conversation_lifecycle,
+    dm_link_attributes,
+    dm_links,
+    dm_prompts,
+    dm_queries,
+    edges,
+    grist_edges,
+    grist_nodes,
+    memgraph_anchor_indexes,
+    memgraph_edges,
+    memgraph_link_indexes,
+    memgraph_nodes,
+    nodes,
+    rag_anchor_embeddings,
+    rag_edge_embeddings,
+    eval_gds,
+)
+data_model_steps = [
+    BatchGenerate(
+        func=steps.get_data_model,  # Generator with main graph data
+        outputs=[
+            dm_anchors,
+            dm_anchor_attributes,
+            dm_link_attributes,
+            dm_links,
+            dm_queries,
+            dm_prompts,
+            dm_conversation_lifecycle,
+        ],
+        labels=[("flow", "regular"), ("flow", "on-demand"), ("stage", "extract"), ("stage", "data-model")],
+    ),
+]
+grist_steps = [
+    BatchGenerate(
+        func=steps.get_grist_data,
+        outputs=[grist_nodes, grist_edges],
+        labels=[("flow", "on-demand"), ("stage", "extract"), ("stage", "grist")],
+    ),
+]
+# ---
+# This part is customisable (can be replaced with a connection of other branches
+default_custom_steps = [
+    BatchTransform(
+        func=steps.prepare_nodes,
+        inputs=[grist_nodes],
+        outputs=[nodes],
+        labels=[("flow", "on-demand"), ("stage", "transform"), ("stage", "grist")],
+        transform_keys=["node_id"],
+    ),
+    BatchTransform(
+        func=steps.prepare_edges,
+        inputs=[grist_edges],
+        outputs=[edges],
+        labels=[("flow", "on-demand"), ("stage", "transform"), ("stage", "grist")],
+        transform_keys=["from_node_id", "to_node_id", "edge_label"],
+    ),
+]
+# --- Loading data to Memgraph and Vector Store ---
+memgraph_steps = [
+    BatchTransform(
+        func=steps.ensure_memgraph_node_indexes,
+        inputs=[dm_anchor_attributes],
+        outputs=[memgraph_anchor_indexes],
+        labels=[("flow", "regular"), ("flow", "on-demand"), ("stage", "load")],
+        transform_keys=["attribute_name"],
+    ),
+    BatchTransform(
+        func=steps.ensure_memgraph_edge_indexes,
+        inputs=[dm_link_attributes],
+        outputs=[memgraph_link_indexes],
+        labels=[("flow", "regular"), ("flow", "on-demand"), ("stage", "load")],
+        transform_keys=["attribute_name"],
+    ),
+    BatchTransform(
+        func=steps.pass_df_to_memgraph,
+        inputs=[nodes],
+        outputs=[memgraph_nodes],
+        labels=[("flow", "regular"), ("flow", "on-demand"), ("stage", "load")],
+        transform_keys=["node_id", "node_type"],
+    ),
+    BatchTransform(
+        func=steps.pass_df_to_memgraph,
+        inputs=[edges],
+        outputs=[memgraph_edges],
+        labels=[("flow", "regular"), ("flow", "on-demand"), ("stage", "load")],
+        transform_keys=["from_node_id", "to_node_id", "edge_label"],
+    ),
+    BatchTransform(
+        func=steps.generate_embeddings,
+        inputs=[nodes, dm_anchor_attributes],
+        outputs=[rag_anchor_embeddings],
+        labels=[("flow", "regular"), ("flow", "on-demand"), ("stage", "load")],
+        transform_keys=["node_id", "node_type"],
+    ),
+    BatchTransform(
+        func=steps.generate_embeddings,
+        inputs=[edges, dm_link_attributes],
+        outputs=[rag_edge_embeddings],
+        labels=[("flow", "regular"), ("flow", "on-demand"), ("stage", "load")],
+        transform_keys=["from_node_id", "to_node_id", "edge_label"],
+    ),
+]
+eval_steps = [
+    BatchGenerate(
+        func=steps.get_eval_gds_from_grist,
+        outputs=[eval_gds],
+        labels=[("pipeline", "eval"), ("flow", "eval"), ("stage", "extract")],
+    ),
+]
+def get_data_model_pipeline() -> Pipeline:
+    return Pipeline(data_model_steps)
+def get_pipeline(custom_steps: list) -> Pipeline:
+    pipeline = Pipeline(
+        [
+            *data_model_steps,
+            *grist_steps,
+            *custom_steps,
+            *memgraph_steps,
+            *eval_steps,
+        ]
+    )
+    return pipeline

vedana_etl/py.typed ADDED Viewed

File without changes

vedana_etl/schemas.py ADDED Viewed

@@ -0,0 +1,31 @@
+from sqlalchemy import Column, String
+from sqlalchemy.dialects.postgresql import JSONB
+GENERIC_NODE_DATA_SCHEMA: list[Column] = [
+    Column("node_id", String, primary_key=True),
+    Column("node_type", String, primary_key=True),
+    Column("attributes", JSONB),
+]
+GENERIC_EDGE_DATA_SCHEMA: list[Column] = [
+    Column("from_node_id", String, primary_key=True),
+    Column("to_node_id", String, primary_key=True),
+    Column("from_node_type", String, primary_key=True),
+    Column("to_node_type", String, primary_key=True),
+    Column("edge_label", String, primary_key=True),
+    Column("attributes", JSONB),
+]
+# ---
+# Evaluation pipeline schemas
+DM_VERSIONING_TABLE_SCHEMA: list[Column] = [
+    Column("dm_id", String, primary_key=True),
+    Column("dm_description", String),
+]
+EVAL_GDS_SCHEMA: list[Column] = [
+    Column("gds_question", String, primary_key=True),
+    Column("gds_answer", String),
+    Column("question_context", String),
+]

vedana_etl/settings.py ADDED Viewed

@@ -0,0 +1,23 @@
+from pydantic_settings import BaseSettings, SettingsConfigDict
+class Settings(BaseSettings):
+    model_config = SettingsConfigDict(
+        env_prefix="",
+        env_file=".env",
+        env_file_encoding="utf-8",
+        extra="ignore",
+    )
+    # Datapipe connection URI
+    # db_conn_uri: str = "sqlite+pysqlite3:///db.sqlite"
+    db_conn_uri: str
+    # Tests pipeline (vedana-eval) settings.
+    grist_test_set_doc_id: str = ""
+    gds_table_name: str = "Gds"  # Table names in the test set doc
+    tests_table_name: str = "Tests"
+    test_environment: str = ""
+settings = Settings()  # type: ignore