PyPI - vedana-etl - Versions diffs - 0.6.0.dev1__tar.gz → 0.6.2__tar.gz - Mend

vedana-etl 0.6.0.dev1tar.gz → 0.6.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (50) hide show

{vedana_etl-0.6.0.dev1 → vedana_etl-0.6.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: vedana-etl
-Version: 0.6.0.dev1
+Version: 0.6.2
 Summary: Pipeline template for Vedana
 Author-email: Andrey Tatarinov <a@tatarinov.co>, Timur Sheydaev <tsheyd@epoch8.co>
 Requires-Python: >=3.12

{vedana_etl-0.6.0.dev1 → vedana_etl-0.6.2}/src/vedana_etl/catalog.py RENAMED Viewed

@@ -224,8 +224,8 @@ rag_anchor_embeddings = Table(
         name="rag_anchor_embeddings",
         data_sql_schema=[
             Column("node_id", String, primary_key=True),
+            Column("node_type", String, primary_key=True),
             Column("attribute_name", String, primary_key=True),
-            Column("label", String, nullable=False),
             Column("attribute_value", String),
             Column("embedding", Vector(dim=core_settings.embeddings_dim), nullable=False),
         ],

{vedana_etl-0.6.0.dev1 → vedana_etl-0.6.2}/src/vedana_etl/steps.py RENAMED Viewed

@@ -604,7 +604,8 @@ def generate_embeddings(
                 tasks.append((pos, attr_name, text_val))
     if not tasks:
-        return df
+        output_columns = pkeys + ["attribute_name", "attribute_value", "embedding"]
+        return pd.DataFrame(columns=output_columns)
     provider = LLMProvider()

{vedana_etl-0.6.0.dev1 → vedana_etl-0.6.2}/tests/.env.example RENAMED Viewed

@@ -4,7 +4,7 @@ GRIST_API_KEY=""
 GRIST_DATA_MODEL_DOC_ID="krvDzKM6mbFRokq3m8NFq9"
 GRIST_DATA_DOC_ID="u2hBF97eDygXmM1sXK9rA4"
-# Memgraph (локальный деплоймент)
+# Memgraph (local deployment)
 MEMGRAPH_URI="bolt://memgraph:7687"
 MEMGRAPH_USER="neo4j"
 MEMGRAPH_PWD="modular-current-bonjour-senior-neptune-8618"

{vedana_etl-0.6.0.dev1 → vedana_etl-0.6.2}/tests/docker-compose.yml RENAMED Viewed

@@ -2,7 +2,7 @@ services:
   grist:
     image: gristlabs/grist:latest
     environment:
-      # (пример из grist-local-testing; подходит только для CI/локала)
+      # (example from grist-local-testing; suitable only for CI/local)
       GRIST_API_KEY: "e30d2f274a538c05fecd14510887f8a3b7eab718"
       GRIST_DATA_MODEL_DOC_ID: "aAco4qS9Dvf8"
       GRIST_DATA_DOC_ID: "kB1iVADLPGU5"
@@ -14,7 +14,7 @@ services:
     volumes:
       # Where to store persistent data, such as documents.
       - ${PERSIST_DIR:-./infra/persist}/grist:/persist
-      # CSV-фикстуры, читаем их скриптом
+      # CSV fixtures, read by script
       - ./fixtures/grist:/seed:ro
     healthcheck:
       test: [ "CMD", "bash", "-lc", "wget -qO- http://localhost:8484/api/status || exit 1" ]

{vedana_etl-0.6.0.dev1 → vedana_etl-0.6.2}/tests/infra/docker-compose.ci.yml RENAMED Viewed

@@ -20,7 +20,7 @@ services:
   grist:
     image: gristlabs/grist:latest
     environment:
-      # (пример из grist-local-testing; подходит только для CI/локала)
+      # (example from grist-local-testing; for CI/local dev)
       GRIST_API_KEY: "e30d2f274a538c05fecd14510887f8a3b7eab718"
       GRIST_DATA_MODEL_DOC_ID: "aAco4qS9Dvf8"
       GRIST_DATA_DOC_ID: "kB1iVADLPGU5"
@@ -32,7 +32,7 @@ services:
     volumes:
       # Where to store persistent data, such as documents.
       - ${PERSIST_DIR:-./persist}/grist:/persist
-      # CSV-фикстуры, читаем их скриптом
+      # read CSV-fixtures
       - ../fixtures/grist:/seed:ro
     healthcheck:
       test: ["CMD", "bash", "-lc", "wget -qO- http://localhost:8484/api/status || exit 1"]

{vedana_etl-0.6.0.dev1 → vedana_etl-0.6.2}/tests/integ/conftest.py RENAMED Viewed

@@ -14,9 +14,9 @@ load_dotenv()
 # -------- live Grist fixtures (NO mocks) --------
 @pytest.fixture(scope="session")
 def dm_dfs():
-    """Data Model из живой Grist: Anchors, Attributes, Links."""
+    """Data Model from live Grist: Anchors, Attributes, Links."""
     anchors_df, a_attrs_df, l_attrs_df, links_df, q_df, p_df, cl_df = next(steps.get_data_model())
-    # sanity: типы как в коде
+    # sanity: types as in code
     assert a_attrs_df["embeddable"].dtype == bool and l_attrs_df["embeddable"].dtype == bool
     assert "embed_threshold" in a_attrs_df.columns and "embed_threshold" in l_attrs_df.columns
     return anchors_df, a_attrs_df, l_attrs_df, links_df, q_df, p_df, cl_df
@@ -24,7 +24,7 @@ def dm_dfs():
 @pytest.fixture(scope="session")
 def raw_graph_dfs():
-    """Сырые nodes/edges из живой Grist."""
+    """Raw nodes/edges from live Grist."""
     nodes, edges = next(steps.get_grist_data())
     return nodes, edges
@@ -48,12 +48,12 @@ def live_memgraph_available():
     return _ping_memgraph()
-# детерминированный провайдер эмбеддингов, чтобы тесты были воспроизводимы
+# deterministic embeddings provider for reproducible tests
 @pytest.fixture
 def dummy_llm(monkeypatch):
     class DummyProv:
         def create_embeddings_sync(self, texts):
-            # фикс-вектор длины 8 (или сколько у тебя EMBEDDINGS_DIM — можно и динамически достать)
+            # fixed vector of length 8 (or however many EMBEDDINGS_DIM you have - can be fetched dynamically)
             return [[1.0] + [0.0] * (getattr(core_settings, "embeddings_dim", 8) - 1) for _ in texts]
     orig = steps.LLMProvider

{vedana_etl-0.6.0.dev1 → vedana_etl-0.6.2}/tests/integ/test_anchor_attribute_filtering.py RENAMED Viewed

@@ -1,18 +1,18 @@
 """
-Интеграционный тест: anchor_attribute_filtering
+Integration test: anchor_attribute_filtering
-Что проверяем:
-  - В граф попадают ТОЛЬКО атрибуты, описанные в дата-модели.
-  - Специально проверяем, что "document_random_attr" (присутствует в тестовых данных)
-    полностью удаляется у типа "document".
+What we check:
+  - ONLY attributes described in Data Model are included in the graph.
+  - Specifically verify that "document_random_attr" (present in test data)
+    is completely removed from "document" type nodes.
-Шаги:
-  1) Загружаем Data Model (Anchors/Attributes/Links) из живой Grist.
-  2) Загружаем сырые данные из живой Grist.
-  3) Проверяем:
-     - атрибуты (кроме служебного DataModel) — подмножество Data Model атрибутов,
-       допускаем только *_embedding (могут появиться позднее).
-     - для типа "document" ключ "document_random_attr" отсутствует.
+Steps:
+  1) Load Data Model (Anchors/Attributes/Links) from live Grist.
+  2) Load raw data from live Grist.
+  3) Verify:
+     - attributes (except service DataModel) are a subset of Data Model attributes,
+       only allowing *_embedding (may appear later).
+     - for "document" type, key "document_random_attr" is absent.
 """
 from typing import Dict, Set
@@ -26,25 +26,25 @@ load_dotenv()
 def test_anchor_attribute_filtering_removes_unknown() -> None:
     """
-    Основная проверка фильтрации атрибутов узлов.
+    Main verification of node attribute filtering.
     """
-    # 1) Проверяем Data Model
+    # 1) Verify Data Model
     anchors_df, a_attrs_df, _l_attrs_df, _links_df, _q_df, _p_df, _cl_df = next(steps.get_data_model())
     assert not anchors_df.empty and not a_attrs_df.empty, "Data Model must not be empty (Anchors)."
-    # 2) Проверяем данные из Grist
+    # 2) Verify data from Grist
     nodes_df, _ = next(steps.get_grist_data())
     assert not nodes_df.empty, "No nodes fetched from Grist."
-    # 3) Допустимые атрибуты по Data Model
+    # 3) Allowed attributes from Data Model
     allowed_attrs: Set[str] = set(a_attrs_df["attribute_name"].astype(str))
-    # 3.1) У каждого узла (кроме DataModel) ключи атрибутов ⊆ Data Model атрибутов (плюс *_embedding)
+    # 3.1) For each node (except DataModel) attribute keys should be subset of Data Model attributes (plus *_embedding)
     for _, row in nodes_df[nodes_df["node_type"] != "DataModel"].iterrows():
         attr_dict: Dict[str, object] = row["attributes"] or {}
         keys = set(map(str, attr_dict.keys()))
-        # разрешаем сгенерированные позже эмбеддинги
+        # allow generated embeddings that may appear later
         embedding_keys = {k for k in keys if k.endswith("_embedding")}
         unknown = keys - allowed_attrs - embedding_keys
         assert not unknown, f"""
@@ -52,7 +52,7 @@ def test_anchor_attribute_filtering_removes_unknown() -> None:
             Data Model: {sorted(unknown)}
             """
-    # 3.2) Специальный кейс: document_random_attr должен исчезнуть у document-узлов
+    # 3.2) Special case: document_random_attr should be removed from document nodes
     docs = nodes_df[nodes_df["node_type"] == "document"]
     assert not docs.empty, "Expected at least one 'document' node in test data."
     still_has_random = [
@@ -62,7 +62,7 @@ def test_anchor_attribute_filtering_removes_unknown() -> None:
         not still_has_random
     ), f"Unexpected attribute 'document_random_attr' is still present in document nodes: {still_has_random}"
-    # Убедимся, что у document осталось хотя бы одно валидное поле из DM, чтобы тест не проходил
-    # «пустым» набором атрибутов.
+    # Ensure document nodes still have at least one valid field from DM, so the test doesn't pass
+    # with an "empty" attribute set.
     any_valid_left = any(bool(set((row["attributes"] or {}).keys()) & allowed_attrs) for _, row in docs.iterrows())
     assert any_valid_left, "After filtering, document nodes should still have at least one attribute from Data Model."

{vedana_etl-0.6.0.dev1 → vedana_etl-0.6.2}/tests/integ/test_anchor_attributes_formula_type_column.py RENAMED Viewed

@@ -1,13 +1,13 @@
 """
-Интеграционный тест: anchor_attributes_formula_type_column
+Integration test: anchor_attributes_formula_type_column
-Цель:
-  - Колонки Grist с типом данных "Formula" в сырых данных (get_grist_data)
-    попадают как вычисленные значения (строки/числа и т.п., а не выражения).
-  - Если такая колонка описана в Data Model, она должна сохраняться
+Goal:
+  - Grist columns with "Formula" data type appear in raw data (get_grist_data)
+    as computed values (strings/numbers etc., not expressions).
+  - If such a column is described in Data Model, it should be preserved.
 Test data:
-  - Формульный атрибут: `document_filepath` (для узлов типа "document").
+  - Formula attribute: `document_filepath` (for nodes of type "document").
 """
 from typing import Any, Dict, Optional
@@ -21,30 +21,30 @@ load_dotenv()
 def test_anchor_attributes_formula_type_column() -> None:
     """
-    Проверяем поведение формульной колонки `document_filepath`:
-    - в сырых nodes присутствует как результат вычисления (не пустое значение);
-    - описана в Data Model и не отфильтровывается
+    Verify the behavior of formula column `document_filepath`:
+    - in raw nodes it appears as a computed value (non-empty);
+    - described in Data Model and not filtered out
     """
-    # 1) Живой Data Model
+    # 1) Live Data Model
     anchors_df, a_attrs_df, _l_attrs_df, links_df, _q_df, _p_df, _cl_df = next(steps.get_data_model())
     assert not anchors_df.empty and not a_attrs_df.empty, "Data Model must not be empty (Anchors)."
     dm_attr_names = set(a_attrs_df["attribute_name"].astype(str))
-    # В этом кейсе ожидаем, что формульный атрибут описан в Data Model.
+    # In this case we expect the formula attribute to be described in Data Model.
     assert (
         "document_filepath" in dm_attr_names
     ), "Test precondition failed: 'document_filepath' must be present in Data Model."
-    # 2) Данные из живой Grist
+    # 2) Data from live Grist
     nodes_df, _ = next(steps.get_grist_data())
     assert not nodes_df.empty, "No nodes fetched from Grist."
     documents = nodes_df[nodes_df["node_type"] == "document"]
     assert not documents.empty, "Expected at least one 'document' node in raw data."
-    # 3) Найдём хотя бы одно непустое значение document_filepath в raw
+    # 3) Find at least one non-empty document_filepath value in raw
     raw_value: Optional[Any] = None
     raw_node_id: Optional[str] = None
     for _, row in documents.iterrows():
@@ -59,11 +59,11 @@ def test_anchor_attributes_formula_type_column() -> None:
         in at least one 'document' node (raw data).
         """
-    # 4) После фильтрации по Data Model атрибут должен сохраниться
+    # 4) After filtering by Data Model the attribute should be preserved
     docs_f = nodes_df[nodes_df["node_type"] == "document"]
     assert not docs_f.empty, "Filtered graph should still contain 'document' nodes."
-    # Проверим, что у того же узла (если он остался) поле всё ещё есть и непустое
+    # Verify that the same node (if it remained) still has the field and it's non-empty
     if raw_node_id is not None and (docs_f["node_id"] == raw_node_id).any():
         row = docs_f.loc[docs_f["node_id"] == raw_node_id].iloc[0]
         attrs_f: Dict[str, Any] = row["attributes"] or {}
@@ -71,7 +71,7 @@ def test_anchor_attributes_formula_type_column() -> None:
             "document_filepath" in attrs_f and str(attrs_f["document_filepath"]).strip()
         ), "Expected 'document_filepath' to be preserved by filtering logic because it is present in Data Model."
     else:
-        # Иначе просто убедимся, что у какого-то document-узла поле присутствует
+        # Otherwise just verify that some document node has the field
         found_any = False
         for _, row in docs_f.iterrows():
             attrs_f = row["attributes"] or {}

{vedana_etl-0.6.0.dev1 → vedana_etl-0.6.2}/tests/integ/test_anchor_attributes_reference_type_column.py RENAMED Viewed

@@ -1,13 +1,13 @@
 """
-Интеграционный тест: anchor_attributes_reference_type_column (SQL Reference)
+Integration test: anchor_attributes_reference_type_column (SQL Reference)
-Фокус теста:
-  - При чтении через Grist SQL провайдер Reference-колонка приходит как <ref_id> + gristHelper_<col>,
-    а в итоговом attributes должно быть **строковое значение** (как в UI), без gristHelper_* ключей.
-  - Если Reference-колонка описана в Data Model, она должна сохраниться при фильтрации.
+Test:
+  - When reading via Grist SQL provider, a Reference column comes as <ref_id> + gristHelper_<col>,
+    but the final attributes should contain **string value** (as in UI), without gristHelper_* keys.
+  - If a Reference column is described in Data Model, it should be preserved during filtering.
-Тестовые данные:
- - reference-поле: document_reference_attr (есть в Data Model)
+Test data:
+ - reference field: document_reference_attr (exists in Data Model)
 """
 from typing import Any, Dict, Optional
@@ -21,14 +21,14 @@ load_dotenv()
 def test_anchor_attributes_reference_type_column() -> None:
     """
-    1) В сыром nodes (get_grist_data) у хотя бы одного document-узла
-       `document_reference_attr` присутствует как непустая строка.
-       В attributes не должно быть gristHelper_* ключей.
-    2) После фильтрации:
-       - `document_reference_attr` остаётся (т.к. она есть в Data Model).
+    1) In raw nodes (get_grist_data) at least one document node has
+       `document_reference_attr` as a non-empty string.
+       There should be no gristHelper_* keys in attributes.
+    2) After filtering:
+       - `document_reference_attr` remains (because it exists in Data Model).
     """
-    # --- 1) Живой Data Model
+    # --- 1) Live Data Model
     anchors_df, a_attrs_df, _l_attrs_df, links_df, _q_df, _p_df, _cl_df = next(steps.get_data_model())
     assert not anchors_df.empty and not a_attrs_df.empty, "Data Model must not be empty (Anchors)."
@@ -37,14 +37,14 @@ def test_anchor_attributes_reference_type_column() -> None:
         "document_reference_attr" in dm_attr_names
     ), "Precondition: 'document_reference_attr' must be present in Data Model."
-    # --- 2) Данные из живой Grist
+    # --- 2) Data from live Grist
     nodes_df, _ = next(steps.get_grist_data())
     assert not nodes_df.empty, "No nodes fetched from Grist."
     documents = nodes_df[nodes_df["node_type"] == "document"]
     assert not documents.empty, "Expected at least one 'document' node in raw data."
-    # --- 2.1) Найти document со строковым document_reference_attr
+    # --- 2.1) Find document with string document_reference_attr
     ref_node_attrs: Optional[Dict[str, Any]] = None
     for _, row in documents.iterrows():
         attrs: Dict[str, Any] = row["attributes"] or {}
@@ -58,13 +58,13 @@ def test_anchor_attributes_reference_type_column() -> None:
         in at least one 'document' node (raw data).
         """
-    # --- 2.2) Проверить, что gristHelper_* ключи не протекли в attributes
+    # --- 2.2) Verify that gristHelper_* keys did not leak into attributes
     assert not any(k.startswith("gristHelper_") for k in ref_node_attrs.keys()), """
         gristHelper_* keys leaked into attributes; the SQL provider should have
         used them to reconstruct the final string value and NOT keep helper keys.
         """
-    # --- 3) Проверка фильтрации по Data Model
+    # --- 3) Verify filtering by Data Model
     docs_f = nodes_df[nodes_df["node_type"] == "document"]
     assert not docs_f.empty, "Filtered graph should still contain 'document' nodes."

vedana_etl-0.6.2/tests/integ/test_anchor_link_columns.py ADDED Viewed

@@ -0,0 +1,78 @@
+"""
+Integration test: anchor_link_columns
+Description:
+Reference / Reference List columns in Anchor_* tables should generate edges between
+nodes according to Data Model.
+Data:
+- link document <-> document_chunk is defined by a column in Anchor_document:
+  link_document_has_document_chunk
+- document 'document:1' in test data has references to chunks:
+  document_chunk:01, document_chunk:02, document_chunk:03, document_chunk:05
+Checks:
+1) Find link document -> document_chunk in DM and get its sentence.
+2) Get edges from pipeline (steps.get_grist_data()).
+3) Verify that edges_df contains edges between 'document:1' and the listed chunks
+   with the required edge_label (from DM). Direction doesn't matter.
+"""
+from typing import Set, Tuple
+import pandas as pd
+from dotenv import load_dotenv
+from vedana_etl import steps
+load_dotenv()
+def _unordered(a: str, b: str) -> Tuple[str, str]:
+    return (a, b) if a <= b else (b, a)
+def test_anchor_link_columns() -> None:
+    # 1) Get sentence from Data Model for document <-> document_chunk
+    _anchors_df, _a_attrs_df, _l_attrs_df, links_df, _q_df, _p_df, _cl_df = next(steps.get_data_model())
+    assert not links_df.empty, "Data Model Links is empty."
+    dm = links_df.copy()
+    a1 = dm["anchor1"].astype(str).str.lower().str.strip()
+    a2 = dm["anchor2"].astype(str).str.lower().str.strip()
+    row = dm[(a1 == "document") & (a2 == "document_chunk")]
+    assert not row.empty, "No link document -> document_chunk in Data Model."
+    sentence = str(row.iloc[0]["sentence"]).strip()
+    assert sentence, "Empty sentence for document <-> document_chunk in Data Model."
+    # 2) Get edges from pipeline
+    nodes_df, edges_df = next(steps.get_grist_data())
+    assert isinstance(edges_df, pd.DataFrame) and not edges_df.empty, "edges_df is empty."
+    # Filter required edges: document <-> document_chunk, required label
+    ft = edges_df["from_node_type"].astype(str).str.lower().str.strip()
+    tt = edges_df["to_node_type"].astype(str).str.lower().str.strip()
+    lbl = edges_df["edge_label"].astype(str).str.lower().str.strip()
+    target_edges = edges_df[
+        (((ft == "document") & (tt == "document_chunk")) | ((ft == "document_chunk") & (tt == "document")))
+        & (lbl == sentence.lower())
+    ].copy()
+    assert not target_edges.empty, f"No edges document <-> document_chunk with label '{sentence}' found."
+    # Build set of actual undirected pairs
+    actual_pairs: Set[Tuple[str, str]] = set(
+        _unordered(str(r["from_node_id"]).strip(), str(r["to_node_id"]).strip()) for _, r in target_edges.iterrows()
+    )
+    # 3) Expected pairs for 'document:1' from test data
+    expected_pairs: Set[Tuple[str, str]] = {
+        _unordered("document:1", "document_chunk:01"),
+        _unordered("document:1", "document_chunk:02"),
+        _unordered("document:1", "document_chunk:03"),
+        _unordered("document:1", "document_chunk:05"),
+    }
+    missing = sorted(p for p in expected_pairs if p not in actual_pairs)
+    assert not missing, f"Not all links from Anchor_document reference column are in the graph. Missing pairs: {missing}"

vedana_etl-0.6.2/tests/integ/test_duplicate_anchor_id_references.py ADDED Viewed

@@ -0,0 +1,81 @@
+"""
+Integration test: duplicate_anchor_id_references
+Description:
+When duplicates of the same logical node exist (e.g., two rows in
+Anchor_document with id "document:1"), only one node should be included
+in the graph (dedup by node_id), and any Reference/Reference List pointing
+to "duplicates" should be recognized as references to the node that remained in the graph.
+Data:
+- Anchor_document has duplicates "document:1".
+- In Anchor_document_chunk, nodes "document_chunk:02/03/05" reference different
+  "document:1" records (at DP-ID level), but in the graph links should be
+  with the single node node_id == "document:1".
+Checks:
+1) In nodes there exists EXACTLY one node with node_id == "document:1".
+2) Between "document:1" and each of {"document_chunk:02","document_chunk:03","document_chunk:05"}
+there exists at least one edge (direction doesn't matter).
+"""
+from typing import Set
+import pandas as pd
+from dotenv import load_dotenv
+from vedana_etl import steps
+load_dotenv()
+def _has_edge_between(
+    edges_df: pd.DataFrame,
+    a_id: str,
+    a_type: str,
+    b_id: str,
+    b_type: str,
+) -> bool:
+    """
+    Check for edge existence between (a_id, a_type) and (b_id, b_type) in any direction.
+    """
+    if edges_df.empty:
+        return False
+    from_id = edges_df["from_node_id"].astype(str)
+    to_id = edges_df["to_node_id"].astype(str)
+    from_t = edges_df["from_node_type"].astype(str).str.lower()
+    to_t = edges_df["to_node_type"].astype(str).str.lower()
+    mask_ab = (from_id == a_id) & (from_t == a_type.lower()) & (to_id == b_id) & (to_t == b_type.lower())
+    mask_ba = (from_id == b_id) & (from_t == b_type.lower()) & (to_id == a_id) & (to_t == a_type.lower())
+    return bool(edges_df[mask_ab | mask_ba].shape[0])
+def test_duplicate_anchor_id_references() -> None:
+    """
+    Test that when node IDs are duplicated, any Reference to duplicates should be parsed
+    as Reference to the node that ends up in the graph.
+    """
+    # 1) Load raw graph tables
+    nodes_df, edges_df = next(steps.get_grist_data())
+    assert not nodes_df.empty, "No nodes received from Grist."
+    assert isinstance(edges_df, pd.DataFrame), "edges_df should be a DataFrame."
+    # 2) Dedup by "document:1": only one node with this node_id should remain in the graph
+    doc1_rows = nodes_df[nodes_df["node_id"].astype(str) == "document:1"]
+    assert not doc1_rows.empty, "No 'document:1' node in the graph. Check test data."
+    assert (
+        doc1_rows.shape[0] == 1
+    ), f"Expected exactly one 'document: 1' node after deduplication, got {doc1_rows.shape[0]}."
+    assert doc1_rows.iloc[0]["node_type"] == "document", "Node 'document:1' should be of type 'document'."
+    # 3) Inter-node links: document:1 <-> document_chunk:{02,03,05}
+    required_chunks: Set[str] = {"document_chunk:02", "document_chunk:03", "document_chunk:05"}
+    missing = [
+        ch for ch in required_chunks if not _has_edge_between(edges_df, "document:1", "document", ch, "document_chunk")
+    ]
+    assert not missing, "Expected links between 'document:1' and specified chunks, but not found: " + ", ".join(
+        sorted(missing)
+    )

vedana_etl-0.6.2/tests/integ/test_duplicate_anchor_ids.py ADDED Viewed

@@ -0,0 +1,49 @@
+"""
+Integration test: duplicate_anchor_ids
+Description:
+  When node IDs are duplicated, the pipeline should not break.
+  The first described node for a given node_id should be included in the graph.
+Test data:
+  Anchor_document has two objects with the same node_id "document:1".
+  After drop_duplicates by "node_id", the first record is expected to remain
+  (in the test Grist snapshot this is the document with document_name = "doc_a").
+Checks:
+  1) The final nodes_df contains exactly one node with node_id == "document:1".
+  2) Its attributes correspond to the first record (document_name == "doc_a").
+"""
+from typing import Dict
+from dotenv import load_dotenv
+from vedana_etl import steps
+load_dotenv()
+def test_duplicate_anchor_ids_keep_first() -> None:
+    """
+    Verify that when duplicates by node_id exist, the first record is kept.
+    """
+    # 1) Load graph tables from live Grist via standard pipeline step.
+    nodes_df, _ = next(steps.get_grist_data())
+    assert not nodes_df.empty, "No nodes fetched from Grist; check test data and GRIST_* env."
+    # 2) Filter by the problematic identifier from test data.
+    masked = nodes_df[nodes_df["node_id"] == "document:1"]
+    assert not masked.empty, "Expected at least one node with node_id == 'document:1' in test data."
+    # 3) Duplicates should be collapsed: exactly one row remains.
+    assert len(masked) == 1, f"Duplicate node_id 'document: 1' wasn't deduplicated. Found {len(masked)} rows."
+    # 4) Verify that the first record was kept (expected name 'doc_a').
+    attrs: Dict[str, object] = masked.iloc[0]["attributes"] or {}
+    got_name = attrs.get("document_name")
+    assert got_name == "doc_a", f"""
+        Deduplication didn't preserve the first record for 'document:1'.
+        Expected document_name == 'doc_a', got {got_name!r}.
+        """

vedana-etl 0.6.0.dev1__tar.gz → 0.6.2__tar.gz

vedana-etl 0.6.0.dev1tar.gz → 0.6.2tar.gz