vedana-etl 0.6.0.dev1__tar.gz → 0.6.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. {vedana_etl-0.6.0.dev1 → vedana_etl-0.6.2}/PKG-INFO +1 -1
  2. {vedana_etl-0.6.0.dev1 → vedana_etl-0.6.2}/src/vedana_etl/catalog.py +1 -1
  3. {vedana_etl-0.6.0.dev1 → vedana_etl-0.6.2}/src/vedana_etl/steps.py +2 -1
  4. {vedana_etl-0.6.0.dev1 → vedana_etl-0.6.2}/tests/.env.example +1 -1
  5. {vedana_etl-0.6.0.dev1 → vedana_etl-0.6.2}/tests/docker-compose.yml +2 -2
  6. {vedana_etl-0.6.0.dev1 → vedana_etl-0.6.2}/tests/infra/docker-compose.ci.yml +2 -2
  7. {vedana_etl-0.6.0.dev1 → vedana_etl-0.6.2}/tests/integ/conftest.py +5 -5
  8. {vedana_etl-0.6.0.dev1 → vedana_etl-0.6.2}/tests/integ/test_anchor_attribute_filtering.py +21 -21
  9. {vedana_etl-0.6.0.dev1 → vedana_etl-0.6.2}/tests/integ/test_anchor_attributes_formula_type_column.py +16 -16
  10. {vedana_etl-0.6.0.dev1 → vedana_etl-0.6.2}/tests/integ/test_anchor_attributes_reference_type_column.py +17 -17
  11. vedana_etl-0.6.2/tests/integ/test_anchor_link_columns.py +78 -0
  12. vedana_etl-0.6.2/tests/integ/test_duplicate_anchor_id_references.py +81 -0
  13. vedana_etl-0.6.2/tests/integ/test_duplicate_anchor_ids.py +49 -0
  14. vedana_etl-0.6.2/tests/integ/test_duplicate_edges.py +92 -0
  15. vedana_etl-0.6.2/tests/integ/test_edge_attribute_dtype.py +91 -0
  16. {vedana_etl-0.6.0.dev1 → vedana_etl-0.6.2}/tests/integ/test_edge_attribute_filtering.py +29 -29
  17. vedana_etl-0.6.2/tests/integ/test_edge_bidirectional.py +104 -0
  18. vedana_etl-0.6.2/tests/integ/test_edge_node_types.py +86 -0
  19. vedana_etl-0.6.2/tests/integ/test_table_filtering.py +64 -0
  20. {vedana_etl-0.6.0.dev1 → vedana_etl-0.6.2}/tests/unit/conftest.py +1 -1
  21. {vedana_etl-0.6.0.dev1 → vedana_etl-0.6.2}/tests/unit/test_steps_pure.py +6 -8
  22. vedana_etl-0.6.0.dev1/tests/integ/test_anchor_link_columns.py +0 -78
  23. vedana_etl-0.6.0.dev1/tests/integ/test_duplicate_anchor_id_references.py +0 -81
  24. vedana_etl-0.6.0.dev1/tests/integ/test_duplicate_anchor_ids.py +0 -49
  25. vedana_etl-0.6.0.dev1/tests/integ/test_duplicate_edges.py +0 -92
  26. vedana_etl-0.6.0.dev1/tests/integ/test_edge_attribute_dtype.py +0 -91
  27. vedana_etl-0.6.0.dev1/tests/integ/test_edge_bidirectional.py +0 -104
  28. vedana_etl-0.6.0.dev1/tests/integ/test_edge_node_types.py +0 -86
  29. vedana_etl-0.6.0.dev1/tests/integ/test_table_filtering.py +0 -64
  30. {vedana_etl-0.6.0.dev1 → vedana_etl-0.6.2}/.gitignore +0 -0
  31. {vedana_etl-0.6.0.dev1 → vedana_etl-0.6.2}/CHANGELOG.md +0 -0
  32. {vedana_etl-0.6.0.dev1 → vedana_etl-0.6.2}/README.md +0 -0
  33. {vedana_etl-0.6.0.dev1 → vedana_etl-0.6.2}/pyproject.toml +0 -0
  34. {vedana_etl-0.6.0.dev1 → vedana_etl-0.6.2}/src/vedana_etl/__init__.py +0 -0
  35. {vedana_etl-0.6.0.dev1 → vedana_etl-0.6.2}/src/vedana_etl/app.py +0 -0
  36. {vedana_etl-0.6.0.dev1 → vedana_etl-0.6.2}/src/vedana_etl/config.py +0 -0
  37. {vedana_etl-0.6.0.dev1 → vedana_etl-0.6.2}/src/vedana_etl/pipeline.py +0 -0
  38. {vedana_etl-0.6.0.dev1 → vedana_etl-0.6.2}/src/vedana_etl/py.typed +0 -0
  39. {vedana_etl-0.6.0.dev1 → vedana_etl-0.6.2}/src/vedana_etl/schemas.py +0 -0
  40. {vedana_etl-0.6.0.dev1 → vedana_etl-0.6.2}/src/vedana_etl/settings.py +0 -0
  41. {vedana_etl-0.6.0.dev1 → vedana_etl-0.6.2}/src/vedana_etl/store.py +0 -0
  42. {vedana_etl-0.6.0.dev1 → vedana_etl-0.6.2}/tests/__init__.py +0 -0
  43. {vedana_etl-0.6.0.dev1 → vedana_etl-0.6.2}/tests/fixtures/grist/Data Model.grist +0 -0
  44. {vedana_etl-0.6.0.dev1 → vedana_etl-0.6.2}/tests/fixtures/grist/Data.grist +0 -0
  45. {vedana_etl-0.6.0.dev1 → vedana_etl-0.6.2}/tests/infra/persist/grist/docs/qAxQ1gcBKcW7kGYq8ayUp7.grist +0 -0
  46. {vedana_etl-0.6.0.dev1 → vedana_etl-0.6.2}/tests/infra/persist/grist/docs/wEEmPY3UiwMDVXv6dr4cFs.grist +0 -0
  47. {vedana_etl-0.6.0.dev1 → vedana_etl-0.6.2}/tests/infra/persist/grist/grist-sessions.db +0 -0
  48. {vedana_etl-0.6.0.dev1 → vedana_etl-0.6.2}/tests/infra/persist/grist/home.sqlite3 +0 -0
  49. {vedana_etl-0.6.0.dev1 → vedana_etl-0.6.2}/tests/integ/__init__.py +0 -0
  50. {vedana_etl-0.6.0.dev1 → vedana_etl-0.6.2}/tests/unit/__init__.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: vedana-etl
3
- Version: 0.6.0.dev1
3
+ Version: 0.6.2
4
4
  Summary: Pipeline template for Vedana
5
5
  Author-email: Andrey Tatarinov <a@tatarinov.co>, Timur Sheydaev <tsheyd@epoch8.co>
6
6
  Requires-Python: >=3.12
@@ -224,8 +224,8 @@ rag_anchor_embeddings = Table(
224
224
  name="rag_anchor_embeddings",
225
225
  data_sql_schema=[
226
226
  Column("node_id", String, primary_key=True),
227
+ Column("node_type", String, primary_key=True),
227
228
  Column("attribute_name", String, primary_key=True),
228
- Column("label", String, nullable=False),
229
229
  Column("attribute_value", String),
230
230
  Column("embedding", Vector(dim=core_settings.embeddings_dim), nullable=False),
231
231
  ],
@@ -604,7 +604,8 @@ def generate_embeddings(
604
604
  tasks.append((pos, attr_name, text_val))
605
605
 
606
606
  if not tasks:
607
- return df
607
+ output_columns = pkeys + ["attribute_name", "attribute_value", "embedding"]
608
+ return pd.DataFrame(columns=output_columns)
608
609
 
609
610
  provider = LLMProvider()
610
611
 
@@ -4,7 +4,7 @@ GRIST_API_KEY=""
4
4
  GRIST_DATA_MODEL_DOC_ID="krvDzKM6mbFRokq3m8NFq9"
5
5
  GRIST_DATA_DOC_ID="u2hBF97eDygXmM1sXK9rA4"
6
6
 
7
- # Memgraph (локальный деплоймент)
7
+ # Memgraph (local deployment)
8
8
  MEMGRAPH_URI="bolt://memgraph:7687"
9
9
  MEMGRAPH_USER="neo4j"
10
10
  MEMGRAPH_PWD="modular-current-bonjour-senior-neptune-8618"
@@ -2,7 +2,7 @@ services:
2
2
  grist:
3
3
  image: gristlabs/grist:latest
4
4
  environment:
5
- # (пример из grist-local-testing; подходит только для CI/локала)
5
+ # (example from grist-local-testing; suitable only for CI/local)
6
6
  GRIST_API_KEY: "e30d2f274a538c05fecd14510887f8a3b7eab718"
7
7
  GRIST_DATA_MODEL_DOC_ID: "aAco4qS9Dvf8"
8
8
  GRIST_DATA_DOC_ID: "kB1iVADLPGU5"
@@ -14,7 +14,7 @@ services:
14
14
  volumes:
15
15
  # Where to store persistent data, such as documents.
16
16
  - ${PERSIST_DIR:-./infra/persist}/grist:/persist
17
- # CSV-фикстуры, читаем их скриптом
17
+ # CSV fixtures, read by script
18
18
  - ./fixtures/grist:/seed:ro
19
19
  healthcheck:
20
20
  test: [ "CMD", "bash", "-lc", "wget -qO- http://localhost:8484/api/status || exit 1" ]
@@ -20,7 +20,7 @@ services:
20
20
  grist:
21
21
  image: gristlabs/grist:latest
22
22
  environment:
23
- # (пример из grist-local-testing; подходит только для CI/локала)
23
+ # (example from grist-local-testing; for CI/local dev)
24
24
  GRIST_API_KEY: "e30d2f274a538c05fecd14510887f8a3b7eab718"
25
25
  GRIST_DATA_MODEL_DOC_ID: "aAco4qS9Dvf8"
26
26
  GRIST_DATA_DOC_ID: "kB1iVADLPGU5"
@@ -32,7 +32,7 @@ services:
32
32
  volumes:
33
33
  # Where to store persistent data, such as documents.
34
34
  - ${PERSIST_DIR:-./persist}/grist:/persist
35
- # CSV-фикстуры, читаем их скриптом
35
+ # read CSV-fixtures
36
36
  - ../fixtures/grist:/seed:ro
37
37
  healthcheck:
38
38
  test: ["CMD", "bash", "-lc", "wget -qO- http://localhost:8484/api/status || exit 1"]
@@ -14,9 +14,9 @@ load_dotenv()
14
14
  # -------- live Grist fixtures (NO mocks) --------
15
15
  @pytest.fixture(scope="session")
16
16
  def dm_dfs():
17
- """Data Model из живой Grist: Anchors, Attributes, Links."""
17
+ """Data Model from live Grist: Anchors, Attributes, Links."""
18
18
  anchors_df, a_attrs_df, l_attrs_df, links_df, q_df, p_df, cl_df = next(steps.get_data_model())
19
- # sanity: типы как в коде
19
+ # sanity: types as in code
20
20
  assert a_attrs_df["embeddable"].dtype == bool and l_attrs_df["embeddable"].dtype == bool
21
21
  assert "embed_threshold" in a_attrs_df.columns and "embed_threshold" in l_attrs_df.columns
22
22
  return anchors_df, a_attrs_df, l_attrs_df, links_df, q_df, p_df, cl_df
@@ -24,7 +24,7 @@ def dm_dfs():
24
24
 
25
25
  @pytest.fixture(scope="session")
26
26
  def raw_graph_dfs():
27
- """Сырые nodes/edges из живой Grist."""
27
+ """Raw nodes/edges from live Grist."""
28
28
  nodes, edges = next(steps.get_grist_data())
29
29
  return nodes, edges
30
30
 
@@ -48,12 +48,12 @@ def live_memgraph_available():
48
48
  return _ping_memgraph()
49
49
 
50
50
 
51
- # детерминированный провайдер эмбеддингов, чтобы тесты были воспроизводимы
51
+ # deterministic embeddings provider for reproducible tests
52
52
  @pytest.fixture
53
53
  def dummy_llm(monkeypatch):
54
54
  class DummyProv:
55
55
  def create_embeddings_sync(self, texts):
56
- # фикс-вектор длины 8 (или сколько у тебя EMBEDDINGS_DIM можно и динамически достать)
56
+ # fixed vector of length 8 (or however many EMBEDDINGS_DIM you have - can be fetched dynamically)
57
57
  return [[1.0] + [0.0] * (getattr(core_settings, "embeddings_dim", 8) - 1) for _ in texts]
58
58
 
59
59
  orig = steps.LLMProvider
@@ -1,18 +1,18 @@
1
1
  """
2
- Интеграционный тест: anchor_attribute_filtering
2
+ Integration test: anchor_attribute_filtering
3
3
 
4
- Что проверяем:
5
- - В граф попадают ТОЛЬКО атрибуты, описанные в дата-модели.
6
- - Специально проверяем, что "document_random_attr" (присутствует в тестовых данных)
7
- полностью удаляется у типа "document".
4
+ What we check:
5
+ - ONLY attributes described in Data Model are included in the graph.
6
+ - Specifically verify that "document_random_attr" (present in test data)
7
+ is completely removed from "document" type nodes.
8
8
 
9
- Шаги:
10
- 1) Загружаем Data Model (Anchors/Attributes/Links) из живой Grist.
11
- 2) Загружаем сырые данные из живой Grist.
12
- 3) Проверяем:
13
- - атрибуты (кроме служебного DataModel) подмножество Data Model атрибутов,
14
- допускаем только *_embedding (могут появиться позднее).
15
- - для типа "document" ключ "document_random_attr" отсутствует.
9
+ Steps:
10
+ 1) Load Data Model (Anchors/Attributes/Links) from live Grist.
11
+ 2) Load raw data from live Grist.
12
+ 3) Verify:
13
+ - attributes (except service DataModel) are a subset of Data Model attributes,
14
+ only allowing *_embedding (may appear later).
15
+ - for "document" type, key "document_random_attr" is absent.
16
16
  """
17
17
 
18
18
  from typing import Dict, Set
@@ -26,25 +26,25 @@ load_dotenv()
26
26
 
27
27
  def test_anchor_attribute_filtering_removes_unknown() -> None:
28
28
  """
29
- Основная проверка фильтрации атрибутов узлов.
29
+ Main verification of node attribute filtering.
30
30
  """
31
31
 
32
- # 1) Проверяем Data Model
32
+ # 1) Verify Data Model
33
33
  anchors_df, a_attrs_df, _l_attrs_df, _links_df, _q_df, _p_df, _cl_df = next(steps.get_data_model())
34
34
  assert not anchors_df.empty and not a_attrs_df.empty, "Data Model must not be empty (Anchors)."
35
35
 
36
- # 2) Проверяем данные из Grist
36
+ # 2) Verify data from Grist
37
37
  nodes_df, _ = next(steps.get_grist_data())
38
38
  assert not nodes_df.empty, "No nodes fetched from Grist."
39
39
 
40
- # 3) Допустимые атрибуты по Data Model
40
+ # 3) Allowed attributes from Data Model
41
41
  allowed_attrs: Set[str] = set(a_attrs_df["attribute_name"].astype(str))
42
42
 
43
- # 3.1) У каждого узла (кроме DataModel) ключи атрибутов Data Model атрибутов (плюс *_embedding)
43
+ # 3.1) For each node (except DataModel) attribute keys should be subset of Data Model attributes (plus *_embedding)
44
44
  for _, row in nodes_df[nodes_df["node_type"] != "DataModel"].iterrows():
45
45
  attr_dict: Dict[str, object] = row["attributes"] or {}
46
46
  keys = set(map(str, attr_dict.keys()))
47
- # разрешаем сгенерированные позже эмбеддинги
47
+ # allow generated embeddings that may appear later
48
48
  embedding_keys = {k for k in keys if k.endswith("_embedding")}
49
49
  unknown = keys - allowed_attrs - embedding_keys
50
50
  assert not unknown, f"""
@@ -52,7 +52,7 @@ def test_anchor_attribute_filtering_removes_unknown() -> None:
52
52
  Data Model: {sorted(unknown)}
53
53
  """
54
54
 
55
- # 3.2) Специальный кейс: document_random_attr должен исчезнуть у document-узлов
55
+ # 3.2) Special case: document_random_attr should be removed from document nodes
56
56
  docs = nodes_df[nodes_df["node_type"] == "document"]
57
57
  assert not docs.empty, "Expected at least one 'document' node in test data."
58
58
  still_has_random = [
@@ -62,7 +62,7 @@ def test_anchor_attribute_filtering_removes_unknown() -> None:
62
62
  not still_has_random
63
63
  ), f"Unexpected attribute 'document_random_attr' is still present in document nodes: {still_has_random}"
64
64
 
65
- # Убедимся, что у document осталось хотя бы одно валидное поле из DM, чтобы тест не проходил
66
- # «пустым» набором атрибутов.
65
+ # Ensure document nodes still have at least one valid field from DM, so the test doesn't pass
66
+ # with an "empty" attribute set.
67
67
  any_valid_left = any(bool(set((row["attributes"] or {}).keys()) & allowed_attrs) for _, row in docs.iterrows())
68
68
  assert any_valid_left, "After filtering, document nodes should still have at least one attribute from Data Model."
@@ -1,13 +1,13 @@
1
1
  """
2
- Интеграционный тест: anchor_attributes_formula_type_column
2
+ Integration test: anchor_attributes_formula_type_column
3
3
 
4
- Цель:
5
- - Колонки Grist с типом данных "Formula" в сырых данных (get_grist_data)
6
- попадают как вычисленные значения (строки/числа и т.п., а не выражения).
7
- - Если такая колонка описана в Data Model, она должна сохраняться
4
+ Goal:
5
+ - Grist columns with "Formula" data type appear in raw data (get_grist_data)
6
+ as computed values (strings/numbers etc., not expressions).
7
+ - If such a column is described in Data Model, it should be preserved.
8
8
 
9
9
  Test data:
10
- - Формульный атрибут: `document_filepath` (для узлов типа "document").
10
+ - Formula attribute: `document_filepath` (for nodes of type "document").
11
11
  """
12
12
 
13
13
  from typing import Any, Dict, Optional
@@ -21,30 +21,30 @@ load_dotenv()
21
21
 
22
22
  def test_anchor_attributes_formula_type_column() -> None:
23
23
  """
24
- Проверяем поведение формульной колонки `document_filepath`:
25
- - в сырых nodes присутствует как результат вычисления (не пустое значение);
26
- - описана в Data Model и не отфильтровывается
24
+ Verify the behavior of formula column `document_filepath`:
25
+ - in raw nodes it appears as a computed value (non-empty);
26
+ - described in Data Model and not filtered out
27
27
  """
28
28
 
29
- # 1) Живой Data Model
29
+ # 1) Live Data Model
30
30
  anchors_df, a_attrs_df, _l_attrs_df, links_df, _q_df, _p_df, _cl_df = next(steps.get_data_model())
31
31
  assert not anchors_df.empty and not a_attrs_df.empty, "Data Model must not be empty (Anchors)."
32
32
 
33
33
  dm_attr_names = set(a_attrs_df["attribute_name"].astype(str))
34
34
 
35
- # В этом кейсе ожидаем, что формульный атрибут описан в Data Model.
35
+ # In this case we expect the formula attribute to be described in Data Model.
36
36
  assert (
37
37
  "document_filepath" in dm_attr_names
38
38
  ), "Test precondition failed: 'document_filepath' must be present in Data Model."
39
39
 
40
- # 2) Данные из живой Grist
40
+ # 2) Data from live Grist
41
41
  nodes_df, _ = next(steps.get_grist_data())
42
42
  assert not nodes_df.empty, "No nodes fetched from Grist."
43
43
 
44
44
  documents = nodes_df[nodes_df["node_type"] == "document"]
45
45
  assert not documents.empty, "Expected at least one 'document' node in raw data."
46
46
 
47
- # 3) Найдём хотя бы одно непустое значение document_filepath в raw
47
+ # 3) Find at least one non-empty document_filepath value in raw
48
48
  raw_value: Optional[Any] = None
49
49
  raw_node_id: Optional[str] = None
50
50
  for _, row in documents.iterrows():
@@ -59,11 +59,11 @@ def test_anchor_attributes_formula_type_column() -> None:
59
59
  in at least one 'document' node (raw data).
60
60
  """
61
61
 
62
- # 4) После фильтрации по Data Model атрибут должен сохраниться
62
+ # 4) After filtering by Data Model the attribute should be preserved
63
63
  docs_f = nodes_df[nodes_df["node_type"] == "document"]
64
64
  assert not docs_f.empty, "Filtered graph should still contain 'document' nodes."
65
65
 
66
- # Проверим, что у того же узла (если он остался) поле всё ещё есть и непустое
66
+ # Verify that the same node (if it remained) still has the field and it's non-empty
67
67
  if raw_node_id is not None and (docs_f["node_id"] == raw_node_id).any():
68
68
  row = docs_f.loc[docs_f["node_id"] == raw_node_id].iloc[0]
69
69
  attrs_f: Dict[str, Any] = row["attributes"] or {}
@@ -71,7 +71,7 @@ def test_anchor_attributes_formula_type_column() -> None:
71
71
  "document_filepath" in attrs_f and str(attrs_f["document_filepath"]).strip()
72
72
  ), "Expected 'document_filepath' to be preserved by filtering logic because it is present in Data Model."
73
73
  else:
74
- # Иначе просто убедимся, что у какого-то document-узла поле присутствует
74
+ # Otherwise just verify that some document node has the field
75
75
  found_any = False
76
76
  for _, row in docs_f.iterrows():
77
77
  attrs_f = row["attributes"] or {}
@@ -1,13 +1,13 @@
1
1
  """
2
- Интеграционный тест: anchor_attributes_reference_type_column (SQL Reference)
2
+ Integration test: anchor_attributes_reference_type_column (SQL Reference)
3
3
 
4
- Фокус теста:
5
- - При чтении через Grist SQL провайдер Reference-колонка приходит как <ref_id> + gristHelper_<col>,
6
- а в итоговом attributes должно быть **строковое значение** (как в UI), без gristHelper_* ключей.
7
- - Если Reference-колонка описана в Data Model, она должна сохраниться при фильтрации.
4
+ Test:
5
+ - When reading via Grist SQL provider, a Reference column comes as <ref_id> + gristHelper_<col>,
6
+ but the final attributes should contain **string value** (as in UI), without gristHelper_* keys.
7
+ - If a Reference column is described in Data Model, it should be preserved during filtering.
8
8
 
9
- Тестовые данные:
10
- - reference-поле: document_reference_attr (есть в Data Model)
9
+ Test data:
10
+ - reference field: document_reference_attr (exists in Data Model)
11
11
  """
12
12
 
13
13
  from typing import Any, Dict, Optional
@@ -21,14 +21,14 @@ load_dotenv()
21
21
 
22
22
  def test_anchor_attributes_reference_type_column() -> None:
23
23
  """
24
- 1) В сыром nodes (get_grist_data) у хотя бы одного document-узла
25
- `document_reference_attr` присутствует как непустая строка.
26
- В attributes не должно быть gristHelper_* ключей.
27
- 2) После фильтрации:
28
- - `document_reference_attr` остаётся (т.к. она есть в Data Model).
24
+ 1) In raw nodes (get_grist_data) at least one document node has
25
+ `document_reference_attr` as a non-empty string.
26
+ There should be no gristHelper_* keys in attributes.
27
+ 2) After filtering:
28
+ - `document_reference_attr` remains (because it exists in Data Model).
29
29
  """
30
30
 
31
- # --- 1) Живой Data Model
31
+ # --- 1) Live Data Model
32
32
  anchors_df, a_attrs_df, _l_attrs_df, links_df, _q_df, _p_df, _cl_df = next(steps.get_data_model())
33
33
  assert not anchors_df.empty and not a_attrs_df.empty, "Data Model must not be empty (Anchors)."
34
34
 
@@ -37,14 +37,14 @@ def test_anchor_attributes_reference_type_column() -> None:
37
37
  "document_reference_attr" in dm_attr_names
38
38
  ), "Precondition: 'document_reference_attr' must be present in Data Model."
39
39
 
40
- # --- 2) Данные из живой Grist
40
+ # --- 2) Data from live Grist
41
41
  nodes_df, _ = next(steps.get_grist_data())
42
42
  assert not nodes_df.empty, "No nodes fetched from Grist."
43
43
 
44
44
  documents = nodes_df[nodes_df["node_type"] == "document"]
45
45
  assert not documents.empty, "Expected at least one 'document' node in raw data."
46
46
 
47
- # --- 2.1) Найти document со строковым document_reference_attr
47
+ # --- 2.1) Find document with string document_reference_attr
48
48
  ref_node_attrs: Optional[Dict[str, Any]] = None
49
49
  for _, row in documents.iterrows():
50
50
  attrs: Dict[str, Any] = row["attributes"] or {}
@@ -58,13 +58,13 @@ def test_anchor_attributes_reference_type_column() -> None:
58
58
  in at least one 'document' node (raw data).
59
59
  """
60
60
 
61
- # --- 2.2) Проверить, что gristHelper_* ключи не протекли в attributes
61
+ # --- 2.2) Verify that gristHelper_* keys did not leak into attributes
62
62
  assert not any(k.startswith("gristHelper_") for k in ref_node_attrs.keys()), """
63
63
  gristHelper_* keys leaked into attributes; the SQL provider should have
64
64
  used them to reconstruct the final string value and NOT keep helper keys.
65
65
  """
66
66
 
67
- # --- 3) Проверка фильтрации по Data Model
67
+ # --- 3) Verify filtering by Data Model
68
68
  docs_f = nodes_df[nodes_df["node_type"] == "document"]
69
69
  assert not docs_f.empty, "Filtered graph should still contain 'document' nodes."
70
70
 
@@ -0,0 +1,78 @@
1
+ """
2
+ Integration test: anchor_link_columns
3
+
4
+ Description:
5
+ Reference / Reference List columns in Anchor_* tables should generate edges between
6
+ nodes according to Data Model.
7
+
8
+ Data:
9
+ - link document <-> document_chunk is defined by a column in Anchor_document:
10
+ link_document_has_document_chunk
11
+ - document 'document:1' in test data has references to chunks:
12
+ document_chunk:01, document_chunk:02, document_chunk:03, document_chunk:05
13
+
14
+ Checks:
15
+ 1) Find link document -> document_chunk in DM and get its sentence.
16
+ 2) Get edges from pipeline (steps.get_grist_data()).
17
+ 3) Verify that edges_df contains edges between 'document:1' and the listed chunks
18
+ with the required edge_label (from DM). Direction doesn't matter.
19
+ """
20
+
21
+ from typing import Set, Tuple
22
+
23
+ import pandas as pd
24
+ from dotenv import load_dotenv
25
+
26
+ from vedana_etl import steps
27
+
28
+ load_dotenv()
29
+
30
+
31
+ def _unordered(a: str, b: str) -> Tuple[str, str]:
32
+ return (a, b) if a <= b else (b, a)
33
+
34
+
35
+ def test_anchor_link_columns() -> None:
36
+ # 1) Get sentence from Data Model for document <-> document_chunk
37
+ _anchors_df, _a_attrs_df, _l_attrs_df, links_df, _q_df, _p_df, _cl_df = next(steps.get_data_model())
38
+ assert not links_df.empty, "Data Model Links is empty."
39
+
40
+ dm = links_df.copy()
41
+ a1 = dm["anchor1"].astype(str).str.lower().str.strip()
42
+ a2 = dm["anchor2"].astype(str).str.lower().str.strip()
43
+ row = dm[(a1 == "document") & (a2 == "document_chunk")]
44
+ assert not row.empty, "No link document -> document_chunk in Data Model."
45
+ sentence = str(row.iloc[0]["sentence"]).strip()
46
+ assert sentence, "Empty sentence for document <-> document_chunk in Data Model."
47
+
48
+ # 2) Get edges from pipeline
49
+ nodes_df, edges_df = next(steps.get_grist_data())
50
+ assert isinstance(edges_df, pd.DataFrame) and not edges_df.empty, "edges_df is empty."
51
+
52
+ # Filter required edges: document <-> document_chunk, required label
53
+ ft = edges_df["from_node_type"].astype(str).str.lower().str.strip()
54
+ tt = edges_df["to_node_type"].astype(str).str.lower().str.strip()
55
+ lbl = edges_df["edge_label"].astype(str).str.lower().str.strip()
56
+
57
+ target_edges = edges_df[
58
+ (((ft == "document") & (tt == "document_chunk")) | ((ft == "document_chunk") & (tt == "document")))
59
+ & (lbl == sentence.lower())
60
+ ].copy()
61
+
62
+ assert not target_edges.empty, f"No edges document <-> document_chunk with label '{sentence}' found."
63
+
64
+ # Build set of actual undirected pairs
65
+ actual_pairs: Set[Tuple[str, str]] = set(
66
+ _unordered(str(r["from_node_id"]).strip(), str(r["to_node_id"]).strip()) for _, r in target_edges.iterrows()
67
+ )
68
+
69
+ # 3) Expected pairs for 'document:1' from test data
70
+ expected_pairs: Set[Tuple[str, str]] = {
71
+ _unordered("document:1", "document_chunk:01"),
72
+ _unordered("document:1", "document_chunk:02"),
73
+ _unordered("document:1", "document_chunk:03"),
74
+ _unordered("document:1", "document_chunk:05"),
75
+ }
76
+
77
+ missing = sorted(p for p in expected_pairs if p not in actual_pairs)
78
+ assert not missing, f"Not all links from Anchor_document reference column are in the graph. Missing pairs: {missing}"
@@ -0,0 +1,81 @@
1
+ """
2
+ Integration test: duplicate_anchor_id_references
3
+
4
+ Description:
5
+ When duplicates of the same logical node exist (e.g., two rows in
6
+ Anchor_document with id "document:1"), only one node should be included
7
+ in the graph (dedup by node_id), and any Reference/Reference List pointing
8
+ to "duplicates" should be recognized as references to the node that remained in the graph.
9
+
10
+ Data:
11
+ - Anchor_document has duplicates "document:1".
12
+ - In Anchor_document_chunk, nodes "document_chunk:02/03/05" reference different
13
+ "document:1" records (at DP-ID level), but in the graph links should be
14
+ with the single node node_id == "document:1".
15
+
16
+ Checks:
17
+ 1) In nodes there exists EXACTLY one node with node_id == "document:1".
18
+ 2) Between "document:1" and each of {"document_chunk:02","document_chunk:03","document_chunk:05"}
19
+ there exists at least one edge (direction doesn't matter).
20
+ """
21
+
22
+ from typing import Set
23
+
24
+ import pandas as pd
25
+ from dotenv import load_dotenv
26
+
27
+ from vedana_etl import steps
28
+
29
+ load_dotenv()
30
+
31
+
32
+ def _has_edge_between(
33
+ edges_df: pd.DataFrame,
34
+ a_id: str,
35
+ a_type: str,
36
+ b_id: str,
37
+ b_type: str,
38
+ ) -> bool:
39
+ """
40
+ Check for edge existence between (a_id, a_type) and (b_id, b_type) in any direction.
41
+ """
42
+
43
+ if edges_df.empty:
44
+ return False
45
+ from_id = edges_df["from_node_id"].astype(str)
46
+ to_id = edges_df["to_node_id"].astype(str)
47
+ from_t = edges_df["from_node_type"].astype(str).str.lower()
48
+ to_t = edges_df["to_node_type"].astype(str).str.lower()
49
+
50
+ mask_ab = (from_id == a_id) & (from_t == a_type.lower()) & (to_id == b_id) & (to_t == b_type.lower())
51
+ mask_ba = (from_id == b_id) & (from_t == b_type.lower()) & (to_id == a_id) & (to_t == a_type.lower())
52
+ return bool(edges_df[mask_ab | mask_ba].shape[0])
53
+
54
+
55
+ def test_duplicate_anchor_id_references() -> None:
56
+ """
57
+ Test that when node IDs are duplicated, any Reference to duplicates should be parsed
58
+ as Reference to the node that ends up in the graph.
59
+ """
60
+
61
+ # 1) Load raw graph tables
62
+ nodes_df, edges_df = next(steps.get_grist_data())
63
+ assert not nodes_df.empty, "No nodes received from Grist."
64
+ assert isinstance(edges_df, pd.DataFrame), "edges_df should be a DataFrame."
65
+
66
+ # 2) Dedup by "document:1": only one node with this node_id should remain in the graph
67
+ doc1_rows = nodes_df[nodes_df["node_id"].astype(str) == "document:1"]
68
+ assert not doc1_rows.empty, "No 'document:1' node in the graph. Check test data."
69
+ assert (
70
+ doc1_rows.shape[0] == 1
71
+ ), f"Expected exactly one 'document: 1' node after deduplication, got {doc1_rows.shape[0]}."
72
+ assert doc1_rows.iloc[0]["node_type"] == "document", "Node 'document:1' should be of type 'document'."
73
+
74
+ # 3) Inter-node links: document:1 <-> document_chunk:{02,03,05}
75
+ required_chunks: Set[str] = {"document_chunk:02", "document_chunk:03", "document_chunk:05"}
76
+ missing = [
77
+ ch for ch in required_chunks if not _has_edge_between(edges_df, "document:1", "document", ch, "document_chunk")
78
+ ]
79
+ assert not missing, "Expected links between 'document:1' and specified chunks, but not found: " + ", ".join(
80
+ sorted(missing)
81
+ )
@@ -0,0 +1,49 @@
1
+ """
2
+ Integration test: duplicate_anchor_ids
3
+
4
+ Description:
5
+ When node IDs are duplicated, the pipeline should not break.
6
+ The first described node for a given node_id should be included in the graph.
7
+
8
+ Test data:
9
+ Anchor_document has two objects with the same node_id "document:1".
10
+ After drop_duplicates by "node_id", the first record is expected to remain
11
+ (in the test Grist snapshot this is the document with document_name = "doc_a").
12
+
13
+ Checks:
14
+ 1) The final nodes_df contains exactly one node with node_id == "document:1".
15
+ 2) Its attributes correspond to the first record (document_name == "doc_a").
16
+ """
17
+
18
+ from typing import Dict
19
+
20
+ from dotenv import load_dotenv
21
+
22
+ from vedana_etl import steps
23
+
24
+ load_dotenv()
25
+
26
+
27
+ def test_duplicate_anchor_ids_keep_first() -> None:
28
+ """
29
+ Verify that when duplicates by node_id exist, the first record is kept.
30
+ """
31
+
32
+ # 1) Load graph tables from live Grist via standard pipeline step.
33
+ nodes_df, _ = next(steps.get_grist_data())
34
+ assert not nodes_df.empty, "No nodes fetched from Grist; check test data and GRIST_* env."
35
+
36
+ # 2) Filter by the problematic identifier from test data.
37
+ masked = nodes_df[nodes_df["node_id"] == "document:1"]
38
+ assert not masked.empty, "Expected at least one node with node_id == 'document:1' in test data."
39
+
40
+ # 3) Duplicates should be collapsed: exactly one row remains.
41
+ assert len(masked) == 1, f"Duplicate node_id 'document: 1' wasn't deduplicated. Found {len(masked)} rows."
42
+
43
+ # 4) Verify that the first record was kept (expected name 'doc_a').
44
+ attrs: Dict[str, object] = masked.iloc[0]["attributes"] or {}
45
+ got_name = attrs.get("document_name")
46
+ assert got_name == "doc_a", f"""
47
+ Deduplication didn't preserve the first record for 'document:1'.
48
+ Expected document_name == 'doc_a', got {got_name!r}.
49
+ """