vedana-etl 0.6.0.dev1__tar.gz → 0.6.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {vedana_etl-0.6.0.dev1 → vedana_etl-0.6.2}/PKG-INFO +1 -1
- {vedana_etl-0.6.0.dev1 → vedana_etl-0.6.2}/src/vedana_etl/catalog.py +1 -1
- {vedana_etl-0.6.0.dev1 → vedana_etl-0.6.2}/src/vedana_etl/steps.py +2 -1
- {vedana_etl-0.6.0.dev1 → vedana_etl-0.6.2}/tests/.env.example +1 -1
- {vedana_etl-0.6.0.dev1 → vedana_etl-0.6.2}/tests/docker-compose.yml +2 -2
- {vedana_etl-0.6.0.dev1 → vedana_etl-0.6.2}/tests/infra/docker-compose.ci.yml +2 -2
- {vedana_etl-0.6.0.dev1 → vedana_etl-0.6.2}/tests/integ/conftest.py +5 -5
- {vedana_etl-0.6.0.dev1 → vedana_etl-0.6.2}/tests/integ/test_anchor_attribute_filtering.py +21 -21
- {vedana_etl-0.6.0.dev1 → vedana_etl-0.6.2}/tests/integ/test_anchor_attributes_formula_type_column.py +16 -16
- {vedana_etl-0.6.0.dev1 → vedana_etl-0.6.2}/tests/integ/test_anchor_attributes_reference_type_column.py +17 -17
- vedana_etl-0.6.2/tests/integ/test_anchor_link_columns.py +78 -0
- vedana_etl-0.6.2/tests/integ/test_duplicate_anchor_id_references.py +81 -0
- vedana_etl-0.6.2/tests/integ/test_duplicate_anchor_ids.py +49 -0
- vedana_etl-0.6.2/tests/integ/test_duplicate_edges.py +92 -0
- vedana_etl-0.6.2/tests/integ/test_edge_attribute_dtype.py +91 -0
- {vedana_etl-0.6.0.dev1 → vedana_etl-0.6.2}/tests/integ/test_edge_attribute_filtering.py +29 -29
- vedana_etl-0.6.2/tests/integ/test_edge_bidirectional.py +104 -0
- vedana_etl-0.6.2/tests/integ/test_edge_node_types.py +86 -0
- vedana_etl-0.6.2/tests/integ/test_table_filtering.py +64 -0
- {vedana_etl-0.6.0.dev1 → vedana_etl-0.6.2}/tests/unit/conftest.py +1 -1
- {vedana_etl-0.6.0.dev1 → vedana_etl-0.6.2}/tests/unit/test_steps_pure.py +6 -8
- vedana_etl-0.6.0.dev1/tests/integ/test_anchor_link_columns.py +0 -78
- vedana_etl-0.6.0.dev1/tests/integ/test_duplicate_anchor_id_references.py +0 -81
- vedana_etl-0.6.0.dev1/tests/integ/test_duplicate_anchor_ids.py +0 -49
- vedana_etl-0.6.0.dev1/tests/integ/test_duplicate_edges.py +0 -92
- vedana_etl-0.6.0.dev1/tests/integ/test_edge_attribute_dtype.py +0 -91
- vedana_etl-0.6.0.dev1/tests/integ/test_edge_bidirectional.py +0 -104
- vedana_etl-0.6.0.dev1/tests/integ/test_edge_node_types.py +0 -86
- vedana_etl-0.6.0.dev1/tests/integ/test_table_filtering.py +0 -64
- {vedana_etl-0.6.0.dev1 → vedana_etl-0.6.2}/.gitignore +0 -0
- {vedana_etl-0.6.0.dev1 → vedana_etl-0.6.2}/CHANGELOG.md +0 -0
- {vedana_etl-0.6.0.dev1 → vedana_etl-0.6.2}/README.md +0 -0
- {vedana_etl-0.6.0.dev1 → vedana_etl-0.6.2}/pyproject.toml +0 -0
- {vedana_etl-0.6.0.dev1 → vedana_etl-0.6.2}/src/vedana_etl/__init__.py +0 -0
- {vedana_etl-0.6.0.dev1 → vedana_etl-0.6.2}/src/vedana_etl/app.py +0 -0
- {vedana_etl-0.6.0.dev1 → vedana_etl-0.6.2}/src/vedana_etl/config.py +0 -0
- {vedana_etl-0.6.0.dev1 → vedana_etl-0.6.2}/src/vedana_etl/pipeline.py +0 -0
- {vedana_etl-0.6.0.dev1 → vedana_etl-0.6.2}/src/vedana_etl/py.typed +0 -0
- {vedana_etl-0.6.0.dev1 → vedana_etl-0.6.2}/src/vedana_etl/schemas.py +0 -0
- {vedana_etl-0.6.0.dev1 → vedana_etl-0.6.2}/src/vedana_etl/settings.py +0 -0
- {vedana_etl-0.6.0.dev1 → vedana_etl-0.6.2}/src/vedana_etl/store.py +0 -0
- {vedana_etl-0.6.0.dev1 → vedana_etl-0.6.2}/tests/__init__.py +0 -0
- {vedana_etl-0.6.0.dev1 → vedana_etl-0.6.2}/tests/fixtures/grist/Data Model.grist +0 -0
- {vedana_etl-0.6.0.dev1 → vedana_etl-0.6.2}/tests/fixtures/grist/Data.grist +0 -0
- {vedana_etl-0.6.0.dev1 → vedana_etl-0.6.2}/tests/infra/persist/grist/docs/qAxQ1gcBKcW7kGYq8ayUp7.grist +0 -0
- {vedana_etl-0.6.0.dev1 → vedana_etl-0.6.2}/tests/infra/persist/grist/docs/wEEmPY3UiwMDVXv6dr4cFs.grist +0 -0
- {vedana_etl-0.6.0.dev1 → vedana_etl-0.6.2}/tests/infra/persist/grist/grist-sessions.db +0 -0
- {vedana_etl-0.6.0.dev1 → vedana_etl-0.6.2}/tests/infra/persist/grist/home.sqlite3 +0 -0
- {vedana_etl-0.6.0.dev1 → vedana_etl-0.6.2}/tests/integ/__init__.py +0 -0
- {vedana_etl-0.6.0.dev1 → vedana_etl-0.6.2}/tests/unit/__init__.py +0 -0
|
@@ -224,8 +224,8 @@ rag_anchor_embeddings = Table(
|
|
|
224
224
|
name="rag_anchor_embeddings",
|
|
225
225
|
data_sql_schema=[
|
|
226
226
|
Column("node_id", String, primary_key=True),
|
|
227
|
+
Column("node_type", String, primary_key=True),
|
|
227
228
|
Column("attribute_name", String, primary_key=True),
|
|
228
|
-
Column("label", String, nullable=False),
|
|
229
229
|
Column("attribute_value", String),
|
|
230
230
|
Column("embedding", Vector(dim=core_settings.embeddings_dim), nullable=False),
|
|
231
231
|
],
|
|
@@ -604,7 +604,8 @@ def generate_embeddings(
|
|
|
604
604
|
tasks.append((pos, attr_name, text_val))
|
|
605
605
|
|
|
606
606
|
if not tasks:
|
|
607
|
-
|
|
607
|
+
output_columns = pkeys + ["attribute_name", "attribute_value", "embedding"]
|
|
608
|
+
return pd.DataFrame(columns=output_columns)
|
|
608
609
|
|
|
609
610
|
provider = LLMProvider()
|
|
610
611
|
|
|
@@ -4,7 +4,7 @@ GRIST_API_KEY=""
|
|
|
4
4
|
GRIST_DATA_MODEL_DOC_ID="krvDzKM6mbFRokq3m8NFq9"
|
|
5
5
|
GRIST_DATA_DOC_ID="u2hBF97eDygXmM1sXK9rA4"
|
|
6
6
|
|
|
7
|
-
# Memgraph (
|
|
7
|
+
# Memgraph (local deployment)
|
|
8
8
|
MEMGRAPH_URI="bolt://memgraph:7687"
|
|
9
9
|
MEMGRAPH_USER="neo4j"
|
|
10
10
|
MEMGRAPH_PWD="modular-current-bonjour-senior-neptune-8618"
|
|
@@ -2,7 +2,7 @@ services:
|
|
|
2
2
|
grist:
|
|
3
3
|
image: gristlabs/grist:latest
|
|
4
4
|
environment:
|
|
5
|
-
# (
|
|
5
|
+
# (example from grist-local-testing; suitable only for CI/local)
|
|
6
6
|
GRIST_API_KEY: "e30d2f274a538c05fecd14510887f8a3b7eab718"
|
|
7
7
|
GRIST_DATA_MODEL_DOC_ID: "aAco4qS9Dvf8"
|
|
8
8
|
GRIST_DATA_DOC_ID: "kB1iVADLPGU5"
|
|
@@ -14,7 +14,7 @@ services:
|
|
|
14
14
|
volumes:
|
|
15
15
|
# Where to store persistent data, such as documents.
|
|
16
16
|
- ${PERSIST_DIR:-./infra/persist}/grist:/persist
|
|
17
|
-
# CSV
|
|
17
|
+
# CSV fixtures, read by script
|
|
18
18
|
- ./fixtures/grist:/seed:ro
|
|
19
19
|
healthcheck:
|
|
20
20
|
test: [ "CMD", "bash", "-lc", "wget -qO- http://localhost:8484/api/status || exit 1" ]
|
|
@@ -20,7 +20,7 @@ services:
|
|
|
20
20
|
grist:
|
|
21
21
|
image: gristlabs/grist:latest
|
|
22
22
|
environment:
|
|
23
|
-
# (
|
|
23
|
+
# (example from grist-local-testing; for CI/local dev)
|
|
24
24
|
GRIST_API_KEY: "e30d2f274a538c05fecd14510887f8a3b7eab718"
|
|
25
25
|
GRIST_DATA_MODEL_DOC_ID: "aAco4qS9Dvf8"
|
|
26
26
|
GRIST_DATA_DOC_ID: "kB1iVADLPGU5"
|
|
@@ -32,7 +32,7 @@ services:
|
|
|
32
32
|
volumes:
|
|
33
33
|
# Where to store persistent data, such as documents.
|
|
34
34
|
- ${PERSIST_DIR:-./persist}/grist:/persist
|
|
35
|
-
# CSV
|
|
35
|
+
# read CSV-fixtures
|
|
36
36
|
- ../fixtures/grist:/seed:ro
|
|
37
37
|
healthcheck:
|
|
38
38
|
test: ["CMD", "bash", "-lc", "wget -qO- http://localhost:8484/api/status || exit 1"]
|
|
@@ -14,9 +14,9 @@ load_dotenv()
|
|
|
14
14
|
# -------- live Grist fixtures (NO mocks) --------
|
|
15
15
|
@pytest.fixture(scope="session")
|
|
16
16
|
def dm_dfs():
|
|
17
|
-
"""Data Model
|
|
17
|
+
"""Data Model from live Grist: Anchors, Attributes, Links."""
|
|
18
18
|
anchors_df, a_attrs_df, l_attrs_df, links_df, q_df, p_df, cl_df = next(steps.get_data_model())
|
|
19
|
-
# sanity:
|
|
19
|
+
# sanity: types as in code
|
|
20
20
|
assert a_attrs_df["embeddable"].dtype == bool and l_attrs_df["embeddable"].dtype == bool
|
|
21
21
|
assert "embed_threshold" in a_attrs_df.columns and "embed_threshold" in l_attrs_df.columns
|
|
22
22
|
return anchors_df, a_attrs_df, l_attrs_df, links_df, q_df, p_df, cl_df
|
|
@@ -24,7 +24,7 @@ def dm_dfs():
|
|
|
24
24
|
|
|
25
25
|
@pytest.fixture(scope="session")
|
|
26
26
|
def raw_graph_dfs():
|
|
27
|
-
"""
|
|
27
|
+
"""Raw nodes/edges from live Grist."""
|
|
28
28
|
nodes, edges = next(steps.get_grist_data())
|
|
29
29
|
return nodes, edges
|
|
30
30
|
|
|
@@ -48,12 +48,12 @@ def live_memgraph_available():
|
|
|
48
48
|
return _ping_memgraph()
|
|
49
49
|
|
|
50
50
|
|
|
51
|
-
#
|
|
51
|
+
# deterministic embeddings provider for reproducible tests
|
|
52
52
|
@pytest.fixture
|
|
53
53
|
def dummy_llm(monkeypatch):
|
|
54
54
|
class DummyProv:
|
|
55
55
|
def create_embeddings_sync(self, texts):
|
|
56
|
-
#
|
|
56
|
+
# fixed vector of length 8 (or however many EMBEDDINGS_DIM you have - can be fetched dynamically)
|
|
57
57
|
return [[1.0] + [0.0] * (getattr(core_settings, "embeddings_dim", 8) - 1) for _ in texts]
|
|
58
58
|
|
|
59
59
|
orig = steps.LLMProvider
|
|
@@ -1,18 +1,18 @@
|
|
|
1
1
|
"""
|
|
2
|
-
|
|
2
|
+
Integration test: anchor_attribute_filtering
|
|
3
3
|
|
|
4
|
-
|
|
5
|
-
-
|
|
6
|
-
-
|
|
7
|
-
|
|
4
|
+
What we check:
|
|
5
|
+
- ONLY attributes described in Data Model are included in the graph.
|
|
6
|
+
- Specifically verify that "document_random_attr" (present in test data)
|
|
7
|
+
is completely removed from "document" type nodes.
|
|
8
8
|
|
|
9
|
-
|
|
10
|
-
1)
|
|
11
|
-
2)
|
|
12
|
-
3)
|
|
13
|
-
-
|
|
14
|
-
|
|
15
|
-
-
|
|
9
|
+
Steps:
|
|
10
|
+
1) Load Data Model (Anchors/Attributes/Links) from live Grist.
|
|
11
|
+
2) Load raw data from live Grist.
|
|
12
|
+
3) Verify:
|
|
13
|
+
- attributes (except service DataModel) are a subset of Data Model attributes,
|
|
14
|
+
only allowing *_embedding (may appear later).
|
|
15
|
+
- for "document" type, key "document_random_attr" is absent.
|
|
16
16
|
"""
|
|
17
17
|
|
|
18
18
|
from typing import Dict, Set
|
|
@@ -26,25 +26,25 @@ load_dotenv()
|
|
|
26
26
|
|
|
27
27
|
def test_anchor_attribute_filtering_removes_unknown() -> None:
|
|
28
28
|
"""
|
|
29
|
-
|
|
29
|
+
Main verification of node attribute filtering.
|
|
30
30
|
"""
|
|
31
31
|
|
|
32
|
-
# 1)
|
|
32
|
+
# 1) Verify Data Model
|
|
33
33
|
anchors_df, a_attrs_df, _l_attrs_df, _links_df, _q_df, _p_df, _cl_df = next(steps.get_data_model())
|
|
34
34
|
assert not anchors_df.empty and not a_attrs_df.empty, "Data Model must not be empty (Anchors)."
|
|
35
35
|
|
|
36
|
-
# 2)
|
|
36
|
+
# 2) Verify data from Grist
|
|
37
37
|
nodes_df, _ = next(steps.get_grist_data())
|
|
38
38
|
assert not nodes_df.empty, "No nodes fetched from Grist."
|
|
39
39
|
|
|
40
|
-
# 3)
|
|
40
|
+
# 3) Allowed attributes from Data Model
|
|
41
41
|
allowed_attrs: Set[str] = set(a_attrs_df["attribute_name"].astype(str))
|
|
42
42
|
|
|
43
|
-
# 3.1)
|
|
43
|
+
# 3.1) For each node (except DataModel) attribute keys should be subset of Data Model attributes (plus *_embedding)
|
|
44
44
|
for _, row in nodes_df[nodes_df["node_type"] != "DataModel"].iterrows():
|
|
45
45
|
attr_dict: Dict[str, object] = row["attributes"] or {}
|
|
46
46
|
keys = set(map(str, attr_dict.keys()))
|
|
47
|
-
#
|
|
47
|
+
# allow generated embeddings that may appear later
|
|
48
48
|
embedding_keys = {k for k in keys if k.endswith("_embedding")}
|
|
49
49
|
unknown = keys - allowed_attrs - embedding_keys
|
|
50
50
|
assert not unknown, f"""
|
|
@@ -52,7 +52,7 @@ def test_anchor_attribute_filtering_removes_unknown() -> None:
|
|
|
52
52
|
Data Model: {sorted(unknown)}
|
|
53
53
|
"""
|
|
54
54
|
|
|
55
|
-
# 3.2)
|
|
55
|
+
# 3.2) Special case: document_random_attr should be removed from document nodes
|
|
56
56
|
docs = nodes_df[nodes_df["node_type"] == "document"]
|
|
57
57
|
assert not docs.empty, "Expected at least one 'document' node in test data."
|
|
58
58
|
still_has_random = [
|
|
@@ -62,7 +62,7 @@ def test_anchor_attribute_filtering_removes_unknown() -> None:
|
|
|
62
62
|
not still_has_random
|
|
63
63
|
), f"Unexpected attribute 'document_random_attr' is still present in document nodes: {still_has_random}"
|
|
64
64
|
|
|
65
|
-
#
|
|
66
|
-
#
|
|
65
|
+
# Ensure document nodes still have at least one valid field from DM, so the test doesn't pass
|
|
66
|
+
# with an "empty" attribute set.
|
|
67
67
|
any_valid_left = any(bool(set((row["attributes"] or {}).keys()) & allowed_attrs) for _, row in docs.iterrows())
|
|
68
68
|
assert any_valid_left, "After filtering, document nodes should still have at least one attribute from Data Model."
|
{vedana_etl-0.6.0.dev1 → vedana_etl-0.6.2}/tests/integ/test_anchor_attributes_formula_type_column.py
RENAMED
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
"""
|
|
2
|
-
|
|
2
|
+
Integration test: anchor_attributes_formula_type_column
|
|
3
3
|
|
|
4
|
-
|
|
5
|
-
-
|
|
6
|
-
|
|
7
|
-
-
|
|
4
|
+
Goal:
|
|
5
|
+
- Grist columns with "Formula" data type appear in raw data (get_grist_data)
|
|
6
|
+
as computed values (strings/numbers etc., not expressions).
|
|
7
|
+
- If such a column is described in Data Model, it should be preserved.
|
|
8
8
|
|
|
9
9
|
Test data:
|
|
10
|
-
-
|
|
10
|
+
- Formula attribute: `document_filepath` (for nodes of type "document").
|
|
11
11
|
"""
|
|
12
12
|
|
|
13
13
|
from typing import Any, Dict, Optional
|
|
@@ -21,30 +21,30 @@ load_dotenv()
|
|
|
21
21
|
|
|
22
22
|
def test_anchor_attributes_formula_type_column() -> None:
|
|
23
23
|
"""
|
|
24
|
-
|
|
25
|
-
-
|
|
26
|
-
-
|
|
24
|
+
Verify the behavior of formula column `document_filepath`:
|
|
25
|
+
- in raw nodes it appears as a computed value (non-empty);
|
|
26
|
+
- described in Data Model and not filtered out
|
|
27
27
|
"""
|
|
28
28
|
|
|
29
|
-
# 1)
|
|
29
|
+
# 1) Live Data Model
|
|
30
30
|
anchors_df, a_attrs_df, _l_attrs_df, links_df, _q_df, _p_df, _cl_df = next(steps.get_data_model())
|
|
31
31
|
assert not anchors_df.empty and not a_attrs_df.empty, "Data Model must not be empty (Anchors)."
|
|
32
32
|
|
|
33
33
|
dm_attr_names = set(a_attrs_df["attribute_name"].astype(str))
|
|
34
34
|
|
|
35
|
-
#
|
|
35
|
+
# In this case we expect the formula attribute to be described in Data Model.
|
|
36
36
|
assert (
|
|
37
37
|
"document_filepath" in dm_attr_names
|
|
38
38
|
), "Test precondition failed: 'document_filepath' must be present in Data Model."
|
|
39
39
|
|
|
40
|
-
# 2)
|
|
40
|
+
# 2) Data from live Grist
|
|
41
41
|
nodes_df, _ = next(steps.get_grist_data())
|
|
42
42
|
assert not nodes_df.empty, "No nodes fetched from Grist."
|
|
43
43
|
|
|
44
44
|
documents = nodes_df[nodes_df["node_type"] == "document"]
|
|
45
45
|
assert not documents.empty, "Expected at least one 'document' node in raw data."
|
|
46
46
|
|
|
47
|
-
# 3)
|
|
47
|
+
# 3) Find at least one non-empty document_filepath value in raw
|
|
48
48
|
raw_value: Optional[Any] = None
|
|
49
49
|
raw_node_id: Optional[str] = None
|
|
50
50
|
for _, row in documents.iterrows():
|
|
@@ -59,11 +59,11 @@ def test_anchor_attributes_formula_type_column() -> None:
|
|
|
59
59
|
in at least one 'document' node (raw data).
|
|
60
60
|
"""
|
|
61
61
|
|
|
62
|
-
# 4)
|
|
62
|
+
# 4) After filtering by Data Model the attribute should be preserved
|
|
63
63
|
docs_f = nodes_df[nodes_df["node_type"] == "document"]
|
|
64
64
|
assert not docs_f.empty, "Filtered graph should still contain 'document' nodes."
|
|
65
65
|
|
|
66
|
-
#
|
|
66
|
+
# Verify that the same node (if it remained) still has the field and it's non-empty
|
|
67
67
|
if raw_node_id is not None and (docs_f["node_id"] == raw_node_id).any():
|
|
68
68
|
row = docs_f.loc[docs_f["node_id"] == raw_node_id].iloc[0]
|
|
69
69
|
attrs_f: Dict[str, Any] = row["attributes"] or {}
|
|
@@ -71,7 +71,7 @@ def test_anchor_attributes_formula_type_column() -> None:
|
|
|
71
71
|
"document_filepath" in attrs_f and str(attrs_f["document_filepath"]).strip()
|
|
72
72
|
), "Expected 'document_filepath' to be preserved by filtering logic because it is present in Data Model."
|
|
73
73
|
else:
|
|
74
|
-
#
|
|
74
|
+
# Otherwise just verify that some document node has the field
|
|
75
75
|
found_any = False
|
|
76
76
|
for _, row in docs_f.iterrows():
|
|
77
77
|
attrs_f = row["attributes"] or {}
|
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
"""
|
|
2
|
-
|
|
2
|
+
Integration test: anchor_attributes_reference_type_column (SQL Reference)
|
|
3
3
|
|
|
4
|
-
|
|
5
|
-
-
|
|
6
|
-
|
|
7
|
-
-
|
|
4
|
+
Test:
|
|
5
|
+
- When reading via Grist SQL provider, a Reference column comes as <ref_id> + gristHelper_<col>,
|
|
6
|
+
but the final attributes should contain **string value** (as in UI), without gristHelper_* keys.
|
|
7
|
+
- If a Reference column is described in Data Model, it should be preserved during filtering.
|
|
8
8
|
|
|
9
|
-
|
|
10
|
-
- reference
|
|
9
|
+
Test data:
|
|
10
|
+
- reference field: document_reference_attr (exists in Data Model)
|
|
11
11
|
"""
|
|
12
12
|
|
|
13
13
|
from typing import Any, Dict, Optional
|
|
@@ -21,14 +21,14 @@ load_dotenv()
|
|
|
21
21
|
|
|
22
22
|
def test_anchor_attributes_reference_type_column() -> None:
|
|
23
23
|
"""
|
|
24
|
-
1)
|
|
25
|
-
`document_reference_attr`
|
|
26
|
-
|
|
27
|
-
2)
|
|
28
|
-
- `document_reference_attr`
|
|
24
|
+
1) In raw nodes (get_grist_data) at least one document node has
|
|
25
|
+
`document_reference_attr` as a non-empty string.
|
|
26
|
+
There should be no gristHelper_* keys in attributes.
|
|
27
|
+
2) After filtering:
|
|
28
|
+
- `document_reference_attr` remains (because it exists in Data Model).
|
|
29
29
|
"""
|
|
30
30
|
|
|
31
|
-
# --- 1)
|
|
31
|
+
# --- 1) Live Data Model
|
|
32
32
|
anchors_df, a_attrs_df, _l_attrs_df, links_df, _q_df, _p_df, _cl_df = next(steps.get_data_model())
|
|
33
33
|
assert not anchors_df.empty and not a_attrs_df.empty, "Data Model must not be empty (Anchors)."
|
|
34
34
|
|
|
@@ -37,14 +37,14 @@ def test_anchor_attributes_reference_type_column() -> None:
|
|
|
37
37
|
"document_reference_attr" in dm_attr_names
|
|
38
38
|
), "Precondition: 'document_reference_attr' must be present in Data Model."
|
|
39
39
|
|
|
40
|
-
# --- 2)
|
|
40
|
+
# --- 2) Data from live Grist
|
|
41
41
|
nodes_df, _ = next(steps.get_grist_data())
|
|
42
42
|
assert not nodes_df.empty, "No nodes fetched from Grist."
|
|
43
43
|
|
|
44
44
|
documents = nodes_df[nodes_df["node_type"] == "document"]
|
|
45
45
|
assert not documents.empty, "Expected at least one 'document' node in raw data."
|
|
46
46
|
|
|
47
|
-
# --- 2.1)
|
|
47
|
+
# --- 2.1) Find document with string document_reference_attr
|
|
48
48
|
ref_node_attrs: Optional[Dict[str, Any]] = None
|
|
49
49
|
for _, row in documents.iterrows():
|
|
50
50
|
attrs: Dict[str, Any] = row["attributes"] or {}
|
|
@@ -58,13 +58,13 @@ def test_anchor_attributes_reference_type_column() -> None:
|
|
|
58
58
|
in at least one 'document' node (raw data).
|
|
59
59
|
"""
|
|
60
60
|
|
|
61
|
-
# --- 2.2)
|
|
61
|
+
# --- 2.2) Verify that gristHelper_* keys did not leak into attributes
|
|
62
62
|
assert not any(k.startswith("gristHelper_") for k in ref_node_attrs.keys()), """
|
|
63
63
|
gristHelper_* keys leaked into attributes; the SQL provider should have
|
|
64
64
|
used them to reconstruct the final string value and NOT keep helper keys.
|
|
65
65
|
"""
|
|
66
66
|
|
|
67
|
-
# --- 3)
|
|
67
|
+
# --- 3) Verify filtering by Data Model
|
|
68
68
|
docs_f = nodes_df[nodes_df["node_type"] == "document"]
|
|
69
69
|
assert not docs_f.empty, "Filtered graph should still contain 'document' nodes."
|
|
70
70
|
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Integration test: anchor_link_columns
|
|
3
|
+
|
|
4
|
+
Description:
|
|
5
|
+
Reference / Reference List columns in Anchor_* tables should generate edges between
|
|
6
|
+
nodes according to Data Model.
|
|
7
|
+
|
|
8
|
+
Data:
|
|
9
|
+
- link document <-> document_chunk is defined by a column in Anchor_document:
|
|
10
|
+
link_document_has_document_chunk
|
|
11
|
+
- document 'document:1' in test data has references to chunks:
|
|
12
|
+
document_chunk:01, document_chunk:02, document_chunk:03, document_chunk:05
|
|
13
|
+
|
|
14
|
+
Checks:
|
|
15
|
+
1) Find link document -> document_chunk in DM and get its sentence.
|
|
16
|
+
2) Get edges from pipeline (steps.get_grist_data()).
|
|
17
|
+
3) Verify that edges_df contains edges between 'document:1' and the listed chunks
|
|
18
|
+
with the required edge_label (from DM). Direction doesn't matter.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from typing import Set, Tuple
|
|
22
|
+
|
|
23
|
+
import pandas as pd
|
|
24
|
+
from dotenv import load_dotenv
|
|
25
|
+
|
|
26
|
+
from vedana_etl import steps
|
|
27
|
+
|
|
28
|
+
load_dotenv()
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _unordered(a: str, b: str) -> Tuple[str, str]:
|
|
32
|
+
return (a, b) if a <= b else (b, a)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def test_anchor_link_columns() -> None:
|
|
36
|
+
# 1) Get sentence from Data Model for document <-> document_chunk
|
|
37
|
+
_anchors_df, _a_attrs_df, _l_attrs_df, links_df, _q_df, _p_df, _cl_df = next(steps.get_data_model())
|
|
38
|
+
assert not links_df.empty, "Data Model Links is empty."
|
|
39
|
+
|
|
40
|
+
dm = links_df.copy()
|
|
41
|
+
a1 = dm["anchor1"].astype(str).str.lower().str.strip()
|
|
42
|
+
a2 = dm["anchor2"].astype(str).str.lower().str.strip()
|
|
43
|
+
row = dm[(a1 == "document") & (a2 == "document_chunk")]
|
|
44
|
+
assert not row.empty, "No link document -> document_chunk in Data Model."
|
|
45
|
+
sentence = str(row.iloc[0]["sentence"]).strip()
|
|
46
|
+
assert sentence, "Empty sentence for document <-> document_chunk in Data Model."
|
|
47
|
+
|
|
48
|
+
# 2) Get edges from pipeline
|
|
49
|
+
nodes_df, edges_df = next(steps.get_grist_data())
|
|
50
|
+
assert isinstance(edges_df, pd.DataFrame) and not edges_df.empty, "edges_df is empty."
|
|
51
|
+
|
|
52
|
+
# Filter required edges: document <-> document_chunk, required label
|
|
53
|
+
ft = edges_df["from_node_type"].astype(str).str.lower().str.strip()
|
|
54
|
+
tt = edges_df["to_node_type"].astype(str).str.lower().str.strip()
|
|
55
|
+
lbl = edges_df["edge_label"].astype(str).str.lower().str.strip()
|
|
56
|
+
|
|
57
|
+
target_edges = edges_df[
|
|
58
|
+
(((ft == "document") & (tt == "document_chunk")) | ((ft == "document_chunk") & (tt == "document")))
|
|
59
|
+
& (lbl == sentence.lower())
|
|
60
|
+
].copy()
|
|
61
|
+
|
|
62
|
+
assert not target_edges.empty, f"No edges document <-> document_chunk with label '{sentence}' found."
|
|
63
|
+
|
|
64
|
+
# Build set of actual undirected pairs
|
|
65
|
+
actual_pairs: Set[Tuple[str, str]] = set(
|
|
66
|
+
_unordered(str(r["from_node_id"]).strip(), str(r["to_node_id"]).strip()) for _, r in target_edges.iterrows()
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
# 3) Expected pairs for 'document:1' from test data
|
|
70
|
+
expected_pairs: Set[Tuple[str, str]] = {
|
|
71
|
+
_unordered("document:1", "document_chunk:01"),
|
|
72
|
+
_unordered("document:1", "document_chunk:02"),
|
|
73
|
+
_unordered("document:1", "document_chunk:03"),
|
|
74
|
+
_unordered("document:1", "document_chunk:05"),
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
missing = sorted(p for p in expected_pairs if p not in actual_pairs)
|
|
78
|
+
assert not missing, f"Not all links from Anchor_document reference column are in the graph. Missing pairs: {missing}"
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Integration test: duplicate_anchor_id_references
|
|
3
|
+
|
|
4
|
+
Description:
|
|
5
|
+
When duplicates of the same logical node exist (e.g., two rows in
|
|
6
|
+
Anchor_document with id "document:1"), only one node should be included
|
|
7
|
+
in the graph (dedup by node_id), and any Reference/Reference List pointing
|
|
8
|
+
to "duplicates" should be recognized as references to the node that remained in the graph.
|
|
9
|
+
|
|
10
|
+
Data:
|
|
11
|
+
- Anchor_document has duplicates "document:1".
|
|
12
|
+
- In Anchor_document_chunk, nodes "document_chunk:02/03/05" reference different
|
|
13
|
+
"document:1" records (at DP-ID level), but in the graph links should be
|
|
14
|
+
with the single node node_id == "document:1".
|
|
15
|
+
|
|
16
|
+
Checks:
|
|
17
|
+
1) In nodes there exists EXACTLY one node with node_id == "document:1".
|
|
18
|
+
2) Between "document:1" and each of {"document_chunk:02","document_chunk:03","document_chunk:05"}
|
|
19
|
+
there exists at least one edge (direction doesn't matter).
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
from typing import Set
|
|
23
|
+
|
|
24
|
+
import pandas as pd
|
|
25
|
+
from dotenv import load_dotenv
|
|
26
|
+
|
|
27
|
+
from vedana_etl import steps
|
|
28
|
+
|
|
29
|
+
load_dotenv()
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _has_edge_between(
|
|
33
|
+
edges_df: pd.DataFrame,
|
|
34
|
+
a_id: str,
|
|
35
|
+
a_type: str,
|
|
36
|
+
b_id: str,
|
|
37
|
+
b_type: str,
|
|
38
|
+
) -> bool:
|
|
39
|
+
"""
|
|
40
|
+
Check for edge existence between (a_id, a_type) and (b_id, b_type) in any direction.
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
if edges_df.empty:
|
|
44
|
+
return False
|
|
45
|
+
from_id = edges_df["from_node_id"].astype(str)
|
|
46
|
+
to_id = edges_df["to_node_id"].astype(str)
|
|
47
|
+
from_t = edges_df["from_node_type"].astype(str).str.lower()
|
|
48
|
+
to_t = edges_df["to_node_type"].astype(str).str.lower()
|
|
49
|
+
|
|
50
|
+
mask_ab = (from_id == a_id) & (from_t == a_type.lower()) & (to_id == b_id) & (to_t == b_type.lower())
|
|
51
|
+
mask_ba = (from_id == b_id) & (from_t == b_type.lower()) & (to_id == a_id) & (to_t == a_type.lower())
|
|
52
|
+
return bool(edges_df[mask_ab | mask_ba].shape[0])
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def test_duplicate_anchor_id_references() -> None:
|
|
56
|
+
"""
|
|
57
|
+
Test that when node IDs are duplicated, any Reference to duplicates should be parsed
|
|
58
|
+
as Reference to the node that ends up in the graph.
|
|
59
|
+
"""
|
|
60
|
+
|
|
61
|
+
# 1) Load raw graph tables
|
|
62
|
+
nodes_df, edges_df = next(steps.get_grist_data())
|
|
63
|
+
assert not nodes_df.empty, "No nodes received from Grist."
|
|
64
|
+
assert isinstance(edges_df, pd.DataFrame), "edges_df should be a DataFrame."
|
|
65
|
+
|
|
66
|
+
# 2) Dedup by "document:1": only one node with this node_id should remain in the graph
|
|
67
|
+
doc1_rows = nodes_df[nodes_df["node_id"].astype(str) == "document:1"]
|
|
68
|
+
assert not doc1_rows.empty, "No 'document:1' node in the graph. Check test data."
|
|
69
|
+
assert (
|
|
70
|
+
doc1_rows.shape[0] == 1
|
|
71
|
+
), f"Expected exactly one 'document: 1' node after deduplication, got {doc1_rows.shape[0]}."
|
|
72
|
+
assert doc1_rows.iloc[0]["node_type"] == "document", "Node 'document:1' should be of type 'document'."
|
|
73
|
+
|
|
74
|
+
# 3) Inter-node links: document:1 <-> document_chunk:{02,03,05}
|
|
75
|
+
required_chunks: Set[str] = {"document_chunk:02", "document_chunk:03", "document_chunk:05"}
|
|
76
|
+
missing = [
|
|
77
|
+
ch for ch in required_chunks if not _has_edge_between(edges_df, "document:1", "document", ch, "document_chunk")
|
|
78
|
+
]
|
|
79
|
+
assert not missing, "Expected links between 'document:1' and specified chunks, but not found: " + ", ".join(
|
|
80
|
+
sorted(missing)
|
|
81
|
+
)
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Integration test: duplicate_anchor_ids
|
|
3
|
+
|
|
4
|
+
Description:
|
|
5
|
+
When node IDs are duplicated, the pipeline should not break.
|
|
6
|
+
The first described node for a given node_id should be included in the graph.
|
|
7
|
+
|
|
8
|
+
Test data:
|
|
9
|
+
Anchor_document has two objects with the same node_id "document:1".
|
|
10
|
+
After drop_duplicates by "node_id", the first record is expected to remain
|
|
11
|
+
(in the test Grist snapshot this is the document with document_name = "doc_a").
|
|
12
|
+
|
|
13
|
+
Checks:
|
|
14
|
+
1) The final nodes_df contains exactly one node with node_id == "document:1".
|
|
15
|
+
2) Its attributes correspond to the first record (document_name == "doc_a").
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from typing import Dict
|
|
19
|
+
|
|
20
|
+
from dotenv import load_dotenv
|
|
21
|
+
|
|
22
|
+
from vedana_etl import steps
|
|
23
|
+
|
|
24
|
+
load_dotenv()
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def test_duplicate_anchor_ids_keep_first() -> None:
|
|
28
|
+
"""
|
|
29
|
+
Verify that when duplicates by node_id exist, the first record is kept.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
# 1) Load graph tables from live Grist via standard pipeline step.
|
|
33
|
+
nodes_df, _ = next(steps.get_grist_data())
|
|
34
|
+
assert not nodes_df.empty, "No nodes fetched from Grist; check test data and GRIST_* env."
|
|
35
|
+
|
|
36
|
+
# 2) Filter by the problematic identifier from test data.
|
|
37
|
+
masked = nodes_df[nodes_df["node_id"] == "document:1"]
|
|
38
|
+
assert not masked.empty, "Expected at least one node with node_id == 'document:1' in test data."
|
|
39
|
+
|
|
40
|
+
# 3) Duplicates should be collapsed: exactly one row remains.
|
|
41
|
+
assert len(masked) == 1, f"Duplicate node_id 'document: 1' wasn't deduplicated. Found {len(masked)} rows."
|
|
42
|
+
|
|
43
|
+
# 4) Verify that the first record was kept (expected name 'doc_a').
|
|
44
|
+
attrs: Dict[str, object] = masked.iloc[0]["attributes"] or {}
|
|
45
|
+
got_name = attrs.get("document_name")
|
|
46
|
+
assert got_name == "doc_a", f"""
|
|
47
|
+
Deduplication didn't preserve the first record for 'document:1'.
|
|
48
|
+
Expected document_name == 'doc_a', got {got_name!r}.
|
|
49
|
+
"""
|