vedana-etl 0.1.0.dev3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
vedana_etl/store.py ADDED
@@ -0,0 +1,208 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Any, Dict, Iterator, List, Optional, Sequence, Tuple
5
+
6
+ import pandas as pd
7
+ from grist_api import GristDocAPI
8
+ from sqlalchemy import Column
9
+
10
+ from datapipe.run_config import RunConfig
11
+ from datapipe.store.database import MetaKey
12
+ from datapipe.store.table_store import TableStore, TableStoreCaps
13
+ from datapipe.types import DataDF, DataSchema, IndexDF, MetaSchema
14
+
15
+
16
+ @dataclass(frozen=True)
17
+ class _GristAuth:
18
+ server: str
19
+ api_key: str
20
+ doc_id: str
21
+
22
+
23
+ def _row_dicts_from_df(df: DataDF, columns: Sequence[str]) -> List[Dict[str, Any]]:
24
+ # Ensure JSON-serializable values: convert NaN to None
25
+ clean_df = df.copy()
26
+ if not clean_df.empty:
27
+ clean_df = clean_df.where(pd.notna(clean_df), None)
28
+ return [
29
+ {col: row.get(col) for col in columns}
30
+ for row in clean_df.to_dict(orient="records") # type: ignore
31
+ ]
32
+
33
+
34
+ def _pk_tuple_from_fields(fields: Dict[str, Any], pk_columns: Sequence[str]) -> Tuple[Any, ...]:
35
+ return tuple(fields.get(col) for col in pk_columns)
36
+
37
+
38
+ class GristStore(TableStore):
39
+ """Grist document table backend.
40
+ Provided table is expected to exist and its schema is expected to match the provided schema.
41
+ Based on Grist Python Client: https://github.com/gristlabs/py_grist_api
42
+ """
43
+
44
+ caps = TableStoreCaps(
45
+ supports_delete=True,
46
+ supports_get_schema=True,
47
+ supports_read_all_rows=True,
48
+ supports_read_nonexistent_rows=True,
49
+ supports_read_meta_pseudo_df=True,
50
+ )
51
+
52
+ def __init__(
53
+ self,
54
+ server: str,
55
+ api_key: str,
56
+ doc_id: str,
57
+ table: str,
58
+ data_sql_schema: List[Column],
59
+ page_size: int = 1000,
60
+ ) -> None:
61
+ super().__init__()
62
+ self._auth = _GristAuth(server=server.rstrip("/"), api_key=api_key, doc_id=doc_id)
63
+ self.table = table
64
+ self.data_sql_schema = data_sql_schema
65
+ self._pk_columns: List[str] = [c.name for c in self.data_sql_schema if c.primary_key]
66
+ self._value_columns: List[str] = [c.name for c in self.data_sql_schema if not c.primary_key]
67
+ self._page_size = page_size
68
+
69
+ self._api = GristDocAPI(self._auth.doc_id, api_key=self._auth.api_key, server=self._auth.server)
70
+
71
+ # Basic metadata interface
72
+ def get_schema(self) -> DataSchema:
73
+ return self.data_sql_schema
74
+
75
+ def get_primary_schema(self) -> DataSchema:
76
+ return [c for c in self.data_sql_schema if c.primary_key]
77
+
78
+ def get_meta_schema(self) -> MetaSchema:
79
+ meta_key_prop = MetaKey.get_property_name()
80
+ return [column for column in self.data_sql_schema if hasattr(column, meta_key_prop)]
81
+
82
+ # Endpoints (for low-level access via client.call)
83
+ @property
84
+ def _records_path(self) -> str:
85
+ return f"tables/{self.table}/records"
86
+
87
+ # Low-level helpers
88
+ def _iter_all_records(self) -> Iterator[Dict[str, Any]]:
89
+ offset = 0
90
+ while True:
91
+ records = (
92
+ self._api.call(self._records_path, {"limit": self._page_size, "offset": offset}, method="GET")
93
+ .json()
94
+ .get("records")
95
+ )
96
+ if not records:
97
+ break
98
+ for rec in records:
99
+ yield rec
100
+ if len(records) < self._page_size:
101
+ break
102
+ offset += self._page_size
103
+
104
+ def _map_pk_to_record_id(self) -> Dict[Tuple[Any, ...], int]:
105
+ mapping: Dict[Tuple[Any, ...], int] = {}
106
+ for rec in self._iter_all_records():
107
+ rec_id_any = rec.get("id")
108
+ if not isinstance(rec_id_any, int):
109
+ continue
110
+ rec_id: int = rec_id_any
111
+ fields: Dict[str, Any] = rec.get("fields", {})
112
+ pk = _pk_tuple_from_fields(fields, self._pk_columns)
113
+ mapping[pk] = rec_id
114
+ return mapping
115
+
116
+ def _fetch_all_rows(self) -> List[Dict[str, Any]]:
117
+ data = self._api.fetch_table(self.table)
118
+ return [r._asdict() for r in data] # namedtuples -> dict
119
+
120
+ # CRUD
121
+ def insert_rows(self, df: DataDF) -> None:
122
+ if df.empty:
123
+ return
124
+ all_columns = self._pk_columns + self._value_columns
125
+ rows = _row_dicts_from_df(df, all_columns)
126
+ self._api.add_records(self.table, rows)
127
+
128
+ def update_rows(self, df: DataDF) -> None:
129
+ if df.empty:
130
+ return
131
+
132
+ # Upsert using sync_table helper
133
+ all_columns = self._pk_columns + self._value_columns
134
+ new_data = [type("Row", (), row) for row in _row_dicts_from_df(df, all_columns)]
135
+ key_cols = [(c, c) for c in self._pk_columns]
136
+ other_cols = [(c, c) for c in self._value_columns]
137
+ self._api.sync_table(self.table, new_data, key_cols, other_cols)
138
+
139
+ def delete_rows(self, idx: IndexDF) -> None:
140
+ if idx is None or idx.empty:
141
+ return
142
+
143
+ # Map keys to record ids, then delete via SQL
144
+ pk_to_id = self._map_pk_to_record_id()
145
+
146
+ ids_to_delete: List[int] = []
147
+ for row in idx[self._pk_columns].to_dict(orient="records"):
148
+ pk = tuple(row.get(col) for col in self._pk_columns)
149
+ rec_id = pk_to_id.get(pk)
150
+ if rec_id is not None:
151
+ ids_to_delete.append(rec_id)
152
+
153
+ if not ids_to_delete:
154
+ return
155
+
156
+ self._api.delete_records(self.table, ids_to_delete)
157
+
158
+ def read_rows(self, idx: Optional[IndexDF] = None) -> DataDF:
159
+ if idx is None: # Read full table
160
+ rows = self._fetch_all_rows()
161
+ if rows:
162
+ return pd.DataFrame.from_records(rows)[[c.name for c in self.data_sql_schema]]
163
+ return pd.DataFrame(columns=[c.name for c in self.data_sql_schema])
164
+
165
+ if idx.empty:
166
+ return pd.DataFrame(columns=[c.name for c in self.data_sql_schema])
167
+
168
+ # Build a local index of existing rows and filter
169
+ wanted: set[Tuple[Any, ...]] = set()
170
+ for row in idx[self._pk_columns].to_dict(orient="records"):
171
+ wanted.add(tuple(row.get(col) for col in self._pk_columns))
172
+
173
+ rows = self._fetch_all_rows()
174
+ if not rows:
175
+ return pd.DataFrame(columns=[c.name for c in self.data_sql_schema])
176
+ df_all = pd.DataFrame.from_records(rows)
177
+ if self._pk_columns:
178
+ tuples = list(map(tuple, df_all[self._pk_columns].astype(object).itertuples(index=False, name=None)))
179
+ mask = [t in wanted for t in tuples]
180
+ df_sel = df_all.loc[mask]
181
+ else:
182
+ df_sel = df_all.iloc[0:0]
183
+ if df_sel.empty:
184
+ return pd.DataFrame(columns=[c.name for c in self.data_sql_schema])
185
+ return df_sel[[c.name for c in self.data_sql_schema]]
186
+
187
+ def read_rows_meta_pseudo_df(
188
+ self,
189
+ chunksize: int = 1000,
190
+ run_config: Optional[RunConfig] = None,
191
+ ) -> Iterator[DataDF]:
192
+ # Stream records in pages and yield as DataFrames
193
+ buffer: List[Dict[str, Any]] = []
194
+
195
+ def flush() -> Iterator[DataDF]:
196
+ nonlocal buffer
197
+ if buffer:
198
+ df = pd.DataFrame.from_records(buffer)
199
+ buffer = []
200
+ yield df
201
+
202
+ rows = self._fetch_all_rows()
203
+ for row in rows:
204
+ buffer.append(row)
205
+ if len(buffer) >= chunksize:
206
+ yield from flush()
207
+
208
+ yield from flush()
@@ -0,0 +1,51 @@
1
+ Metadata-Version: 2.4
2
+ Name: vedana-etl
3
+ Version: 0.1.0.dev3
4
+ Summary: Pipeline template for Vedana
5
+ Author-email: Andrey Tatarinov <a@tatarinov.co>, Timur Sheydaev <tsheyd@epoch8.co>
6
+ Requires-Python: >=3.12
7
+ Requires-Dist: alembic>=1.16.1
8
+ Requires-Dist: datapipe-app>=0.5.4
9
+ Requires-Dist: datapipe-core>=0.14.3
10
+ Requires-Dist: grist-api>=0.1.1
11
+ Requires-Dist: neo4j>=5.28.1
12
+ Requires-Dist: openai>=2.8.0
13
+ Requires-Dist: pandas>=1.2.0
14
+ Requires-Dist: pgvector>=0.4.2
15
+ Requires-Dist: pytest>=8.4.1
16
+ Requires-Dist: requests>=2.32.4
17
+ Requires-Dist: sqlalchemy>=2.0.41
18
+ Requires-Dist: vedana-core>=0.5.0
19
+ Description-Content-Type: text/markdown
20
+
21
+ # Basic pipeline for all vedana projects.
22
+
23
+ This pipeline:
24
+
25
+ - Parses Grist Data & Data Model
26
+ - Ensures that Memgraph index/vector index structure is in sync with data model
27
+ - Updates Memgraph database in incremental fashion
28
+
29
+ To add steps:
30
+ 1. Pass extra transformations to [get_pipeline](src/pipeline.py)
31
+ 2. Create new app configuration from [app.py](src/app.py)
32
+
33
+ ## Pipeline Labels Hierarchy
34
+
35
+ ### Pipeline
36
+
37
+ `labels=("pipeline", "pipeline_name")` defines a set of operations as standalone, sort of like a DAG in Airflow
38
+ or a Dagster Job. Its purpose is to be able to render it as a separate tab on the ETL page of Backoffice in order to
39
+ look at it independently of other transformations
40
+
41
+ ### Stage
42
+
43
+ `labels=("stage", "stage_name")` defines a stage of `pipeline`. Currently, stages are useful for creating and managing
44
+ observability features, such as [main dashboard's](/libs/vedana-backoffice/vedana_backoffice/pages/main_dashboard.py)
45
+ Ingest table, which displays DataTable's of all transformations with `labels=("stage", "extract")`.
46
+ Stages are also useful when running the pipeline manually.
47
+
48
+ ### Flow
49
+
50
+ `labels=("flow", "flow_name")` helps execute a `pipeline` (or possibly several pipelines) in a nice fashion,
51
+ used in defining cron jobs, etc.
@@ -0,0 +1,13 @@
1
+ vedana_etl/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ vedana_etl/app.py,sha256=-nD70kEUdMYpZSt6gW0rCUNCakCnbM47McD7E_VGlGQ,318
3
+ vedana_etl/catalog.py,sha256=oNDSXQcOUG-D4IfleGgWGhe4H2oCpmG0Hxdj1TU2ekA,7432
4
+ vedana_etl/config.py,sha256=z9L_EY_UsHK5lfmaNSkTj-f9Kxx43hZfZ5hURDZSnOs,603
5
+ vedana_etl/pipeline.py,sha256=AcDcWBgauucIPe9mnL8Uar6Vv3jUqBXg0kINLwR5s9I,4145
6
+ vedana_etl/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
+ vedana_etl/schemas.py,sha256=OaV7zop6bkzIHZQh56oeH29wz7KVtm7wSEvBMJTsaIA,948
8
+ vedana_etl/settings.py,sha256=9phXeq3-YlTsUnBkMvoMemzMXAQjMEYrouEpoXSbUxk,612
9
+ vedana_etl/steps.py,sha256=27mS155c7CkuZPKQlao1cSnLhOy9qLja6MQ7Oqa2tKY,26081
10
+ vedana_etl/store.py,sha256=5JbxJJ3Yhbzw_klZOwtImdW9jGLOiMU6VAGHXDNxf6Q,7386
11
+ vedana_etl-0.1.0.dev3.dist-info/METADATA,sha256=mSiht1wRjN_cFkTzGBzQsH3Zndtm4JgOv2G_2KKDMug,1873
12
+ vedana_etl-0.1.0.dev3.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
13
+ vedana_etl-0.1.0.dev3.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.28.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any