vedana-etl 0.1.0.dev3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vedana_etl/__init__.py +0 -0
- vedana_etl/app.py +10 -0
- vedana_etl/catalog.py +266 -0
- vedana_etl/config.py +22 -0
- vedana_etl/pipeline.py +142 -0
- vedana_etl/py.typed +0 -0
- vedana_etl/schemas.py +31 -0
- vedana_etl/settings.py +23 -0
- vedana_etl/steps.py +685 -0
- vedana_etl/store.py +208 -0
- vedana_etl-0.1.0.dev3.dist-info/METADATA +51 -0
- vedana_etl-0.1.0.dev3.dist-info/RECORD +13 -0
- vedana_etl-0.1.0.dev3.dist-info/WHEEL +4 -0
vedana_etl/store.py
ADDED
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import Any, Dict, Iterator, List, Optional, Sequence, Tuple
|
|
5
|
+
|
|
6
|
+
import pandas as pd
|
|
7
|
+
from grist_api import GristDocAPI
|
|
8
|
+
from sqlalchemy import Column
|
|
9
|
+
|
|
10
|
+
from datapipe.run_config import RunConfig
|
|
11
|
+
from datapipe.store.database import MetaKey
|
|
12
|
+
from datapipe.store.table_store import TableStore, TableStoreCaps
|
|
13
|
+
from datapipe.types import DataDF, DataSchema, IndexDF, MetaSchema
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass(frozen=True)
|
|
17
|
+
class _GristAuth:
|
|
18
|
+
server: str
|
|
19
|
+
api_key: str
|
|
20
|
+
doc_id: str
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _row_dicts_from_df(df: DataDF, columns: Sequence[str]) -> List[Dict[str, Any]]:
|
|
24
|
+
# Ensure JSON-serializable values: convert NaN to None
|
|
25
|
+
clean_df = df.copy()
|
|
26
|
+
if not clean_df.empty:
|
|
27
|
+
clean_df = clean_df.where(pd.notna(clean_df), None)
|
|
28
|
+
return [
|
|
29
|
+
{col: row.get(col) for col in columns}
|
|
30
|
+
for row in clean_df.to_dict(orient="records") # type: ignore
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _pk_tuple_from_fields(fields: Dict[str, Any], pk_columns: Sequence[str]) -> Tuple[Any, ...]:
|
|
35
|
+
return tuple(fields.get(col) for col in pk_columns)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class GristStore(TableStore):
|
|
39
|
+
"""Grist document table backend.
|
|
40
|
+
Provided table is expected to exist and its schema is expected to match the provided schema.
|
|
41
|
+
Based on Grist Python Client: https://github.com/gristlabs/py_grist_api
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
caps = TableStoreCaps(
|
|
45
|
+
supports_delete=True,
|
|
46
|
+
supports_get_schema=True,
|
|
47
|
+
supports_read_all_rows=True,
|
|
48
|
+
supports_read_nonexistent_rows=True,
|
|
49
|
+
supports_read_meta_pseudo_df=True,
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
def __init__(
|
|
53
|
+
self,
|
|
54
|
+
server: str,
|
|
55
|
+
api_key: str,
|
|
56
|
+
doc_id: str,
|
|
57
|
+
table: str,
|
|
58
|
+
data_sql_schema: List[Column],
|
|
59
|
+
page_size: int = 1000,
|
|
60
|
+
) -> None:
|
|
61
|
+
super().__init__()
|
|
62
|
+
self._auth = _GristAuth(server=server.rstrip("/"), api_key=api_key, doc_id=doc_id)
|
|
63
|
+
self.table = table
|
|
64
|
+
self.data_sql_schema = data_sql_schema
|
|
65
|
+
self._pk_columns: List[str] = [c.name for c in self.data_sql_schema if c.primary_key]
|
|
66
|
+
self._value_columns: List[str] = [c.name for c in self.data_sql_schema if not c.primary_key]
|
|
67
|
+
self._page_size = page_size
|
|
68
|
+
|
|
69
|
+
self._api = GristDocAPI(self._auth.doc_id, api_key=self._auth.api_key, server=self._auth.server)
|
|
70
|
+
|
|
71
|
+
# Basic metadata interface
|
|
72
|
+
def get_schema(self) -> DataSchema:
|
|
73
|
+
return self.data_sql_schema
|
|
74
|
+
|
|
75
|
+
def get_primary_schema(self) -> DataSchema:
|
|
76
|
+
return [c for c in self.data_sql_schema if c.primary_key]
|
|
77
|
+
|
|
78
|
+
def get_meta_schema(self) -> MetaSchema:
|
|
79
|
+
meta_key_prop = MetaKey.get_property_name()
|
|
80
|
+
return [column for column in self.data_sql_schema if hasattr(column, meta_key_prop)]
|
|
81
|
+
|
|
82
|
+
# Endpoints (for low-level access via client.call)
|
|
83
|
+
@property
|
|
84
|
+
def _records_path(self) -> str:
|
|
85
|
+
return f"tables/{self.table}/records"
|
|
86
|
+
|
|
87
|
+
# Low-level helpers
|
|
88
|
+
def _iter_all_records(self) -> Iterator[Dict[str, Any]]:
|
|
89
|
+
offset = 0
|
|
90
|
+
while True:
|
|
91
|
+
records = (
|
|
92
|
+
self._api.call(self._records_path, {"limit": self._page_size, "offset": offset}, method="GET")
|
|
93
|
+
.json()
|
|
94
|
+
.get("records")
|
|
95
|
+
)
|
|
96
|
+
if not records:
|
|
97
|
+
break
|
|
98
|
+
for rec in records:
|
|
99
|
+
yield rec
|
|
100
|
+
if len(records) < self._page_size:
|
|
101
|
+
break
|
|
102
|
+
offset += self._page_size
|
|
103
|
+
|
|
104
|
+
def _map_pk_to_record_id(self) -> Dict[Tuple[Any, ...], int]:
|
|
105
|
+
mapping: Dict[Tuple[Any, ...], int] = {}
|
|
106
|
+
for rec in self._iter_all_records():
|
|
107
|
+
rec_id_any = rec.get("id")
|
|
108
|
+
if not isinstance(rec_id_any, int):
|
|
109
|
+
continue
|
|
110
|
+
rec_id: int = rec_id_any
|
|
111
|
+
fields: Dict[str, Any] = rec.get("fields", {})
|
|
112
|
+
pk = _pk_tuple_from_fields(fields, self._pk_columns)
|
|
113
|
+
mapping[pk] = rec_id
|
|
114
|
+
return mapping
|
|
115
|
+
|
|
116
|
+
def _fetch_all_rows(self) -> List[Dict[str, Any]]:
|
|
117
|
+
data = self._api.fetch_table(self.table)
|
|
118
|
+
return [r._asdict() for r in data] # namedtuples -> dict
|
|
119
|
+
|
|
120
|
+
# CRUD
|
|
121
|
+
def insert_rows(self, df: DataDF) -> None:
|
|
122
|
+
if df.empty:
|
|
123
|
+
return
|
|
124
|
+
all_columns = self._pk_columns + self._value_columns
|
|
125
|
+
rows = _row_dicts_from_df(df, all_columns)
|
|
126
|
+
self._api.add_records(self.table, rows)
|
|
127
|
+
|
|
128
|
+
def update_rows(self, df: DataDF) -> None:
|
|
129
|
+
if df.empty:
|
|
130
|
+
return
|
|
131
|
+
|
|
132
|
+
# Upsert using sync_table helper
|
|
133
|
+
all_columns = self._pk_columns + self._value_columns
|
|
134
|
+
new_data = [type("Row", (), row) for row in _row_dicts_from_df(df, all_columns)]
|
|
135
|
+
key_cols = [(c, c) for c in self._pk_columns]
|
|
136
|
+
other_cols = [(c, c) for c in self._value_columns]
|
|
137
|
+
self._api.sync_table(self.table, new_data, key_cols, other_cols)
|
|
138
|
+
|
|
139
|
+
def delete_rows(self, idx: IndexDF) -> None:
|
|
140
|
+
if idx is None or idx.empty:
|
|
141
|
+
return
|
|
142
|
+
|
|
143
|
+
# Map keys to record ids, then delete via SQL
|
|
144
|
+
pk_to_id = self._map_pk_to_record_id()
|
|
145
|
+
|
|
146
|
+
ids_to_delete: List[int] = []
|
|
147
|
+
for row in idx[self._pk_columns].to_dict(orient="records"):
|
|
148
|
+
pk = tuple(row.get(col) for col in self._pk_columns)
|
|
149
|
+
rec_id = pk_to_id.get(pk)
|
|
150
|
+
if rec_id is not None:
|
|
151
|
+
ids_to_delete.append(rec_id)
|
|
152
|
+
|
|
153
|
+
if not ids_to_delete:
|
|
154
|
+
return
|
|
155
|
+
|
|
156
|
+
self._api.delete_records(self.table, ids_to_delete)
|
|
157
|
+
|
|
158
|
+
def read_rows(self, idx: Optional[IndexDF] = None) -> DataDF:
|
|
159
|
+
if idx is None: # Read full table
|
|
160
|
+
rows = self._fetch_all_rows()
|
|
161
|
+
if rows:
|
|
162
|
+
return pd.DataFrame.from_records(rows)[[c.name for c in self.data_sql_schema]]
|
|
163
|
+
return pd.DataFrame(columns=[c.name for c in self.data_sql_schema])
|
|
164
|
+
|
|
165
|
+
if idx.empty:
|
|
166
|
+
return pd.DataFrame(columns=[c.name for c in self.data_sql_schema])
|
|
167
|
+
|
|
168
|
+
# Build a local index of existing rows and filter
|
|
169
|
+
wanted: set[Tuple[Any, ...]] = set()
|
|
170
|
+
for row in idx[self._pk_columns].to_dict(orient="records"):
|
|
171
|
+
wanted.add(tuple(row.get(col) for col in self._pk_columns))
|
|
172
|
+
|
|
173
|
+
rows = self._fetch_all_rows()
|
|
174
|
+
if not rows:
|
|
175
|
+
return pd.DataFrame(columns=[c.name for c in self.data_sql_schema])
|
|
176
|
+
df_all = pd.DataFrame.from_records(rows)
|
|
177
|
+
if self._pk_columns:
|
|
178
|
+
tuples = list(map(tuple, df_all[self._pk_columns].astype(object).itertuples(index=False, name=None)))
|
|
179
|
+
mask = [t in wanted for t in tuples]
|
|
180
|
+
df_sel = df_all.loc[mask]
|
|
181
|
+
else:
|
|
182
|
+
df_sel = df_all.iloc[0:0]
|
|
183
|
+
if df_sel.empty:
|
|
184
|
+
return pd.DataFrame(columns=[c.name for c in self.data_sql_schema])
|
|
185
|
+
return df_sel[[c.name for c in self.data_sql_schema]]
|
|
186
|
+
|
|
187
|
+
def read_rows_meta_pseudo_df(
|
|
188
|
+
self,
|
|
189
|
+
chunksize: int = 1000,
|
|
190
|
+
run_config: Optional[RunConfig] = None,
|
|
191
|
+
) -> Iterator[DataDF]:
|
|
192
|
+
# Stream records in pages and yield as DataFrames
|
|
193
|
+
buffer: List[Dict[str, Any]] = []
|
|
194
|
+
|
|
195
|
+
def flush() -> Iterator[DataDF]:
|
|
196
|
+
nonlocal buffer
|
|
197
|
+
if buffer:
|
|
198
|
+
df = pd.DataFrame.from_records(buffer)
|
|
199
|
+
buffer = []
|
|
200
|
+
yield df
|
|
201
|
+
|
|
202
|
+
rows = self._fetch_all_rows()
|
|
203
|
+
for row in rows:
|
|
204
|
+
buffer.append(row)
|
|
205
|
+
if len(buffer) >= chunksize:
|
|
206
|
+
yield from flush()
|
|
207
|
+
|
|
208
|
+
yield from flush()
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: vedana-etl
|
|
3
|
+
Version: 0.1.0.dev3
|
|
4
|
+
Summary: Pipeline template for Vedana
|
|
5
|
+
Author-email: Andrey Tatarinov <a@tatarinov.co>, Timur Sheydaev <tsheyd@epoch8.co>
|
|
6
|
+
Requires-Python: >=3.12
|
|
7
|
+
Requires-Dist: alembic>=1.16.1
|
|
8
|
+
Requires-Dist: datapipe-app>=0.5.4
|
|
9
|
+
Requires-Dist: datapipe-core>=0.14.3
|
|
10
|
+
Requires-Dist: grist-api>=0.1.1
|
|
11
|
+
Requires-Dist: neo4j>=5.28.1
|
|
12
|
+
Requires-Dist: openai>=2.8.0
|
|
13
|
+
Requires-Dist: pandas>=1.2.0
|
|
14
|
+
Requires-Dist: pgvector>=0.4.2
|
|
15
|
+
Requires-Dist: pytest>=8.4.1
|
|
16
|
+
Requires-Dist: requests>=2.32.4
|
|
17
|
+
Requires-Dist: sqlalchemy>=2.0.41
|
|
18
|
+
Requires-Dist: vedana-core>=0.5.0
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
|
|
21
|
+
# Basic pipeline for all vedana projects.
|
|
22
|
+
|
|
23
|
+
This pipeline:
|
|
24
|
+
|
|
25
|
+
- Parses Grist Data & Data Model
|
|
26
|
+
- Ensures that Memgraph index/vector index structure is in sync with data model
|
|
27
|
+
- Updates Memgraph database in incremental fashion
|
|
28
|
+
|
|
29
|
+
To add steps:
|
|
30
|
+
1. Pass extra transformations to [get_pipeline](src/pipeline.py)
|
|
31
|
+
2. Create new app configuration from [app.py](src/app.py)
|
|
32
|
+
|
|
33
|
+
## Pipeline Labels Hierarchy
|
|
34
|
+
|
|
35
|
+
### Pipeline
|
|
36
|
+
|
|
37
|
+
`labels=("pipeline", "pipeline_name")` defines a set of operations as standalone, sort of like a DAG in Airflow
|
|
38
|
+
or a Dagster Job. Its purpose is to be able to render it as a separate tab on the ETL page of Backoffice in order to
|
|
39
|
+
look at it independently of other transformations
|
|
40
|
+
|
|
41
|
+
### Stage
|
|
42
|
+
|
|
43
|
+
`labels=("stage", "stage_name")` defines a stage of `pipeline`. Currently, stages are useful for creating and managing
|
|
44
|
+
observability features, such as [main dashboard's](/libs/vedana-backoffice/vedana_backoffice/pages/main_dashboard.py)
|
|
45
|
+
Ingest table, which displays DataTable's of all transformations with `labels=("stage", "extract")`.
|
|
46
|
+
Stages are also useful when running the pipeline manually.
|
|
47
|
+
|
|
48
|
+
### Flow
|
|
49
|
+
|
|
50
|
+
`labels=("flow", "flow_name")` helps execute a `pipeline` (or possibly several pipelines) in a nice fashion,
|
|
51
|
+
used in defining cron jobs, etc.
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
vedana_etl/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
vedana_etl/app.py,sha256=-nD70kEUdMYpZSt6gW0rCUNCakCnbM47McD7E_VGlGQ,318
|
|
3
|
+
vedana_etl/catalog.py,sha256=oNDSXQcOUG-D4IfleGgWGhe4H2oCpmG0Hxdj1TU2ekA,7432
|
|
4
|
+
vedana_etl/config.py,sha256=z9L_EY_UsHK5lfmaNSkTj-f9Kxx43hZfZ5hURDZSnOs,603
|
|
5
|
+
vedana_etl/pipeline.py,sha256=AcDcWBgauucIPe9mnL8Uar6Vv3jUqBXg0kINLwR5s9I,4145
|
|
6
|
+
vedana_etl/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
7
|
+
vedana_etl/schemas.py,sha256=OaV7zop6bkzIHZQh56oeH29wz7KVtm7wSEvBMJTsaIA,948
|
|
8
|
+
vedana_etl/settings.py,sha256=9phXeq3-YlTsUnBkMvoMemzMXAQjMEYrouEpoXSbUxk,612
|
|
9
|
+
vedana_etl/steps.py,sha256=27mS155c7CkuZPKQlao1cSnLhOy9qLja6MQ7Oqa2tKY,26081
|
|
10
|
+
vedana_etl/store.py,sha256=5JbxJJ3Yhbzw_klZOwtImdW9jGLOiMU6VAGHXDNxf6Q,7386
|
|
11
|
+
vedana_etl-0.1.0.dev3.dist-info/METADATA,sha256=mSiht1wRjN_cFkTzGBzQsH3Zndtm4JgOv2G_2KKDMug,1873
|
|
12
|
+
vedana_etl-0.1.0.dev3.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
13
|
+
vedana_etl-0.1.0.dev3.dist-info/RECORD,,
|