vedana-etl 0.1.0.dev3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. vedana_etl-0.1.0.dev3/.gitignore +499 -0
  2. vedana_etl-0.1.0.dev3/CHANGELOG.md +22 -0
  3. vedana_etl-0.1.0.dev3/PKG-INFO +51 -0
  4. vedana_etl-0.1.0.dev3/README.md +31 -0
  5. vedana_etl-0.1.0.dev3/pyproject.toml +76 -0
  6. vedana_etl-0.1.0.dev3/src/vedana_etl/__init__.py +0 -0
  7. vedana_etl-0.1.0.dev3/src/vedana_etl/app.py +10 -0
  8. vedana_etl-0.1.0.dev3/src/vedana_etl/catalog.py +266 -0
  9. vedana_etl-0.1.0.dev3/src/vedana_etl/config.py +22 -0
  10. vedana_etl-0.1.0.dev3/src/vedana_etl/pipeline.py +142 -0
  11. vedana_etl-0.1.0.dev3/src/vedana_etl/py.typed +0 -0
  12. vedana_etl-0.1.0.dev3/src/vedana_etl/schemas.py +31 -0
  13. vedana_etl-0.1.0.dev3/src/vedana_etl/settings.py +23 -0
  14. vedana_etl-0.1.0.dev3/src/vedana_etl/steps.py +685 -0
  15. vedana_etl-0.1.0.dev3/src/vedana_etl/store.py +208 -0
  16. vedana_etl-0.1.0.dev3/tests/.env.example +18 -0
  17. vedana_etl-0.1.0.dev3/tests/__init__.py +0 -0
  18. vedana_etl-0.1.0.dev3/tests/docker-compose.yml +54 -0
  19. vedana_etl-0.1.0.dev3/tests/fixtures/grist/Data Model.grist +3 -0
  20. vedana_etl-0.1.0.dev3/tests/fixtures/grist/Data.grist +3 -0
  21. vedana_etl-0.1.0.dev3/tests/infra/docker-compose.ci.yml +41 -0
  22. vedana_etl-0.1.0.dev3/tests/infra/persist/grist/docs/qAxQ1gcBKcW7kGYq8ayUp7.grist +3 -0
  23. vedana_etl-0.1.0.dev3/tests/infra/persist/grist/docs/wEEmPY3UiwMDVXv6dr4cFs.grist +3 -0
  24. vedana_etl-0.1.0.dev3/tests/infra/persist/grist/grist-sessions.db +3 -0
  25. vedana_etl-0.1.0.dev3/tests/infra/persist/grist/home.sqlite3 +3 -0
  26. vedana_etl-0.1.0.dev3/tests/integ/__init__.py +0 -0
  27. vedana_etl-0.1.0.dev3/tests/integ/conftest.py +68 -0
  28. vedana_etl-0.1.0.dev3/tests/integ/test_anchor_attribute_filtering.py +68 -0
  29. vedana_etl-0.1.0.dev3/tests/integ/test_anchor_attributes_formula_type_column.py +84 -0
  30. vedana_etl-0.1.0.dev3/tests/integ/test_anchor_attributes_reference_type_column.py +79 -0
  31. vedana_etl-0.1.0.dev3/tests/integ/test_anchor_link_columns.py +78 -0
  32. vedana_etl-0.1.0.dev3/tests/integ/test_duplicate_anchor_id_references.py +81 -0
  33. vedana_etl-0.1.0.dev3/tests/integ/test_duplicate_anchor_ids.py +49 -0
  34. vedana_etl-0.1.0.dev3/tests/integ/test_duplicate_edges.py +92 -0
  35. vedana_etl-0.1.0.dev3/tests/integ/test_edge_attribute_dtype.py +91 -0
  36. vedana_etl-0.1.0.dev3/tests/integ/test_edge_attribute_filtering.py +93 -0
  37. vedana_etl-0.1.0.dev3/tests/integ/test_edge_bidirectional.py +104 -0
  38. vedana_etl-0.1.0.dev3/tests/integ/test_edge_node_types.py +86 -0
  39. vedana_etl-0.1.0.dev3/tests/integ/test_table_filtering.py +64 -0
  40. vedana_etl-0.1.0.dev3/tests/unit/__init__.py +0 -0
  41. vedana_etl-0.1.0.dev3/tests/unit/conftest.py +61 -0
  42. vedana_etl-0.1.0.dev3/tests/unit/test_steps_pure.py +149 -0
@@ -0,0 +1,499 @@
1
+ # File created using '.gitignore Generator' for Visual Studio Code: https://bit.ly/vscode-gig
2
+ # Created by https://www.toptal.com/developers/gitignore/api/windows,python,pycharm+iml,pycharm+all,pycharm,macos,linux,visualstudiocode
3
+ # Edit at https://www.toptal.com/developers/gitignore?templates=windows,python,pycharm+iml,pycharm+all,pycharm,macos,linux,visualstudiocode
4
+
5
+ ### Linux ###
6
+ *~
7
+
8
+ # temporary files which can be created if a process still has a handle open of a deleted file
9
+ .fuse_hidden*
10
+
11
+ # KDE directory preferences
12
+ .directory
13
+
14
+ # Linux trash folder which might appear on any partition or disk
15
+ .Trash-*
16
+
17
+ # .nfs files are created when an open file is removed but is still being accessed
18
+ .nfs*
19
+
20
+ ### macOS ###
21
+ # General
22
+ .DS_Store
23
+ .AppleDouble
24
+ .LSOverride
25
+
26
+ # Icon must end with two \r
27
+ Icon
28
+
29
+ # Thumbnails
30
+ ._*
31
+
32
+ # Files that might appear in the root of a volume
33
+ .DocumentRevisions-V100
34
+ .fseventsd
35
+ .Spotlight-V100
36
+ .TemporaryItems
37
+ .Trashes
38
+ .VolumeIcon.icns
39
+ .com.apple.timemachine.donotpresent
40
+
41
+ # Directories potentially created on remote AFP share
42
+ .AppleDB
43
+ .AppleDesktop
44
+ Network Trash Folder
45
+ Temporary Items
46
+ .apdisk
47
+
48
+ ### macOS Patch ###
49
+ # iCloud generated files
50
+ *.icloud
51
+
52
+ ### PyCharm ###
53
+ # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
54
+ # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
55
+
56
+ # User-specific stuff
57
+ .idea/**/workspace.xml
58
+ .idea/**/tasks.xml
59
+ .idea/**/usage.statistics.xml
60
+ .idea/**/dictionaries
61
+ .idea/**/shelf
62
+
63
+ # AWS User-specific
64
+ .idea/**/aws.xml
65
+
66
+ # Generated files
67
+ .idea/**/contentModel.xml
68
+
69
+ # Sensitive or high-churn files
70
+ .idea/**/dataSources/
71
+ .idea/**/dataSources.ids
72
+ .idea/**/dataSources.local.xml
73
+ .idea/**/sqlDataSources.xml
74
+ .idea/**/dynamic.xml
75
+ .idea/**/uiDesigner.xml
76
+ .idea/**/dbnavigator.xml
77
+
78
+ # Gradle
79
+ .idea/**/gradle.xml
80
+ .idea/**/libraries
81
+
82
+ # Gradle and Maven with auto-import
83
+ # When using Gradle or Maven with auto-import, you should exclude module files,
84
+ # since they will be recreated, and may cause churn. Uncomment if using
85
+ # auto-import.
86
+ # .idea/artifacts
87
+ # .idea/compiler.xml
88
+ # .idea/jarRepositories.xml
89
+ # .idea/modules.xml
90
+ # .idea/*.iml
91
+ # .idea/modules
92
+ # *.iml
93
+ # *.ipr
94
+
95
+ # CMake
96
+ cmake-build-*/
97
+
98
+ # Mongo Explorer plugin
99
+ .idea/**/mongoSettings.xml
100
+
101
+ # File-based project format
102
+ *.iws
103
+
104
+ # IntelliJ
105
+ out/
106
+
107
+ # mpeltonen/sbt-idea plugin
108
+ .idea_modules/
109
+
110
+ # JIRA plugin
111
+ atlassian-ide-plugin.xml
112
+
113
+ # Cursive Clojure plugin
114
+ .idea/replstate.xml
115
+
116
+ # SonarLint plugin
117
+ .idea/sonarlint/
118
+
119
+ # Crashlytics plugin (for Android Studio and IntelliJ)
120
+ com_crashlytics_export_strings.xml
121
+ crashlytics.properties
122
+ crashlytics-build.properties
123
+ fabric.properties
124
+
125
+ # Editor-based Rest Client
126
+ .idea/httpRequests
127
+
128
+ # Android studio 3.1+ serialized cache file
129
+ .idea/caches/build_file_checksums.ser
130
+
131
+ ### PyCharm Patch ###
132
+ # Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721
133
+
134
+ # *.iml
135
+ # modules.xml
136
+ # .idea/misc.xml
137
+ # *.ipr
138
+
139
+ # Sonarlint plugin
140
+ # https://plugins.jetbrains.com/plugin/7973-sonarlint
141
+ .idea/**/sonarlint/
142
+
143
+ # SonarQube Plugin
144
+ # https://plugins.jetbrains.com/plugin/7238-sonarqube-community-plugin
145
+ .idea/**/sonarIssues.xml
146
+
147
+ # Markdown Navigator plugin
148
+ # https://plugins.jetbrains.com/plugin/7896-markdown-navigator-enhanced
149
+ .idea/**/markdown-navigator.xml
150
+ .idea/**/markdown-navigator-enh.xml
151
+ .idea/**/markdown-navigator/
152
+
153
+ # Cache file creation bug
154
+ # See https://youtrack.jetbrains.com/issue/JBR-2257
155
+ .idea/$CACHE_FILE$
156
+
157
+ # CodeStream plugin
158
+ # https://plugins.jetbrains.com/plugin/12206-codestream
159
+ .idea/codestream.xml
160
+
161
+ # Azure Toolkit for IntelliJ plugin
162
+ # https://plugins.jetbrains.com/plugin/8053-azure-toolkit-for-intellij
163
+ .idea/**/azureSettings.xml
164
+
165
+ ### PyCharm+all ###
166
+ # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
167
+ # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
168
+
169
+ # User-specific stuff
170
+
171
+ # AWS User-specific
172
+
173
+ # Generated files
174
+
175
+ # Sensitive or high-churn files
176
+
177
+ # Gradle
178
+
179
+ # Gradle and Maven with auto-import
180
+ # When using Gradle or Maven with auto-import, you should exclude module files,
181
+ # since they will be recreated, and may cause churn. Uncomment if using
182
+ # auto-import.
183
+ # .idea/artifacts
184
+ # .idea/compiler.xml
185
+ # .idea/jarRepositories.xml
186
+ # .idea/modules.xml
187
+ # .idea/*.iml
188
+ # .idea/modules
189
+ # *.iml
190
+ # *.ipr
191
+
192
+ # CMake
193
+
194
+ # Mongo Explorer plugin
195
+
196
+ # File-based project format
197
+
198
+ # IntelliJ
199
+
200
+ # mpeltonen/sbt-idea plugin
201
+
202
+ # JIRA plugin
203
+
204
+ # Cursive Clojure plugin
205
+
206
+ # SonarLint plugin
207
+
208
+ # Crashlytics plugin (for Android Studio and IntelliJ)
209
+
210
+ # Editor-based Rest Client
211
+
212
+ # Android studio 3.1+ serialized cache file
213
+
214
+ ### PyCharm+all Patch ###
215
+ # Ignore everything but code style settings and run configurations
216
+ # that are supposed to be shared within teams.
217
+
218
+ .idea/*
219
+
220
+ !.idea/codeStyles
221
+ !.idea/runConfigurations
222
+
223
+ ### PyCharm+iml ###
224
+ # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
225
+ # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
226
+
227
+ # User-specific stuff
228
+
229
+ # AWS User-specific
230
+
231
+ # Generated files
232
+
233
+ # Sensitive or high-churn files
234
+
235
+ # Gradle
236
+
237
+ # Gradle and Maven with auto-import
238
+ # When using Gradle or Maven with auto-import, you should exclude module files,
239
+ # since they will be recreated, and may cause churn. Uncomment if using
240
+ # auto-import.
241
+ # .idea/artifacts
242
+ # .idea/compiler.xml
243
+ # .idea/jarRepositories.xml
244
+ # .idea/modules.xml
245
+ # .idea/*.iml
246
+ # .idea/modules
247
+ # *.iml
248
+ # *.ipr
249
+
250
+ # CMake
251
+
252
+ # Mongo Explorer plugin
253
+
254
+ # File-based project format
255
+
256
+ # IntelliJ
257
+
258
+ # mpeltonen/sbt-idea plugin
259
+
260
+ # JIRA plugin
261
+
262
+ # Cursive Clojure plugin
263
+
264
+ # SonarLint plugin
265
+
266
+ # Crashlytics plugin (for Android Studio and IntelliJ)
267
+
268
+ # Editor-based Rest Client
269
+
270
+ # Android studio 3.1+ serialized cache file
271
+
272
+ ### PyCharm+iml Patch ###
273
+ # Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-249601023
274
+
275
+ *.iml
276
+ modules.xml
277
+ .idea/misc.xml
278
+ *.ipr
279
+
280
+ ### Python ###
281
+ # Byte-compiled / optimized / DLL files
282
+ __pycache__/
283
+ *.py[cod]
284
+ *$py.class
285
+
286
+ # C extensions
287
+ *.so
288
+
289
+ # Distribution / packaging
290
+ .Python
291
+ build/
292
+ develop-eggs/
293
+ dist/
294
+ downloads/
295
+ eggs/
296
+ .eggs/
297
+ lib/
298
+ lib64/
299
+ parts/
300
+ sdist/
301
+ var/
302
+ wheels/
303
+ share/python-wheels/
304
+ *.egg-info/
305
+ .installed.cfg
306
+ *.egg
307
+ MANIFEST
308
+
309
+ # PyInstaller
310
+ # Usually these files are written by a python script from a template
311
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
312
+ *.manifest
313
+ *.spec
314
+
315
+ # Installer logs
316
+ pip-log.txt
317
+ pip-delete-this-directory.txt
318
+
319
+ # Unit test / coverage reports
320
+ htmlcov/
321
+ .tox/
322
+ .nox/
323
+ .coverage
324
+ .coverage.*
325
+ .cache
326
+ nosetests.xml
327
+ coverage.xml
328
+ *.cover
329
+ *.py,cover
330
+ .hypothesis/
331
+ .pytest_cache/
332
+ cover/
333
+
334
+ # Translations
335
+ *.mo
336
+ *.pot
337
+
338
+ # Django stuff:
339
+ *.log
340
+ local_settings.py
341
+ db.sqlite3
342
+ db.sqlite3-journal
343
+
344
+ # Flask stuff:
345
+ instance/
346
+ .webassets-cache
347
+
348
+ # Scrapy stuff:
349
+ .scrapy
350
+
351
+ # Sphinx documentation
352
+ docs/_build/
353
+
354
+ # PyBuilder
355
+ .pybuilder/
356
+ target/
357
+
358
+ # Jupyter Notebook
359
+ .ipynb_checkpoints
360
+
361
+ # IPython
362
+ profile_default/
363
+ ipython_config.py
364
+
365
+ # pyenv
366
+ # For a library or package, you might want to ignore these files since the code is
367
+ # intended to run in multiple environments; otherwise, check them in:
368
+ # .python-version
369
+
370
+ # pipenv
371
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
372
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
373
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
374
+ # install all needed dependencies.
375
+ #Pipfile.lock
376
+
377
+ # poetry
378
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
379
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
380
+ # commonly ignored for libraries.
381
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
382
+ #poetry.lock
383
+
384
+ # pdm
385
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
386
+ #pdm.lock
387
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
388
+ # in version control.
389
+ # https://pdm.fming.dev/#use-with-ide
390
+ .pdm.toml
391
+
392
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
393
+ __pypackages__/
394
+
395
+ # Celery stuff
396
+ celerybeat-schedule
397
+ celerybeat.pid
398
+
399
+ # SageMath parsed files
400
+ *.sage.py
401
+
402
+ # Environments
403
+ .env
404
+ .venv
405
+ env/
406
+ venv/
407
+ ENV/
408
+ env.bak/
409
+ venv.bak/
410
+
411
+ # Spyder project settings
412
+ .spyderproject
413
+ .spyproject
414
+
415
+ # Rope project settings
416
+ .ropeproject
417
+
418
+ # mkdocs documentation
419
+ /site
420
+
421
+ # mypy
422
+ .mypy_cache/
423
+ .dmypy.json
424
+ dmypy.json
425
+
426
+ # Pyre type checker
427
+ .pyre/
428
+
429
+ # pytype static type analyzer
430
+ .pytype/
431
+
432
+ # Cython debug symbols
433
+ cython_debug/
434
+
435
+ # PyCharm
436
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
437
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
438
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
439
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
440
+ #.idea/
441
+
442
+ ### Python Patch ###
443
+ # Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
444
+ poetry.toml
445
+
446
+ # ruff
447
+ .ruff_cache/
448
+
449
+ # LSP config files
450
+ pyrightconfig.json
451
+
452
+ ### VisualStudioCode ###
453
+ .vscode/*
454
+ !.vscode/settings.json
455
+ !.vscode/tasks.json
456
+ !.vscode/launch.json
457
+ !.vscode/extensions.json
458
+ !.vscode/*.code-snippets
459
+
460
+ # Local History for Visual Studio Code
461
+ .history/
462
+
463
+ # Built Visual Studio Code Extensions
464
+ *.vsix
465
+
466
+ ### VisualStudioCode Patch ###
467
+ # Ignore all local history of files
468
+ .history
469
+ .ionide
470
+
471
+ ### Windows ###
472
+ # Windows thumbnail cache files
473
+ Thumbs.db
474
+ Thumbs.db:encryptable
475
+ ehthumbs.db
476
+ ehthumbs_vista.db
477
+
478
+ # Dump file
479
+ *.stackdump
480
+
481
+ # Folder config file
482
+ [Dd]esktop.ini
483
+
484
+ # Recycle Bin used on file shares
485
+ $RECYCLE.BIN/
486
+
487
+ # Windows Installer files
488
+ *.cab
489
+ *.msi
490
+ *.msix
491
+ *.msm
492
+ *.msp
493
+
494
+ # Windows shortcuts
495
+ *.lnk
496
+
497
+ # End of https://www.toptal.com/developers/gitignore/api/windows,python,pycharm+iml,pycharm+all,pycharm,macos,linux,visualstudiocode
498
+
499
+ # Custom rules (everything added below won't be overriden by 'Generate .gitignore File' if you use 'Update' option)
@@ -0,0 +1,22 @@
1
+ # WIP
2
+
3
+ * handle vedana-core's DataModel through ETL
4
+ * move Eval away from ETL, processing tests using JIMS & Backoffice now
5
+ * store embeddings in tables for PGVector
6
+
7
+ # 2025-09-19
8
+
9
+ * Refactored evaluation script into a Datapipe pipeline
10
+
11
+ # 2025-09-04
12
+
13
+ * Covered get_grist_data / filter_grist_data with tests
14
+ * More fixes in processing "reference" type columns for both links and attrs
15
+
16
+ # 2025-09-02
17
+
18
+ * Refactor to var-style Table declaration in pipeline
19
+
20
+ # 0.2.0
21
+
22
+ * Initial pipeline
@@ -0,0 +1,51 @@
1
+ Metadata-Version: 2.4
2
+ Name: vedana-etl
3
+ Version: 0.1.0.dev3
4
+ Summary: Pipeline template for Vedana
5
+ Author-email: Andrey Tatarinov <a@tatarinov.co>, Timur Sheydaev <tsheyd@epoch8.co>
6
+ Requires-Python: >=3.12
7
+ Requires-Dist: alembic>=1.16.1
8
+ Requires-Dist: datapipe-app>=0.5.4
9
+ Requires-Dist: datapipe-core>=0.14.3
10
+ Requires-Dist: grist-api>=0.1.1
11
+ Requires-Dist: neo4j>=5.28.1
12
+ Requires-Dist: openai>=2.8.0
13
+ Requires-Dist: pandas>=1.2.0
14
+ Requires-Dist: pgvector>=0.4.2
15
+ Requires-Dist: pytest>=8.4.1
16
+ Requires-Dist: requests>=2.32.4
17
+ Requires-Dist: sqlalchemy>=2.0.41
18
+ Requires-Dist: vedana-core>=0.5.0
19
+ Description-Content-Type: text/markdown
20
+
21
+ # Basic pipeline for all vedana projects.
22
+
23
+ This pipeline:
24
+
25
+ - Parses Grist Data & Data Model
26
+ - Ensures that Memgraph index/vector index structure is in sync with data model
27
+ - Updates Memgraph database in incremental fashion
28
+
29
+ To add steps:
30
+ 1. Pass extra transformations to [get_pipeline](src/pipeline.py)
31
+ 2. Create new app configuration from [app.py](src/app.py)
32
+
33
+ ## Pipeline Labels Hierarchy
34
+
35
+ ### Pipeline
36
+
37
+ `labels=("pipeline", "pipeline_name")` defines a set of operations as standalone, sort of like a DAG in Airflow
38
+ or a Dagster Job. Its purpose is to be able to render it as a separate tab on the ETL page of Backoffice in order to
39
+ look at it independently of other transformations
40
+
41
+ ### Stage
42
+
43
+ `labels=("stage", "stage_name")` defines a stage of `pipeline`. Currently, stages are useful for creating and managing
44
+ observability features, such as [main dashboard's](/libs/vedana-backoffice/vedana_backoffice/pages/main_dashboard.py)
45
+ Ingest table, which displays DataTable's of all transformations with `labels=("stage", "extract")`.
46
+ Stages are also useful when running the pipeline manually.
47
+
48
+ ### Flow
49
+
50
+ `labels=("flow", "flow_name")` helps execute a `pipeline` (or possibly several pipelines) in a nice fashion,
51
+ used in defining cron jobs, etc.
@@ -0,0 +1,31 @@
1
+ # Basic pipeline for all vedana projects.
2
+
3
+ This pipeline:
4
+
5
+ - Parses Grist Data & Data Model
6
+ - Ensures that Memgraph index/vector index structure is in sync with data model
7
+ - Updates Memgraph database in incremental fashion
8
+
9
+ To add steps:
10
+ 1. Pass extra transformations to [get_pipeline](src/pipeline.py)
11
+ 2. Create new app configuration from [app.py](src/app.py)
12
+
13
+ ## Pipeline Labels Hierarchy
14
+
15
+ ### Pipeline
16
+
17
+ `labels=("pipeline", "pipeline_name")` defines a set of operations as standalone, sort of like a DAG in Airflow
18
+ or a Dagster Job. Its purpose is to be able to render it as a separate tab on the ETL page of Backoffice in order to
19
+ look at it independently of other transformations
20
+
21
+ ### Stage
22
+
23
+ `labels=("stage", "stage_name")` defines a stage of `pipeline`. Currently, stages are useful for creating and managing
24
+ observability features, such as [main dashboard's](/libs/vedana-backoffice/vedana_backoffice/pages/main_dashboard.py)
25
+ Ingest table, which displays DataTable's of all transformations with `labels=("stage", "extract")`.
26
+ Stages are also useful when running the pipeline manually.
27
+
28
+ ### Flow
29
+
30
+ `labels=("flow", "flow_name")` helps execute a `pipeline` (or possibly several pipelines) in a nice fashion,
31
+ used in defining cron jobs, etc.
@@ -0,0 +1,76 @@
1
+ [project]
2
+ name = "vedana-etl"
3
+ dynamic = ["version"]
4
+ description = "Pipeline template for Vedana"
5
+ readme = "README.md"
6
+ authors = [
7
+ { name = "Andrey Tatarinov", email = "a@tatarinov.co" },
8
+ { name = "Timur Sheydaev", email = "tsheyd@epoch8.co" },
9
+ ]
10
+ requires-python = ">=3.12"
11
+ dependencies = [
12
+ "alembic>=1.16.1",
13
+ "datapipe-core>=0.14.3",
14
+ "datapipe-app>=0.5.4",
15
+ "pandas>=1.2.0 ",
16
+ "requests>=2.32.4",
17
+ "sqlalchemy>=2.0.41",
18
+ "pgvector>=0.4.2",
19
+ "grist-api>=0.1.1",
20
+ "neo4j>=5.28.1",
21
+ "openai>=2.8.0",
22
+ "vedana-core>=0.5.0",
23
+ "pytest>=8.4.1",
24
+ ]
25
+
26
+ [build-system]
27
+ requires = ["hatchling", "uv-dynamic-versioning"]
28
+ build-backend = "hatchling.build"
29
+
30
+ [tool.hatch.version]
31
+ source = "uv-dynamic-versioning"
32
+
33
+ [tool.uv-dynamic-versioning]
34
+ enable = true
35
+ vcs = "git"
36
+ pattern = "default"
37
+
38
+ [dependency-groups]
39
+ dev = [
40
+ "mypy>=1.19.0",
41
+ "pytest>=8.4.1",
42
+ "ruff>=0.14.10",
43
+ "types-pyyaml>=6.0.12.20250822",
44
+ ]
45
+
46
+ [tool.ruff]
47
+ line-length = 120
48
+
49
+ [tool.uv-workspace-codegen]
50
+ generate = true
51
+ template_type = ["lib", "publish"]
52
+ generate_standard_pytest_step = false
53
+ typechecker = "mypy"
54
+ custom_steps = """
55
+ - name: Start infra (Grist + Memgraph)
56
+ run: |
57
+ docker compose -f libs/vedana-etl/tests/infra/docker-compose.ci.yml up -d
58
+ echo "Waiting containers..."
59
+ docker ps
60
+
61
+ - name: Run unit + integration tests
62
+ env:
63
+ GRIST_SERVER_URL: "http://0.0.0.0:8484"
64
+ GRIST_API_KEY: "e30d2f274a538c05fecd14510887f8a3b7eab718"
65
+ GRIST_DATA_MODEL_DOC_ID: "wEEmPY3UiwMD"
66
+ GRIST_DATA_DOC_ID: "qAxQ1gcBKcW7"
67
+ GRIST_ORG_IN_PATH: "true"
68
+ GRIST_DEFAULT_EMAIL: "ci@example.com"
69
+ GRIST_SESSION_SECRET: "dev-secret"
70
+ MEMGRAPH_URI: "bolt://localhost:7687"
71
+ MEMGRAPH_USER: ""
72
+ MEMGRAPH_PWD: ""
73
+ EMBEDDINGS_DIM: "1536"
74
+ run: |
75
+ uv run pytest libs/vedana-etl/
76
+ """
File without changes
@@ -0,0 +1,10 @@
1
+ from datapipe.compute import Catalog
2
+ from datapipe_app import DatapipeAPI
3
+
4
+ from vedana_etl.config import ds
5
+ from vedana_etl.pipeline import default_custom_steps, get_pipeline
6
+
7
+ # base app - no extra tables / steps
8
+ pipeline = get_pipeline(custom_steps=default_custom_steps)
9
+
10
+ app = DatapipeAPI(ds, Catalog({}), pipeline)