datachain 0.30.5__py3-none-any.whl → 0.39.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (119) hide show
  1. datachain/__init__.py +4 -0
  2. datachain/asyn.py +11 -12
  3. datachain/cache.py +5 -5
  4. datachain/catalog/__init__.py +0 -2
  5. datachain/catalog/catalog.py +276 -354
  6. datachain/catalog/dependency.py +164 -0
  7. datachain/catalog/loader.py +8 -3
  8. datachain/checkpoint.py +43 -0
  9. datachain/cli/__init__.py +10 -17
  10. datachain/cli/commands/__init__.py +1 -8
  11. datachain/cli/commands/datasets.py +42 -27
  12. datachain/cli/commands/ls.py +15 -15
  13. datachain/cli/commands/show.py +2 -2
  14. datachain/cli/parser/__init__.py +3 -43
  15. datachain/cli/parser/job.py +1 -1
  16. datachain/cli/parser/utils.py +1 -2
  17. datachain/cli/utils.py +2 -15
  18. datachain/client/azure.py +2 -2
  19. datachain/client/fsspec.py +34 -23
  20. datachain/client/gcs.py +3 -3
  21. datachain/client/http.py +157 -0
  22. datachain/client/local.py +11 -7
  23. datachain/client/s3.py +3 -3
  24. datachain/config.py +4 -8
  25. datachain/data_storage/db_engine.py +12 -6
  26. datachain/data_storage/job.py +2 -0
  27. datachain/data_storage/metastore.py +716 -137
  28. datachain/data_storage/schema.py +20 -27
  29. datachain/data_storage/serializer.py +105 -15
  30. datachain/data_storage/sqlite.py +114 -114
  31. datachain/data_storage/warehouse.py +140 -48
  32. datachain/dataset.py +109 -89
  33. datachain/delta.py +117 -42
  34. datachain/diff/__init__.py +25 -33
  35. datachain/error.py +24 -0
  36. datachain/func/aggregate.py +9 -11
  37. datachain/func/array.py +12 -12
  38. datachain/func/base.py +7 -4
  39. datachain/func/conditional.py +9 -13
  40. datachain/func/func.py +63 -45
  41. datachain/func/numeric.py +5 -7
  42. datachain/func/string.py +2 -2
  43. datachain/hash_utils.py +123 -0
  44. datachain/job.py +11 -7
  45. datachain/json.py +138 -0
  46. datachain/lib/arrow.py +18 -15
  47. datachain/lib/audio.py +60 -59
  48. datachain/lib/clip.py +14 -13
  49. datachain/lib/convert/python_to_sql.py +6 -10
  50. datachain/lib/convert/values_to_tuples.py +151 -53
  51. datachain/lib/data_model.py +23 -19
  52. datachain/lib/dataset_info.py +7 -7
  53. datachain/lib/dc/__init__.py +2 -1
  54. datachain/lib/dc/csv.py +22 -26
  55. datachain/lib/dc/database.py +37 -34
  56. datachain/lib/dc/datachain.py +518 -324
  57. datachain/lib/dc/datasets.py +38 -30
  58. datachain/lib/dc/hf.py +16 -20
  59. datachain/lib/dc/json.py +17 -18
  60. datachain/lib/dc/listings.py +5 -8
  61. datachain/lib/dc/pandas.py +3 -6
  62. datachain/lib/dc/parquet.py +33 -21
  63. datachain/lib/dc/records.py +9 -13
  64. datachain/lib/dc/storage.py +103 -65
  65. datachain/lib/dc/storage_pattern.py +251 -0
  66. datachain/lib/dc/utils.py +17 -14
  67. datachain/lib/dc/values.py +3 -6
  68. datachain/lib/file.py +187 -50
  69. datachain/lib/hf.py +7 -5
  70. datachain/lib/image.py +13 -13
  71. datachain/lib/listing.py +5 -5
  72. datachain/lib/listing_info.py +1 -2
  73. datachain/lib/meta_formats.py +2 -3
  74. datachain/lib/model_store.py +20 -8
  75. datachain/lib/namespaces.py +59 -7
  76. datachain/lib/projects.py +51 -9
  77. datachain/lib/pytorch.py +31 -23
  78. datachain/lib/settings.py +188 -85
  79. datachain/lib/signal_schema.py +302 -64
  80. datachain/lib/text.py +8 -7
  81. datachain/lib/udf.py +103 -63
  82. datachain/lib/udf_signature.py +59 -34
  83. datachain/lib/utils.py +20 -0
  84. datachain/lib/video.py +3 -4
  85. datachain/lib/webdataset.py +31 -36
  86. datachain/lib/webdataset_laion.py +15 -16
  87. datachain/listing.py +12 -5
  88. datachain/model/bbox.py +3 -1
  89. datachain/namespace.py +22 -3
  90. datachain/node.py +6 -6
  91. datachain/nodes_thread_pool.py +0 -1
  92. datachain/plugins.py +24 -0
  93. datachain/project.py +4 -4
  94. datachain/query/batch.py +10 -12
  95. datachain/query/dataset.py +376 -194
  96. datachain/query/dispatch.py +112 -84
  97. datachain/query/metrics.py +3 -4
  98. datachain/query/params.py +2 -3
  99. datachain/query/queue.py +2 -1
  100. datachain/query/schema.py +7 -6
  101. datachain/query/session.py +190 -33
  102. datachain/query/udf.py +9 -6
  103. datachain/remote/studio.py +90 -53
  104. datachain/script_meta.py +12 -12
  105. datachain/sql/sqlite/base.py +37 -25
  106. datachain/sql/sqlite/types.py +1 -1
  107. datachain/sql/types.py +36 -5
  108. datachain/studio.py +49 -40
  109. datachain/toolkit/split.py +31 -10
  110. datachain/utils.py +39 -48
  111. {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/METADATA +26 -38
  112. datachain-0.39.0.dist-info/RECORD +173 -0
  113. datachain/cli/commands/query.py +0 -54
  114. datachain/query/utils.py +0 -36
  115. datachain-0.30.5.dist-info/RECORD +0 -168
  116. {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/WHEEL +0 -0
  117. {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/entry_points.txt +0 -0
  118. {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/licenses/LICENSE +0 -0
  119. {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/top_level.txt +0 -0
@@ -1,18 +1,11 @@
1
1
  import logging
2
2
  import os
3
3
  import sqlite3
4
- from collections.abc import Iterable, Sequence
4
+ from collections.abc import Callable, Iterable, Sequence
5
5
  from contextlib import contextmanager
6
6
  from functools import cached_property, wraps
7
7
  from time import sleep
8
- from typing import (
9
- TYPE_CHECKING,
10
- Any,
11
- Callable,
12
- ClassVar,
13
- Optional,
14
- Union,
15
- )
8
+ from typing import TYPE_CHECKING, Any, ClassVar, Union
16
9
 
17
10
  import sqlalchemy
18
11
  from sqlalchemy import (
@@ -27,16 +20,19 @@ from sqlalchemy import (
27
20
  from sqlalchemy.dialects import sqlite
28
21
  from sqlalchemy.schema import CreateIndex, CreateTable, DropTable
29
22
  from sqlalchemy.sql import func
30
- from sqlalchemy.sql.elements import BinaryExpression, BooleanClauseList
23
+ from sqlalchemy.sql.elements import (
24
+ BinaryExpression,
25
+ BooleanClauseList,
26
+ )
31
27
  from sqlalchemy.sql.expression import bindparam, cast
32
28
  from sqlalchemy.sql.selectable import Select
33
29
  from tqdm.auto import tqdm
34
30
 
35
31
  import datachain.sql.sqlite
36
- from datachain import semver
37
32
  from datachain.data_storage import AbstractDBMetastore, AbstractWarehouse
38
33
  from datachain.data_storage.db_engine import DatabaseEngine
39
34
  from datachain.data_storage.schema import DefaultSchema
35
+ from datachain.data_storage.warehouse import INSERT_BATCH_SIZE
40
36
  from datachain.dataset import DatasetRecord, StorageURI
41
37
  from datachain.error import DataChainError, OutdatedDatabaseSchemaError
42
38
  from datachain.namespace import Namespace
@@ -44,9 +40,10 @@ from datachain.project import Project
44
40
  from datachain.sql.sqlite import create_user_defined_sql_functions, sqlite_dialect
45
41
  from datachain.sql.sqlite.base import load_usearch_extension
46
42
  from datachain.sql.types import SQLType
47
- from datachain.utils import DataChainDir, batched_it
43
+ from datachain.utils import DataChainDir, batched, batched_it
48
44
 
49
45
  if TYPE_CHECKING:
46
+ from sqlalchemy import CTE, Subquery
50
47
  from sqlalchemy.dialects.sqlite import Insert
51
48
  from sqlalchemy.engine.base import Engine
52
49
  from sqlalchemy.schema import SchemaItem
@@ -104,8 +101,8 @@ def retry_sqlite_locks(func):
104
101
 
105
102
 
106
103
  def get_db_file_in_memory(
107
- db_file: Optional[str] = None, in_memory: bool = False
108
- ) -> Optional[str]:
104
+ db_file: str | None = None, in_memory: bool = False
105
+ ) -> str | None:
109
106
  """Get in-memory db_file and check that conflicting arguments are not provided."""
110
107
  if in_memory:
111
108
  if db_file and db_file != ":memory:":
@@ -118,7 +115,7 @@ class SQLiteDatabaseEngine(DatabaseEngine):
118
115
  dialect = sqlite_dialect
119
116
 
120
117
  db: sqlite3.Connection
121
- db_file: Optional[str]
118
+ db_file: str | None
122
119
  is_closed: bool
123
120
 
124
121
  def __init__(
@@ -126,8 +123,8 @@ class SQLiteDatabaseEngine(DatabaseEngine):
126
123
  engine: "Engine",
127
124
  metadata: "MetaData",
128
125
  db: sqlite3.Connection,
129
- db_file: Optional[str] = None,
130
- max_variable_number: Optional[int] = 999,
126
+ db_file: str | None = None,
127
+ max_variable_number: int | None = 999,
131
128
  ):
132
129
  self.engine = engine
133
130
  self.metadata = metadata
@@ -137,12 +134,12 @@ class SQLiteDatabaseEngine(DatabaseEngine):
137
134
  self.max_variable_number = max_variable_number
138
135
 
139
136
  @classmethod
140
- def from_db_file(cls, db_file: Optional[str] = None) -> "SQLiteDatabaseEngine":
137
+ def from_db_file(cls, db_file: str | None = None) -> "SQLiteDatabaseEngine":
141
138
  return cls(*cls._connect(db_file=db_file))
142
139
 
143
140
  @staticmethod
144
141
  def _connect(
145
- db_file: Optional[str] = None,
142
+ db_file: str | None = None,
146
143
  ) -> tuple["Engine", "MetaData", sqlite3.Connection, str, int]:
147
144
  try:
148
145
  if db_file == ":memory:":
@@ -200,10 +197,14 @@ class SQLiteDatabaseEngine(DatabaseEngine):
200
197
  """
201
198
  return (
202
199
  SQLiteDatabaseEngine.from_db_file,
203
- [self.db_file],
200
+ [str(self.db_file)],
204
201
  {},
205
202
  )
206
203
 
204
+ @classmethod
205
+ def serialize_callable_name(cls) -> str:
206
+ return "sqlite.from_db_file"
207
+
207
208
  def _reconnect(self) -> None:
208
209
  if not self.is_closed:
209
210
  raise RuntimeError("Cannot reconnect on still-open DB!")
@@ -227,7 +228,7 @@ class SQLiteDatabaseEngine(DatabaseEngine):
227
228
  def execute(
228
229
  self,
229
230
  query,
230
- cursor: Optional[sqlite3.Cursor] = None,
231
+ cursor: sqlite3.Cursor | None = None,
231
232
  conn=None,
232
233
  ) -> sqlite3.Cursor:
233
234
  if self.is_closed:
@@ -246,7 +247,7 @@ class SQLiteDatabaseEngine(DatabaseEngine):
246
247
 
247
248
  @retry_sqlite_locks
248
249
  def executemany(
249
- self, query, params, cursor: Optional[sqlite3.Cursor] = None, conn=None
250
+ self, query, params, cursor: sqlite3.Cursor | None = None, conn=None
250
251
  ) -> sqlite3.Cursor:
251
252
  if cursor:
252
253
  return cursor.executemany(self.compile(query).string, params)
@@ -290,6 +291,8 @@ class SQLiteDatabaseEngine(DatabaseEngine):
290
291
  return self.db.cursor(factory)
291
292
 
292
293
  def close(self) -> None:
294
+ if self.is_closed:
295
+ return
293
296
  self.db.close()
294
297
  self.is_closed = True
295
298
 
@@ -326,7 +329,13 @@ class SQLiteDatabaseEngine(DatabaseEngine):
326
329
  query = "SELECT name FROM sqlite_master WHERE type='table';"
327
330
  return [r[0] for r in self.execute_str(query).fetchall()]
328
331
 
329
- def create_table(self, table: "Table", if_not_exists: bool = True) -> None:
332
+ def create_table(
333
+ self,
334
+ table: "Table",
335
+ if_not_exists: bool = True,
336
+ *,
337
+ kind: str | None = None,
338
+ ) -> None:
330
339
  self.execute(CreateTable(table, if_not_exists=if_not_exists))
331
340
 
332
341
  def drop_table(self, table: "Table", if_exists: bool = False) -> None:
@@ -346,13 +355,13 @@ class SQLiteMetastore(AbstractDBMetastore):
346
355
 
347
356
  META_TABLE = "meta"
348
357
 
349
- db: "SQLiteDatabaseEngine"
358
+ db: SQLiteDatabaseEngine
350
359
 
351
360
  def __init__(
352
361
  self,
353
- uri: Optional[StorageURI] = None,
354
- db: Optional["SQLiteDatabaseEngine"] = None,
355
- db_file: Optional[str] = None,
362
+ uri: StorageURI | None = None,
363
+ db: SQLiteDatabaseEngine | None = None,
364
+ db_file: str | None = None,
356
365
  in_memory: bool = False,
357
366
  ):
358
367
  uri = uri or StorageURI("")
@@ -367,11 +376,12 @@ class SQLiteMetastore(AbstractDBMetastore):
367
376
 
368
377
  self.db = db or SQLiteDatabaseEngine.from_db_file(db_file)
369
378
 
370
- self._init_meta_table()
371
- self._init_meta_schema_value()
372
- self._check_schema_version()
373
- self._init_tables()
374
- self._init_namespaces_projects()
379
+ with self._init_guard():
380
+ self._init_meta_table()
381
+ self._init_meta_schema_value()
382
+ self._check_schema_version()
383
+ self._init_tables()
384
+ self._init_namespaces_projects()
375
385
 
376
386
  def __exit__(self, exc_type, exc_value, traceback) -> None:
377
387
  """Close connection upon exit from context manager."""
@@ -379,7 +389,7 @@ class SQLiteMetastore(AbstractDBMetastore):
379
389
 
380
390
  def clone(
381
391
  self,
382
- uri: Optional[StorageURI] = None,
392
+ uri: StorageURI | None = None,
383
393
  use_new_connection: bool = False,
384
394
  ) -> "SQLiteMetastore":
385
395
  uri = uri or StorageURI("")
@@ -402,6 +412,10 @@ class SQLiteMetastore(AbstractDBMetastore):
402
412
  },
403
413
  )
404
414
 
415
+ @classmethod
416
+ def serialize_callable_name(cls) -> str:
417
+ return "sqlite.metastore.init_after_clone"
418
+
405
419
  @classmethod
406
420
  def init_after_clone(
407
421
  cls,
@@ -458,6 +472,10 @@ class SQLiteMetastore(AbstractDBMetastore):
458
472
  self.default_table_names.append(self._datasets_dependencies.name)
459
473
  self.db.create_table(self._jobs, if_not_exists=True)
460
474
  self.default_table_names.append(self._jobs.name)
475
+ self.db.create_table(self._checkpoints, if_not_exists=True)
476
+ self.default_table_names.append(self._checkpoints.name)
477
+ self.db.create_table(self._dataset_version_jobs, if_not_exists=True)
478
+ self.default_table_names.append(self._dataset_version_jobs.name)
461
479
 
462
480
  def _init_namespaces_projects(self) -> None:
463
481
  """
@@ -535,6 +553,26 @@ class SQLiteMetastore(AbstractDBMetastore):
535
553
  self._datasets_versions.c.created_at,
536
554
  ]
537
555
 
556
+ def _dataset_dependency_nodes_select_columns(
557
+ self,
558
+ namespaces_subquery: "Subquery",
559
+ dependency_tree_cte: "CTE",
560
+ datasets_subquery: "Subquery",
561
+ ) -> list["ColumnElement"]:
562
+ return [
563
+ namespaces_subquery.c.name,
564
+ self._projects.c.name,
565
+ dependency_tree_cte.c.id,
566
+ dependency_tree_cte.c.dataset_id,
567
+ dependency_tree_cte.c.dataset_version_id,
568
+ datasets_subquery.c.name,
569
+ self._datasets_versions.c.version,
570
+ self._datasets_versions.c.created_at,
571
+ dependency_tree_cte.c.source_dataset_id,
572
+ dependency_tree_cte.c.source_dataset_version_id,
573
+ dependency_tree_cte.c.depth,
574
+ ]
575
+
538
576
  #
539
577
  # Jobs
540
578
  #
@@ -542,6 +580,15 @@ class SQLiteMetastore(AbstractDBMetastore):
542
580
  def _jobs_insert(self) -> "Insert":
543
581
  return sqlite.insert(self._jobs)
544
582
 
583
+ #
584
+ # Checkpoints
585
+ #
586
+ def _checkpoints_insert(self) -> "Insert":
587
+ return sqlite.insert(self._checkpoints)
588
+
589
+ def _dataset_version_jobs_insert(self) -> "Insert":
590
+ return sqlite.insert(self._dataset_version_jobs)
591
+
545
592
  #
546
593
  # Namespaces
547
594
  #
@@ -565,15 +612,15 @@ class SQLiteWarehouse(AbstractWarehouse):
565
612
  This is currently used for the local cli.
566
613
  """
567
614
 
568
- db: "SQLiteDatabaseEngine"
615
+ db: SQLiteDatabaseEngine
569
616
 
570
617
  # Cache for our defined column types to dialect specific TypeEngine relations
571
618
  _col_python_type: ClassVar[dict[type, "TypeEngine"]] = {}
572
619
 
573
620
  def __init__(
574
621
  self,
575
- db: Optional["SQLiteDatabaseEngine"] = None,
576
- db_file: Optional[str] = None,
622
+ db: SQLiteDatabaseEngine | None = None,
623
+ db_file: str | None = None,
577
624
  in_memory: bool = False,
578
625
  ):
579
626
  self.schema: DefaultSchema = DefaultSchema()
@@ -601,6 +648,10 @@ class SQLiteWarehouse(AbstractWarehouse):
601
648
  {"db_clone_params": self.db.clone_params()},
602
649
  )
603
650
 
651
+ @classmethod
652
+ def serialize_callable_name(cls) -> str:
653
+ return "sqlite.warehouse.init_after_clone"
654
+
604
655
  @classmethod
605
656
  def init_after_clone(
606
657
  cls,
@@ -624,7 +675,7 @@ class SQLiteWarehouse(AbstractWarehouse):
624
675
  only=filter_tables,
625
676
  )
626
677
 
627
- def is_ready(self, timeout: Optional[int] = None) -> bool:
678
+ def is_ready(self, timeout: int | None = None) -> bool:
628
679
  return True
629
680
 
630
681
  def create_dataset_rows_table(
@@ -654,77 +705,24 @@ class SQLiteWarehouse(AbstractWarehouse):
654
705
  for row in self.db.execute(query, cursor=cur)
655
706
  ]
656
707
 
657
- def merge_dataset_rows(
658
- self,
659
- src: DatasetRecord,
660
- dst: DatasetRecord,
661
- src_version: str,
662
- dst_version: str,
663
- ) -> None:
664
- dst_empty = False
665
-
666
- if not self.db.has_table(self.dataset_table_name(src, src_version)):
667
- # source table doesn't exist, nothing to do
668
- return
669
-
670
- src_dr = self.dataset_rows(src, src_version).table
671
-
672
- if not self.db.has_table(self.dataset_table_name(dst, dst_version)):
673
- # destination table doesn't exist, create it
674
- self.create_dataset_rows_table(
675
- self.dataset_table_name(dst, dst_version),
676
- columns=src_dr.columns,
677
- )
678
- dst_empty = True
679
-
680
- dst_dr = self.dataset_rows(dst, dst_version).table
681
- merge_fields = [c.name for c in src_dr.columns if c.name != "sys__id"]
682
- select_src = select(*(getattr(src_dr.columns, f) for f in merge_fields))
683
-
684
- if dst_empty:
685
- # we don't need union, but just select from source to destination
686
- insert_query = sqlite.insert(dst_dr).from_select(merge_fields, select_src)
687
- else:
688
- dst_version_latest = None
689
- # find the previous version of the destination dataset
690
- dst_previous_versions = [
691
- v.version
692
- for v in dst.versions # type: ignore [union-attr]
693
- if semver.compare(v.version, dst_version) == -1
694
- ]
695
- if dst_previous_versions:
696
- dst_version_latest = max(dst_previous_versions)
697
-
698
- dst_dr_latest = self.dataset_rows(dst, dst_version_latest).table
699
-
700
- select_dst_latest = select(
701
- *(getattr(dst_dr_latest.c, f) for f in merge_fields)
702
- )
703
- union_query = sqlalchemy.union(select_src, select_dst_latest)
704
- insert_query = (
705
- sqlite.insert(dst_dr)
706
- .from_select(merge_fields, union_query)
707
- .prefix_with("OR IGNORE")
708
- )
709
-
710
- self.db.execute(insert_query)
711
-
712
708
  def prepare_entries(self, entries: "Iterable[File]") -> Iterable[dict[str, Any]]:
713
709
  return (e.model_dump() for e in entries)
714
710
 
715
- def insert_rows(self, table: Table, rows: Iterable[dict[str, Any]]) -> None:
716
- rows = list(rows)
717
- if not rows:
718
- return
719
-
720
- with self.db.transaction() as conn:
721
- # transactions speeds up inserts significantly as there is no separate
722
- # transaction created for each insert row
723
- self.db.executemany(
724
- table.insert().values({f: bindparam(f) for f in rows[0]}),
725
- rows,
726
- conn=conn,
727
- )
711
+ def insert_rows(
712
+ self,
713
+ table: Table,
714
+ rows: Iterable[dict[str, Any]],
715
+ batch_size: int = INSERT_BATCH_SIZE,
716
+ ) -> None:
717
+ for row_chunk in batched(rows, batch_size):
718
+ with self.db.transaction() as conn:
719
+ # transactions speeds up inserts significantly as there is no separate
720
+ # transaction created for each insert row
721
+ self.db.executemany(
722
+ table.insert().values({f: bindparam(f) for f in row_chunk[0]}),
723
+ row_chunk,
724
+ conn=conn,
725
+ )
728
726
 
729
727
  def insert_dataset_rows(self, df, dataset: DatasetRecord, version: str) -> int:
730
728
  dr = self.dataset_rows(dataset, version)
@@ -768,7 +766,7 @@ class SQLiteWarehouse(AbstractWarehouse):
768
766
  self,
769
767
  table: Table,
770
768
  query: Select,
771
- progress_cb: Optional[Callable[[int], None]] = None,
769
+ progress_cb: Callable[[int], None] | None = None,
772
770
  ) -> None:
773
771
  col_id = (
774
772
  query.selected_columns.sys__id
@@ -797,7 +795,7 @@ class SQLiteWarehouse(AbstractWarehouse):
797
795
  .limit(None)
798
796
  )
799
797
 
800
- for batch in batched_it(ids, 10_000):
798
+ for batch in batched_it(ids, INSERT_BATCH_SIZE):
801
799
  batch_ids = [row[0] for row in batch]
802
800
  select_q._where_criteria = (col_id.in_(batch_ids),)
803
801
  q = table.insert().from_select(list(select_q.selected_columns), select_q)
@@ -852,18 +850,20 @@ class SQLiteWarehouse(AbstractWarehouse):
852
850
  if isinstance(c, BinaryExpression):
853
851
  right_left_join = add_left_rows_filter(c)
854
852
 
855
- union = sqlalchemy.union(left_right_join, right_left_join).subquery()
856
- return sqlalchemy.select(*union.c).select_from(union)
853
+ union_cte = sqlalchemy.union(left_right_join, right_left_join).cte()
854
+ return sqlalchemy.select(*union_cte.c).select_from(union_cte)
855
+
856
+ def _system_row_number_expr(self):
857
+ return func.row_number().over()
858
+
859
+ def _system_random_expr(self):
860
+ return self._system_row_number_expr() * 1103515245 + 12345
857
861
 
858
862
  def create_pre_udf_table(self, query: "Select") -> "Table":
859
863
  """
860
864
  Create a temporary table from a query for use in a UDF.
861
865
  """
862
- columns = [
863
- sqlalchemy.Column(c.name, c.type)
864
- for c in query.selected_columns
865
- if c.name != "sys__id"
866
- ]
866
+ columns = [sqlalchemy.Column(c.name, c.type) for c in query.selected_columns]
867
867
  table = self.create_udf_table(columns)
868
868
 
869
869
  with tqdm(desc="Preparing", unit=" rows", leave=False) as pbar: