datachain 0.3.15__py3-none-any.whl → 0.3.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/data_storage/sqlite.py +19 -0
- datachain/data_storage/warehouse.py +19 -3
- datachain/query/dataset.py +38 -19
- {datachain-0.3.15.dist-info → datachain-0.3.16.dist-info}/METADATA +1 -1
- {datachain-0.3.15.dist-info → datachain-0.3.16.dist-info}/RECORD +9 -9
- {datachain-0.3.15.dist-info → datachain-0.3.16.dist-info}/WHEEL +1 -1
- {datachain-0.3.15.dist-info → datachain-0.3.16.dist-info}/LICENSE +0 -0
- {datachain-0.3.15.dist-info → datachain-0.3.16.dist-info}/entry_points.txt +0 -0
- {datachain-0.3.15.dist-info → datachain-0.3.16.dist-info}/top_level.txt +0 -0
datachain/data_storage/sqlite.py
CHANGED
|
@@ -40,7 +40,9 @@ if TYPE_CHECKING:
|
|
|
40
40
|
from sqlalchemy.dialects.sqlite import Insert
|
|
41
41
|
from sqlalchemy.engine.base import Engine
|
|
42
42
|
from sqlalchemy.schema import SchemaItem
|
|
43
|
+
from sqlalchemy.sql._typing import _FromClauseArgument, _OnClauseArgument
|
|
43
44
|
from sqlalchemy.sql.elements import ColumnElement
|
|
45
|
+
from sqlalchemy.sql.selectable import Join
|
|
44
46
|
from sqlalchemy.types import TypeEngine
|
|
45
47
|
|
|
46
48
|
from datachain.lib.file import File
|
|
@@ -788,6 +790,23 @@ class SQLiteWarehouse(AbstractWarehouse):
|
|
|
788
790
|
if progress_cb:
|
|
789
791
|
progress_cb(len(batch_ids))
|
|
790
792
|
|
|
793
|
+
def join(
|
|
794
|
+
self,
|
|
795
|
+
left: "_FromClauseArgument",
|
|
796
|
+
right: "_FromClauseArgument",
|
|
797
|
+
onclause: "_OnClauseArgument",
|
|
798
|
+
inner: bool = True,
|
|
799
|
+
) -> "Join":
|
|
800
|
+
"""
|
|
801
|
+
Join two tables together.
|
|
802
|
+
"""
|
|
803
|
+
return sqlalchemy.join(
|
|
804
|
+
left,
|
|
805
|
+
right,
|
|
806
|
+
onclause,
|
|
807
|
+
isouter=not inner,
|
|
808
|
+
)
|
|
809
|
+
|
|
791
810
|
def create_pre_udf_table(self, query: "Select") -> "Table":
|
|
792
811
|
"""
|
|
793
812
|
Create a temporary table from a query for use in a UDF.
|
|
@@ -27,8 +27,12 @@ from datachain.storage import StorageURI
|
|
|
27
27
|
from datachain.utils import sql_escape_like
|
|
28
28
|
|
|
29
29
|
if TYPE_CHECKING:
|
|
30
|
-
from sqlalchemy.sql._typing import
|
|
31
|
-
|
|
30
|
+
from sqlalchemy.sql._typing import (
|
|
31
|
+
_ColumnsClauseArgument,
|
|
32
|
+
_FromClauseArgument,
|
|
33
|
+
_OnClauseArgument,
|
|
34
|
+
)
|
|
35
|
+
from sqlalchemy.sql.selectable import Join, Select
|
|
32
36
|
from sqlalchemy.types import TypeEngine
|
|
33
37
|
|
|
34
38
|
from datachain.data_storage import AbstractIDGenerator, schema
|
|
@@ -894,6 +898,18 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
894
898
|
Copy the results of a query into a table.
|
|
895
899
|
"""
|
|
896
900
|
|
|
901
|
+
@abstractmethod
|
|
902
|
+
def join(
|
|
903
|
+
self,
|
|
904
|
+
left: "_FromClauseArgument",
|
|
905
|
+
right: "_FromClauseArgument",
|
|
906
|
+
onclause: "_OnClauseArgument",
|
|
907
|
+
inner: bool = True,
|
|
908
|
+
) -> "Join":
|
|
909
|
+
"""
|
|
910
|
+
Join two tables together.
|
|
911
|
+
"""
|
|
912
|
+
|
|
897
913
|
@abstractmethod
|
|
898
914
|
def create_pre_udf_table(self, query: "Select") -> "Table":
|
|
899
915
|
"""
|
|
@@ -922,7 +938,7 @@ class AbstractWarehouse(ABC, Serializable):
|
|
|
922
938
|
are cleaned up as soon as they are no longer needed.
|
|
923
939
|
"""
|
|
924
940
|
with tqdm(desc="Cleanup", unit=" tables") as pbar:
|
|
925
|
-
for name in names:
|
|
941
|
+
for name in set(names):
|
|
926
942
|
self.db.drop_table(Table(name, self.db.metadata), if_exists=True)
|
|
927
943
|
pbar.update(1)
|
|
928
944
|
|
datachain/query/dataset.py
CHANGED
|
@@ -33,7 +33,6 @@ from sqlalchemy.sql.elements import ColumnClause, ColumnElement
|
|
|
33
33
|
from sqlalchemy.sql.expression import label
|
|
34
34
|
from sqlalchemy.sql.schema import TableClause
|
|
35
35
|
from sqlalchemy.sql.selectable import Select
|
|
36
|
-
from tqdm import tqdm
|
|
37
36
|
|
|
38
37
|
from datachain.asyn import ASYNC_WORKERS, AsyncMapper, OrderedMapper
|
|
39
38
|
from datachain.catalog import QUERY_SCRIPT_CANCELED_EXIT_CODE, get_catalog
|
|
@@ -899,12 +898,36 @@ class SQLUnion(Step):
|
|
|
899
898
|
|
|
900
899
|
@frozen
|
|
901
900
|
class SQLJoin(Step):
|
|
901
|
+
catalog: "Catalog"
|
|
902
902
|
query1: "DatasetQuery"
|
|
903
903
|
query2: "DatasetQuery"
|
|
904
904
|
predicates: Union[JoinPredicateType, tuple[JoinPredicateType, ...]]
|
|
905
905
|
inner: bool
|
|
906
906
|
rname: str
|
|
907
907
|
|
|
908
|
+
def get_query(self, dq: "DatasetQuery", temp_tables: list[str]) -> sa.Subquery:
|
|
909
|
+
query = dq.apply_steps().select()
|
|
910
|
+
temp_tables.extend(dq.temp_table_names)
|
|
911
|
+
|
|
912
|
+
if not any(isinstance(step, (SQLJoin, SQLUnion)) for step in dq.steps):
|
|
913
|
+
return query.subquery(dq.table.name)
|
|
914
|
+
|
|
915
|
+
warehouse = self.catalog.warehouse
|
|
916
|
+
|
|
917
|
+
columns = [
|
|
918
|
+
c if isinstance(c, Column) else Column(c.name, c.type)
|
|
919
|
+
for c in query.subquery().columns
|
|
920
|
+
]
|
|
921
|
+
temp_table = warehouse.create_dataset_rows_table(
|
|
922
|
+
warehouse.temp_table_name(),
|
|
923
|
+
columns=columns,
|
|
924
|
+
)
|
|
925
|
+
temp_tables.append(temp_table.name)
|
|
926
|
+
|
|
927
|
+
warehouse.copy_table(temp_table, query)
|
|
928
|
+
|
|
929
|
+
return temp_table.select().subquery(dq.table.name)
|
|
930
|
+
|
|
908
931
|
def validate_expression(self, exp: "ClauseElement", q1, q2):
|
|
909
932
|
"""
|
|
910
933
|
Checking if columns used in expression actually exist in left / right
|
|
@@ -937,10 +960,8 @@ class SQLJoin(Step):
|
|
|
937
960
|
def apply(
|
|
938
961
|
self, query_generator: QueryGenerator, temp_tables: list[str]
|
|
939
962
|
) -> StepResult:
|
|
940
|
-
q1 = self.
|
|
941
|
-
|
|
942
|
-
q2 = self.query2.apply_steps().select().subquery(self.query2.table.name)
|
|
943
|
-
temp_tables.extend(self.query2.temp_table_names)
|
|
963
|
+
q1 = self.get_query(self.query1, temp_tables)
|
|
964
|
+
q2 = self.get_query(self.query2, temp_tables)
|
|
944
965
|
|
|
945
966
|
q1_columns = list(q1.c)
|
|
946
967
|
q1_column_names = {c.name for c in q1_columns}
|
|
@@ -951,7 +972,12 @@ class SQLJoin(Step):
|
|
|
951
972
|
continue
|
|
952
973
|
|
|
953
974
|
if c.name in q1_column_names:
|
|
954
|
-
|
|
975
|
+
new_name = self.rname.format(name=c.name)
|
|
976
|
+
new_name_idx = 0
|
|
977
|
+
while new_name in q1_column_names:
|
|
978
|
+
new_name_idx += 1
|
|
979
|
+
new_name = self.rname.format(name=f"{c.name}_{new_name_idx}")
|
|
980
|
+
c = c.label(new_name)
|
|
955
981
|
q2_columns.append(c)
|
|
956
982
|
|
|
957
983
|
res_columns = q1_columns + q2_columns
|
|
@@ -979,16 +1005,14 @@ class SQLJoin(Step):
|
|
|
979
1005
|
self.validate_expression(join_expression, q1, q2)
|
|
980
1006
|
|
|
981
1007
|
def q(*columns):
|
|
982
|
-
join_query =
|
|
1008
|
+
join_query = self.catalog.warehouse.join(
|
|
983
1009
|
q1,
|
|
984
1010
|
q2,
|
|
985
1011
|
join_expression,
|
|
986
|
-
|
|
1012
|
+
inner=self.inner,
|
|
987
1013
|
)
|
|
988
|
-
|
|
989
|
-
|
|
990
|
-
subquery = res.subquery()
|
|
991
|
-
return sqlalchemy.select(*subquery.c).select_from(subquery)
|
|
1014
|
+
return sqlalchemy.select(*columns).select_from(join_query)
|
|
1015
|
+
# return sqlalchemy.select(*subquery.c).select_from(subquery)
|
|
992
1016
|
|
|
993
1017
|
return step_result(
|
|
994
1018
|
q,
|
|
@@ -1511,7 +1535,7 @@ class DatasetQuery:
|
|
|
1511
1535
|
if isinstance(predicates, (str, ColumnClause, ColumnElement))
|
|
1512
1536
|
else tuple(predicates)
|
|
1513
1537
|
)
|
|
1514
|
-
new_query.steps = [SQLJoin(left, right, predicates, inner, rname)]
|
|
1538
|
+
new_query.steps = [SQLJoin(self.catalog, left, right, predicates, inner, rname)]
|
|
1515
1539
|
return new_query
|
|
1516
1540
|
|
|
1517
1541
|
@detach
|
|
@@ -1687,12 +1711,7 @@ class DatasetQuery:
|
|
|
1687
1711
|
|
|
1688
1712
|
dr = self.catalog.warehouse.dataset_rows(dataset)
|
|
1689
1713
|
|
|
1690
|
-
|
|
1691
|
-
self.catalog.warehouse.copy_table(
|
|
1692
|
-
dr.get_table(),
|
|
1693
|
-
query.select(),
|
|
1694
|
-
progress_cb=pbar.update,
|
|
1695
|
-
)
|
|
1714
|
+
self.catalog.warehouse.copy_table(dr.get_table(), query.select())
|
|
1696
1715
|
|
|
1697
1716
|
self.catalog.metastore.update_dataset_status(
|
|
1698
1717
|
dataset, DatasetStatus.COMPLETE, version=version
|
|
@@ -35,8 +35,8 @@ datachain/data_storage/job.py,sha256=w-7spowjkOa1P5fUVtJou3OltT0L48P0RYWZ9rSJ9-s
|
|
|
35
35
|
datachain/data_storage/metastore.py,sha256=cHN0xmbUvChyayHHZm3Vqxr87jFqojPSlGBqhTPStlE,54519
|
|
36
36
|
datachain/data_storage/schema.py,sha256=AGbjyEir5UmRZXI3m0jChZogUh5wd8csj6-YlUWaAxQ,8383
|
|
37
37
|
datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
|
|
38
|
-
datachain/data_storage/sqlite.py,sha256=
|
|
39
|
-
datachain/data_storage/warehouse.py,sha256=
|
|
38
|
+
datachain/data_storage/sqlite.py,sha256=3OehNpYb4WJYt4RhPxZrQn9UL1yiHX7Fp1W53o-Y1NA,28788
|
|
39
|
+
datachain/data_storage/warehouse.py,sha256=g_yWXpw5iC-VYi8gH0ctDlwO3Mo6AT-32j3Nw6TFgqw,32857
|
|
40
40
|
datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
41
41
|
datachain/lib/arrow.py,sha256=voY9KuJ2uhPxw_DS6rIjwfKjWXi84T3LFJ7kGFcDQuk,7272
|
|
42
42
|
datachain/lib/clip.py,sha256=lm5CzVi4Cj1jVLEKvERKArb-egb9j1Ls-fwTItT6vlI,6150
|
|
@@ -70,7 +70,7 @@ datachain/lib/convert/values_to_tuples.py,sha256=YOdbjzHq-uj6-cV2Qq43G72eN2avMND
|
|
|
70
70
|
datachain/query/__init__.py,sha256=tv-spkjUCYamMN9ys_90scYrZ8kJ7C7d1MTYVmxGtk4,325
|
|
71
71
|
datachain/query/batch.py,sha256=-vlpINJiertlnaoUVv1C95RatU0F6zuhpIYRufJRo1M,3660
|
|
72
72
|
datachain/query/builtins.py,sha256=U6yHPF9bzxqK5iwyqCqbJxo8ggBVx9FtuXxRrQQ0SNM,2244
|
|
73
|
-
datachain/query/dataset.py,sha256=
|
|
73
|
+
datachain/query/dataset.py,sha256=tBmAlcz6orJbKWkcvGVE4wom-EWInFaXHJYMSpVZnhA,58892
|
|
74
74
|
datachain/query/dispatch.py,sha256=GBh3EZHDp5AaXxrjOpfrpfsuy7Umnqxu-MAXcK9X3gc,12945
|
|
75
75
|
datachain/query/metrics.py,sha256=r5b0ygYhokbXp8Mg3kCH8iFSRw0jxzyeBe-C-J_bKFc,938
|
|
76
76
|
datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
|
|
@@ -97,9 +97,9 @@ datachain/sql/sqlite/base.py,sha256=WLPHBhZbXbiqPoRV1VgDrXJqku4UuvJpBhYeQ0k5rI8,
|
|
|
97
97
|
datachain/sql/sqlite/types.py,sha256=yzvp0sXSEoEYXs6zaYC_2YubarQoZH-MiUNXcpuEP4s,1573
|
|
98
98
|
datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
|
|
99
99
|
datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
|
|
100
|
-
datachain-0.3.
|
|
101
|
-
datachain-0.3.
|
|
102
|
-
datachain-0.3.
|
|
103
|
-
datachain-0.3.
|
|
104
|
-
datachain-0.3.
|
|
105
|
-
datachain-0.3.
|
|
100
|
+
datachain-0.3.16.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
|
|
101
|
+
datachain-0.3.16.dist-info/METADATA,sha256=EjMy4f4OVbwVttlWRzzXRLr-uAEAGNMPMmge96_CI2o,17073
|
|
102
|
+
datachain-0.3.16.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
|
|
103
|
+
datachain-0.3.16.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
|
|
104
|
+
datachain-0.3.16.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
|
|
105
|
+
datachain-0.3.16.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|