datachain 0.3.15__py3-none-any.whl → 0.3.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

@@ -40,7 +40,9 @@ if TYPE_CHECKING:
40
40
  from sqlalchemy.dialects.sqlite import Insert
41
41
  from sqlalchemy.engine.base import Engine
42
42
  from sqlalchemy.schema import SchemaItem
43
+ from sqlalchemy.sql._typing import _FromClauseArgument, _OnClauseArgument
43
44
  from sqlalchemy.sql.elements import ColumnElement
45
+ from sqlalchemy.sql.selectable import Join
44
46
  from sqlalchemy.types import TypeEngine
45
47
 
46
48
  from datachain.lib.file import File
@@ -788,6 +790,23 @@ class SQLiteWarehouse(AbstractWarehouse):
788
790
  if progress_cb:
789
791
  progress_cb(len(batch_ids))
790
792
 
793
+ def join(
794
+ self,
795
+ left: "_FromClauseArgument",
796
+ right: "_FromClauseArgument",
797
+ onclause: "_OnClauseArgument",
798
+ inner: bool = True,
799
+ ) -> "Join":
800
+ """
801
+ Join two tables together.
802
+ """
803
+ return sqlalchemy.join(
804
+ left,
805
+ right,
806
+ onclause,
807
+ isouter=not inner,
808
+ )
809
+
791
810
  def create_pre_udf_table(self, query: "Select") -> "Table":
792
811
  """
793
812
  Create a temporary table from a query for use in a UDF.
@@ -27,8 +27,12 @@ from datachain.storage import StorageURI
27
27
  from datachain.utils import sql_escape_like
28
28
 
29
29
  if TYPE_CHECKING:
30
- from sqlalchemy.sql._typing import _ColumnsClauseArgument
31
- from sqlalchemy.sql.selectable import Select
30
+ from sqlalchemy.sql._typing import (
31
+ _ColumnsClauseArgument,
32
+ _FromClauseArgument,
33
+ _OnClauseArgument,
34
+ )
35
+ from sqlalchemy.sql.selectable import Join, Select
32
36
  from sqlalchemy.types import TypeEngine
33
37
 
34
38
  from datachain.data_storage import AbstractIDGenerator, schema
@@ -894,6 +898,18 @@ class AbstractWarehouse(ABC, Serializable):
894
898
  Copy the results of a query into a table.
895
899
  """
896
900
 
901
+ @abstractmethod
902
+ def join(
903
+ self,
904
+ left: "_FromClauseArgument",
905
+ right: "_FromClauseArgument",
906
+ onclause: "_OnClauseArgument",
907
+ inner: bool = True,
908
+ ) -> "Join":
909
+ """
910
+ Join two tables together.
911
+ """
912
+
897
913
  @abstractmethod
898
914
  def create_pre_udf_table(self, query: "Select") -> "Table":
899
915
  """
@@ -922,7 +938,7 @@ class AbstractWarehouse(ABC, Serializable):
922
938
  are cleaned up as soon as they are no longer needed.
923
939
  """
924
940
  with tqdm(desc="Cleanup", unit=" tables") as pbar:
925
- for name in names:
941
+ for name in set(names):
926
942
  self.db.drop_table(Table(name, self.db.metadata), if_exists=True)
927
943
  pbar.update(1)
928
944
 
@@ -33,7 +33,6 @@ from sqlalchemy.sql.elements import ColumnClause, ColumnElement
33
33
  from sqlalchemy.sql.expression import label
34
34
  from sqlalchemy.sql.schema import TableClause
35
35
  from sqlalchemy.sql.selectable import Select
36
- from tqdm import tqdm
37
36
 
38
37
  from datachain.asyn import ASYNC_WORKERS, AsyncMapper, OrderedMapper
39
38
  from datachain.catalog import QUERY_SCRIPT_CANCELED_EXIT_CODE, get_catalog
@@ -899,12 +898,36 @@ class SQLUnion(Step):
899
898
 
900
899
  @frozen
901
900
  class SQLJoin(Step):
901
+ catalog: "Catalog"
902
902
  query1: "DatasetQuery"
903
903
  query2: "DatasetQuery"
904
904
  predicates: Union[JoinPredicateType, tuple[JoinPredicateType, ...]]
905
905
  inner: bool
906
906
  rname: str
907
907
 
908
+ def get_query(self, dq: "DatasetQuery", temp_tables: list[str]) -> sa.Subquery:
909
+ query = dq.apply_steps().select()
910
+ temp_tables.extend(dq.temp_table_names)
911
+
912
+ if not any(isinstance(step, (SQLJoin, SQLUnion)) for step in dq.steps):
913
+ return query.subquery(dq.table.name)
914
+
915
+ warehouse = self.catalog.warehouse
916
+
917
+ columns = [
918
+ c if isinstance(c, Column) else Column(c.name, c.type)
919
+ for c in query.subquery().columns
920
+ ]
921
+ temp_table = warehouse.create_dataset_rows_table(
922
+ warehouse.temp_table_name(),
923
+ columns=columns,
924
+ )
925
+ temp_tables.append(temp_table.name)
926
+
927
+ warehouse.copy_table(temp_table, query)
928
+
929
+ return temp_table.select().subquery(dq.table.name)
930
+
908
931
  def validate_expression(self, exp: "ClauseElement", q1, q2):
909
932
  """
910
933
  Checking if columns used in expression actually exist in left / right
@@ -937,10 +960,8 @@ class SQLJoin(Step):
937
960
  def apply(
938
961
  self, query_generator: QueryGenerator, temp_tables: list[str]
939
962
  ) -> StepResult:
940
- q1 = self.query1.apply_steps().select().subquery(self.query1.table.name)
941
- temp_tables.extend(self.query1.temp_table_names)
942
- q2 = self.query2.apply_steps().select().subquery(self.query2.table.name)
943
- temp_tables.extend(self.query2.temp_table_names)
963
+ q1 = self.get_query(self.query1, temp_tables)
964
+ q2 = self.get_query(self.query2, temp_tables)
944
965
 
945
966
  q1_columns = list(q1.c)
946
967
  q1_column_names = {c.name for c in q1_columns}
@@ -951,7 +972,12 @@ class SQLJoin(Step):
951
972
  continue
952
973
 
953
974
  if c.name in q1_column_names:
954
- c = c.label(self.rname.format(name=c.name))
975
+ new_name = self.rname.format(name=c.name)
976
+ new_name_idx = 0
977
+ while new_name in q1_column_names:
978
+ new_name_idx += 1
979
+ new_name = self.rname.format(name=f"{c.name}_{new_name_idx}")
980
+ c = c.label(new_name)
955
981
  q2_columns.append(c)
956
982
 
957
983
  res_columns = q1_columns + q2_columns
@@ -979,16 +1005,14 @@ class SQLJoin(Step):
979
1005
  self.validate_expression(join_expression, q1, q2)
980
1006
 
981
1007
  def q(*columns):
982
- join_query = sqlalchemy.join(
1008
+ join_query = self.catalog.warehouse.join(
983
1009
  q1,
984
1010
  q2,
985
1011
  join_expression,
986
- isouter=not self.inner,
1012
+ inner=self.inner,
987
1013
  )
988
-
989
- res = sqlalchemy.select(*columns).select_from(join_query)
990
- subquery = res.subquery()
991
- return sqlalchemy.select(*subquery.c).select_from(subquery)
1014
+ return sqlalchemy.select(*columns).select_from(join_query)
1015
+ # return sqlalchemy.select(*subquery.c).select_from(subquery)
992
1016
 
993
1017
  return step_result(
994
1018
  q,
@@ -1511,7 +1535,7 @@ class DatasetQuery:
1511
1535
  if isinstance(predicates, (str, ColumnClause, ColumnElement))
1512
1536
  else tuple(predicates)
1513
1537
  )
1514
- new_query.steps = [SQLJoin(left, right, predicates, inner, rname)]
1538
+ new_query.steps = [SQLJoin(self.catalog, left, right, predicates, inner, rname)]
1515
1539
  return new_query
1516
1540
 
1517
1541
  @detach
@@ -1687,12 +1711,7 @@ class DatasetQuery:
1687
1711
 
1688
1712
  dr = self.catalog.warehouse.dataset_rows(dataset)
1689
1713
 
1690
- with tqdm(desc="Saving", unit=" rows") as pbar:
1691
- self.catalog.warehouse.copy_table(
1692
- dr.get_table(),
1693
- query.select(),
1694
- progress_cb=pbar.update,
1695
- )
1714
+ self.catalog.warehouse.copy_table(dr.get_table(), query.select())
1696
1715
 
1697
1716
  self.catalog.metastore.update_dataset_status(
1698
1717
  dataset, DatasetStatus.COMPLETE, version=version
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.3.15
3
+ Version: 0.3.16
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -35,8 +35,8 @@ datachain/data_storage/job.py,sha256=w-7spowjkOa1P5fUVtJou3OltT0L48P0RYWZ9rSJ9-s
35
35
  datachain/data_storage/metastore.py,sha256=cHN0xmbUvChyayHHZm3Vqxr87jFqojPSlGBqhTPStlE,54519
36
36
  datachain/data_storage/schema.py,sha256=AGbjyEir5UmRZXI3m0jChZogUh5wd8csj6-YlUWaAxQ,8383
37
37
  datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
38
- datachain/data_storage/sqlite.py,sha256=yooLHQXrpoqDguGlF0SGcCiMU1T82OEc4wr1ra8eBHo,28285
39
- datachain/data_storage/warehouse.py,sha256=Pq6Nt3fyz1WFv6Mdtv2ZUr0_GFCNbafbtS4PdibblUg,32507
38
+ datachain/data_storage/sqlite.py,sha256=3OehNpYb4WJYt4RhPxZrQn9UL1yiHX7Fp1W53o-Y1NA,28788
39
+ datachain/data_storage/warehouse.py,sha256=g_yWXpw5iC-VYi8gH0ctDlwO3Mo6AT-32j3Nw6TFgqw,32857
40
40
  datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
41
41
  datachain/lib/arrow.py,sha256=voY9KuJ2uhPxw_DS6rIjwfKjWXi84T3LFJ7kGFcDQuk,7272
42
42
  datachain/lib/clip.py,sha256=lm5CzVi4Cj1jVLEKvERKArb-egb9j1Ls-fwTItT6vlI,6150
@@ -70,7 +70,7 @@ datachain/lib/convert/values_to_tuples.py,sha256=YOdbjzHq-uj6-cV2Qq43G72eN2avMND
70
70
  datachain/query/__init__.py,sha256=tv-spkjUCYamMN9ys_90scYrZ8kJ7C7d1MTYVmxGtk4,325
71
71
  datachain/query/batch.py,sha256=-vlpINJiertlnaoUVv1C95RatU0F6zuhpIYRufJRo1M,3660
72
72
  datachain/query/builtins.py,sha256=U6yHPF9bzxqK5iwyqCqbJxo8ggBVx9FtuXxRrQQ0SNM,2244
73
- datachain/query/dataset.py,sha256=9lhcgccavqypVParE4pvd_Hgg8gmoDAN6m1IkpSwXhE,58219
73
+ datachain/query/dataset.py,sha256=tBmAlcz6orJbKWkcvGVE4wom-EWInFaXHJYMSpVZnhA,58892
74
74
  datachain/query/dispatch.py,sha256=GBh3EZHDp5AaXxrjOpfrpfsuy7Umnqxu-MAXcK9X3gc,12945
75
75
  datachain/query/metrics.py,sha256=r5b0ygYhokbXp8Mg3kCH8iFSRw0jxzyeBe-C-J_bKFc,938
76
76
  datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
@@ -97,9 +97,9 @@ datachain/sql/sqlite/base.py,sha256=WLPHBhZbXbiqPoRV1VgDrXJqku4UuvJpBhYeQ0k5rI8,
97
97
  datachain/sql/sqlite/types.py,sha256=yzvp0sXSEoEYXs6zaYC_2YubarQoZH-MiUNXcpuEP4s,1573
98
98
  datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
99
99
  datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
100
- datachain-0.3.15.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
101
- datachain-0.3.15.dist-info/METADATA,sha256=E3jImGtRTyvMPTSqFsgwhsHsnZn_9SRVeThmrDXRuf0,17073
102
- datachain-0.3.15.dist-info/WHEEL,sha256=5Mi1sN9lKoFv_gxcPtisEVrJZihrm_beibeg5R6xb4I,91
103
- datachain-0.3.15.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
104
- datachain-0.3.15.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
105
- datachain-0.3.15.dist-info/RECORD,,
100
+ datachain-0.3.16.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
101
+ datachain-0.3.16.dist-info/METADATA,sha256=EjMy4f4OVbwVttlWRzzXRLr-uAEAGNMPMmge96_CI2o,17073
102
+ datachain-0.3.16.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
103
+ datachain-0.3.16.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
104
+ datachain-0.3.16.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
105
+ datachain-0.3.16.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.0.0)
2
+ Generator: setuptools (75.1.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5