datachain 0.7.1__py3-none-any.whl → 0.7.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (42) hide show
  1. datachain/__init__.py +0 -2
  2. datachain/catalog/catalog.py +8 -6
  3. datachain/cli.py +1 -1
  4. datachain/client/fsspec.py +9 -9
  5. datachain/data_storage/schema.py +2 -2
  6. datachain/data_storage/sqlite.py +5 -4
  7. datachain/data_storage/warehouse.py +18 -18
  8. datachain/func/__init__.py +49 -0
  9. datachain/{lib/func → func}/aggregate.py +13 -11
  10. datachain/func/array.py +176 -0
  11. datachain/func/base.py +23 -0
  12. datachain/func/conditional.py +81 -0
  13. datachain/func/func.py +384 -0
  14. datachain/func/path.py +110 -0
  15. datachain/func/random.py +23 -0
  16. datachain/func/string.py +154 -0
  17. datachain/func/window.py +49 -0
  18. datachain/lib/arrow.py +24 -12
  19. datachain/lib/data_model.py +25 -9
  20. datachain/lib/dataset_info.py +2 -2
  21. datachain/lib/dc.py +94 -56
  22. datachain/lib/hf.py +1 -1
  23. datachain/lib/signal_schema.py +1 -1
  24. datachain/lib/utils.py +1 -0
  25. datachain/lib/webdataset_laion.py +5 -5
  26. datachain/model/bbox.py +2 -2
  27. datachain/model/pose.py +5 -5
  28. datachain/model/segment.py +2 -2
  29. datachain/nodes_fetcher.py +2 -2
  30. datachain/query/dataset.py +57 -34
  31. datachain/sql/__init__.py +0 -2
  32. datachain/sql/functions/__init__.py +0 -26
  33. datachain/sql/selectable.py +11 -5
  34. datachain/sql/sqlite/base.py +11 -2
  35. {datachain-0.7.1.dist-info → datachain-0.7.2.dist-info}/METADATA +1 -1
  36. {datachain-0.7.1.dist-info → datachain-0.7.2.dist-info}/RECORD +40 -33
  37. datachain/lib/func/__init__.py +0 -32
  38. datachain/lib/func/func.py +0 -152
  39. {datachain-0.7.1.dist-info → datachain-0.7.2.dist-info}/LICENSE +0 -0
  40. {datachain-0.7.1.dist-info → datachain-0.7.2.dist-info}/WHEEL +0 -0
  41. {datachain-0.7.1.dist-info → datachain-0.7.2.dist-info}/entry_points.txt +0 -0
  42. {datachain-0.7.1.dist-info → datachain-0.7.2.dist-info}/top_level.txt +0 -0
@@ -43,9 +43,10 @@ from datachain.data_storage.schema import (
43
43
  )
44
44
  from datachain.dataset import DatasetStatus, RowDict
45
45
  from datachain.error import DatasetNotFoundError, QueryScriptCancelError
46
+ from datachain.func.base import Function
46
47
  from datachain.lib.udf import UDFAdapter
47
48
  from datachain.progress import CombinedDownloadCallback
48
- from datachain.sql.functions import rand
49
+ from datachain.sql.functions.random import rand
49
50
  from datachain.utils import (
50
51
  batched,
51
52
  determine_processes,
@@ -65,15 +66,16 @@ if TYPE_CHECKING:
65
66
  from datachain.catalog import Catalog
66
67
  from datachain.data_storage import AbstractWarehouse
67
68
  from datachain.dataset import DatasetRecord
68
-
69
- from .udf import UDFResult
69
+ from datachain.lib.udf import UDFResult
70
70
 
71
71
  P = ParamSpec("P")
72
72
 
73
73
 
74
74
  INSERT_BATCH_SIZE = 10000
75
75
 
76
- PartitionByType = Union[ColumnElement, Sequence[ColumnElement]]
76
+ PartitionByType = Union[
77
+ Function, ColumnElement, Sequence[Union[Function, ColumnElement]]
78
+ ]
77
79
  JoinPredicateType = Union[str, ColumnClause, ColumnElement]
78
80
  DatasetDependencyType = tuple[str, int]
79
81
 
@@ -457,18 +459,15 @@ class UDFStep(Step, ABC):
457
459
  # Run the UDFDispatcher in another process to avoid needing
458
460
  # if __name__ == '__main__': in user scripts
459
461
  exec_cmd = get_datachain_executable()
462
+ cmd = [*exec_cmd, "internal-run-udf"]
460
463
  envs = dict(os.environ)
461
464
  envs.update({"PYTHONPATH": os.getcwd()})
462
465
  process_data = filtered_cloudpickle_dumps(udf_info)
463
- result = subprocess.run( # noqa: S603
464
- [*exec_cmd, "internal-run-udf"],
465
- input=process_data,
466
- check=False,
467
- env=envs,
468
- )
469
- if result.returncode != 0:
470
- raise RuntimeError("UDF Execution Failed!")
471
466
 
467
+ with subprocess.Popen(cmd, env=envs, stdin=subprocess.PIPE) as process: # noqa: S603
468
+ process.communicate(process_data)
469
+ if process.poll():
470
+ raise RuntimeError("UDF Execution Failed!")
472
471
  else:
473
472
  # Otherwise process single-threaded (faster for smaller UDFs)
474
473
  warehouse = self.catalog.warehouse
@@ -520,13 +519,17 @@ class UDFStep(Step, ABC):
520
519
  else:
521
520
  list_partition_by = [self.partition_by]
522
521
 
522
+ partition_by = [
523
+ p.get_column() if isinstance(p, Function) else p for p in list_partition_by
524
+ ]
525
+
523
526
  # create table with partitions
524
527
  tbl = self.catalog.warehouse.create_udf_table(partition_columns())
525
528
 
526
529
  # fill table with partitions
527
530
  cols = [
528
531
  query.selected_columns.sys__id,
529
- f.dense_rank().over(order_by=list_partition_by).label(PARTITION_COLUMN_ID),
532
+ f.dense_rank().over(order_by=partition_by).label(PARTITION_COLUMN_ID),
530
533
  ]
531
534
  self.catalog.warehouse.db.execute(
532
535
  tbl.insert().from_select(cols, query.with_only_columns(*cols))
@@ -683,6 +686,12 @@ class SQLClause(Step, ABC):
683
686
 
684
687
  return step_result(q, new_query.selected_columns)
685
688
 
689
+ def parse_cols(
690
+ self,
691
+ cols: Sequence[Union[Function, ColumnElement]],
692
+ ) -> tuple[ColumnElement, ...]:
693
+ return tuple(c.get_column() if isinstance(c, Function) else c for c in cols)
694
+
686
695
  @abstractmethod
687
696
  def apply_sql_clause(self, query):
688
697
  pass
@@ -690,12 +699,14 @@ class SQLClause(Step, ABC):
690
699
 
691
700
  @frozen
692
701
  class SQLSelect(SQLClause):
693
- args: tuple[Union[str, ColumnElement], ...]
702
+ args: tuple[Union[Function, ColumnElement], ...]
694
703
 
695
704
  def apply_sql_clause(self, query) -> Select:
696
705
  subquery = query.subquery()
697
-
698
- args = [subquery.c[str(c)] if isinstance(c, (str, C)) else c for c in self.args]
706
+ args = [
707
+ subquery.c[str(c)] if isinstance(c, (str, C)) else c
708
+ for c in self.parse_cols(self.args)
709
+ ]
699
710
  if not args:
700
711
  args = subquery.c
701
712
 
@@ -704,22 +715,25 @@ class SQLSelect(SQLClause):
704
715
 
705
716
  @frozen
706
717
  class SQLSelectExcept(SQLClause):
707
- args: tuple[str, ...]
718
+ args: tuple[Union[Function, ColumnElement], ...]
708
719
 
709
720
  def apply_sql_clause(self, query: Select) -> Select:
710
721
  subquery = query.subquery()
711
- names = set(self.args)
712
- args = [c for c in subquery.c if c.name not in names]
722
+ args = [c for c in subquery.c if c.name not in set(self.parse_cols(self.args))]
713
723
  return sqlalchemy.select(*args).select_from(subquery)
714
724
 
715
725
 
716
726
  @frozen
717
727
  class SQLMutate(SQLClause):
718
- args: tuple[ColumnElement, ...]
728
+ args: tuple[Union[Function, ColumnElement], ...]
719
729
 
720
730
  def apply_sql_clause(self, query: Select) -> Select:
721
731
  original_subquery = query.subquery()
722
- to_mutate = {c.name for c in self.args}
732
+ args = [
733
+ original_subquery.c[str(c)] if isinstance(c, (str, C)) else c
734
+ for c in self.parse_cols(self.args)
735
+ ]
736
+ to_mutate = {c.name for c in args}
723
737
 
724
738
  prefix = f"mutate{token_hex(8)}_"
725
739
  cols = [
@@ -729,9 +743,7 @@ class SQLMutate(SQLClause):
729
743
  # this is needed for new column to be used in clauses
730
744
  # like ORDER BY, otherwise new column is not recognized
731
745
  subquery = (
732
- sqlalchemy.select(*cols, *self.args)
733
- .select_from(original_subquery)
734
- .subquery()
746
+ sqlalchemy.select(*cols, *args).select_from(original_subquery).subquery()
735
747
  )
736
748
 
737
749
  return sqlalchemy.select(*subquery.c).select_from(subquery)
@@ -739,21 +751,24 @@ class SQLMutate(SQLClause):
739
751
 
740
752
  @frozen
741
753
  class SQLFilter(SQLClause):
742
- expressions: tuple[ColumnElement, ...]
754
+ expressions: tuple[Union[Function, ColumnElement], ...]
743
755
 
744
756
  def __and__(self, other):
745
- return self.__class__(self.expressions + other)
757
+ expressions = self.parse_cols(self.expressions)
758
+ return self.__class__(expressions + other)
746
759
 
747
760
  def apply_sql_clause(self, query: Select) -> Select:
748
- return query.filter(*self.expressions)
761
+ expressions = self.parse_cols(self.expressions)
762
+ return query.filter(*expressions)
749
763
 
750
764
 
751
765
  @frozen
752
766
  class SQLOrderBy(SQLClause):
753
- args: tuple[ColumnElement, ...]
767
+ args: tuple[Union[Function, ColumnElement], ...]
754
768
 
755
769
  def apply_sql_clause(self, query: Select) -> Select:
756
- return query.order_by(*self.args)
770
+ args = self.parse_cols(self.args)
771
+ return query.order_by(*args)
757
772
 
758
773
 
759
774
  @frozen
@@ -948,8 +963,8 @@ class SQLJoin(Step):
948
963
 
949
964
  @frozen
950
965
  class SQLGroupBy(SQLClause):
951
- cols: Sequence[Union[str, ColumnElement]]
952
- group_by: Sequence[Union[str, ColumnElement]]
966
+ cols: Sequence[Union[str, Function, ColumnElement]]
967
+ group_by: Sequence[Union[str, Function, ColumnElement]]
953
968
 
954
969
  def apply_sql_clause(self, query) -> Select:
955
970
  if not self.cols:
@@ -959,12 +974,20 @@ class SQLGroupBy(SQLClause):
959
974
 
960
975
  subquery = query.subquery()
961
976
 
977
+ group_by = [
978
+ c.get_column() if isinstance(c, Function) else c for c in self.group_by
979
+ ]
980
+
962
981
  cols = [
963
- subquery.c[str(c)] if isinstance(c, (str, C)) else c
964
- for c in [*self.group_by, *self.cols]
982
+ c.get_column()
983
+ if isinstance(c, Function)
984
+ else subquery.c[str(c)]
985
+ if isinstance(c, (str, C))
986
+ else c
987
+ for c in (*group_by, *self.cols)
965
988
  ]
966
989
 
967
- return sqlalchemy.select(*cols).select_from(subquery).group_by(*self.group_by)
990
+ return sqlalchemy.select(*cols).select_from(subquery).group_by(*group_by)
968
991
 
969
992
 
970
993
  def _validate_columns(
datachain/sql/__init__.py CHANGED
@@ -1,13 +1,11 @@
1
1
  from sqlalchemy.sql.elements import literal
2
2
  from sqlalchemy.sql.expression import column
3
3
 
4
- from . import functions
5
4
  from .default import setup as default_setup
6
5
  from .selectable import select, values
7
6
 
8
7
  __all__ = [
9
8
  "column",
10
- "functions",
11
9
  "literal",
12
10
  "select",
13
11
  "values",
@@ -1,26 +0,0 @@
1
- from sqlalchemy.sql.expression import func
2
-
3
- from . import array, path, string
4
- from .aggregate import avg
5
- from .conditional import greatest, least
6
- from .random import rand
7
-
8
- count = func.count
9
- sum = func.sum
10
- min = func.min
11
- max = func.max
12
-
13
- __all__ = [
14
- "array",
15
- "avg",
16
- "count",
17
- "func",
18
- "greatest",
19
- "least",
20
- "max",
21
- "min",
22
- "path",
23
- "rand",
24
- "string",
25
- "sum",
26
- ]
@@ -9,7 +9,9 @@ class Values(selectable.Values):
9
9
  columns = [expression.column(f"c{i}") for i in range(1, num_columns + 1)]
10
10
  else:
11
11
  columns = [
12
- expression.column(c) if isinstance(c, str) else c for c in columns
12
+ process_column_expression(c)
13
+ for c in columns
14
+ # expression.column(c) if isinstance(c, str) else c for c in columns
13
15
  ]
14
16
  super().__init__(*columns, **kwargs)
15
17
  self._data += tuple(data)
@@ -19,13 +21,17 @@ def values(data, columns=None, **kwargs) -> Values:
19
21
  return Values(data, columns=columns, **kwargs)
20
22
 
21
23
 
22
- def process_column_expressions(columns):
23
- return [expression.column(c) if isinstance(c, str) else c for c in columns]
24
+ def process_column_expression(col):
25
+ if hasattr(col, "get_column"):
26
+ return col.get_column()
27
+ if isinstance(col, str):
28
+ return expression.column(col)
29
+ return col
24
30
 
25
31
 
26
32
  def select(*columns, **kwargs) -> "expression.Select":
27
- columns = process_column_expressions(columns)
28
- return expression.select(*columns, **kwargs)
33
+ columns_processed = [process_column_expression(c) for c in columns]
34
+ return expression.select(*columns_processed, **kwargs)
29
35
 
30
36
 
31
37
  def base_values_compiler(column_name_func, element, compiler, **kwargs):
@@ -1,6 +1,7 @@
1
1
  import logging
2
2
  import re
3
3
  import sqlite3
4
+ import warnings
4
5
  from collections.abc import Iterable
5
6
  from datetime import MAXYEAR, MINYEAR, datetime, timezone
6
7
  from types import MappingProxyType
@@ -418,14 +419,22 @@ def compile_collect(element, compiler, **kwargs):
418
419
  return compiler.process(func.json_group_array(*element.clauses.clauses), **kwargs)
419
420
 
420
421
 
421
- def load_usearch_extension(conn) -> bool:
422
+ def load_usearch_extension(conn: sqlite3.Connection) -> bool:
422
423
  try:
423
424
  # usearch is part of the vector optional dependencies
424
425
  # we use the extension's cosine and euclidean distance functions
425
426
  from usearch import sqlite_path
426
427
 
427
428
  conn.enable_load_extension(True)
428
- conn.load_extension(sqlite_path())
429
+
430
+ with warnings.catch_warnings():
431
+ # usearch binary is not available for Windows, see: https://github.com/unum-cloud/usearch/issues/427.
432
+ # and, sometimes fail to download the binary in other platforms
433
+ # triggering UserWarning.
434
+
435
+ warnings.filterwarnings("ignore", category=UserWarning, module="usearch")
436
+ conn.load_extension(sqlite_path())
437
+
429
438
  conn.enable_load_extension(False)
430
439
  return True
431
440
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.7.1
3
+ Version: 0.7.2
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -1,8 +1,8 @@
1
- datachain/__init__.py,sha256=OGzc8xZWtwqxiiutjU4AxCRPY0lrX_csgERiTrq4G0o,908
1
+ datachain/__init__.py,sha256=ofPJ6B-d-ybSDRrE7J6wqF_ZRAB2W9U8l-eeuBtqPLg,865
2
2
  datachain/__main__.py,sha256=hG3Y4ARGEqe1AWwNMd259rBlqtphx1Wk39YbueQ0yV8,91
3
3
  datachain/asyn.py,sha256=5aKrjnUxk0mtnZeFKNJd1DCE0MsnSoyJBZkr0y9H_a0,9313
4
4
  datachain/cache.py,sha256=s0YHN7qurmQv-eC265TjeureK84TebWWAnL07cxchZQ,2997
5
- datachain/cli.py,sha256=hdVt_HJumQVgtaBAtBVJm-uPyYVogMXNVLmRcZyWHgk,36677
5
+ datachain/cli.py,sha256=weZDEj4Kkgi9vqzqJdQcX_jSymSINHbbZjjTqu1RHa4,36685
6
6
  datachain/cli_utils.py,sha256=jrn9ejGXjybeO1ur3fjdSiAyCHZrX0qsLLbJzN9ErPM,2418
7
7
  datachain/config.py,sha256=g8qbNV0vW2VEKpX-dGZ9pAn0DAz6G2ZFcr7SAV3PoSM,4272
8
8
  datachain/dataset.py,sha256=0IN-5y723y-bnFlieKtOFZLCjwX_yplFo3q0DV7LRPw,14821
@@ -10,7 +10,7 @@ datachain/error.py,sha256=bxAAL32lSeMgzsQDEHbGTGORj-mPzzpCRvWDPueJNN4,1092
10
10
  datachain/job.py,sha256=Jt4sNutMHJReaGsj3r3scueN5aESLGfhimAa8pUP7Is,1271
11
11
  datachain/listing.py,sha256=TgKg25ZWAP5enzKgw2_2GUPJVdnQUh6uySHB5SJrUY4,7773
12
12
  datachain/node.py,sha256=o8Sqy92QkzzcLK6XmIFLyDSE6Rw6kUTmGRhEmfLFdhg,5211
13
- datachain/nodes_fetcher.py,sha256=F-73-h19HHNGtHFBGKk7p3mc0ALm4a9zGnzhtuUjnp4,1107
13
+ datachain/nodes_fetcher.py,sha256=ILMzUW5o4_6lUOVrLDC9gJPCXfcgKnMG68plrc7dAOA,1113
14
14
  datachain/nodes_thread_pool.py,sha256=uPo-xl8zG5m9YgODjPFBpbcqqHjI-dcxH87yAbj_qco,3192
15
15
  datachain/progress.py,sha256=5KotcvvzAUL_RF0GEj4JY0IB1lyImnmHxe89YkT1XO4,4330
16
16
  datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -18,13 +18,13 @@ datachain/studio.py,sha256=6kxF7VxPAbh9D7_Bk8_SghS5OXrwUwSpDaw19eNCTP4,4083
18
18
  datachain/telemetry.py,sha256=0A4IOPPp9VlP5pyW9eBfaTK3YhHGzHl7dQudQjUAx9A,994
19
19
  datachain/utils.py,sha256=-mSFowjIidJ4_sMXInvNHLn4rK_QnHuIlLuH1_lMGmI,13897
20
20
  datachain/catalog/__init__.py,sha256=g2iAAFx_gEIrqshXlhSEbrc8qDaEH11cjU40n3CHDz4,409
21
- datachain/catalog/catalog.py,sha256=J1nUWLI4RYCvvR6fB4neQBtB7V-CTh4PM71irhNmJc4,57817
21
+ datachain/catalog/catalog.py,sha256=sWljYCIpvUR3eCeYg4GTZXfyn5ropZVkfEPocc9m7KE,57941
22
22
  datachain/catalog/datasource.py,sha256=D-VWIVDCM10A8sQavLhRXdYSCG7F4o4ifswEF80_NAQ,1412
23
23
  datachain/catalog/loader.py,sha256=-6VelNfXUdgUnwInVyA8g86Boxv2xqhTh9xNS-Zlwig,8242
24
24
  datachain/client/__init__.py,sha256=T4wiYL9KIM0ZZ_UqIyzV8_ufzYlewmizlV4iymHNluE,86
25
25
  datachain/client/azure.py,sha256=ffxs26zm6KLAL1aUWJm-vtzuZP3LSNha7UDGXynMBKo,2234
26
26
  datachain/client/fileslice.py,sha256=bT7TYco1Qe3bqoc8aUkUZcPdPofJDHlryL5BsTn9xsY,3021
27
- datachain/client/fsspec.py,sha256=Ai5m7alkAnv-RWXuLbZ95SKEPaQ3Pyk5ujDy50JDX5w,12692
27
+ datachain/client/fsspec.py,sha256=KDGLhJMnive73hI8GABeP_aQZv1w5M_6rxz6KRRxaHI,12712
28
28
  datachain/client/gcs.py,sha256=cnTIr5GS6dbYOEYfqehhyQu3dr6XNjPHSg5U3FkivUk,4124
29
29
  datachain/client/hf.py,sha256=XeVJVbiNViZCpn3sfb90Fr8SYO3BdLmfE3hOWMoqInE,951
30
30
  datachain/client/local.py,sha256=vwbgCwZ7IqY2voj2l7tLJjgov7Dp--fEUvUwUBsMbls,4457
@@ -34,18 +34,28 @@ datachain/data_storage/db_engine.py,sha256=81Ol1of9TTTzD97ORajCnP366Xz2mEJt6C-kT
34
34
  datachain/data_storage/id_generator.py,sha256=lCEoU0BM37Ai2aRpSbwo5oQT0GqZnSpYwwvizathRMQ,4292
35
35
  datachain/data_storage/job.py,sha256=w-7spowjkOa1P5fUVtJou3OltT0L48P0RYWZ9rSJ9-s,383
36
36
  datachain/data_storage/metastore.py,sha256=5b7o_CSHC2djottebYn-Hq5q0yaSLOKPIRCnaVRvjsU,36056
37
- datachain/data_storage/schema.py,sha256=scANMQqozita3HjEtq7eupMgh6yYkrZHoXtfuL2RoQg,9879
37
+ datachain/data_storage/schema.py,sha256=-QVlRvD0dfu-ZFUxylEoSnLJLnleMEjVlcAb2OGu-AY,9895
38
38
  datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
39
- datachain/data_storage/sqlite.py,sha256=CspRUlYsIcubgzvcQxTACnmcuKESSLZcqCl0dcrtRiA,27471
40
- datachain/data_storage/warehouse.py,sha256=yXNU0U3exzR1E6dqbYYmL4RhXWsbYWVdZ3jONGcVniY,30914
39
+ datachain/data_storage/sqlite.py,sha256=nF-2B-n8YZh9cJlZv4XnbahAJDW6pvrp1h9L-140M7A,27538
40
+ datachain/data_storage/warehouse.py,sha256=kFLhYEFkpsfl65Lr1c4t4HJt3nO1Ez_QQ76aQNN30fc,30966
41
+ datachain/func/__init__.py,sha256=4VUt5BaLdBAl_BnAku0Jb8plqd7kDOiYrQTMG3pN0c4,794
42
+ datachain/func/aggregate.py,sha256=7_IPrIwb2XSs3zG4iOr1eTvzn6kNVe2mkzvNzjusDHk,10942
43
+ datachain/func/array.py,sha256=zHDNWuWLA7HVa9FEvQeHhVi00_xqenyleTqcLwkXWBI,5477
44
+ datachain/func/base.py,sha256=wA0sBQAVyN9LPxoo7Ox83peS0zUVnyuKxukwAcjGLfY,534
45
+ datachain/func/conditional.py,sha256=mQroxsoExpBW84Zm5dAYP4OpBblWmzfnF2qJq9rba54,2223
46
+ datachain/func/func.py,sha256=9wqdxxisoDL0w8qKGQmL6sNdgJeIOzotEUPlxu9t2IQ,12326
47
+ datachain/func/path.py,sha256=mqN_mfkwv44z2II7DMTp_fGGw95hmTCNls_TOFNpr4k,3155
48
+ datachain/func/random.py,sha256=pENOLj9rSmWfGCnOsUIaCsVC5486zQb66qfQvXaz9Z4,452
49
+ datachain/func/string.py,sha256=NQzaXXYu7yb72HPADy4WrFlcgvTS77L9x7-qvCKJtnk,4522
50
+ datachain/func/window.py,sha256=0MB1yjpVbwOrl_WNLZ8V3jkJz3o0XlYinpAcZQJuxiA,1688
41
51
  datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
42
- datachain/lib/arrow.py,sha256=-hu9tic79a01SY2UBqkA3U6wUr6tnE3T3q5q_BnO93A,9156
52
+ datachain/lib/arrow.py,sha256=b5efxAUaNNYVwtXVJqj07D3zf5KC-BPlLCxKEZbEG6w,9429
43
53
  datachain/lib/clip.py,sha256=lm5CzVi4Cj1jVLEKvERKArb-egb9j1Ls-fwTItT6vlI,6150
44
- datachain/lib/data_model.py,sha256=dau4AlZBhOFvF7pEKMeqCeRkcFFg5KFvTBWW_2CdH5g,2371
45
- datachain/lib/dataset_info.py,sha256=q0EW9tj5jXGSD9Lzct9zbH4P1lfIGd_cIWqhnMxv7Q0,2464
46
- datachain/lib/dc.py,sha256=u0RQJPG0zwxsoYS-4wrbDBPuLYZajwIi1YX37khKfkI,87942
54
+ datachain/lib/data_model.py,sha256=zS4lmXHVBXc9ntcyea2a1CRLXGSAN_0glXcF88CohgY,2685
55
+ datachain/lib/dataset_info.py,sha256=3APfNYMWizIwXhgRYpMQKSeVntNAvQuBbbB25dV7mgY,2460
56
+ datachain/lib/dc.py,sha256=J7liATKQBJCkeHanVLr0s3d1t5wxiiiSJuSbuxKBbLg,89527
47
57
  datachain/lib/file.py,sha256=-XMkL6ED1sE7TMhWoMRTEuOXswZJw8X6AEmJDONFP74,15019
48
- datachain/lib/hf.py,sha256=BW2NPpqxkpPwkSaGlppT8Rbs8zPpyYC-tR6htY08c-0,5817
58
+ datachain/lib/hf.py,sha256=a-zFpDmZIR4r8dlNNTjfpAKSnuJ9xyRXlgcdENiXt3E,5864
49
59
  datachain/lib/image.py,sha256=AMXYwQsmarZjRbPCZY3M1jDsM2WAB_b3cTY4uOIuXNU,2675
50
60
  datachain/lib/listing.py,sha256=cVkCp7TRVpcZKSx-Bbk9t51bQI9Mw0o86W6ZPhAsuzM,3667
51
61
  datachain/lib/listing_info.py,sha256=9ua40Hw0aiQByUw3oAEeNzMavJYfW0Uhe8YdCTK-m_g,1110
@@ -53,35 +63,32 @@ datachain/lib/meta_formats.py,sha256=anK2bDVbaeCCh0yvKUBaW2MVos3zRgdaSV8uSduzPcU
53
63
  datachain/lib/model_store.py,sha256=DNIv8Y6Jtk1_idNLzIpsThOsdW2BMAudyUCbPUcgcxk,2515
54
64
  datachain/lib/pytorch.py,sha256=W-ARi2xH1f1DUkVfRuerW-YWYgSaJASmNCxtz2lrJGI,6072
55
65
  datachain/lib/settings.py,sha256=ZELRCTLbi5vzRPiDX6cQ9LLg9TefJ_A05gIGni0lll8,2535
56
- datachain/lib/signal_schema.py,sha256=xwkE5bxJxUhZTjrA6jqN87XbSXPikCbL6eOPL9WyrKM,24556
66
+ datachain/lib/signal_schema.py,sha256=_uh19nCKhiD9ua8oIN1Q8R9iYv1BZAuqTJCLYVmyW8k,24557
57
67
  datachain/lib/tar.py,sha256=3WIzao6yD5fbLqXLTt9GhPGNonbFIs_fDRu-9vgLgsA,1038
58
68
  datachain/lib/text.py,sha256=UNHm8fhidk7wdrWqacEWaA6I9ykfYqarQ2URby7jc7M,1261
59
69
  datachain/lib/udf.py,sha256=-j0krjNAELTqRI0dB1N65AmawtcIY5vN---AuUcW8Us,13637
60
70
  datachain/lib/udf_signature.py,sha256=GXw24A-Olna6DWCdgy2bC-gZh_gLGPQ-KvjuI6pUjC0,7281
61
- datachain/lib/utils.py,sha256=6NwgWLl5JrgtD4rsSFEe-yR2ntEwJMJEtAZ3FIxK3fg,1529
71
+ datachain/lib/utils.py,sha256=om-MCiyYwvPHtFq3V2rBKrRDNkio9XXofj7RsUIlHKU,1586
62
72
  datachain/lib/vfile.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
63
73
  datachain/lib/webdataset.py,sha256=o7SHk5HOUWsZ5Ln04xOM04eQqiBHiJNO7xLgyVBrwo8,6924
64
- datachain/lib/webdataset_laion.py,sha256=aGMWeFmeYNK75ewO9JTA11iB1i3QtTzUfenQA5jajfo,2535
74
+ datachain/lib/webdataset_laion.py,sha256=xvT6m_r5y0KbOx14BUe7UC5mOgrktJq53Mh-H0EVlUE,2525
65
75
  datachain/lib/convert/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
66
76
  datachain/lib/convert/flatten.py,sha256=Uebc5CeqCsacp-nr6IG9i6OGuUavXqdqnoGctZBk3RQ,1384
67
77
  datachain/lib/convert/python_to_sql.py,sha256=40SAOdoOgikZRhn8iomCPDRoxC3RFxjJLivEAA9MHDU,2880
68
78
  datachain/lib/convert/sql_to_python.py,sha256=XXCBYDQFUXJIBNWkjEP944cnCfJ8GF2Tji0DLF3A_zQ,315
69
79
  datachain/lib/convert/unflatten.py,sha256=Ogvh_5wg2f38_At_1lN0D_e2uZOOpYEvwvB2xdq56Tw,2012
70
80
  datachain/lib/convert/values_to_tuples.py,sha256=varRCnSMT_pZmHznrd2Yi05qXLLz_v9YH_pOCpHSkdc,3921
71
- datachain/lib/func/__init__.py,sha256=wlAKhGV0QDg9y7reSwoUF8Vicfqh_YOUNIXLzxICGz4,403
72
- datachain/lib/func/aggregate.py,sha256=H1ziFQdaK9zvnxvttfnEzkkyGvEEmMAvmgCsBV6nfm8,10917
73
- datachain/lib/func/func.py,sha256=HAJZ_tpiRG2R-et7pr0WnoyNZYtpbPn3_HBuL3RQpbU,4800
74
81
  datachain/model/__init__.py,sha256=R9faX5OHV1xh2EW-g2MPedwbtEqt3LodJRyluB-QylI,189
75
- datachain/model/bbox.py,sha256=LLtzc8OiL-cxqqlPWXA4MFTo8HRS3GW2gOxA0Sf_cxI,3158
76
- datachain/model/pose.py,sha256=0URrnS99Ugq0yspCXC2z-hgpybEA5tWLJXpxqVLnAlI,3088
77
- datachain/model/segment.py,sha256=dMxtm-05fNseEoEKpZj9iDN7fwGK1udyAreN-V-cRks,1597
82
+ datachain/model/bbox.py,sha256=1Li1G3RdiQwLOAc2Mak2nQU0bcvdH-lXmXtA984CUWM,3154
83
+ datachain/model/pose.py,sha256=q9NgB8h66aKnYnLi7Pyf9bU-F_90W4cbvtSO3-_hkdk,3078
84
+ datachain/model/segment.py,sha256=iRWf0KieXfSM1eGD9Y7THx8L_EMB79Sk8WVebs3xSbQ,1593
78
85
  datachain/model/ultralytics/__init__.py,sha256=EvcNX9qUyxKXXlKCPpsXeRrabyXk5E9EkN-tyiYkfS4,750
79
86
  datachain/model/ultralytics/bbox.py,sha256=OZ9XBdyMOYc401P-RhfSN9QaYvMpnx2Phu9ptaJgZBY,4316
80
87
  datachain/model/ultralytics/pose.py,sha256=71KBTcoST2wcEtsyGXqLVpvUtqbp9gwZGA15pEPtX5A,2959
81
88
  datachain/model/ultralytics/segment.py,sha256=Z1ab0tZRJubSYNH4KkFlzhYeGNTfAyC71KmkQcToHDQ,2760
82
89
  datachain/query/__init__.py,sha256=7DhEIjAA8uZJfejruAVMZVcGFmvUpffuZJwgRqNwe-c,263
83
90
  datachain/query/batch.py,sha256=5fEhORFe7li12SdYddaSK3LyqksMfCHhwN1_A6TfsA4,3485
84
- datachain/query/dataset.py,sha256=sQny-ZemB2HueC4mPg-7qSaqUD85MMO-DQyVVP8K1CA,53765
91
+ datachain/query/dataset.py,sha256=bQVG4WnJfBQpvnxouIdDlsJF2gB8V4lDp4Zu9JeZ-rc,54771
85
92
  datachain/query/dispatch.py,sha256=wjjTWw6sFQbB9SKRh78VbfvwSMgJXCfqJklS3-9KnCU,12025
86
93
  datachain/query/metrics.py,sha256=r5b0ygYhokbXp8Mg3kCH8iFSRw0jxzyeBe-C-J_bKFc,938
87
94
  datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
@@ -90,13 +97,13 @@ datachain/query/schema.py,sha256=b_KnVy6B26Ol4nYG0LqNNpeQ1QYPk95YRGUjXfdaQWs,660
90
97
  datachain/query/session.py,sha256=50SOdLNCjqHHKI-L4xGXyzTVxzMWfANqKqjeYre-c2k,5959
91
98
  datachain/remote/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
92
99
  datachain/remote/studio.py,sha256=g88kHdlRhmruiWwoIxq_JJoymZUrtMAL937NWQyWyXI,9209
93
- datachain/sql/__init__.py,sha256=A2djrbQwSMUZZEIKGnm-mnRA-NDSbiDJNpAmmwGNyIo,303
94
- datachain/sql/selectable.py,sha256=fBM-wS1TUA42kVEAAiwqGtibIevyZAEritwt8PZGyLQ,1589
100
+ datachain/sql/__init__.py,sha256=6SQRdbljO3d2hx3EAVXEZrHQKv5jth0Jh98PogT59No,262
101
+ datachain/sql/selectable.py,sha256=cTc60qVoAwqqss0Vop8Lt5Z-ROnM1XrQmL_GLjRxhXs,1765
95
102
  datachain/sql/types.py,sha256=ASSPkmM5EzdRindqj2O7WHLXq8VHAgFYedG8lYfGvVI,14045
96
103
  datachain/sql/utils.py,sha256=rzlJw08etivdrcuQPqNVvVWhuVSyUPUQEEc6DOhu258,818
97
104
  datachain/sql/default/__init__.py,sha256=XQ2cEZpzWiABqjV-6yYHUBGI9vN_UHxbxZENESmVAWw,45
98
105
  datachain/sql/default/base.py,sha256=QD-31C6JnyOXzogyDx90sUhm7QvgXIYpeHEASH84igU,628
99
- datachain/sql/functions/__init__.py,sha256=-vIkU0AqwOW5FX6P89xYl-uBIUdt46CEnCtshmN85gM,400
106
+ datachain/sql/functions/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
100
107
  datachain/sql/functions/aggregate.py,sha256=3AQdA8YHPFdtCEfwZKQXTT8SlQWdG9gD5PBtGN3Odqs,944
101
108
  datachain/sql/functions/array.py,sha256=rvH27SWN9gdh_mFnp0GIiXuCrNW6n8ZbY4I_JUS-_e0,1140
102
109
  datachain/sql/functions/conditional.py,sha256=q7YUKfunXeEldXaxgT-p5pUTcOEVU_tcQ2BJlquTRPs,207
@@ -104,15 +111,15 @@ datachain/sql/functions/path.py,sha256=zixpERotTFP6LZ7I4TiGtyRA8kXOoZmH1yzH9oRW0
104
111
  datachain/sql/functions/random.py,sha256=vBwEEj98VH4LjWixUCygQ5Bz1mv1nohsCG0-ZTELlVg,271
105
112
  datachain/sql/functions/string.py,sha256=DYgiw8XSk7ge7GXvyRI1zbaMruIizNeI-puOjriQGZQ,1148
106
113
  datachain/sql/sqlite/__init__.py,sha256=TAdJX0Bg28XdqPO-QwUVKy8rg78cgMileHvMNot7d04,166
107
- datachain/sql/sqlite/base.py,sha256=aHSZVvh4XSVkvZ07h3jMoRlHI4sWD8y3SnmGs9xMG9Y,14375
114
+ datachain/sql/sqlite/base.py,sha256=X4iEynOAqqvqz8lmgUKvURleKO6aguULgG8RoufKrSk,14772
108
115
  datachain/sql/sqlite/types.py,sha256=lPXS1XbkmUtlkkiRxy_A_UzsgpPv2VSkXYOD4zIHM4w,1734
109
116
  datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
110
117
  datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
111
118
  datachain/toolkit/split.py,sha256=ZgDcrNiKiPXZmKD591_1z9qRIXitu5zwAsoVPB7ykiU,2508
112
119
  datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
113
- datachain-0.7.1.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
114
- datachain-0.7.1.dist-info/METADATA,sha256=9ICI9nDBKNq39JJR2q_RxuYBCFkUD4o81T2FEO8LKDU,18006
115
- datachain-0.7.1.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
116
- datachain-0.7.1.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
117
- datachain-0.7.1.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
118
- datachain-0.7.1.dist-info/RECORD,,
120
+ datachain-0.7.2.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
121
+ datachain-0.7.2.dist-info/METADATA,sha256=FuinZ-OIUuKz_b26-eirZl4hJdHJ4oOa8MO-LxzGywc,18006
122
+ datachain-0.7.2.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
123
+ datachain-0.7.2.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
124
+ datachain-0.7.2.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
125
+ datachain-0.7.2.dist-info/RECORD,,
@@ -1,32 +0,0 @@
1
- from .aggregate import (
2
- any_value,
3
- avg,
4
- collect,
5
- concat,
6
- count,
7
- dense_rank,
8
- first,
9
- max,
10
- min,
11
- rank,
12
- row_number,
13
- sum,
14
- )
15
- from .func import Func, window
16
-
17
- __all__ = [
18
- "Func",
19
- "any_value",
20
- "avg",
21
- "collect",
22
- "concat",
23
- "count",
24
- "dense_rank",
25
- "first",
26
- "max",
27
- "min",
28
- "rank",
29
- "row_number",
30
- "sum",
31
- "window",
32
- ]
@@ -1,152 +0,0 @@
1
- from dataclasses import dataclass
2
- from typing import TYPE_CHECKING, Callable, Optional
3
-
4
- from sqlalchemy import desc
5
-
6
- from datachain.lib.convert.python_to_sql import python_to_sql
7
- from datachain.lib.utils import DataChainColumnError, DataChainParamsError
8
- from datachain.query.schema import Column, ColumnMeta
9
-
10
- if TYPE_CHECKING:
11
- from datachain import DataType
12
- from datachain.lib.signal_schema import SignalSchema
13
-
14
-
15
- @dataclass
16
- class Window:
17
- """Represents a window specification for SQL window functions."""
18
-
19
- partition_by: str
20
- order_by: str
21
- desc: bool = False
22
-
23
-
24
- def window(partition_by: str, order_by: str, desc: bool = False) -> Window:
25
- """
26
- Defines a window specification for SQL window functions.
27
-
28
- The `window` function specifies how to partition and order the result set
29
- for the associated window function. It is used to define the scope of the rows
30
- that the window function will operate on.
31
-
32
- Args:
33
- partition_by (str): The column name by which to partition the result set.
34
- Rows with the same value in the partition column
35
- will be grouped together for the window function.
36
- order_by (str): The column name by which to order the rows
37
- within each partition. This determines the sequence in which
38
- the window function is applied.
39
- desc (bool, optional): If True, the rows will be ordered in descending order.
40
- Defaults to False, which orders the rows
41
- in ascending order.
42
-
43
- Returns:
44
- Window: A Window object representing the window specification.
45
-
46
- Example:
47
- ```py
48
- window = func.window(partition_by="signal.category", order_by="created_at")
49
- dc.mutate(
50
- row_number=func.row_number().over(window),
51
- )
52
- ```
53
- """
54
- return Window(
55
- ColumnMeta.to_db_name(partition_by),
56
- ColumnMeta.to_db_name(order_by),
57
- desc,
58
- )
59
-
60
-
61
- class Func:
62
- """Represents a function to be applied to a column in a SQL query."""
63
-
64
- def __init__(
65
- self,
66
- name: str,
67
- inner: Callable,
68
- col: Optional[str] = None,
69
- result_type: Optional["DataType"] = None,
70
- is_array: bool = False,
71
- is_window: bool = False,
72
- window: Optional[Window] = None,
73
- ) -> None:
74
- self.name = name
75
- self.inner = inner
76
- self.col = col
77
- self.result_type = result_type
78
- self.is_array = is_array
79
- self.is_window = is_window
80
- self.window = window
81
-
82
- def __str__(self) -> str:
83
- return self.name + "()"
84
-
85
- def over(self, window: Window) -> "Func":
86
- if not self.is_window:
87
- raise DataChainParamsError(f"{self} doesn't support window (over())")
88
-
89
- return Func(
90
- "over",
91
- self.inner,
92
- self.col,
93
- self.result_type,
94
- self.is_array,
95
- self.is_window,
96
- window,
97
- )
98
-
99
- @property
100
- def db_col(self) -> Optional[str]:
101
- return ColumnMeta.to_db_name(self.col) if self.col else None
102
-
103
- def db_col_type(self, signals_schema: "SignalSchema") -> Optional["DataType"]:
104
- if not self.db_col:
105
- return None
106
- col_type: type = signals_schema.get_column_type(self.db_col)
107
- return list[col_type] if self.is_array else col_type # type: ignore[valid-type]
108
-
109
- def get_result_type(self, signals_schema: "SignalSchema") -> "DataType":
110
- if self.result_type:
111
- return self.result_type
112
-
113
- if col_type := self.db_col_type(signals_schema):
114
- return col_type
115
-
116
- raise DataChainColumnError(
117
- str(self),
118
- "Column name is required to infer result type",
119
- )
120
-
121
- def get_column(
122
- self, signals_schema: "SignalSchema", label: Optional[str] = None
123
- ) -> Column:
124
- col_type = self.get_result_type(signals_schema)
125
- sql_type = python_to_sql(col_type)
126
-
127
- if self.col:
128
- col = Column(self.db_col, sql_type)
129
- func_col = self.inner(col)
130
- else:
131
- func_col = self.inner()
132
-
133
- if self.is_window:
134
- if not self.window:
135
- raise DataChainParamsError(
136
- f"Window function {self} requires over() clause with a window spec",
137
- )
138
- func_col = func_col.over(
139
- partition_by=self.window.partition_by,
140
- order_by=(
141
- desc(self.window.order_by)
142
- if self.window.desc
143
- else self.window.order_by
144
- ),
145
- )
146
-
147
- func_col.type = sql_type
148
-
149
- if label:
150
- func_col = func_col.label(label)
151
-
152
- return func_col