PyPI - datachain - Versions diffs - 0.14.2__py3-none-any.whl → 0.39.0__py3-none-any.whl - Mend

datachain 0.14.2py3-none-any.whl → 0.39.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (137) hide show

datachain/__init__.py +20 -0
datachain/asyn.py +11 -12
datachain/cache.py +7 -7
datachain/catalog/__init__.py +2 -2
datachain/catalog/catalog.py +621 -507
datachain/catalog/dependency.py +164 -0
datachain/catalog/loader.py +28 -18
datachain/checkpoint.py +43 -0
datachain/cli/__init__.py +24 -33
datachain/cli/commands/__init__.py +1 -8
datachain/cli/commands/datasets.py +83 -52
datachain/cli/commands/ls.py +17 -17
datachain/cli/commands/show.py +4 -4
datachain/cli/parser/__init__.py +8 -74
datachain/cli/parser/job.py +95 -3
datachain/cli/parser/studio.py +11 -4
datachain/cli/parser/utils.py +1 -2
datachain/cli/utils.py +2 -15
datachain/client/azure.py +4 -4
datachain/client/fsspec.py +45 -28
datachain/client/gcs.py +6 -6
datachain/client/hf.py +29 -2
datachain/client/http.py +157 -0
datachain/client/local.py +15 -11
datachain/client/s3.py +17 -9
datachain/config.py +4 -8
datachain/data_storage/db_engine.py +12 -6
datachain/data_storage/job.py +5 -1
datachain/data_storage/metastore.py +1252 -186
datachain/data_storage/schema.py +58 -45
datachain/data_storage/serializer.py +105 -15
datachain/data_storage/sqlite.py +286 -127
datachain/data_storage/warehouse.py +250 -113
datachain/dataset.py +353 -148
datachain/delta.py +391 -0
datachain/diff/__init__.py +27 -29
datachain/error.py +60 -0
datachain/func/__init__.py +2 -1
datachain/func/aggregate.py +66 -42
datachain/func/array.py +242 -38
datachain/func/base.py +7 -4
datachain/func/conditional.py +110 -60
datachain/func/func.py +96 -45
datachain/func/numeric.py +55 -38
datachain/func/path.py +32 -20
datachain/func/random.py +2 -2
datachain/func/string.py +67 -37
datachain/func/window.py +7 -8
datachain/hash_utils.py +123 -0
datachain/job.py +11 -7
datachain/json.py +138 -0
datachain/lib/arrow.py +58 -22
datachain/lib/audio.py +245 -0
datachain/lib/clip.py +14 -13
datachain/lib/convert/flatten.py +5 -3
datachain/lib/convert/python_to_sql.py +6 -10
datachain/lib/convert/sql_to_python.py +8 -0
datachain/lib/convert/values_to_tuples.py +156 -51
datachain/lib/data_model.py +42 -20
datachain/lib/dataset_info.py +36 -8
datachain/lib/dc/__init__.py +8 -2
datachain/lib/dc/csv.py +25 -28
datachain/lib/dc/database.py +398 -0
datachain/lib/dc/datachain.py +1289 -425
datachain/lib/dc/datasets.py +320 -38
datachain/lib/dc/hf.py +38 -24
datachain/lib/dc/json.py +29 -32
datachain/lib/dc/listings.py +112 -8
datachain/lib/dc/pandas.py +16 -12
datachain/lib/dc/parquet.py +35 -23
datachain/lib/dc/records.py +31 -23
datachain/lib/dc/storage.py +154 -64
datachain/lib/dc/storage_pattern.py +251 -0
datachain/lib/dc/utils.py +24 -16
datachain/lib/dc/values.py +8 -9
datachain/lib/file.py +622 -89
datachain/lib/hf.py +69 -39
datachain/lib/image.py +14 -14
datachain/lib/listing.py +14 -11
datachain/lib/listing_info.py +1 -2
datachain/lib/meta_formats.py +3 -4
datachain/lib/model_store.py +39 -7
datachain/lib/namespaces.py +125 -0
datachain/lib/projects.py +130 -0
datachain/lib/pytorch.py +32 -21
datachain/lib/settings.py +192 -56
datachain/lib/signal_schema.py +427 -104
datachain/lib/tar.py +1 -2
datachain/lib/text.py +8 -7
datachain/lib/udf.py +164 -76
datachain/lib/udf_signature.py +60 -35
datachain/lib/utils.py +118 -4
datachain/lib/video.py +17 -9
datachain/lib/webdataset.py +61 -56
datachain/lib/webdataset_laion.py +15 -16
datachain/listing.py +22 -10
datachain/model/bbox.py +3 -1
datachain/model/ultralytics/bbox.py +16 -12
datachain/model/ultralytics/pose.py +16 -12
datachain/model/ultralytics/segment.py +16 -12
datachain/namespace.py +84 -0
datachain/node.py +6 -6
datachain/nodes_thread_pool.py +0 -1
datachain/plugins.py +24 -0
datachain/project.py +78 -0
datachain/query/batch.py +40 -41
datachain/query/dataset.py +604 -322
datachain/query/dispatch.py +261 -154
datachain/query/metrics.py +4 -6
datachain/query/params.py +2 -3
datachain/query/queue.py +3 -12
datachain/query/schema.py +11 -6
datachain/query/session.py +200 -33
datachain/query/udf.py +34 -2
datachain/remote/studio.py +171 -69
datachain/script_meta.py +12 -12
datachain/semver.py +68 -0
datachain/sql/__init__.py +2 -0
datachain/sql/functions/array.py +33 -1
datachain/sql/postgresql_dialect.py +9 -0
datachain/sql/postgresql_types.py +21 -0
datachain/sql/sqlite/__init__.py +5 -1
datachain/sql/sqlite/base.py +102 -29
datachain/sql/sqlite/types.py +8 -13
datachain/sql/types.py +70 -15
datachain/studio.py +223 -46
datachain/toolkit/split.py +31 -10
datachain/utils.py +101 -59
{datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/METADATA +77 -22
datachain-0.39.0.dist-info/RECORD +173 -0
{datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/WHEEL +1 -1
datachain/cli/commands/query.py +0 -53
datachain/query/utils.py +0 -42
datachain-0.14.2.dist-info/RECORD +0 -158
{datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/entry_points.txt +0 -0
{datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/licenses/LICENSE +0 -0
{datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/top_level.txt +0 -0

datachain/func/aggregate.py CHANGED Viewed

@@ -1,78 +1,87 @@
-from typing import Optional
 from sqlalchemy import func as sa_func
+from datachain.query.schema import Column
 from datachain.sql.functions import aggregate
 from .func import Func
-def count(col: Optional[str] = None) -> Func:
+def count(col: str | Column | None = None) -> Func:
     """
-    Returns the COUNT aggregate SQL function for the given column name.
+    Returns a COUNT aggregate SQL function for the specified column.
-    The COUNT function returns the number of rows in a table.
+    The COUNT function returns the number of rows, optionally filtered
+    by a specific column.
     Args:
-        col (str, optional): The name of the column for which to count rows.
-                             If not provided, it defaults to counting all rows.
+        col (str | Column, optional): The column to count.
+            If omitted, counts all rows.
+            The column can be specified as a string or a `Column` object.
     Returns:
-        Func: A Func object that represents the COUNT aggregate function.
+        Func: A `Func` object representing the COUNT aggregate function.
     Example:
         ```py
         dc.group_by(
-            count=func.count(),
+            count1=func.count(),
+            count2=func.count("signal.id"),
+            count3=func.count(dc.C("signal.category")),
             partition_by="signal.category",
         )
         ```
     Notes:
-        - Result column will always be of type int.
+        - The result column will always have an integer type.
     """
     return Func(
-        "count", inner=sa_func.count, cols=[col] if col else None, result_type=int
+        "count",
+        inner=sa_func.count,
+        cols=[col] if col is not None else None,
+        result_type=int,
     )
-def sum(col: str) -> Func:
+def sum(col: str | Column) -> Func:
     """
-    Returns the SUM aggregate SQL function for the given column name.
+    Returns the SUM aggregate SQL function for the specified column.
     The SUM function returns the total sum of a numeric column in a table.
     It sums up all the values for the specified column.
     Args:
-        col (str): The name of the column for which to calculate the sum.
+        col (str | Column): The name of the column for which to calculate the sum.
+            The column can be specified as a string or a `Column` object.
     Returns:
-        Func: A Func object that represents the SUM aggregate function.
+        Func: A `Func` object that represents the SUM aggregate function.
     Example:
         ```py
         dc.group_by(
             files_size=func.sum("file.size"),
+            total_size=func.sum(dc.C("size")),
             partition_by="signal.category",
         )
         ```
     Notes:
         - The `sum` function should be used on numeric columns.
-        - Result column type will be the same as the input column type.
+        - The result column type will be the same as the input column type.
     """
     return Func("sum", inner=sa_func.sum, cols=[col])
-def avg(col: str) -> Func:
+def avg(col: str | Column) -> Func:
     """
-    Returns the AVG aggregate SQL function for the given column name.
+    Returns the AVG aggregate SQL function for the specified column.
     The AVG function returns the average of a numeric column in a table.
     It calculates the mean of all values in the specified column.
     Args:
-        col (str): The name of the column for which to calculate the average.
+        col (str | Column): The name of the column for which to calculate the average.
+            Column can be specified as a string or a `Column` object.
     Returns:
         Func: A Func object that represents the AVG aggregate function.
@@ -81,26 +90,28 @@ def avg(col: str) -> Func:
         ```py
         dc.group_by(
             avg_file_size=func.avg("file.size"),
+            avg_signal_value=func.avg(dc.C("signal.value")),
             partition_by="signal.category",
         )
         ```
     Notes:
         - The `avg` function should be used on numeric columns.
-        - Result column will always be of type float.
+        - The result column will always be of type float.
     """
     return Func("avg", inner=aggregate.avg, cols=[col], result_type=float)
-def min(col: str) -> Func:
+def min(col: str | Column) -> Func:
     """
-    Returns the MIN aggregate SQL function for the given column name.
+    Returns the MIN aggregate SQL function for the specified column.
     The MIN function returns the smallest value in the specified column.
     It can be used on both numeric and non-numeric columns to find the minimum value.
     Args:
-        col (str): The name of the column for which to find the minimum value.
+        col (str | Column): The name of the column for which to find the minimum value.
+            Column can be specified as a string or a `Column` object.
     Returns:
         Func: A Func object that represents the MIN aggregate function.
@@ -109,18 +120,19 @@ def min(col: str) -> Func:
         ```py
         dc.group_by(
             smallest_file=func.min("file.size"),
+            min_signal=func.min(dc.C("signal")),
             partition_by="signal.category",
         )
         ```
     Notes:
         - The `min` function can be used with numeric, date, and string columns.
-        - Result column will have the same type as the input column.
+        - The result column will have the same type as the input column.
     """
     return Func("min", inner=sa_func.min, cols=[col])
-def max(col: str) -> Func:
+def max(col: str | Column) -> Func:
     """
     Returns the MAX aggregate SQL function for the given column name.
@@ -128,7 +140,8 @@ def max(col: str) -> Func:
     It can be used on both numeric and non-numeric columns to find the maximum value.
     Args:
-        col (str): The name of the column for which to find the maximum value.
+        col (str | Column): The name of the column for which to find the maximum value.
+            Column can be specified as a string or a `Column` object.
     Returns:
         Func: A Func object that represents the MAX aggregate function.
@@ -137,18 +150,19 @@ def max(col: str) -> Func:
         ```py
         dc.group_by(
             largest_file=func.max("file.size"),
+            max_signal=func.max(dc.C("signal")),
             partition_by="signal.category",
         )
         ```
     Notes:
         - The `max` function can be used with numeric, date, and string columns.
-        - Result column will have the same type as the input column.
+        - The result column will have the same type as the input column.
     """
     return Func("max", inner=sa_func.max, cols=[col])
-def any_value(col: str) -> Func:
+def any_value(col: str | Column) -> Func:
     """
     Returns the ANY_VALUE aggregate SQL function for the given column name.
@@ -157,7 +171,9 @@ def any_value(col: str) -> Func:
     as long as it comes from one of the rows in the group.
     Args:
-        col (str): The name of the column from which to return an arbitrary value.
+        col (str | Column): The name of the column from which to return
+            an arbitrary value.
+            Column can be specified as a string or a `Column` object.
     Returns:
         Func: A Func object that represents the ANY_VALUE aggregate function.
@@ -165,21 +181,22 @@ def any_value(col: str) -> Func:
     Example:
         ```py
         dc.group_by(
-            file_example=func.any_value("file.name"),
+            file_example=func.any_value("file.path"),
+            signal_example=func.any_value(dc.C("signal.value")),
             partition_by="signal.category",
         )
         ```
     Notes:
         - The `any_value` function can be used with any type of column.
-        - Result column will have the same type as the input column.
+        - The result column will have the same type as the input column.
         - The result of `any_value` is non-deterministic,
           meaning it may return different values for different executions.
     """
     return Func("any_value", inner=aggregate.any_value, cols=[col])
-def collect(col: str) -> Func:
+def collect(col: str | Column) -> Func:
     """
     Returns the COLLECT aggregate SQL function for the given column name.
@@ -188,7 +205,8 @@ def collect(col: str) -> Func:
     into a collection, often for further processing or aggregation.
     Args:
-        col (str): The name of the column from which to collect values.
+        col (str | Column): The name of the column from which to collect values.
+            Column can be specified as a string or a `Column` object.
     Returns:
         Func: A Func object that represents the COLLECT aggregate function.
@@ -197,18 +215,19 @@ def collect(col: str) -> Func:
         ```py
         dc.group_by(
             signals=func.collect("signal"),
+            file_paths=func.collect(dc.C("file.path")),
             partition_by="signal.category",
         )
         ```
     Notes:
         - The `collect` function can be used with numeric and string columns.
-        - Result column will have an array type.
+        - The result column will have an array type.
     """
     return Func("collect", inner=aggregate.collect, cols=[col], is_array=True)
-def concat(col: str, separator="") -> Func:
+def concat(col: str | Column, separator="") -> Func:
     """
     Returns the CONCAT aggregate SQL function for the given column name.
@@ -217,9 +236,10 @@ def concat(col: str, separator="") -> Func:
     into a single combined value.
     Args:
-        col (str): The name of the column from which to concatenate values.
+        col (str | Column): The name of the column from which to concatenate values.
+            Column can be specified as a string or a `Column` object.
         separator (str, optional): The separator to use between concatenated values.
-                                   Defaults to an empty string.
+            Defaults to an empty string.
     Returns:
         Func: A Func object that represents the CONCAT aggregate function.
@@ -227,14 +247,15 @@ def concat(col: str, separator="") -> Func:
     Example:
         ```py
         dc.group_by(
-            files=func.concat("file.name", separator=", "),
+            files=func.concat("file.path", separator=", "),
+            signals=func.concat(dc.C("signal.name"), separator=" | "),
             partition_by="signal.category",
         )
         ```
     Notes:
         - The `concat` function can be used with string columns.
-        - Result column will have a string type.
+        - The result column will have a string type.
     """
     def inner(arg):
@@ -325,7 +346,7 @@ def dense_rank() -> Func:
     return Func("dense_rank", inner=sa_func.dense_rank, result_type=int, is_window=True)
-def first(col: str) -> Func:
+def first(col: str | Column) -> Func:
     """
     Returns the FIRST_VALUE window function for SQL queries.
@@ -334,7 +355,9 @@ def first(col: str) -> Func:
     and can be useful for retrieving the leading value in a group of rows.
     Args:
-        col (str): The name of the column from which to retrieve the first value.
+        col (str | Column): The name of the column from which to retrieve
+            the first value.
+            Column can be specified as a string or a `Column` object.
     Returns:
         Func: A Func object that represents the FIRST_VALUE window function.
@@ -343,7 +366,8 @@ def first(col: str) -> Func:
         ```py
         window = func.window(partition_by="signal.category", order_by="created_at")
         dc.mutate(
-            first_file=func.first("file.name").over(window),
+            first_file=func.first("file.path").over(window),
+            first_signal=func.first(dc.C("signal.value")).over(window),
         )
         ```

datachain 0.14.2__py3-none-any.whl → 0.39.0__py3-none-any.whl

datachain 0.14.2py3-none-any.whl → 0.39.0py3-none-any.whl