PyPI - datachain - Versions diffs - 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl - Mend

datachain 0.2.0py3-none-any.whl → 0.2.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of datachain might be problematic. Click here for more details.

Files changed (39) hide show

datachain/__init__.py +0 -4
datachain/catalog/catalog.py +17 -2
datachain/cli.py +8 -1
datachain/data_storage/db_engine.py +0 -2
datachain/data_storage/schema.py +15 -26
datachain/data_storage/sqlite.py +3 -0
datachain/data_storage/warehouse.py +1 -7
datachain/lib/arrow.py +7 -13
datachain/lib/cached_stream.py +3 -85
datachain/lib/clip.py +151 -0
datachain/lib/dc.py +41 -59
datachain/lib/feature.py +5 -1
datachain/lib/feature_registry.py +3 -2
datachain/lib/feature_utils.py +1 -2
datachain/lib/file.py +17 -24
datachain/lib/image.py +37 -79
datachain/lib/pytorch.py +4 -2
datachain/lib/signal_schema.py +3 -4
datachain/lib/text.py +18 -49
datachain/lib/udf.py +64 -55
datachain/lib/udf_signature.py +11 -10
datachain/lib/utils.py +17 -0
datachain/lib/webdataset.py +2 -2
datachain/listing.py +0 -3
datachain/query/dataset.py +66 -46
datachain/query/dispatch.py +2 -2
datachain/query/schema.py +1 -8
datachain/query/udf.py +16 -18
datachain/sql/sqlite/base.py +34 -2
datachain/sql/sqlite/vector.py +13 -5
datachain/utils.py +28 -0
{datachain-0.2.0.dist-info → datachain-0.2.2.dist-info}/METADATA +3 -2
{datachain-0.2.0.dist-info → datachain-0.2.2.dist-info}/RECORD +37 -38
{datachain-0.2.0.dist-info → datachain-0.2.2.dist-info}/WHEEL +1 -1
datachain/_version.py +0 -16
datachain/lib/reader.py +0 -49
{datachain-0.2.0.dist-info → datachain-0.2.2.dist-info}/LICENSE +0 -0
{datachain-0.2.0.dist-info → datachain-0.2.2.dist-info}/entry_points.txt +0 -0
{datachain-0.2.0.dist-info → datachain-0.2.2.dist-info}/top_level.txt +0 -0

datachain/lib/udf.py CHANGED Viewed

@@ -1,15 +1,16 @@
 import inspect
 import sys
 import traceback
-from typing import TYPE_CHECKING, Callable, Optional
+from typing import TYPE_CHECKING, Callable
 from datachain.lib.feature import Feature
 from datachain.lib.signal_schema import SignalSchema
-from datachain.lib.utils import DataChainError, DataChainParamsError
-from datachain.query import Stream, udf
+from datachain.lib.udf_signature import UdfSignature
+from datachain.lib.utils import AbstractUDF, DataChainError, DataChainParamsError
+from datachain.query import udf
 if TYPE_CHECKING:
-    from dvxc.query.udf import UDFWrapper
+    from datachain.query.udf import UDFWrapper
 class UdfError(DataChainParamsError):
@@ -17,31 +18,68 @@ class UdfError(DataChainParamsError):
         super().__init__(f"UDF error: {msg}")
-class UDFBase:
+class UDFBase(AbstractUDF):
     is_input_batched = False
     is_output_batched = False
     is_input_grouped = False
-    def __init__(
-        self,
-        params: SignalSchema,
-        output: SignalSchema,
-        func: Optional[Callable] = None,
-    ):
+    def __init__(self):
+        self.params = None
+        self.output = None
+        self.params_spec = None
+        self.output_spec = None
+        self._contains_stream = None
+        self._catalog = None
+        self._func = None
+    def process(self, *args, **kwargs):
+        """Processing function that needs to be defined by user"""
+        if not self._func:
+            raise NotImplementedError("UDF processing is not implemented")
+        return self._func(*args, **kwargs)
+    def setup(self):
+        """Initialization process executed on each worker before processing begins.
+        This is needed for tasks like pre-loading ML models prior to scoring.
+        """
+    def teardown(self):
+        """Teardown process executed on each process/worker after processing ends.
+        This is needed for tasks like closing connections to end-points.
+        """
+    def _init(self, sign: UdfSignature, params: SignalSchema, func: Callable):
         self.params = params
-        self.output = output
-        self._func = func
+        self.output = sign.output_schema
-        params_spec = params.to_udf_spec()
+        params_spec = self.params.to_udf_spec()
         self.params_spec = list(params_spec.keys())
-        self._contains_stream = False
-        if params.contains_file():
-            self.params_spec.insert(0, Stream())  # type: ignore[arg-type]
-            self._contains_stream = True
+        self.output_spec = self.output.to_udf_spec()
-        self.output_spec = output.to_udf_spec()
+        self._func = func
-        self._catalog = None
+    @classmethod
+    def _create(
+        cls,
+        target_class: type["UDFBase"],
+        sign: UdfSignature,
+        params: SignalSchema,
+        catalog,
+    ) -> "UDFBase":
+        if isinstance(sign.func, AbstractUDF):
+            if not isinstance(sign.func, target_class):  # type: ignore[unreachable]
+                raise UdfError(
+                    f"cannot create UDF: provided UDF '{sign.func.__name__}'"
+                    f" must be a child of target class '{target_class.__name__}'",
+                )
+            result = sign.func
+            func = None
+        else:
+            result = target_class()
+            func = sign.func
+        result._init(sign, params, func)
+        return result
     @property
     def name(self):
@@ -58,25 +96,10 @@ class UDFBase:
         udf_wrapper = udf(self.params_spec, self.output_spec, batch=batch)
         return udf_wrapper(self)
-    def bootstrap(self):
-        """Initialization process executed on each worker before processing begins.
-        This is needed for tasks like pre-loading ML models prior to scoring.
-        """
-    def teardown(self):
-        """Teardown process executed on each process/worker after processing ends.
-        This is needed for tasks like closing connections to end-points.
-        """
-    def process(self, *args, **kwargs):
-        if not self._func:
-            raise NotImplementedError("UDF processing is not implemented")
-        return self._func(*args, **kwargs)
     def validate_results(self, results, *args, **kwargs):
         return results
-    def __call__(self, *rows, **kwargs):
+    def __call__(self, *rows):
         if self.is_input_grouped:
             objs = self._parse_grouped_rows(rows)
         else:
@@ -122,18 +145,10 @@ class UDFBase:
             rows = [rows]
         objs = []
         for row in rows:
-            if self._contains_stream:
-                stream, *row = row
-            else:
-                stream = None
             obj_row = self.params.row_to_objs(row)
-            if self._contains_stream:
-                for obj in obj_row:
-                    if isinstance(obj, Feature):
-                        obj._set_stream(self._catalog, stream, True)
+            for obj in obj_row:
+                if isinstance(obj, Feature):
+                    obj._set_stream(self._catalog, caching_enabled=True)
             objs.append(obj_row)
         return objs
@@ -150,13 +165,7 @@ class UDFBase:
             output_map[name] = []
         for flat_obj in group:
-            if self._contains_stream:
-                position = 1
-                stream = flat_obj[0]
-            else:
-                position = 0
-                stream = None
+            position = 0
             for signal, (cls, length) in spec_map.items():
                 slice = flat_obj[position : position + length]
                 position += length
@@ -167,7 +176,7 @@ class UDFBase:
                     obj = slice[0]
                 if isinstance(obj, Feature):
-                    obj._set_stream(self._catalog, stream)
+                    obj._set_stream(self._catalog)
                 output_map[signal].append(obj)
         return list(output_map.values())

datachain/lib/udf_signature.py CHANGED Viewed

@@ -5,7 +5,7 @@ from typing import Callable, Optional, Union, get_args, get_origin
 from datachain.lib.feature import Feature, FeatureType, FeatureTypeNames
 from datachain.lib.signal_schema import SignalSchema
-from datachain.lib.utils import DataChainParamsError
+from datachain.lib.utils import AbstractUDF, DataChainParamsError
 class UdfSignatureError(DataChainParamsError):
@@ -49,10 +49,13 @@ class UdfSignature:
         else:
             if func is None:
                 raise UdfSignatureError(chain, "user function is not defined")
             udf_func = func
             signal_name = None
         if not callable(udf_func):
-            raise UdfSignatureError(chain, f"function '{func}' is not callable")
+            raise UdfSignatureError(chain, f"UDF '{udf_func}' is not callable")
         func_params_map_sign, func_outs_sign, is_iterator = (
             UdfSignature._func_signature(chain, udf_func)
         )
@@ -108,13 +111,6 @@ class UdfSignature:
         if isinstance(output, str):
             output = [output]
         if isinstance(output, Sequence):
-            if not func_outs_sign:
-                raise UdfSignatureError(
-                    chain,
-                    "output types are not specified. Specify types in 'output' as"
-                    " a dict or as function return value hint.",
-                )
             if len(func_outs_sign) != len(output):
                 raise UdfSignatureError(
                     chain,
@@ -158,8 +154,13 @@ class UdfSignature:
     @staticmethod
     def _func_signature(
-        chain: str, func: Callable
+        chain: str, udf_func: Callable
     ) -> tuple[dict[str, type], Sequence[type], bool]:
+        if isinstance(udf_func, AbstractUDF):
+            func = udf_func.process  # type: ignore[unreachable]
+        else:
+            func = udf_func
         sign = inspect.signature(func)
         input_map = {prm.name: prm.annotation for prm in sign.parameters.values()}

datachain/lib/utils.py CHANGED Viewed

@@ -1,3 +1,20 @@
+from abc import ABC, abstractmethod
+class AbstractUDF(ABC):
+    @abstractmethod
+    def process(self, *args, **kwargs):
+        pass
+    @abstractmethod
+    def setup(self):
+        pass
+    @abstractmethod
+    def teardown(self):
+        pass
 class DataChainError(Exception):
     def __init__(self, message):
         super().__init__(message)

datachain/lib/webdataset.py CHANGED Viewed

@@ -2,6 +2,7 @@ import hashlib
 import json
 import tarfile
 from collections.abc import Iterator, Sequence
+from pathlib import Path
 from typing import (
     Any,
     Callable,
@@ -240,10 +241,9 @@ class TarStream(File):
 def get_tar_groups(stream, tar, core_extensions, spec, encoding="utf-8"):
     builder = Builder(stream, core_extensions, spec, tar, encoding)
-    for item in tar.getmembers():
+    for item in sorted(tar.getmembers(), key=lambda m: Path(m.name).stem):
         if not item.isfile():
             continue
         try:
             builder.add(item)
         except StopIteration:

datachain/listing.py CHANGED Viewed

@@ -20,9 +20,6 @@ if TYPE_CHECKING:
     from datachain.storage import Storage
-RANDOM_BITS = 63  # size of the random integer field
 class Listing:
     def __init__(
         self,

datachain/query/dataset.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import ast
 import contextlib
 import datetime
 import inspect
@@ -51,9 +52,10 @@ from datachain.data_storage.schema import (
 from datachain.dataset import DatasetStatus, RowDict
 from datachain.error import DatasetNotFoundError, QueryScriptCancelError
 from datachain.progress import CombinedDownloadCallback
+from datachain.query.schema import DEFAULT_DELIMITER
 from datachain.sql.functions import rand
 from datachain.storage import Storage, StorageURI
-from datachain.utils import batched, determine_processes
+from datachain.utils import batched, determine_processes, inside_notebook
 from .batch import RowBatch
 from .metrics import metrics
@@ -62,7 +64,6 @@ from .session import Session
 from .udf import UDFBase, UDFClassWrapper, UDFFactory, UDFType
 if TYPE_CHECKING:
-    import pandas as pd
     from sqlalchemy.sql.elements import ClauseElement
     from sqlalchemy.sql.schema import Table
     from sqlalchemy.sql.selectable import GenerativeSelect
@@ -547,8 +548,9 @@ class UDF(Step, ABC):
                 else:
                     udf = self.udf
-                if hasattr(udf.func, "bootstrap") and callable(udf.func.bootstrap):
-                    udf.func.bootstrap()
+                if hasattr(udf.func, "setup") and callable(udf.func.setup):
+                    udf.func.setup()
                 warehouse = self.catalog.warehouse
                 with contextlib.closing(
@@ -599,12 +601,15 @@ class UDF(Step, ABC):
         # Create a dynamic module with the generated name
         dynamic_module = types.ModuleType(feature_module_name)
         # Get the import lines for the necessary objects from the main module
-        import_lines = [
-            source.getimport(obj, alias=name)
-            for name, obj in inspect.getmembers(sys.modules["__main__"], _imports)
-            if not (name.startswith("__") and name.endswith("__"))
-        ]
         main_module = sys.modules["__main__"]
+        if getattr(main_module, "__file__", None):
+            import_lines = list(get_imports(main_module))
+        else:
+            import_lines = [
+                source.getimport(obj, alias=name)
+                for name, obj in main_module.__dict__.items()
+                if _imports(obj) and not (name.startswith("__") and name.endswith("__"))
+            ]
         # Get the feature classes from the main module
         feature_classes = {
@@ -612,6 +617,10 @@ class UDF(Step, ABC):
             for name, obj in main_module.__dict__.items()
             if _feature_predicate(obj)
         }
+        if not feature_classes:
+            yield None
+            return
         # Get the source code of the feature classes
         feature_sources = [source.getsource(cls) for _, cls in feature_classes.items()]
         # Set the module name for the feature classes to the generated name
@@ -621,7 +630,7 @@ class UDF(Step, ABC):
         # Add the dynamic module to the sys.modules dictionary
         sys.modules[feature_module_name] = dynamic_module
         # Combine the import lines and feature sources
-        feature_file = "".join(import_lines) + "\n".join(feature_sources)
+        feature_file = "\n".join(import_lines) + "\n" + "\n".join(feature_sources)
         # Write the module content to a .py file
         with open(f"{feature_module_name}.py", "w") as module_file:
@@ -1362,33 +1371,11 @@ class DatasetQuery:
             cols = result.columns
             return [dict(zip(cols, row)) for row in result]
-    @classmethod
-    def create_empty_record(
-        cls, name: Optional[str] = None, session: Optional[Session] = None
-    ) -> "DatasetRecord":
-        session = Session.get(session)
-        if name is None:
-            name = session.generate_temp_dataset_name()
-        columns = session.catalog.warehouse.dataset_row_cls.file_columns()
-        return session.catalog.create_dataset(name, columns=columns)
-    @classmethod
-    def insert_record(
-        cls,
-        dsr: "DatasetRecord",
-        record: dict[str, Any],
-        session: Optional[Session] = None,
-    ) -> None:
-        session = Session.get(session)
-        dr = session.catalog.warehouse.dataset_rows(dsr)
-        insert_q = dr.get_table().insert().values(**record)
-        session.catalog.warehouse.db.execute(insert_q)
     def to_pandas(self) -> "pd.DataFrame":
-        import pandas as pd
         records = self.to_records()
-        return pd.DataFrame.from_records(records)
+        df = pd.DataFrame.from_records(records)
+        df.columns = [c.replace(DEFAULT_DELIMITER, ".") for c in df.columns]
+        return df
     def shuffle(self) -> "Self":
         # ToDo: implement shaffle based on seed and/or generating random column
@@ -1410,8 +1397,17 @@ class DatasetQuery:
     def show(self, limit=20) -> None:
         df = self.limit(limit).to_pandas()
-        no_footer = re.sub(r"\n\[\d+ rows x \d+ columns\]$", "", str(df))
-        print(no_footer.rstrip(" \n"))
+        options = ["display.max_colwidth", 50, "display.show_dimensions", False]
+        with pd.option_context(*options):
+            if inside_notebook():
+                from IPython.display import display
+                display(df)
+            else:
+                print(df.to_string())
         if len(df) == limit:
             print(f"[limited by {limit} objects]")
@@ -1692,6 +1688,15 @@ class DatasetQuery:
                     storage.timestamp_str,
                 )
+    def exec(self) -> "Self":
+        """Execute the query."""
+        try:
+            query = self.clone()
+            query.apply_steps()
+        finally:
+            self.cleanup()
+        return query
     def save(
         self,
         name: Optional[str] = None,
@@ -1737,22 +1742,16 @@ class DatasetQuery:
             # Exclude the id column and let the db create it to avoid unique
             # constraint violations.
-            cols = [col.name for col in dr.get_table().c if col.name != "id"]
-            assert cols
             q = query.exclude(("id",))
             if q._order_by_clauses:
                 # ensuring we have id sorted by order by clause if it exists in a query
                 q = q.add_columns(
                     f.row_number().over(order_by=q._order_by_clauses).label("id")
                 )
-                cols.append("id")
-            self.catalog.warehouse.db.execute(
-                sqlalchemy.insert(dr.get_table()).from_select(cols, q),
-                **kwargs,
-            )
+            cols = tuple(c.name for c in q.columns)
+            insert_q = sqlalchemy.insert(dr.get_table()).from_select(cols, q)
+            self.catalog.warehouse.db.execute(insert_q, **kwargs)
             self.catalog.metastore.update_dataset_status(
                 dataset, DatasetStatus.COMPLETE, version=version
             )
@@ -1884,3 +1883,24 @@ def _feature_predicate(obj):
 def _imports(obj):
     return not source.isfrommain(obj)
+def get_imports(m):
+    root = ast.parse(inspect.getsource(m))
+    for node in ast.iter_child_nodes(root):
+        if isinstance(node, ast.Import):
+            module = None
+        elif isinstance(node, ast.ImportFrom):
+            module = node.module
+        else:
+            continue
+        for n in node.names:
+            import_script = ""
+            if module:
+                import_script += f"from {module} "
+            import_script += f"import {n.name}"
+            if n.asname:
+                import_script += f" as {n.asname}"
+            yield import_script

datachain/query/dispatch.py CHANGED Viewed

@@ -370,8 +370,8 @@ class UDFWorker:
         return WorkerCallback(self.done_queue)
     def run(self) -> None:
-        if hasattr(self.udf.func, "bootstrap") and callable(self.udf.func.bootstrap):
-            self.udf.func.bootstrap()
+        if hasattr(self.udf.func, "setup") and callable(self.udf.func.setup):
+            self.udf.func.setup()
         while (batch := get_from_queue(self.task_queue)) != STOP_SIGNAL:
             n_rows = len(batch.rows) if isinstance(batch, RowBatch) else 1
             udf_output = self.udf(

datachain/query/schema.py CHANGED Viewed

@@ -3,14 +3,12 @@ import json
 from abc import ABC, abstractmethod
 from datetime import datetime, timezone
 from fnmatch import fnmatch
-from random import getrandbits
 from typing import TYPE_CHECKING, Any, Callable, ClassVar, Optional, Union
 import attrs
 import sqlalchemy as sa
 from fsspec.callbacks import DEFAULT_CALLBACK, Callback
-from datachain.data_storage.warehouse import RANDOM_BITS
 from datachain.sql.types import JSON, Boolean, DateTime, Int, Int64, SQLType, String
 if TYPE_CHECKING:
@@ -217,7 +215,7 @@ class DatasetRow:
         "source": String,
         "parent": String,
         "name": String,
-        "size": Int,
+        "size": Int64,
         "location": JSON,
         "vtype": String,
         "dir_type": Int,
@@ -227,8 +225,6 @@ class DatasetRow:
         "last_modified": DateTime,
         "version": String,
         "etag": String,
-        # system column
-        "random": Int64,
     }
     @staticmethod
@@ -267,8 +263,6 @@ class DatasetRow:
         last_modified = last_modified or datetime.now(timezone.utc)
-        random = getrandbits(RANDOM_BITS)
         return (  # type: ignore [return-value]
             source,
             parent,
@@ -283,7 +277,6 @@ class DatasetRow:
             last_modified,
             version,
             etag,
-            random,
         )
     @staticmethod

datachain/query/udf.py CHANGED Viewed

@@ -14,6 +14,7 @@ from typing import (
 from fsspec.callbacks import DEFAULT_CALLBACK, Callback
 from datachain.dataset import RowDict
+from datachain.lib.utils import AbstractUDF
 from .batch import Batch, BatchingStrategy, NoBatching, Partition, RowBatch
 from .schema import (
@@ -58,14 +59,6 @@ class UDFProperties:
     def signal_names(self) -> Iterable[str]:
         return self.output.keys()
-    def parameter_parser(self) -> Callable:
-        """Generate a parameter list from a dataset row."""
-        def plist(catalog: "Catalog", row: "RowDict", **kwargs) -> list:
-            return [p.get_value(catalog, row, **kwargs) for p in self.params]
-        return plist
 def udf(
     params: Sequence[UDFParamSpec],
@@ -113,32 +106,37 @@ class UDFBase:
         self.func = func
         self.properties = properties
         self.signal_names = properties.signal_names()
-        self.parameter_parser = properties.parameter_parser()
         self.output = properties.output
     def __call__(
         self,
         catalog: "Catalog",
-        param: "BatchingResult",
+        arg: "BatchingResult",
         is_generator: bool = False,
         cache: bool = False,
         cb: Callback = DEFAULT_CALLBACK,
     ) -> Iterable[UDFResult]:
-        if isinstance(param, RowBatch):
+        if isinstance(self.func, AbstractUDF):
+            self.func._catalog = catalog  # type: ignore[unreachable]
+        if isinstance(arg, RowBatch):
             udf_inputs = [
-                self.parameter_parser(catalog, row, cache=cache, cb=cb)
-                for row in param.rows
+                self.bind_parameters(catalog, row, cache=cache, cb=cb)
+                for row in arg.rows
             ]
             udf_outputs = self.func(udf_inputs)
-            return self._process_results(param.rows, udf_outputs, is_generator)
-        if isinstance(param, RowDict):
-            udf_inputs = self.parameter_parser(catalog, param, cache=cache, cb=cb)
+            return self._process_results(arg.rows, udf_outputs, is_generator)
+        if isinstance(arg, RowDict):
+            udf_inputs = self.bind_parameters(catalog, arg, cache=cache, cb=cb)
             udf_outputs = self.func(*udf_inputs)
             if not is_generator:
                 # udf_outputs is generator already if is_generator=True
                 udf_outputs = [udf_outputs]
-            return self._process_results([param], udf_outputs, is_generator)
-        raise ValueError(f"unexpected UDF parameter {param}")
+            return self._process_results([arg], udf_outputs, is_generator)
+        raise ValueError(f"Unexpected UDF argument: {arg}")
+    def bind_parameters(self, catalog: "Catalog", row: "RowDict", **kwargs) -> list:
+        return [p.get_value(catalog, row, **kwargs) for p in self.properties.params]
     def _process_results(
         self,

datachain/sql/sqlite/base.py CHANGED Viewed

@@ -71,8 +71,6 @@ def setup():
     compiles(sql_path.name, "sqlite")(compile_path_name)
     compiles(sql_path.file_stem, "sqlite")(compile_path_file_stem)
     compiles(sql_path.file_ext, "sqlite")(compile_path_file_ext)
-    compiles(array.cosine_distance, "sqlite")(compile_cosine_distance)
-    compiles(array.euclidean_distance, "sqlite")(compile_euclidean_distance)
     compiles(array.length, "sqlite")(compile_array_length)
     compiles(string.length, "sqlite")(compile_string_length)
     compiles(string.split, "sqlite")(compile_string_split)
@@ -81,6 +79,13 @@ def setup():
     compiles(Values, "sqlite")(compile_values)
     compiles(random.rand, "sqlite")(compile_rand)
+    if load_usearch_extension(sqlite3.connect(":memory:")):
+        compiles(array.cosine_distance, "sqlite")(compile_cosine_distance_ext)
+        compiles(array.euclidean_distance, "sqlite")(compile_euclidean_distance_ext)
+    else:
+        compiles(array.cosine_distance, "sqlite")(compile_cosine_distance)
+        compiles(array.euclidean_distance, "sqlite")(compile_euclidean_distance)
     register_user_defined_sql_functions()
     setup_is_complete = True
@@ -246,11 +251,23 @@ def compile_path_file_ext(element, compiler, **kwargs):
     return compiler.process(path_file_ext(*element.clauses.clauses), **kwargs)
+def compile_cosine_distance_ext(element, compiler, **kwargs):
+    run_compiler_hook("cosine_distance")
+    return f"distance_cosine_f32({compiler.process(element.clauses, **kwargs)})"
 def compile_cosine_distance(element, compiler, **kwargs):
     run_compiler_hook("cosine_distance")
     return f"cosine_distance({compiler.process(element.clauses, **kwargs)})"
+def compile_euclidean_distance_ext(element, compiler, **kwargs):
+    run_compiler_hook("euclidean_distance")
+    return (
+        f"sqrt(distance_sqeuclidean_f32({compiler.process(element.clauses, **kwargs)}))"
+    )
 def compile_euclidean_distance(element, compiler, **kwargs):
     run_compiler_hook("euclidean_distance")
     return f"euclidean_distance({compiler.process(element.clauses, **kwargs)})"
@@ -330,3 +347,18 @@ def compile_values(element, compiler, **kwargs):
 def compile_rand(element, compiler, **kwargs):
     return compiler.process(func.random(), **kwargs)
+def load_usearch_extension(conn) -> bool:
+    try:
+        # usearch is part of the vector optional dependencies
+        # we use the extension's cosine and euclidean distance functions
+        from usearch import sqlite_path
+        conn.enable_load_extension(True)
+        conn.load_extension(sqlite_path())
+        conn.enable_load_extension(False)
+        return True
+    except Exception:  # noqa: BLE001
+        return False

datachain 0.2.0__py3-none-any.whl → 0.2.2__py3-none-any.whl

Potentially problematic release.

datachain 0.2.0py3-none-any.whl → 0.2.2py3-none-any.whl