datachain 0.2.10__py3-none-any.whl → 0.2.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

datachain/lib/dc.py CHANGED
@@ -11,6 +11,7 @@ from typing import (
11
11
  Union,
12
12
  )
13
13
 
14
+ import pandas as pd
14
15
  import sqlalchemy
15
16
  from pydantic import BaseModel, create_model
16
17
 
@@ -38,9 +39,9 @@ from datachain.query.dataset import (
38
39
  detach,
39
40
  )
40
41
  from datachain.query.schema import Column, DatasetRow
42
+ from datachain.utils import inside_notebook
41
43
 
42
44
  if TYPE_CHECKING:
43
- import pandas as pd
44
45
  from typing_extensions import Self
45
46
 
46
47
  C = Column
@@ -731,6 +732,37 @@ class DataChain(DatasetQuery):
731
732
 
732
733
  return cls.from_values(name, session, object_name=object_name, **fr_map)
733
734
 
735
+ def to_pandas(self, flatten=False) -> "pd.DataFrame":
736
+ headers, max_length = self.signals_schema.get_headers_with_length()
737
+ if flatten or max_length < 2:
738
+ df = pd.DataFrame.from_records(self.to_records())
739
+ if headers:
740
+ df.columns = [".".join(filter(None, header)) for header in headers]
741
+ return df
742
+
743
+ transposed_result = list(map(list, zip(*self.results())))
744
+ data = {tuple(n): val for n, val in zip(headers, transposed_result)}
745
+ return pd.DataFrame(data)
746
+
747
+ def show(self, limit: int = 20, flatten=False, transpose=False) -> None:
748
+ dc = self.limit(limit) if limit > 0 else self
749
+ df = dc.to_pandas(flatten)
750
+ if transpose:
751
+ df = df.T
752
+
753
+ with pd.option_context(
754
+ "display.max_columns", None, "display.multi_sparse", False
755
+ ):
756
+ if inside_notebook():
757
+ from IPython.display import display
758
+
759
+ display(df)
760
+ else:
761
+ print(df)
762
+
763
+ if len(df) == limit:
764
+ print(f"\n[Limited by {len(df)} rows]")
765
+
734
766
  def parse_tabular(
735
767
  self,
736
768
  output: OutputType = None,
@@ -143,8 +143,8 @@ class SignalSchema:
143
143
  if not fr:
144
144
  raise SignalSchemaError(
145
145
  f"cannot deserialize '{signal}': "
146
- f"unregistered type '{type_name}'."
147
- f" Try to register it with `Registry.add({type_name})`."
146
+ f"unknown type '{type_name}'."
147
+ f" Try to add it with `ModelStore.add({type_name})`."
148
148
  )
149
149
  except TypeError as err:
150
150
  raise SignalSchemaError(
@@ -192,10 +192,17 @@ class SignalSchema:
192
192
  def slice(
193
193
  self, keys: Sequence[str], setup: Optional[dict[str, Callable]] = None
194
194
  ) -> "SignalSchema":
195
+ # Make new schema that combines current schema and setup signals
195
196
  setup = setup or {}
196
197
  setup_no_types = dict.fromkeys(setup.keys(), str)
197
- union = self.values | setup_no_types
198
- schema = {k: union[k] for k in keys if k in union}
198
+ union = SignalSchema(self.values | setup_no_types)
199
+ # Slice combined schema by keys
200
+ schema = {}
201
+ for k in keys:
202
+ try:
203
+ schema[k] = union._find_in_tree(k.split("."))
204
+ except SignalResolvingError:
205
+ pass
199
206
  return SignalSchema(schema, setup)
200
207
 
201
208
  def row_to_features(
@@ -331,6 +338,16 @@ class SignalSchema:
331
338
  sub_schema = SignalSchema({"* list of": args[0]})
332
339
  sub_schema.print_tree(indent=indent, start_at=total_indent + indent)
333
340
 
341
+ def get_headers_with_length(self):
342
+ paths = [
343
+ path for path, _, has_subtree, _ in self.get_flat_tree() if not has_subtree
344
+ ]
345
+ max_length = max([len(path) for path in paths], default=0)
346
+ return [
347
+ path + [""] * (max_length - len(path)) if len(path) < max_length else path
348
+ for path in paths
349
+ ], max_length
350
+
334
351
  def __or__(self, other):
335
352
  return self.__class__(self.values | other.values)
336
353
 
@@ -13,8 +13,9 @@ from typing import (
13
13
  get_origin,
14
14
  )
15
15
 
16
- from pydantic import BaseModel, Field
16
+ from pydantic import Field
17
17
 
18
+ from datachain.lib.data_model import DataModel
18
19
  from datachain.lib.file import File, TarVFile
19
20
  from datachain.lib.utils import DataChainError
20
21
 
@@ -45,7 +46,7 @@ class UnknownFileExtensionError(WDSError):
45
46
  super().__init__(tar_stream, f"unknown extension '{ext}' for file '{name}'")
46
47
 
47
48
 
48
- class WDSBasic(BaseModel):
49
+ class WDSBasic(DataModel):
49
50
  file: File
50
51
 
51
52
 
@@ -74,7 +75,7 @@ class WDSAllFile(WDSBasic):
74
75
  cbor: Optional[bytes] = Field(default=None)
75
76
 
76
77
 
77
- class WDSReadableSubclass(BaseModel):
78
+ class WDSReadableSubclass(DataModel):
78
79
  @staticmethod
79
80
  def _reader(builder, item: tarfile.TarInfo) -> "WDSReadableSubclass":
80
81
  raise NotImplementedError
@@ -1,4 +1,3 @@
1
- import ast
2
1
  import contextlib
3
2
  import datetime
4
3
  import inspect
@@ -10,7 +9,6 @@ import re
10
9
  import string
11
10
  import subprocess
12
11
  import sys
13
- import types
14
12
  from abc import ABC, abstractmethod
15
13
  from collections.abc import Generator, Iterable, Iterator, Sequence
16
14
  from copy import copy
@@ -26,12 +24,9 @@ from typing import (
26
24
  )
27
25
 
28
26
  import attrs
29
- import pandas as pd
30
27
  import sqlalchemy
31
28
  from attrs import frozen
32
- from dill import dumps, source
33
29
  from fsspec.callbacks import DEFAULT_CALLBACK, Callback, TqdmCallback
34
- from pydantic import BaseModel
35
30
  from sqlalchemy import Column
36
31
  from sqlalchemy.sql import func as f
37
32
  from sqlalchemy.sql.elements import ColumnClause, ColumnElement
@@ -53,10 +48,13 @@ from datachain.data_storage.schema import (
53
48
  from datachain.dataset import DatasetStatus, RowDict
54
49
  from datachain.error import DatasetNotFoundError, QueryScriptCancelError
55
50
  from datachain.progress import CombinedDownloadCallback
56
- from datachain.query.schema import DEFAULT_DELIMITER
57
51
  from datachain.sql.functions import rand
58
52
  from datachain.storage import Storage, StorageURI
59
- from datachain.utils import batched, determine_processes, inside_notebook
53
+ from datachain.utils import (
54
+ batched,
55
+ determine_processes,
56
+ filtered_cloudpickle_dumps,
57
+ )
60
58
 
61
59
  from .metrics import metrics
62
60
  from .schema import C, UDFParamSpec, normalize_param
@@ -492,7 +490,7 @@ class UDF(Step, ABC):
492
490
  elif processes:
493
491
  # Parallel processing (faster for more CPU-heavy UDFs)
494
492
  udf_info = {
495
- "udf": self.udf,
493
+ "udf_data": filtered_cloudpickle_dumps(self.udf),
496
494
  "catalog_init": self.catalog.get_init_params(),
497
495
  "id_generator_clone_params": (
498
496
  self.catalog.id_generator.clone_params()
@@ -513,16 +511,15 @@ class UDF(Step, ABC):
513
511
 
514
512
  envs = dict(os.environ)
515
513
  envs.update({"PYTHONPATH": os.getcwd()})
516
- with self.process_feature_module():
517
- process_data = dumps(udf_info, recurse=True)
518
- result = subprocess.run( # noqa: S603
519
- [datachain_exec_path, "--internal-run-udf"],
520
- input=process_data,
521
- check=False,
522
- env=envs,
523
- )
524
- if result.returncode != 0:
525
- raise RuntimeError("UDF Execution Failed!")
514
+ process_data = filtered_cloudpickle_dumps(udf_info)
515
+ result = subprocess.run( # noqa: S603
516
+ [datachain_exec_path, "--internal-run-udf"],
517
+ input=process_data,
518
+ check=False,
519
+ env=envs,
520
+ )
521
+ if result.returncode != 0:
522
+ raise RuntimeError("UDF Execution Failed!")
526
523
 
527
524
  else:
528
525
  # Otherwise process single-threaded (faster for smaller UDFs)
@@ -571,57 +568,6 @@ class UDF(Step, ABC):
571
568
  self.catalog.warehouse.close()
572
569
  raise
573
570
 
574
- @contextlib.contextmanager
575
- def process_feature_module(self):
576
- # Generate a random name for the feature module
577
- feature_module_name = "tmp" + _random_string(10)
578
- # Create a dynamic module with the generated name
579
- dynamic_module = types.ModuleType(feature_module_name)
580
- # Get the import lines for the necessary objects from the main module
581
- main_module = sys.modules["__main__"]
582
- if getattr(main_module, "__file__", None):
583
- import_lines = list(get_imports(main_module))
584
- else:
585
- import_lines = [
586
- source.getimport(obj, alias=name)
587
- for name, obj in main_module.__dict__.items()
588
- if _imports(obj) and not (name.startswith("__") and name.endswith("__"))
589
- ]
590
-
591
- # Get the feature classes from the main module
592
- feature_classes = {
593
- name: obj
594
- for name, obj in main_module.__dict__.items()
595
- if _feature_predicate(obj)
596
- }
597
- if not feature_classes:
598
- yield None
599
- return
600
-
601
- # Get the source code of the feature classes
602
- feature_sources = [source.getsource(cls) for _, cls in feature_classes.items()]
603
- # Set the module name for the feature classes to the generated name
604
- for name, cls in feature_classes.items():
605
- cls.__module__ = feature_module_name
606
- setattr(dynamic_module, name, cls)
607
- # Add the dynamic module to the sys.modules dictionary
608
- sys.modules[feature_module_name] = dynamic_module
609
- # Combine the import lines and feature sources
610
- feature_file = "\n".join(import_lines) + "\n" + "\n".join(feature_sources)
611
-
612
- # Write the module content to a .py file
613
- with open(f"{feature_module_name}.py", "w") as module_file:
614
- module_file.write(feature_file)
615
-
616
- try:
617
- yield feature_module_name
618
- finally:
619
- for cls in feature_classes.values():
620
- cls.__module__ = main_module.__name__
621
- os.unlink(f"{feature_module_name}.py")
622
- # Remove the dynamic module from sys.modules
623
- del sys.modules[feature_module_name]
624
-
625
571
  def create_partitions_table(self, query: Select) -> "Table":
626
572
  """
627
573
  Create temporary table with group by partitions.
@@ -1346,12 +1292,6 @@ class DatasetQuery:
1346
1292
  def to_records(self) -> list[dict[str, Any]]:
1347
1293
  return self.results(lambda cols, row: dict(zip(cols, row)))
1348
1294
 
1349
- def to_pandas(self) -> "pd.DataFrame":
1350
- records = self.to_records()
1351
- df = pd.DataFrame.from_records(records)
1352
- df.columns = [c.replace(DEFAULT_DELIMITER, ".") for c in df.columns]
1353
- return df
1354
-
1355
1295
  def shuffle(self) -> "Self":
1356
1296
  # ToDo: implement shaffle based on seed and/or generating random column
1357
1297
  return self.order_by(C.sys__rand)
@@ -1370,22 +1310,6 @@ class DatasetQuery:
1370
1310
 
1371
1311
  return sampled.limit(n)
1372
1312
 
1373
- def show(self, limit=20) -> None:
1374
- df = self.limit(limit).to_pandas()
1375
-
1376
- options = ["display.max_colwidth", 50, "display.show_dimensions", False]
1377
- with pd.option_context(*options):
1378
- if inside_notebook():
1379
- from IPython.display import display
1380
-
1381
- display(df)
1382
-
1383
- else:
1384
- print(df.to_string())
1385
-
1386
- if len(df) == limit:
1387
- print(f"[limited by {limit} objects]")
1388
-
1389
1313
  def clone(self, new_table=True) -> "Self":
1390
1314
  obj = copy(self)
1391
1315
  obj.steps = obj.steps.copy()
@@ -1853,34 +1777,3 @@ def _random_string(length: int) -> str:
1853
1777
  random.choice(string.ascii_letters + string.digits) # noqa: S311
1854
1778
  for i in range(length)
1855
1779
  )
1856
-
1857
-
1858
- def _feature_predicate(obj):
1859
- return (
1860
- inspect.isclass(obj) and source.isfrommain(obj) and issubclass(obj, BaseModel)
1861
- )
1862
-
1863
-
1864
- def _imports(obj):
1865
- return not source.isfrommain(obj)
1866
-
1867
-
1868
- def get_imports(m):
1869
- root = ast.parse(inspect.getsource(m))
1870
-
1871
- for node in ast.iter_child_nodes(root):
1872
- if isinstance(node, ast.Import):
1873
- module = None
1874
- elif isinstance(node, ast.ImportFrom):
1875
- module = node.module
1876
- else:
1877
- continue
1878
-
1879
- for n in node.names:
1880
- import_script = ""
1881
- if module:
1882
- import_script += f"from {module} "
1883
- import_script += f"import {n.name}"
1884
- if n.asname:
1885
- import_script += f" as {n.asname}"
1886
- yield import_script
@@ -10,7 +10,7 @@ from typing import Any, Optional
10
10
 
11
11
  import attrs
12
12
  import multiprocess
13
- from dill import load
13
+ from cloudpickle import load, loads
14
14
  from fsspec.callbacks import DEFAULT_CALLBACK, Callback
15
15
  from multiprocess import get_context
16
16
 
@@ -84,7 +84,7 @@ def put_into_queue(queue: Queue, item: Any) -> None:
84
84
 
85
85
  def udf_entrypoint() -> int:
86
86
  # Load UDF info from stdin
87
- udf_info = load(stdin.buffer) # noqa: S301
87
+ udf_info = load(stdin.buffer)
88
88
 
89
89
  (
90
90
  warehouse_class,
@@ -95,7 +95,7 @@ def udf_entrypoint() -> int:
95
95
 
96
96
  # Parallel processing (faster for more CPU-heavy UDFs)
97
97
  dispatch = UDFDispatcher(
98
- udf_info["udf"],
98
+ udf_info["udf_data"],
99
99
  udf_info["catalog_init"],
100
100
  udf_info["id_generator_clone_params"],
101
101
  udf_info["metastore_clone_params"],
@@ -108,7 +108,7 @@ def udf_entrypoint() -> int:
108
108
  batching = udf_info["batching"]
109
109
  table = udf_info["table"]
110
110
  n_workers = udf_info["processes"]
111
- udf = udf_info["udf"]
111
+ udf = loads(udf_info["udf_data"])
112
112
  if n_workers is True:
113
113
  # Use default number of CPUs (cores)
114
114
  n_workers = None
@@ -146,7 +146,7 @@ class UDFDispatcher:
146
146
 
147
147
  def __init__(
148
148
  self,
149
- udf,
149
+ udf_data,
150
150
  catalog_init_params,
151
151
  id_generator_clone_params,
152
152
  metastore_clone_params,
@@ -155,14 +155,7 @@ class UDFDispatcher:
155
155
  is_generator=False,
156
156
  buffer_size=DEFAULT_BATCH_SIZE,
157
157
  ):
158
- # isinstance cannot be used here, as dill packages the entire class definition,
159
- # and so these two types are not considered exactly equal,
160
- # even if they have the same import path.
161
- if full_module_type_path(type(udf)) != full_module_type_path(UDFFactory):
162
- self.udf = udf
163
- else:
164
- self.udf = None
165
- self.udf_factory = udf
158
+ self.udf_data = udf_data
166
159
  self.catalog_init_params = catalog_init_params
167
160
  (
168
161
  self.id_generator_class,
@@ -214,6 +207,15 @@ class UDFDispatcher:
214
207
  self.catalog = Catalog(
215
208
  id_generator, metastore, warehouse, **self.catalog_init_params
216
209
  )
210
+ udf = loads(self.udf_data)
211
+ # isinstance cannot be used here, as cloudpickle packages the entire class
212
+ # definition, and so these two types are not considered exactly equal,
213
+ # even if they have the same import path.
214
+ if full_module_type_path(type(udf)) != full_module_type_path(UDFFactory):
215
+ self.udf = udf
216
+ else:
217
+ self.udf = None
218
+ self.udf_factory = udf
217
219
  if not self.udf:
218
220
  self.udf = self.udf_factory()
219
221
 
datachain/utils.py CHANGED
@@ -1,5 +1,6 @@
1
1
  import glob
2
2
  import importlib.util
3
+ import io
3
4
  import json
4
5
  import os
5
6
  import os.path as osp
@@ -13,8 +14,10 @@ from itertools import islice
13
14
  from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union
14
15
  from uuid import UUID
15
16
 
17
+ import cloudpickle
16
18
  from dateutil import tz
17
19
  from dateutil.parser import isoparse
20
+ from pydantic import BaseModel
18
21
 
19
22
  if TYPE_CHECKING:
20
23
  import pandas as pd
@@ -388,3 +391,39 @@ def inside_notebook() -> bool:
388
391
  return False
389
392
 
390
393
  return False
394
+
395
+
396
+ def get_all_subclasses(cls):
397
+ """Return all subclasses of a given class.
398
+ Can return duplicates due to multiple inheritance."""
399
+ for subclass in cls.__subclasses__():
400
+ yield from get_all_subclasses(subclass)
401
+ yield subclass
402
+
403
+
404
+ def filtered_cloudpickle_dumps(obj: Any) -> bytes:
405
+ """Equivalent to cloudpickle.dumps, but this supports Pydantic models."""
406
+ model_namespaces = {}
407
+
408
+ with io.BytesIO() as f:
409
+ pickler = cloudpickle.CloudPickler(f)
410
+
411
+ for model_class in get_all_subclasses(BaseModel):
412
+ # This "is not None" check is needed, because due to multiple inheritance,
413
+ # it is theoretically possible to get the same class twice from
414
+ # get_all_subclasses.
415
+ if model_class.__pydantic_parent_namespace__ is not None:
416
+ # __pydantic_parent_namespace__ can contain many unnecessary and
417
+ # unpickleable entities, so should be removed for serialization.
418
+ model_namespaces[model_class] = (
419
+ model_class.__pydantic_parent_namespace__
420
+ )
421
+ model_class.__pydantic_parent_namespace__ = None
422
+
423
+ try:
424
+ pickler.dump(obj)
425
+ return f.getvalue()
426
+ finally:
427
+ for model_class, namespace in model_namespaces.items():
428
+ # Restore original __pydantic_parent_namespace__ locally.
429
+ model_class.__pydantic_parent_namespace__ = namespace
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.2.10
3
+ Version: 0.2.11
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -34,6 +34,7 @@ Requires-Dist: shtab <2,>=1.3.4
34
34
  Requires-Dist: sqlalchemy >=2
35
35
  Requires-Dist: multiprocess ==0.70.16
36
36
  Requires-Dist: dill ==0.3.8
37
+ Requires-Dist: cloudpickle
37
38
  Requires-Dist: ujson >=5.9.0
38
39
  Requires-Dist: pydantic <3,>=2
39
40
  Requires-Dist: jmespath >=1.0
@@ -14,7 +14,7 @@ datachain/nodes_thread_pool.py,sha256=ZyzBvUImIPmi4WlKC2SW2msA0UhtembbTdcs2nx29A
14
14
  datachain/progress.py,sha256=7_8FtJs770ITK9sMq-Lt4k4k18QmYl4yIG_kCoWID3o,4559
15
15
  datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
16
  datachain/storage.py,sha256=RiSJLYdHUjnrEWkLBKPcETHpAxld_B2WxLg711t0aZI,3733
17
- datachain/utils.py,sha256=12yQAV8tfyCHqp_xJcJBeNnr1L_BO8e2bOPyXdM68gs,10759
17
+ datachain/utils.py,sha256=AWUXRk7yvDpHcqzzPWwzv8HtF1-jDVEBHKxAgT7u02E,12288
18
18
  datachain/catalog/__init__.py,sha256=g2iAAFx_gEIrqshXlhSEbrc8qDaEH11cjU40n3CHDz4,409
19
19
  datachain/catalog/catalog.py,sha256=A5W9Ffoz1lZkzl6A3igaMC5jrus8VIYVLJLX8JTVKrk,79603
20
20
  datachain/catalog/datasource.py,sha256=D-VWIVDCM10A8sQavLhRXdYSCG7F4o4ifswEF80_NAQ,1412
@@ -40,8 +40,7 @@ datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
40
40
  datachain/lib/arrow.py,sha256=ttSiH8Xr08zxypAa3-BNTxMO2NBuZfYICwmG1qQwvWU,3268
41
41
  datachain/lib/clip.py,sha256=YRa15Whnn6C8BMA-OAu0mYjc4h9i_n7pffRGdtfrTBA,5222
42
42
  datachain/lib/data_model.py,sha256=DpV_-1JqJptCf0w4cnzPlHm5Yl4FQaveRgVCDZFaHXs,2012
43
- datachain/lib/dc.py,sha256=Px7zj1mrAsL3sBLu1pezssBQkvY0YAoGJm4VbT2yRwc,34699
44
- datachain/lib/feature_registry.py,sha256=LUrBvDom-k1shFuCv46-OdgntbIUQ5008oyIS0iPM6Q,2298
43
+ datachain/lib/dc.py,sha256=rd-7gVcMRZ2M-O8aQhNx85H31w-kRQHpXSwtf26dSk4,35849
45
44
  datachain/lib/file.py,sha256=Uik1sq2l-uknpikH4Gdm7ZR0EcQYP2TrNg-urECjbW4,8304
46
45
  datachain/lib/gpt4_vision.py,sha256=CZ-a64olZNp9TNmLGngmbN6b02UYImzwK3dPClnjxTI,2716
47
46
  datachain/lib/hf_image_to_text.py,sha256=uVl4mnUl8gnHrJ3wfSZlxBevH-cxqOswxLArLAHxRrE,3077
@@ -53,14 +52,14 @@ datachain/lib/meta_formats.py,sha256=SF7UPPe-U-1HL6DBO1NfwZLIChjkHrHasgHf5ztCUoU
53
52
  datachain/lib/model_store.py,sha256=JFpI1P0WFpsO6eAU49AdWmff5T8azqLrqOMB08pYJjg,2331
54
53
  datachain/lib/pytorch.py,sha256=7fd2g0dI9zrMfRl3IVwIvXRH0v6TwSAyZGAbqKdEjcI,5505
55
54
  datachain/lib/settings.py,sha256=6Nkoh8riETrftYwDp3aniK53Dsjc07MdztL8N0cW1D8,2849
56
- datachain/lib/signal_schema.py,sha256=xzVHauGrhGcS5aOE1UMqT5YjJeZIMAZYQq76tZShhnY,13550
55
+ datachain/lib/signal_schema.py,sha256=mRdq5qEGnFQgbSawzDPi2MCZ6PULTMigd51B2RuNxpg,14173
57
56
  datachain/lib/text.py,sha256=d2V-52cqzVm5OT68BcLYyHrglvFMVR5DPzsbtRRv3D0,1063
58
57
  datachain/lib/udf.py,sha256=RqCiGuNKL5P8eS84s_mmVYjK1gvkuRYdnIKm9qe-i2U,9698
59
58
  datachain/lib/udf_signature.py,sha256=R81QqZseG_xeBFzJSgt-wrTQeUU-1RrWkHckLm_HEUU,7135
60
59
  datachain/lib/unstructured.py,sha256=9Y6rAelXdYqkNbPaqz6DhXjhS8d6qXcP0ieIsWkzvkk,1143
61
60
  datachain/lib/utils.py,sha256=5-kJlAZE0D9nXXweAjo7-SP_AWGo28feaDByONYaooQ,463
62
61
  datachain/lib/vfile.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
63
- datachain/lib/webdataset.py,sha256=eqIDSqfBOhEK43JMp-6lYdYy2x3Ge5lwqR-hKGV8aG0,8259
62
+ datachain/lib/webdataset.py,sha256=nIa6ubv94CwnATeeSdE7f_F9Zkz9LuBTfbXvFg3_-Ak,8295
64
63
  datachain/lib/webdataset_laion.py,sha256=PQP6tQmUP7Xu9fPuAGK1JDBYA6T5UufYMUTGaxgspJA,2118
65
64
  datachain/lib/convert/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
66
65
  datachain/lib/convert/flatten.py,sha256=XdAj0f9W32ABjOo8UyYm0y0H_yHDn3qEHERTyXuhJxk,1592
@@ -70,8 +69,8 @@ datachain/lib/convert/values_to_tuples.py,sha256=MWz9pHT-AaPQN8hNMUYfuOHstyuNv0Q
70
69
  datachain/query/__init__.py,sha256=tv-spkjUCYamMN9ys_90scYrZ8kJ7C7d1MTYVmxGtk4,325
71
70
  datachain/query/batch.py,sha256=j-_ZcuQra2Ro3Wj4crtqQCg-7xuv-p84hr4QHdvT7as,3479
72
71
  datachain/query/builtins.py,sha256=ZKNs49t8Oa_OaboCBIEqtXZt7c1Qe9OR_C_HpoDriIU,2781
73
- datachain/query/dataset.py,sha256=Pmaz16phEummJpWJD3x-8SMMbCb6xcOtWTyMdsFOdOE,64414
74
- datachain/query/dispatch.py,sha256=Qv5QpP5-K9JAmZLntifRzS5_AYHbK82Ahreo302Ntq8,13218
72
+ datachain/query/dataset.py,sha256=P1KBv_R0YnKjNDHzOJwAx9qhwI08l0dLgaXfak3ps7k,60578
73
+ datachain/query/dispatch.py,sha256=oGX9ZuoKWPB_EyqAZD_eULcO3OejY44_keSmFS6SHT0,13315
75
74
  datachain/query/metrics.py,sha256=vsECqbZfoSDBnvC3GQlziKXmISVYDLgHP1fMPEOtKyo,640
76
75
  datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
77
76
  datachain/query/schema.py,sha256=n1NBOj6JO2I26mZD4vSURmVC2rk3mjIkJQheeLogoy4,7748
@@ -97,9 +96,9 @@ datachain/sql/sqlite/types.py,sha256=oP93nLfTBaYnN0z_4Dsv-HZm8j9rrUf1esMM-z3JLbg
97
96
  datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
98
97
  datachain/text/__init__.py,sha256=-yxHL2gVl3H0Zxam6iWUO6F1Mc4QAFHX6z-5fjHND74,72
99
98
  datachain/torch/__init__.py,sha256=9QJW8h0FevIXEykRsxQ7XzMDXvdIkv3kVf_UY95CTyg,600
100
- datachain-0.2.10.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
101
- datachain-0.2.10.dist-info/METADATA,sha256=bWvqTD9c2joLmkDGpdcutjjF_s1LpccbSCLbkIaKQYQ,16732
102
- datachain-0.2.10.dist-info/WHEEL,sha256=Z4pYXqR_rTB7OWNDYFOm1qRk0RX6GFP2o8LgvP453Hk,91
103
- datachain-0.2.10.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
104
- datachain-0.2.10.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
105
- datachain-0.2.10.dist-info/RECORD,,
99
+ datachain-0.2.11.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
100
+ datachain-0.2.11.dist-info/METADATA,sha256=OVKgVc-Wc75AAQIY6hGL1CEBmnwksfgOXfiUen_xAOM,16759
101
+ datachain-0.2.11.dist-info/WHEEL,sha256=FZ75kcLy9M91ncbIgG8dnpCncbiKXSRGJ_PFILs6SFg,91
102
+ datachain-0.2.11.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
103
+ datachain-0.2.11.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
104
+ datachain-0.2.11.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (70.3.0)
2
+ Generator: setuptools (71.0.1)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,77 +0,0 @@
1
- import logging
2
- from typing import Any, ClassVar, Optional
3
-
4
- from pydantic import BaseModel
5
-
6
- logger = logging.getLogger(__name__)
7
-
8
-
9
- class Registry:
10
- reg: ClassVar[dict[str, dict[int, Any]]] = {}
11
-
12
- @classmethod
13
- def get_version(cls, model: type[BaseModel]) -> int:
14
- if not hasattr(model, "_version"):
15
- return 0
16
- return model._version
17
-
18
- @classmethod
19
- def get_name(cls, model) -> str:
20
- if (version := cls.get_version(model)) > 0:
21
- return f"{model.__name__}@v{version}"
22
- return model.__name__
23
-
24
- @classmethod
25
- def add(cls, fr: type):
26
- if (model := Registry.to_pydantic(fr)) is None:
27
- return
28
-
29
- name = model.__name__
30
- if name not in cls.reg:
31
- cls.reg[name] = {}
32
- version = Registry.get_version(model)
33
- cls.reg[name][version] = model
34
-
35
- for f_info in model.model_fields.values():
36
- if (anno := Registry.to_pydantic(f_info.annotation)) is not None:
37
- cls.add(anno)
38
-
39
- @classmethod
40
- def get(cls, name: str, version: Optional[int] = None) -> Optional[type]:
41
- class_dict = cls.reg.get(name, None)
42
- if class_dict is None:
43
- return None
44
- if version is None:
45
- max_ver = max(class_dict.keys(), default=None)
46
- if max_ver is None:
47
- return None
48
- return class_dict[max_ver]
49
- return class_dict.get(version, None)
50
-
51
- @classmethod
52
- def parse_name_version(cls, fullname: str) -> tuple[str, int]:
53
- name = fullname
54
- version = 0
55
-
56
- if "@" in fullname:
57
- name, version_str = fullname.split("@")
58
- if version_str.strip() != "":
59
- version = int(version_str[1:])
60
-
61
- return name, version
62
-
63
- @classmethod
64
- def remove(cls, fr: type) -> None:
65
- version = fr._version # type: ignore[attr-defined]
66
- if fr.__name__ in cls.reg and version in cls.reg[fr.__name__]:
67
- del cls.reg[fr.__name__][version]
68
-
69
- @staticmethod
70
- def is_pydantic(val):
71
- return not hasattr(val, "__origin__") and issubclass(val, BaseModel)
72
-
73
- @staticmethod
74
- def to_pydantic(val) -> Optional[type[BaseModel]]:
75
- if val is None or not Registry.is_pydantic(val):
76
- return None
77
- return val