datachain 0.2.10__py3-none-any.whl → 0.2.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/lib/dc.py +33 -1
- datachain/lib/signal_schema.py +21 -4
- datachain/lib/webdataset.py +4 -3
- datachain/query/dataset.py +15 -122
- datachain/query/dispatch.py +15 -13
- datachain/utils.py +39 -0
- {datachain-0.2.10.dist-info → datachain-0.2.11.dist-info}/METADATA +2 -1
- {datachain-0.2.10.dist-info → datachain-0.2.11.dist-info}/RECORD +12 -13
- {datachain-0.2.10.dist-info → datachain-0.2.11.dist-info}/WHEEL +1 -1
- datachain/lib/feature_registry.py +0 -77
- {datachain-0.2.10.dist-info → datachain-0.2.11.dist-info}/LICENSE +0 -0
- {datachain-0.2.10.dist-info → datachain-0.2.11.dist-info}/entry_points.txt +0 -0
- {datachain-0.2.10.dist-info → datachain-0.2.11.dist-info}/top_level.txt +0 -0
datachain/lib/dc.py
CHANGED
|
@@ -11,6 +11,7 @@ from typing import (
|
|
|
11
11
|
Union,
|
|
12
12
|
)
|
|
13
13
|
|
|
14
|
+
import pandas as pd
|
|
14
15
|
import sqlalchemy
|
|
15
16
|
from pydantic import BaseModel, create_model
|
|
16
17
|
|
|
@@ -38,9 +39,9 @@ from datachain.query.dataset import (
|
|
|
38
39
|
detach,
|
|
39
40
|
)
|
|
40
41
|
from datachain.query.schema import Column, DatasetRow
|
|
42
|
+
from datachain.utils import inside_notebook
|
|
41
43
|
|
|
42
44
|
if TYPE_CHECKING:
|
|
43
|
-
import pandas as pd
|
|
44
45
|
from typing_extensions import Self
|
|
45
46
|
|
|
46
47
|
C = Column
|
|
@@ -731,6 +732,37 @@ class DataChain(DatasetQuery):
|
|
|
731
732
|
|
|
732
733
|
return cls.from_values(name, session, object_name=object_name, **fr_map)
|
|
733
734
|
|
|
735
|
+
def to_pandas(self, flatten=False) -> "pd.DataFrame":
|
|
736
|
+
headers, max_length = self.signals_schema.get_headers_with_length()
|
|
737
|
+
if flatten or max_length < 2:
|
|
738
|
+
df = pd.DataFrame.from_records(self.to_records())
|
|
739
|
+
if headers:
|
|
740
|
+
df.columns = [".".join(filter(None, header)) for header in headers]
|
|
741
|
+
return df
|
|
742
|
+
|
|
743
|
+
transposed_result = list(map(list, zip(*self.results())))
|
|
744
|
+
data = {tuple(n): val for n, val in zip(headers, transposed_result)}
|
|
745
|
+
return pd.DataFrame(data)
|
|
746
|
+
|
|
747
|
+
def show(self, limit: int = 20, flatten=False, transpose=False) -> None:
|
|
748
|
+
dc = self.limit(limit) if limit > 0 else self
|
|
749
|
+
df = dc.to_pandas(flatten)
|
|
750
|
+
if transpose:
|
|
751
|
+
df = df.T
|
|
752
|
+
|
|
753
|
+
with pd.option_context(
|
|
754
|
+
"display.max_columns", None, "display.multi_sparse", False
|
|
755
|
+
):
|
|
756
|
+
if inside_notebook():
|
|
757
|
+
from IPython.display import display
|
|
758
|
+
|
|
759
|
+
display(df)
|
|
760
|
+
else:
|
|
761
|
+
print(df)
|
|
762
|
+
|
|
763
|
+
if len(df) == limit:
|
|
764
|
+
print(f"\n[Limited by {len(df)} rows]")
|
|
765
|
+
|
|
734
766
|
def parse_tabular(
|
|
735
767
|
self,
|
|
736
768
|
output: OutputType = None,
|
datachain/lib/signal_schema.py
CHANGED
|
@@ -143,8 +143,8 @@ class SignalSchema:
|
|
|
143
143
|
if not fr:
|
|
144
144
|
raise SignalSchemaError(
|
|
145
145
|
f"cannot deserialize '{signal}': "
|
|
146
|
-
f"
|
|
147
|
-
f" Try to
|
|
146
|
+
f"unknown type '{type_name}'."
|
|
147
|
+
f" Try to add it with `ModelStore.add({type_name})`."
|
|
148
148
|
)
|
|
149
149
|
except TypeError as err:
|
|
150
150
|
raise SignalSchemaError(
|
|
@@ -192,10 +192,17 @@ class SignalSchema:
|
|
|
192
192
|
def slice(
|
|
193
193
|
self, keys: Sequence[str], setup: Optional[dict[str, Callable]] = None
|
|
194
194
|
) -> "SignalSchema":
|
|
195
|
+
# Make new schema that combines current schema and setup signals
|
|
195
196
|
setup = setup or {}
|
|
196
197
|
setup_no_types = dict.fromkeys(setup.keys(), str)
|
|
197
|
-
union = self.values | setup_no_types
|
|
198
|
-
|
|
198
|
+
union = SignalSchema(self.values | setup_no_types)
|
|
199
|
+
# Slice combined schema by keys
|
|
200
|
+
schema = {}
|
|
201
|
+
for k in keys:
|
|
202
|
+
try:
|
|
203
|
+
schema[k] = union._find_in_tree(k.split("."))
|
|
204
|
+
except SignalResolvingError:
|
|
205
|
+
pass
|
|
199
206
|
return SignalSchema(schema, setup)
|
|
200
207
|
|
|
201
208
|
def row_to_features(
|
|
@@ -331,6 +338,16 @@ class SignalSchema:
|
|
|
331
338
|
sub_schema = SignalSchema({"* list of": args[0]})
|
|
332
339
|
sub_schema.print_tree(indent=indent, start_at=total_indent + indent)
|
|
333
340
|
|
|
341
|
+
def get_headers_with_length(self):
|
|
342
|
+
paths = [
|
|
343
|
+
path for path, _, has_subtree, _ in self.get_flat_tree() if not has_subtree
|
|
344
|
+
]
|
|
345
|
+
max_length = max([len(path) for path in paths], default=0)
|
|
346
|
+
return [
|
|
347
|
+
path + [""] * (max_length - len(path)) if len(path) < max_length else path
|
|
348
|
+
for path in paths
|
|
349
|
+
], max_length
|
|
350
|
+
|
|
334
351
|
def __or__(self, other):
|
|
335
352
|
return self.__class__(self.values | other.values)
|
|
336
353
|
|
datachain/lib/webdataset.py
CHANGED
|
@@ -13,8 +13,9 @@ from typing import (
|
|
|
13
13
|
get_origin,
|
|
14
14
|
)
|
|
15
15
|
|
|
16
|
-
from pydantic import
|
|
16
|
+
from pydantic import Field
|
|
17
17
|
|
|
18
|
+
from datachain.lib.data_model import DataModel
|
|
18
19
|
from datachain.lib.file import File, TarVFile
|
|
19
20
|
from datachain.lib.utils import DataChainError
|
|
20
21
|
|
|
@@ -45,7 +46,7 @@ class UnknownFileExtensionError(WDSError):
|
|
|
45
46
|
super().__init__(tar_stream, f"unknown extension '{ext}' for file '{name}'")
|
|
46
47
|
|
|
47
48
|
|
|
48
|
-
class WDSBasic(
|
|
49
|
+
class WDSBasic(DataModel):
|
|
49
50
|
file: File
|
|
50
51
|
|
|
51
52
|
|
|
@@ -74,7 +75,7 @@ class WDSAllFile(WDSBasic):
|
|
|
74
75
|
cbor: Optional[bytes] = Field(default=None)
|
|
75
76
|
|
|
76
77
|
|
|
77
|
-
class WDSReadableSubclass(
|
|
78
|
+
class WDSReadableSubclass(DataModel):
|
|
78
79
|
@staticmethod
|
|
79
80
|
def _reader(builder, item: tarfile.TarInfo) -> "WDSReadableSubclass":
|
|
80
81
|
raise NotImplementedError
|
datachain/query/dataset.py
CHANGED
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
import ast
|
|
2
1
|
import contextlib
|
|
3
2
|
import datetime
|
|
4
3
|
import inspect
|
|
@@ -10,7 +9,6 @@ import re
|
|
|
10
9
|
import string
|
|
11
10
|
import subprocess
|
|
12
11
|
import sys
|
|
13
|
-
import types
|
|
14
12
|
from abc import ABC, abstractmethod
|
|
15
13
|
from collections.abc import Generator, Iterable, Iterator, Sequence
|
|
16
14
|
from copy import copy
|
|
@@ -26,12 +24,9 @@ from typing import (
|
|
|
26
24
|
)
|
|
27
25
|
|
|
28
26
|
import attrs
|
|
29
|
-
import pandas as pd
|
|
30
27
|
import sqlalchemy
|
|
31
28
|
from attrs import frozen
|
|
32
|
-
from dill import dumps, source
|
|
33
29
|
from fsspec.callbacks import DEFAULT_CALLBACK, Callback, TqdmCallback
|
|
34
|
-
from pydantic import BaseModel
|
|
35
30
|
from sqlalchemy import Column
|
|
36
31
|
from sqlalchemy.sql import func as f
|
|
37
32
|
from sqlalchemy.sql.elements import ColumnClause, ColumnElement
|
|
@@ -53,10 +48,13 @@ from datachain.data_storage.schema import (
|
|
|
53
48
|
from datachain.dataset import DatasetStatus, RowDict
|
|
54
49
|
from datachain.error import DatasetNotFoundError, QueryScriptCancelError
|
|
55
50
|
from datachain.progress import CombinedDownloadCallback
|
|
56
|
-
from datachain.query.schema import DEFAULT_DELIMITER
|
|
57
51
|
from datachain.sql.functions import rand
|
|
58
52
|
from datachain.storage import Storage, StorageURI
|
|
59
|
-
from datachain.utils import
|
|
53
|
+
from datachain.utils import (
|
|
54
|
+
batched,
|
|
55
|
+
determine_processes,
|
|
56
|
+
filtered_cloudpickle_dumps,
|
|
57
|
+
)
|
|
60
58
|
|
|
61
59
|
from .metrics import metrics
|
|
62
60
|
from .schema import C, UDFParamSpec, normalize_param
|
|
@@ -492,7 +490,7 @@ class UDF(Step, ABC):
|
|
|
492
490
|
elif processes:
|
|
493
491
|
# Parallel processing (faster for more CPU-heavy UDFs)
|
|
494
492
|
udf_info = {
|
|
495
|
-
"
|
|
493
|
+
"udf_data": filtered_cloudpickle_dumps(self.udf),
|
|
496
494
|
"catalog_init": self.catalog.get_init_params(),
|
|
497
495
|
"id_generator_clone_params": (
|
|
498
496
|
self.catalog.id_generator.clone_params()
|
|
@@ -513,16 +511,15 @@ class UDF(Step, ABC):
|
|
|
513
511
|
|
|
514
512
|
envs = dict(os.environ)
|
|
515
513
|
envs.update({"PYTHONPATH": os.getcwd()})
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
raise RuntimeError("UDF Execution Failed!")
|
|
514
|
+
process_data = filtered_cloudpickle_dumps(udf_info)
|
|
515
|
+
result = subprocess.run( # noqa: S603
|
|
516
|
+
[datachain_exec_path, "--internal-run-udf"],
|
|
517
|
+
input=process_data,
|
|
518
|
+
check=False,
|
|
519
|
+
env=envs,
|
|
520
|
+
)
|
|
521
|
+
if result.returncode != 0:
|
|
522
|
+
raise RuntimeError("UDF Execution Failed!")
|
|
526
523
|
|
|
527
524
|
else:
|
|
528
525
|
# Otherwise process single-threaded (faster for smaller UDFs)
|
|
@@ -571,57 +568,6 @@ class UDF(Step, ABC):
|
|
|
571
568
|
self.catalog.warehouse.close()
|
|
572
569
|
raise
|
|
573
570
|
|
|
574
|
-
@contextlib.contextmanager
|
|
575
|
-
def process_feature_module(self):
|
|
576
|
-
# Generate a random name for the feature module
|
|
577
|
-
feature_module_name = "tmp" + _random_string(10)
|
|
578
|
-
# Create a dynamic module with the generated name
|
|
579
|
-
dynamic_module = types.ModuleType(feature_module_name)
|
|
580
|
-
# Get the import lines for the necessary objects from the main module
|
|
581
|
-
main_module = sys.modules["__main__"]
|
|
582
|
-
if getattr(main_module, "__file__", None):
|
|
583
|
-
import_lines = list(get_imports(main_module))
|
|
584
|
-
else:
|
|
585
|
-
import_lines = [
|
|
586
|
-
source.getimport(obj, alias=name)
|
|
587
|
-
for name, obj in main_module.__dict__.items()
|
|
588
|
-
if _imports(obj) and not (name.startswith("__") and name.endswith("__"))
|
|
589
|
-
]
|
|
590
|
-
|
|
591
|
-
# Get the feature classes from the main module
|
|
592
|
-
feature_classes = {
|
|
593
|
-
name: obj
|
|
594
|
-
for name, obj in main_module.__dict__.items()
|
|
595
|
-
if _feature_predicate(obj)
|
|
596
|
-
}
|
|
597
|
-
if not feature_classes:
|
|
598
|
-
yield None
|
|
599
|
-
return
|
|
600
|
-
|
|
601
|
-
# Get the source code of the feature classes
|
|
602
|
-
feature_sources = [source.getsource(cls) for _, cls in feature_classes.items()]
|
|
603
|
-
# Set the module name for the feature classes to the generated name
|
|
604
|
-
for name, cls in feature_classes.items():
|
|
605
|
-
cls.__module__ = feature_module_name
|
|
606
|
-
setattr(dynamic_module, name, cls)
|
|
607
|
-
# Add the dynamic module to the sys.modules dictionary
|
|
608
|
-
sys.modules[feature_module_name] = dynamic_module
|
|
609
|
-
# Combine the import lines and feature sources
|
|
610
|
-
feature_file = "\n".join(import_lines) + "\n" + "\n".join(feature_sources)
|
|
611
|
-
|
|
612
|
-
# Write the module content to a .py file
|
|
613
|
-
with open(f"{feature_module_name}.py", "w") as module_file:
|
|
614
|
-
module_file.write(feature_file)
|
|
615
|
-
|
|
616
|
-
try:
|
|
617
|
-
yield feature_module_name
|
|
618
|
-
finally:
|
|
619
|
-
for cls in feature_classes.values():
|
|
620
|
-
cls.__module__ = main_module.__name__
|
|
621
|
-
os.unlink(f"{feature_module_name}.py")
|
|
622
|
-
# Remove the dynamic module from sys.modules
|
|
623
|
-
del sys.modules[feature_module_name]
|
|
624
|
-
|
|
625
571
|
def create_partitions_table(self, query: Select) -> "Table":
|
|
626
572
|
"""
|
|
627
573
|
Create temporary table with group by partitions.
|
|
@@ -1346,12 +1292,6 @@ class DatasetQuery:
|
|
|
1346
1292
|
def to_records(self) -> list[dict[str, Any]]:
|
|
1347
1293
|
return self.results(lambda cols, row: dict(zip(cols, row)))
|
|
1348
1294
|
|
|
1349
|
-
def to_pandas(self) -> "pd.DataFrame":
|
|
1350
|
-
records = self.to_records()
|
|
1351
|
-
df = pd.DataFrame.from_records(records)
|
|
1352
|
-
df.columns = [c.replace(DEFAULT_DELIMITER, ".") for c in df.columns]
|
|
1353
|
-
return df
|
|
1354
|
-
|
|
1355
1295
|
def shuffle(self) -> "Self":
|
|
1356
1296
|
# ToDo: implement shaffle based on seed and/or generating random column
|
|
1357
1297
|
return self.order_by(C.sys__rand)
|
|
@@ -1370,22 +1310,6 @@ class DatasetQuery:
|
|
|
1370
1310
|
|
|
1371
1311
|
return sampled.limit(n)
|
|
1372
1312
|
|
|
1373
|
-
def show(self, limit=20) -> None:
|
|
1374
|
-
df = self.limit(limit).to_pandas()
|
|
1375
|
-
|
|
1376
|
-
options = ["display.max_colwidth", 50, "display.show_dimensions", False]
|
|
1377
|
-
with pd.option_context(*options):
|
|
1378
|
-
if inside_notebook():
|
|
1379
|
-
from IPython.display import display
|
|
1380
|
-
|
|
1381
|
-
display(df)
|
|
1382
|
-
|
|
1383
|
-
else:
|
|
1384
|
-
print(df.to_string())
|
|
1385
|
-
|
|
1386
|
-
if len(df) == limit:
|
|
1387
|
-
print(f"[limited by {limit} objects]")
|
|
1388
|
-
|
|
1389
1313
|
def clone(self, new_table=True) -> "Self":
|
|
1390
1314
|
obj = copy(self)
|
|
1391
1315
|
obj.steps = obj.steps.copy()
|
|
@@ -1853,34 +1777,3 @@ def _random_string(length: int) -> str:
|
|
|
1853
1777
|
random.choice(string.ascii_letters + string.digits) # noqa: S311
|
|
1854
1778
|
for i in range(length)
|
|
1855
1779
|
)
|
|
1856
|
-
|
|
1857
|
-
|
|
1858
|
-
def _feature_predicate(obj):
|
|
1859
|
-
return (
|
|
1860
|
-
inspect.isclass(obj) and source.isfrommain(obj) and issubclass(obj, BaseModel)
|
|
1861
|
-
)
|
|
1862
|
-
|
|
1863
|
-
|
|
1864
|
-
def _imports(obj):
|
|
1865
|
-
return not source.isfrommain(obj)
|
|
1866
|
-
|
|
1867
|
-
|
|
1868
|
-
def get_imports(m):
|
|
1869
|
-
root = ast.parse(inspect.getsource(m))
|
|
1870
|
-
|
|
1871
|
-
for node in ast.iter_child_nodes(root):
|
|
1872
|
-
if isinstance(node, ast.Import):
|
|
1873
|
-
module = None
|
|
1874
|
-
elif isinstance(node, ast.ImportFrom):
|
|
1875
|
-
module = node.module
|
|
1876
|
-
else:
|
|
1877
|
-
continue
|
|
1878
|
-
|
|
1879
|
-
for n in node.names:
|
|
1880
|
-
import_script = ""
|
|
1881
|
-
if module:
|
|
1882
|
-
import_script += f"from {module} "
|
|
1883
|
-
import_script += f"import {n.name}"
|
|
1884
|
-
if n.asname:
|
|
1885
|
-
import_script += f" as {n.asname}"
|
|
1886
|
-
yield import_script
|
datachain/query/dispatch.py
CHANGED
|
@@ -10,7 +10,7 @@ from typing import Any, Optional
|
|
|
10
10
|
|
|
11
11
|
import attrs
|
|
12
12
|
import multiprocess
|
|
13
|
-
from
|
|
13
|
+
from cloudpickle import load, loads
|
|
14
14
|
from fsspec.callbacks import DEFAULT_CALLBACK, Callback
|
|
15
15
|
from multiprocess import get_context
|
|
16
16
|
|
|
@@ -84,7 +84,7 @@ def put_into_queue(queue: Queue, item: Any) -> None:
|
|
|
84
84
|
|
|
85
85
|
def udf_entrypoint() -> int:
|
|
86
86
|
# Load UDF info from stdin
|
|
87
|
-
udf_info = load(stdin.buffer)
|
|
87
|
+
udf_info = load(stdin.buffer)
|
|
88
88
|
|
|
89
89
|
(
|
|
90
90
|
warehouse_class,
|
|
@@ -95,7 +95,7 @@ def udf_entrypoint() -> int:
|
|
|
95
95
|
|
|
96
96
|
# Parallel processing (faster for more CPU-heavy UDFs)
|
|
97
97
|
dispatch = UDFDispatcher(
|
|
98
|
-
udf_info["
|
|
98
|
+
udf_info["udf_data"],
|
|
99
99
|
udf_info["catalog_init"],
|
|
100
100
|
udf_info["id_generator_clone_params"],
|
|
101
101
|
udf_info["metastore_clone_params"],
|
|
@@ -108,7 +108,7 @@ def udf_entrypoint() -> int:
|
|
|
108
108
|
batching = udf_info["batching"]
|
|
109
109
|
table = udf_info["table"]
|
|
110
110
|
n_workers = udf_info["processes"]
|
|
111
|
-
udf = udf_info["
|
|
111
|
+
udf = loads(udf_info["udf_data"])
|
|
112
112
|
if n_workers is True:
|
|
113
113
|
# Use default number of CPUs (cores)
|
|
114
114
|
n_workers = None
|
|
@@ -146,7 +146,7 @@ class UDFDispatcher:
|
|
|
146
146
|
|
|
147
147
|
def __init__(
|
|
148
148
|
self,
|
|
149
|
-
|
|
149
|
+
udf_data,
|
|
150
150
|
catalog_init_params,
|
|
151
151
|
id_generator_clone_params,
|
|
152
152
|
metastore_clone_params,
|
|
@@ -155,14 +155,7 @@ class UDFDispatcher:
|
|
|
155
155
|
is_generator=False,
|
|
156
156
|
buffer_size=DEFAULT_BATCH_SIZE,
|
|
157
157
|
):
|
|
158
|
-
|
|
159
|
-
# and so these two types are not considered exactly equal,
|
|
160
|
-
# even if they have the same import path.
|
|
161
|
-
if full_module_type_path(type(udf)) != full_module_type_path(UDFFactory):
|
|
162
|
-
self.udf = udf
|
|
163
|
-
else:
|
|
164
|
-
self.udf = None
|
|
165
|
-
self.udf_factory = udf
|
|
158
|
+
self.udf_data = udf_data
|
|
166
159
|
self.catalog_init_params = catalog_init_params
|
|
167
160
|
(
|
|
168
161
|
self.id_generator_class,
|
|
@@ -214,6 +207,15 @@ class UDFDispatcher:
|
|
|
214
207
|
self.catalog = Catalog(
|
|
215
208
|
id_generator, metastore, warehouse, **self.catalog_init_params
|
|
216
209
|
)
|
|
210
|
+
udf = loads(self.udf_data)
|
|
211
|
+
# isinstance cannot be used here, as cloudpickle packages the entire class
|
|
212
|
+
# definition, and so these two types are not considered exactly equal,
|
|
213
|
+
# even if they have the same import path.
|
|
214
|
+
if full_module_type_path(type(udf)) != full_module_type_path(UDFFactory):
|
|
215
|
+
self.udf = udf
|
|
216
|
+
else:
|
|
217
|
+
self.udf = None
|
|
218
|
+
self.udf_factory = udf
|
|
217
219
|
if not self.udf:
|
|
218
220
|
self.udf = self.udf_factory()
|
|
219
221
|
|
datachain/utils.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import glob
|
|
2
2
|
import importlib.util
|
|
3
|
+
import io
|
|
3
4
|
import json
|
|
4
5
|
import os
|
|
5
6
|
import os.path as osp
|
|
@@ -13,8 +14,10 @@ from itertools import islice
|
|
|
13
14
|
from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union
|
|
14
15
|
from uuid import UUID
|
|
15
16
|
|
|
17
|
+
import cloudpickle
|
|
16
18
|
from dateutil import tz
|
|
17
19
|
from dateutil.parser import isoparse
|
|
20
|
+
from pydantic import BaseModel
|
|
18
21
|
|
|
19
22
|
if TYPE_CHECKING:
|
|
20
23
|
import pandas as pd
|
|
@@ -388,3 +391,39 @@ def inside_notebook() -> bool:
|
|
|
388
391
|
return False
|
|
389
392
|
|
|
390
393
|
return False
|
|
394
|
+
|
|
395
|
+
|
|
396
|
+
def get_all_subclasses(cls):
|
|
397
|
+
"""Return all subclasses of a given class.
|
|
398
|
+
Can return duplicates due to multiple inheritance."""
|
|
399
|
+
for subclass in cls.__subclasses__():
|
|
400
|
+
yield from get_all_subclasses(subclass)
|
|
401
|
+
yield subclass
|
|
402
|
+
|
|
403
|
+
|
|
404
|
+
def filtered_cloudpickle_dumps(obj: Any) -> bytes:
|
|
405
|
+
"""Equivalent to cloudpickle.dumps, but this supports Pydantic models."""
|
|
406
|
+
model_namespaces = {}
|
|
407
|
+
|
|
408
|
+
with io.BytesIO() as f:
|
|
409
|
+
pickler = cloudpickle.CloudPickler(f)
|
|
410
|
+
|
|
411
|
+
for model_class in get_all_subclasses(BaseModel):
|
|
412
|
+
# This "is not None" check is needed, because due to multiple inheritance,
|
|
413
|
+
# it is theoretically possible to get the same class twice from
|
|
414
|
+
# get_all_subclasses.
|
|
415
|
+
if model_class.__pydantic_parent_namespace__ is not None:
|
|
416
|
+
# __pydantic_parent_namespace__ can contain many unnecessary and
|
|
417
|
+
# unpickleable entities, so should be removed for serialization.
|
|
418
|
+
model_namespaces[model_class] = (
|
|
419
|
+
model_class.__pydantic_parent_namespace__
|
|
420
|
+
)
|
|
421
|
+
model_class.__pydantic_parent_namespace__ = None
|
|
422
|
+
|
|
423
|
+
try:
|
|
424
|
+
pickler.dump(obj)
|
|
425
|
+
return f.getvalue()
|
|
426
|
+
finally:
|
|
427
|
+
for model_class, namespace in model_namespaces.items():
|
|
428
|
+
# Restore original __pydantic_parent_namespace__ locally.
|
|
429
|
+
model_class.__pydantic_parent_namespace__ = namespace
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.11
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -34,6 +34,7 @@ Requires-Dist: shtab <2,>=1.3.4
|
|
|
34
34
|
Requires-Dist: sqlalchemy >=2
|
|
35
35
|
Requires-Dist: multiprocess ==0.70.16
|
|
36
36
|
Requires-Dist: dill ==0.3.8
|
|
37
|
+
Requires-Dist: cloudpickle
|
|
37
38
|
Requires-Dist: ujson >=5.9.0
|
|
38
39
|
Requires-Dist: pydantic <3,>=2
|
|
39
40
|
Requires-Dist: jmespath >=1.0
|
|
@@ -14,7 +14,7 @@ datachain/nodes_thread_pool.py,sha256=ZyzBvUImIPmi4WlKC2SW2msA0UhtembbTdcs2nx29A
|
|
|
14
14
|
datachain/progress.py,sha256=7_8FtJs770ITK9sMq-Lt4k4k18QmYl4yIG_kCoWID3o,4559
|
|
15
15
|
datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
16
16
|
datachain/storage.py,sha256=RiSJLYdHUjnrEWkLBKPcETHpAxld_B2WxLg711t0aZI,3733
|
|
17
|
-
datachain/utils.py,sha256=
|
|
17
|
+
datachain/utils.py,sha256=AWUXRk7yvDpHcqzzPWwzv8HtF1-jDVEBHKxAgT7u02E,12288
|
|
18
18
|
datachain/catalog/__init__.py,sha256=g2iAAFx_gEIrqshXlhSEbrc8qDaEH11cjU40n3CHDz4,409
|
|
19
19
|
datachain/catalog/catalog.py,sha256=A5W9Ffoz1lZkzl6A3igaMC5jrus8VIYVLJLX8JTVKrk,79603
|
|
20
20
|
datachain/catalog/datasource.py,sha256=D-VWIVDCM10A8sQavLhRXdYSCG7F4o4ifswEF80_NAQ,1412
|
|
@@ -40,8 +40,7 @@ datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
|
40
40
|
datachain/lib/arrow.py,sha256=ttSiH8Xr08zxypAa3-BNTxMO2NBuZfYICwmG1qQwvWU,3268
|
|
41
41
|
datachain/lib/clip.py,sha256=YRa15Whnn6C8BMA-OAu0mYjc4h9i_n7pffRGdtfrTBA,5222
|
|
42
42
|
datachain/lib/data_model.py,sha256=DpV_-1JqJptCf0w4cnzPlHm5Yl4FQaveRgVCDZFaHXs,2012
|
|
43
|
-
datachain/lib/dc.py,sha256=
|
|
44
|
-
datachain/lib/feature_registry.py,sha256=LUrBvDom-k1shFuCv46-OdgntbIUQ5008oyIS0iPM6Q,2298
|
|
43
|
+
datachain/lib/dc.py,sha256=rd-7gVcMRZ2M-O8aQhNx85H31w-kRQHpXSwtf26dSk4,35849
|
|
45
44
|
datachain/lib/file.py,sha256=Uik1sq2l-uknpikH4Gdm7ZR0EcQYP2TrNg-urECjbW4,8304
|
|
46
45
|
datachain/lib/gpt4_vision.py,sha256=CZ-a64olZNp9TNmLGngmbN6b02UYImzwK3dPClnjxTI,2716
|
|
47
46
|
datachain/lib/hf_image_to_text.py,sha256=uVl4mnUl8gnHrJ3wfSZlxBevH-cxqOswxLArLAHxRrE,3077
|
|
@@ -53,14 +52,14 @@ datachain/lib/meta_formats.py,sha256=SF7UPPe-U-1HL6DBO1NfwZLIChjkHrHasgHf5ztCUoU
|
|
|
53
52
|
datachain/lib/model_store.py,sha256=JFpI1P0WFpsO6eAU49AdWmff5T8azqLrqOMB08pYJjg,2331
|
|
54
53
|
datachain/lib/pytorch.py,sha256=7fd2g0dI9zrMfRl3IVwIvXRH0v6TwSAyZGAbqKdEjcI,5505
|
|
55
54
|
datachain/lib/settings.py,sha256=6Nkoh8riETrftYwDp3aniK53Dsjc07MdztL8N0cW1D8,2849
|
|
56
|
-
datachain/lib/signal_schema.py,sha256=
|
|
55
|
+
datachain/lib/signal_schema.py,sha256=mRdq5qEGnFQgbSawzDPi2MCZ6PULTMigd51B2RuNxpg,14173
|
|
57
56
|
datachain/lib/text.py,sha256=d2V-52cqzVm5OT68BcLYyHrglvFMVR5DPzsbtRRv3D0,1063
|
|
58
57
|
datachain/lib/udf.py,sha256=RqCiGuNKL5P8eS84s_mmVYjK1gvkuRYdnIKm9qe-i2U,9698
|
|
59
58
|
datachain/lib/udf_signature.py,sha256=R81QqZseG_xeBFzJSgt-wrTQeUU-1RrWkHckLm_HEUU,7135
|
|
60
59
|
datachain/lib/unstructured.py,sha256=9Y6rAelXdYqkNbPaqz6DhXjhS8d6qXcP0ieIsWkzvkk,1143
|
|
61
60
|
datachain/lib/utils.py,sha256=5-kJlAZE0D9nXXweAjo7-SP_AWGo28feaDByONYaooQ,463
|
|
62
61
|
datachain/lib/vfile.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
63
|
-
datachain/lib/webdataset.py,sha256=
|
|
62
|
+
datachain/lib/webdataset.py,sha256=nIa6ubv94CwnATeeSdE7f_F9Zkz9LuBTfbXvFg3_-Ak,8295
|
|
64
63
|
datachain/lib/webdataset_laion.py,sha256=PQP6tQmUP7Xu9fPuAGK1JDBYA6T5UufYMUTGaxgspJA,2118
|
|
65
64
|
datachain/lib/convert/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
66
65
|
datachain/lib/convert/flatten.py,sha256=XdAj0f9W32ABjOo8UyYm0y0H_yHDn3qEHERTyXuhJxk,1592
|
|
@@ -70,8 +69,8 @@ datachain/lib/convert/values_to_tuples.py,sha256=MWz9pHT-AaPQN8hNMUYfuOHstyuNv0Q
|
|
|
70
69
|
datachain/query/__init__.py,sha256=tv-spkjUCYamMN9ys_90scYrZ8kJ7C7d1MTYVmxGtk4,325
|
|
71
70
|
datachain/query/batch.py,sha256=j-_ZcuQra2Ro3Wj4crtqQCg-7xuv-p84hr4QHdvT7as,3479
|
|
72
71
|
datachain/query/builtins.py,sha256=ZKNs49t8Oa_OaboCBIEqtXZt7c1Qe9OR_C_HpoDriIU,2781
|
|
73
|
-
datachain/query/dataset.py,sha256=
|
|
74
|
-
datachain/query/dispatch.py,sha256=
|
|
72
|
+
datachain/query/dataset.py,sha256=P1KBv_R0YnKjNDHzOJwAx9qhwI08l0dLgaXfak3ps7k,60578
|
|
73
|
+
datachain/query/dispatch.py,sha256=oGX9ZuoKWPB_EyqAZD_eULcO3OejY44_keSmFS6SHT0,13315
|
|
75
74
|
datachain/query/metrics.py,sha256=vsECqbZfoSDBnvC3GQlziKXmISVYDLgHP1fMPEOtKyo,640
|
|
76
75
|
datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
|
|
77
76
|
datachain/query/schema.py,sha256=n1NBOj6JO2I26mZD4vSURmVC2rk3mjIkJQheeLogoy4,7748
|
|
@@ -97,9 +96,9 @@ datachain/sql/sqlite/types.py,sha256=oP93nLfTBaYnN0z_4Dsv-HZm8j9rrUf1esMM-z3JLbg
|
|
|
97
96
|
datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
|
|
98
97
|
datachain/text/__init__.py,sha256=-yxHL2gVl3H0Zxam6iWUO6F1Mc4QAFHX6z-5fjHND74,72
|
|
99
98
|
datachain/torch/__init__.py,sha256=9QJW8h0FevIXEykRsxQ7XzMDXvdIkv3kVf_UY95CTyg,600
|
|
100
|
-
datachain-0.2.
|
|
101
|
-
datachain-0.2.
|
|
102
|
-
datachain-0.2.
|
|
103
|
-
datachain-0.2.
|
|
104
|
-
datachain-0.2.
|
|
105
|
-
datachain-0.2.
|
|
99
|
+
datachain-0.2.11.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
|
|
100
|
+
datachain-0.2.11.dist-info/METADATA,sha256=OVKgVc-Wc75AAQIY6hGL1CEBmnwksfgOXfiUen_xAOM,16759
|
|
101
|
+
datachain-0.2.11.dist-info/WHEEL,sha256=FZ75kcLy9M91ncbIgG8dnpCncbiKXSRGJ_PFILs6SFg,91
|
|
102
|
+
datachain-0.2.11.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
|
|
103
|
+
datachain-0.2.11.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
|
|
104
|
+
datachain-0.2.11.dist-info/RECORD,,
|
|
@@ -1,77 +0,0 @@
|
|
|
1
|
-
import logging
|
|
2
|
-
from typing import Any, ClassVar, Optional
|
|
3
|
-
|
|
4
|
-
from pydantic import BaseModel
|
|
5
|
-
|
|
6
|
-
logger = logging.getLogger(__name__)
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
class Registry:
|
|
10
|
-
reg: ClassVar[dict[str, dict[int, Any]]] = {}
|
|
11
|
-
|
|
12
|
-
@classmethod
|
|
13
|
-
def get_version(cls, model: type[BaseModel]) -> int:
|
|
14
|
-
if not hasattr(model, "_version"):
|
|
15
|
-
return 0
|
|
16
|
-
return model._version
|
|
17
|
-
|
|
18
|
-
@classmethod
|
|
19
|
-
def get_name(cls, model) -> str:
|
|
20
|
-
if (version := cls.get_version(model)) > 0:
|
|
21
|
-
return f"{model.__name__}@v{version}"
|
|
22
|
-
return model.__name__
|
|
23
|
-
|
|
24
|
-
@classmethod
|
|
25
|
-
def add(cls, fr: type):
|
|
26
|
-
if (model := Registry.to_pydantic(fr)) is None:
|
|
27
|
-
return
|
|
28
|
-
|
|
29
|
-
name = model.__name__
|
|
30
|
-
if name not in cls.reg:
|
|
31
|
-
cls.reg[name] = {}
|
|
32
|
-
version = Registry.get_version(model)
|
|
33
|
-
cls.reg[name][version] = model
|
|
34
|
-
|
|
35
|
-
for f_info in model.model_fields.values():
|
|
36
|
-
if (anno := Registry.to_pydantic(f_info.annotation)) is not None:
|
|
37
|
-
cls.add(anno)
|
|
38
|
-
|
|
39
|
-
@classmethod
|
|
40
|
-
def get(cls, name: str, version: Optional[int] = None) -> Optional[type]:
|
|
41
|
-
class_dict = cls.reg.get(name, None)
|
|
42
|
-
if class_dict is None:
|
|
43
|
-
return None
|
|
44
|
-
if version is None:
|
|
45
|
-
max_ver = max(class_dict.keys(), default=None)
|
|
46
|
-
if max_ver is None:
|
|
47
|
-
return None
|
|
48
|
-
return class_dict[max_ver]
|
|
49
|
-
return class_dict.get(version, None)
|
|
50
|
-
|
|
51
|
-
@classmethod
|
|
52
|
-
def parse_name_version(cls, fullname: str) -> tuple[str, int]:
|
|
53
|
-
name = fullname
|
|
54
|
-
version = 0
|
|
55
|
-
|
|
56
|
-
if "@" in fullname:
|
|
57
|
-
name, version_str = fullname.split("@")
|
|
58
|
-
if version_str.strip() != "":
|
|
59
|
-
version = int(version_str[1:])
|
|
60
|
-
|
|
61
|
-
return name, version
|
|
62
|
-
|
|
63
|
-
@classmethod
|
|
64
|
-
def remove(cls, fr: type) -> None:
|
|
65
|
-
version = fr._version # type: ignore[attr-defined]
|
|
66
|
-
if fr.__name__ in cls.reg and version in cls.reg[fr.__name__]:
|
|
67
|
-
del cls.reg[fr.__name__][version]
|
|
68
|
-
|
|
69
|
-
@staticmethod
|
|
70
|
-
def is_pydantic(val):
|
|
71
|
-
return not hasattr(val, "__origin__") and issubclass(val, BaseModel)
|
|
72
|
-
|
|
73
|
-
@staticmethod
|
|
74
|
-
def to_pydantic(val) -> Optional[type[BaseModel]]:
|
|
75
|
-
if val is None or not Registry.is_pydantic(val):
|
|
76
|
-
return None
|
|
77
|
-
return val
|
|
File without changes
|
|
File without changes
|
|
File without changes
|