datachain 0.2.1__py3-none-any.whl → 0.2.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/catalog/catalog.py +30 -6
- datachain/data_storage/db_engine.py +0 -2
- datachain/data_storage/schema.py +10 -27
- datachain/data_storage/warehouse.py +1 -7
- datachain/lib/arrow.py +7 -13
- datachain/lib/clip.py +151 -0
- datachain/lib/dc.py +35 -57
- datachain/lib/feature_utils.py +1 -2
- datachain/lib/file.py +7 -0
- datachain/lib/image.py +37 -79
- datachain/lib/pytorch.py +4 -2
- datachain/lib/signal_schema.py +2 -47
- datachain/lib/text.py +18 -49
- datachain/lib/udf.py +58 -30
- datachain/lib/udf_signature.py +11 -10
- datachain/lib/utils.py +17 -0
- datachain/lib/webdataset.py +2 -2
- datachain/listing.py +0 -3
- datachain/query/dataset.py +63 -37
- datachain/query/dispatch.py +2 -2
- datachain/query/schema.py +1 -8
- datachain/query/udf.py +16 -18
- datachain/utils.py +28 -0
- {datachain-0.2.1.dist-info → datachain-0.2.3.dist-info}/METADATA +2 -1
- {datachain-0.2.1.dist-info → datachain-0.2.3.dist-info}/RECORD +29 -29
- {datachain-0.2.1.dist-info → datachain-0.2.3.dist-info}/WHEEL +1 -1
- datachain/lib/reader.py +0 -49
- {datachain-0.2.1.dist-info → datachain-0.2.3.dist-info}/LICENSE +0 -0
- {datachain-0.2.1.dist-info → datachain-0.2.3.dist-info}/entry_points.txt +0 -0
- {datachain-0.2.1.dist-info → datachain-0.2.3.dist-info}/top_level.txt +0 -0
datachain/query/dataset.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import ast
|
|
1
2
|
import contextlib
|
|
2
3
|
import datetime
|
|
3
4
|
import inspect
|
|
@@ -51,9 +52,10 @@ from datachain.data_storage.schema import (
|
|
|
51
52
|
from datachain.dataset import DatasetStatus, RowDict
|
|
52
53
|
from datachain.error import DatasetNotFoundError, QueryScriptCancelError
|
|
53
54
|
from datachain.progress import CombinedDownloadCallback
|
|
55
|
+
from datachain.query.schema import DEFAULT_DELIMITER
|
|
54
56
|
from datachain.sql.functions import rand
|
|
55
57
|
from datachain.storage import Storage, StorageURI
|
|
56
|
-
from datachain.utils import batched, determine_processes
|
|
58
|
+
from datachain.utils import batched, determine_processes, inside_notebook
|
|
57
59
|
|
|
58
60
|
from .batch import RowBatch
|
|
59
61
|
from .metrics import metrics
|
|
@@ -62,7 +64,6 @@ from .session import Session
|
|
|
62
64
|
from .udf import UDFBase, UDFClassWrapper, UDFFactory, UDFType
|
|
63
65
|
|
|
64
66
|
if TYPE_CHECKING:
|
|
65
|
-
import pandas as pd
|
|
66
67
|
from sqlalchemy.sql.elements import ClauseElement
|
|
67
68
|
from sqlalchemy.sql.schema import Table
|
|
68
69
|
from sqlalchemy.sql.selectable import GenerativeSelect
|
|
@@ -547,8 +548,9 @@ class UDF(Step, ABC):
|
|
|
547
548
|
else:
|
|
548
549
|
udf = self.udf
|
|
549
550
|
|
|
550
|
-
if hasattr(udf.func, "
|
|
551
|
-
udf.func.
|
|
551
|
+
if hasattr(udf.func, "setup") and callable(udf.func.setup):
|
|
552
|
+
udf.func.setup()
|
|
553
|
+
|
|
552
554
|
warehouse = self.catalog.warehouse
|
|
553
555
|
|
|
554
556
|
with contextlib.closing(
|
|
@@ -599,12 +601,15 @@ class UDF(Step, ABC):
|
|
|
599
601
|
# Create a dynamic module with the generated name
|
|
600
602
|
dynamic_module = types.ModuleType(feature_module_name)
|
|
601
603
|
# Get the import lines for the necessary objects from the main module
|
|
602
|
-
import_lines = [
|
|
603
|
-
source.getimport(obj, alias=name)
|
|
604
|
-
for name, obj in inspect.getmembers(sys.modules["__main__"], _imports)
|
|
605
|
-
if not (name.startswith("__") and name.endswith("__"))
|
|
606
|
-
]
|
|
607
604
|
main_module = sys.modules["__main__"]
|
|
605
|
+
if getattr(main_module, "__file__", None):
|
|
606
|
+
import_lines = list(get_imports(main_module))
|
|
607
|
+
else:
|
|
608
|
+
import_lines = [
|
|
609
|
+
source.getimport(obj, alias=name)
|
|
610
|
+
for name, obj in main_module.__dict__.items()
|
|
611
|
+
if _imports(obj) and not (name.startswith("__") and name.endswith("__"))
|
|
612
|
+
]
|
|
608
613
|
|
|
609
614
|
# Get the feature classes from the main module
|
|
610
615
|
feature_classes = {
|
|
@@ -612,6 +617,10 @@ class UDF(Step, ABC):
|
|
|
612
617
|
for name, obj in main_module.__dict__.items()
|
|
613
618
|
if _feature_predicate(obj)
|
|
614
619
|
}
|
|
620
|
+
if not feature_classes:
|
|
621
|
+
yield None
|
|
622
|
+
return
|
|
623
|
+
|
|
615
624
|
# Get the source code of the feature classes
|
|
616
625
|
feature_sources = [source.getsource(cls) for _, cls in feature_classes.items()]
|
|
617
626
|
# Set the module name for the feature classes to the generated name
|
|
@@ -621,7 +630,7 @@ class UDF(Step, ABC):
|
|
|
621
630
|
# Add the dynamic module to the sys.modules dictionary
|
|
622
631
|
sys.modules[feature_module_name] = dynamic_module
|
|
623
632
|
# Combine the import lines and feature sources
|
|
624
|
-
feature_file = "".join(import_lines) + "\n".join(feature_sources)
|
|
633
|
+
feature_file = "\n".join(import_lines) + "\n" + "\n".join(feature_sources)
|
|
625
634
|
|
|
626
635
|
# Write the module content to a .py file
|
|
627
636
|
with open(f"{feature_module_name}.py", "w") as module_file:
|
|
@@ -1362,33 +1371,11 @@ class DatasetQuery:
|
|
|
1362
1371
|
cols = result.columns
|
|
1363
1372
|
return [dict(zip(cols, row)) for row in result]
|
|
1364
1373
|
|
|
1365
|
-
@classmethod
|
|
1366
|
-
def create_empty_record(
|
|
1367
|
-
cls, name: Optional[str] = None, session: Optional[Session] = None
|
|
1368
|
-
) -> "DatasetRecord":
|
|
1369
|
-
session = Session.get(session)
|
|
1370
|
-
if name is None:
|
|
1371
|
-
name = session.generate_temp_dataset_name()
|
|
1372
|
-
columns = session.catalog.warehouse.dataset_row_cls.file_columns()
|
|
1373
|
-
return session.catalog.create_dataset(name, columns=columns)
|
|
1374
|
-
|
|
1375
|
-
@classmethod
|
|
1376
|
-
def insert_record(
|
|
1377
|
-
cls,
|
|
1378
|
-
dsr: "DatasetRecord",
|
|
1379
|
-
record: dict[str, Any],
|
|
1380
|
-
session: Optional[Session] = None,
|
|
1381
|
-
) -> None:
|
|
1382
|
-
session = Session.get(session)
|
|
1383
|
-
dr = session.catalog.warehouse.dataset_rows(dsr)
|
|
1384
|
-
insert_q = dr.get_table().insert().values(**record)
|
|
1385
|
-
session.catalog.warehouse.db.execute(insert_q)
|
|
1386
|
-
|
|
1387
1374
|
def to_pandas(self) -> "pd.DataFrame":
|
|
1388
|
-
import pandas as pd
|
|
1389
|
-
|
|
1390
1375
|
records = self.to_records()
|
|
1391
|
-
|
|
1376
|
+
df = pd.DataFrame.from_records(records)
|
|
1377
|
+
df.columns = [c.replace(DEFAULT_DELIMITER, ".") for c in df.columns]
|
|
1378
|
+
return df
|
|
1392
1379
|
|
|
1393
1380
|
def shuffle(self) -> "Self":
|
|
1394
1381
|
# ToDo: implement shaffle based on seed and/or generating random column
|
|
@@ -1410,8 +1397,17 @@ class DatasetQuery:
|
|
|
1410
1397
|
|
|
1411
1398
|
def show(self, limit=20) -> None:
|
|
1412
1399
|
df = self.limit(limit).to_pandas()
|
|
1413
|
-
|
|
1414
|
-
|
|
1400
|
+
|
|
1401
|
+
options = ["display.max_colwidth", 50, "display.show_dimensions", False]
|
|
1402
|
+
with pd.option_context(*options):
|
|
1403
|
+
if inside_notebook():
|
|
1404
|
+
from IPython.display import display
|
|
1405
|
+
|
|
1406
|
+
display(df)
|
|
1407
|
+
|
|
1408
|
+
else:
|
|
1409
|
+
print(df.to_string())
|
|
1410
|
+
|
|
1415
1411
|
if len(df) == limit:
|
|
1416
1412
|
print(f"[limited by {limit} objects]")
|
|
1417
1413
|
|
|
@@ -1692,6 +1688,15 @@ class DatasetQuery:
|
|
|
1692
1688
|
storage.timestamp_str,
|
|
1693
1689
|
)
|
|
1694
1690
|
|
|
1691
|
+
def exec(self) -> "Self":
|
|
1692
|
+
"""Execute the query."""
|
|
1693
|
+
try:
|
|
1694
|
+
query = self.clone()
|
|
1695
|
+
query.apply_steps()
|
|
1696
|
+
finally:
|
|
1697
|
+
self.cleanup()
|
|
1698
|
+
return query
|
|
1699
|
+
|
|
1695
1700
|
def save(
|
|
1696
1701
|
self,
|
|
1697
1702
|
name: Optional[str] = None,
|
|
@@ -1878,3 +1883,24 @@ def _feature_predicate(obj):
|
|
|
1878
1883
|
|
|
1879
1884
|
def _imports(obj):
|
|
1880
1885
|
return not source.isfrommain(obj)
|
|
1886
|
+
|
|
1887
|
+
|
|
1888
|
+
def get_imports(m):
|
|
1889
|
+
root = ast.parse(inspect.getsource(m))
|
|
1890
|
+
|
|
1891
|
+
for node in ast.iter_child_nodes(root):
|
|
1892
|
+
if isinstance(node, ast.Import):
|
|
1893
|
+
module = None
|
|
1894
|
+
elif isinstance(node, ast.ImportFrom):
|
|
1895
|
+
module = node.module
|
|
1896
|
+
else:
|
|
1897
|
+
continue
|
|
1898
|
+
|
|
1899
|
+
for n in node.names:
|
|
1900
|
+
import_script = ""
|
|
1901
|
+
if module:
|
|
1902
|
+
import_script += f"from {module} "
|
|
1903
|
+
import_script += f"import {n.name}"
|
|
1904
|
+
if n.asname:
|
|
1905
|
+
import_script += f" as {n.asname}"
|
|
1906
|
+
yield import_script
|
datachain/query/dispatch.py
CHANGED
|
@@ -370,8 +370,8 @@ class UDFWorker:
|
|
|
370
370
|
return WorkerCallback(self.done_queue)
|
|
371
371
|
|
|
372
372
|
def run(self) -> None:
|
|
373
|
-
if hasattr(self.udf.func, "
|
|
374
|
-
self.udf.func.
|
|
373
|
+
if hasattr(self.udf.func, "setup") and callable(self.udf.func.setup):
|
|
374
|
+
self.udf.func.setup()
|
|
375
375
|
while (batch := get_from_queue(self.task_queue)) != STOP_SIGNAL:
|
|
376
376
|
n_rows = len(batch.rows) if isinstance(batch, RowBatch) else 1
|
|
377
377
|
udf_output = self.udf(
|
datachain/query/schema.py
CHANGED
|
@@ -3,14 +3,12 @@ import json
|
|
|
3
3
|
from abc import ABC, abstractmethod
|
|
4
4
|
from datetime import datetime, timezone
|
|
5
5
|
from fnmatch import fnmatch
|
|
6
|
-
from random import getrandbits
|
|
7
6
|
from typing import TYPE_CHECKING, Any, Callable, ClassVar, Optional, Union
|
|
8
7
|
|
|
9
8
|
import attrs
|
|
10
9
|
import sqlalchemy as sa
|
|
11
10
|
from fsspec.callbacks import DEFAULT_CALLBACK, Callback
|
|
12
11
|
|
|
13
|
-
from datachain.data_storage.warehouse import RANDOM_BITS
|
|
14
12
|
from datachain.sql.types import JSON, Boolean, DateTime, Int, Int64, SQLType, String
|
|
15
13
|
|
|
16
14
|
if TYPE_CHECKING:
|
|
@@ -217,7 +215,7 @@ class DatasetRow:
|
|
|
217
215
|
"source": String,
|
|
218
216
|
"parent": String,
|
|
219
217
|
"name": String,
|
|
220
|
-
"size":
|
|
218
|
+
"size": Int64,
|
|
221
219
|
"location": JSON,
|
|
222
220
|
"vtype": String,
|
|
223
221
|
"dir_type": Int,
|
|
@@ -227,8 +225,6 @@ class DatasetRow:
|
|
|
227
225
|
"last_modified": DateTime,
|
|
228
226
|
"version": String,
|
|
229
227
|
"etag": String,
|
|
230
|
-
# system column
|
|
231
|
-
"random": Int64,
|
|
232
228
|
}
|
|
233
229
|
|
|
234
230
|
@staticmethod
|
|
@@ -267,8 +263,6 @@ class DatasetRow:
|
|
|
267
263
|
|
|
268
264
|
last_modified = last_modified or datetime.now(timezone.utc)
|
|
269
265
|
|
|
270
|
-
random = getrandbits(RANDOM_BITS)
|
|
271
|
-
|
|
272
266
|
return ( # type: ignore [return-value]
|
|
273
267
|
source,
|
|
274
268
|
parent,
|
|
@@ -283,7 +277,6 @@ class DatasetRow:
|
|
|
283
277
|
last_modified,
|
|
284
278
|
version,
|
|
285
279
|
etag,
|
|
286
|
-
random,
|
|
287
280
|
)
|
|
288
281
|
|
|
289
282
|
@staticmethod
|
datachain/query/udf.py
CHANGED
|
@@ -14,6 +14,7 @@ from typing import (
|
|
|
14
14
|
from fsspec.callbacks import DEFAULT_CALLBACK, Callback
|
|
15
15
|
|
|
16
16
|
from datachain.dataset import RowDict
|
|
17
|
+
from datachain.lib.utils import AbstractUDF
|
|
17
18
|
|
|
18
19
|
from .batch import Batch, BatchingStrategy, NoBatching, Partition, RowBatch
|
|
19
20
|
from .schema import (
|
|
@@ -58,14 +59,6 @@ class UDFProperties:
|
|
|
58
59
|
def signal_names(self) -> Iterable[str]:
|
|
59
60
|
return self.output.keys()
|
|
60
61
|
|
|
61
|
-
def parameter_parser(self) -> Callable:
|
|
62
|
-
"""Generate a parameter list from a dataset row."""
|
|
63
|
-
|
|
64
|
-
def plist(catalog: "Catalog", row: "RowDict", **kwargs) -> list:
|
|
65
|
-
return [p.get_value(catalog, row, **kwargs) for p in self.params]
|
|
66
|
-
|
|
67
|
-
return plist
|
|
68
|
-
|
|
69
62
|
|
|
70
63
|
def udf(
|
|
71
64
|
params: Sequence[UDFParamSpec],
|
|
@@ -113,32 +106,37 @@ class UDFBase:
|
|
|
113
106
|
self.func = func
|
|
114
107
|
self.properties = properties
|
|
115
108
|
self.signal_names = properties.signal_names()
|
|
116
|
-
self.parameter_parser = properties.parameter_parser()
|
|
117
109
|
self.output = properties.output
|
|
118
110
|
|
|
119
111
|
def __call__(
|
|
120
112
|
self,
|
|
121
113
|
catalog: "Catalog",
|
|
122
|
-
|
|
114
|
+
arg: "BatchingResult",
|
|
123
115
|
is_generator: bool = False,
|
|
124
116
|
cache: bool = False,
|
|
125
117
|
cb: Callback = DEFAULT_CALLBACK,
|
|
126
118
|
) -> Iterable[UDFResult]:
|
|
127
|
-
if isinstance(
|
|
119
|
+
if isinstance(self.func, AbstractUDF):
|
|
120
|
+
self.func._catalog = catalog # type: ignore[unreachable]
|
|
121
|
+
|
|
122
|
+
if isinstance(arg, RowBatch):
|
|
128
123
|
udf_inputs = [
|
|
129
|
-
self.
|
|
130
|
-
for row in
|
|
124
|
+
self.bind_parameters(catalog, row, cache=cache, cb=cb)
|
|
125
|
+
for row in arg.rows
|
|
131
126
|
]
|
|
132
127
|
udf_outputs = self.func(udf_inputs)
|
|
133
|
-
return self._process_results(
|
|
134
|
-
if isinstance(
|
|
135
|
-
udf_inputs = self.
|
|
128
|
+
return self._process_results(arg.rows, udf_outputs, is_generator)
|
|
129
|
+
if isinstance(arg, RowDict):
|
|
130
|
+
udf_inputs = self.bind_parameters(catalog, arg, cache=cache, cb=cb)
|
|
136
131
|
udf_outputs = self.func(*udf_inputs)
|
|
137
132
|
if not is_generator:
|
|
138
133
|
# udf_outputs is generator already if is_generator=True
|
|
139
134
|
udf_outputs = [udf_outputs]
|
|
140
|
-
return self._process_results([
|
|
141
|
-
raise ValueError(f"
|
|
135
|
+
return self._process_results([arg], udf_outputs, is_generator)
|
|
136
|
+
raise ValueError(f"Unexpected UDF argument: {arg}")
|
|
137
|
+
|
|
138
|
+
def bind_parameters(self, catalog: "Catalog", row: "RowDict", **kwargs) -> list:
|
|
139
|
+
return [p.get_value(catalog, row, **kwargs) for p in self.properties.params]
|
|
142
140
|
|
|
143
141
|
def _process_results(
|
|
144
142
|
self,
|
datachain/utils.py
CHANGED
|
@@ -360,3 +360,31 @@ class JSONSerialize(json.JSONEncoder):
|
|
|
360
360
|
return str(obj)
|
|
361
361
|
|
|
362
362
|
return super().default(obj)
|
|
363
|
+
|
|
364
|
+
|
|
365
|
+
def inside_colab() -> bool:
|
|
366
|
+
try:
|
|
367
|
+
from google import colab # noqa: F401
|
|
368
|
+
except ImportError:
|
|
369
|
+
return False
|
|
370
|
+
return True
|
|
371
|
+
|
|
372
|
+
|
|
373
|
+
def inside_notebook() -> bool:
|
|
374
|
+
if inside_colab():
|
|
375
|
+
return True
|
|
376
|
+
|
|
377
|
+
try:
|
|
378
|
+
shell = get_ipython().__class__.__name__ # type: ignore[name-defined]
|
|
379
|
+
except NameError:
|
|
380
|
+
return False
|
|
381
|
+
|
|
382
|
+
if shell == "ZMQInteractiveShell":
|
|
383
|
+
try:
|
|
384
|
+
import IPython
|
|
385
|
+
|
|
386
|
+
return IPython.__version__ >= "6.0.0"
|
|
387
|
+
except ImportError:
|
|
388
|
+
return False
|
|
389
|
+
|
|
390
|
+
return False
|
|
@@ -1,9 +1,10 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.3
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License: Apache-2.0
|
|
7
|
+
Project-URL: Documentation, https://datachain.dvc.ai
|
|
7
8
|
Project-URL: Issues, https://github.com/iterative/dvcx/issues
|
|
8
9
|
Project-URL: Source, https://github.com/iterative/dvcx
|
|
9
10
|
Classifier: Programming Language :: Python :: 3
|
|
@@ -7,16 +7,16 @@ datachain/cli_utils.py,sha256=jrn9ejGXjybeO1ur3fjdSiAyCHZrX0qsLLbJzN9ErPM,2418
|
|
|
7
7
|
datachain/config.py,sha256=PfC7W5yO6HFO6-iMB4YB-0RR88LPiGmD6sS_SfVbGso,1979
|
|
8
8
|
datachain/dataset.py,sha256=MZezyuJWNj_3PEtzr0epPMNyWAOTrhTSPI5FmemV6L4,14470
|
|
9
9
|
datachain/error.py,sha256=GY9KYTmb7GHXn2gGHV9X-PBhgwLj3i7VpK7tGHtAoGM,1279
|
|
10
|
-
datachain/listing.py,sha256=
|
|
10
|
+
datachain/listing.py,sha256=sX8vZNzAzoTel1li6VJiYeHUJwseUERVEoW9D5P7tII,8192
|
|
11
11
|
datachain/node.py,sha256=fHe7k5ajI2g2qnzsG-_NQR_T-QdBYctVeEa8c8dsu_Y,5703
|
|
12
12
|
datachain/nodes_fetcher.py,sha256=kca19yvu11JxoVY1t4_ydp1FmchiV88GnNicNBQ9NIA,831
|
|
13
13
|
datachain/nodes_thread_pool.py,sha256=ZyzBvUImIPmi4WlKC2SW2msA0UhtembbTdcs2nx29A0,3191
|
|
14
14
|
datachain/progress.py,sha256=7_8FtJs770ITK9sMq-Lt4k4k18QmYl4yIG_kCoWID3o,4559
|
|
15
15
|
datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
16
16
|
datachain/storage.py,sha256=RiSJLYdHUjnrEWkLBKPcETHpAxld_B2WxLg711t0aZI,3733
|
|
17
|
-
datachain/utils.py,sha256=
|
|
17
|
+
datachain/utils.py,sha256=12yQAV8tfyCHqp_xJcJBeNnr1L_BO8e2bOPyXdM68gs,10759
|
|
18
18
|
datachain/catalog/__init__.py,sha256=g2iAAFx_gEIrqshXlhSEbrc8qDaEH11cjU40n3CHDz4,409
|
|
19
|
-
datachain/catalog/catalog.py,sha256=
|
|
19
|
+
datachain/catalog/catalog.py,sha256=pulKGJgAmxqSmFqBhA-J0wCKdBqGX4vqpV0cAvV6vUw,79578
|
|
20
20
|
datachain/catalog/datasource.py,sha256=D-VWIVDCM10A8sQavLhRXdYSCG7F4o4ifswEF80_NAQ,1412
|
|
21
21
|
datachain/catalog/loader.py,sha256=GJ8zhEYkC7TuaPzCsjJQ4LtTdECu-wwYzC12MikPOMQ,7307
|
|
22
22
|
datachain/catalog/subclass.py,sha256=B5R0qxeTYEyVAAPM1RutBPSoXZc8L5mVVZeSGXki9Sw,2096
|
|
@@ -28,52 +28,52 @@ datachain/client/gcs.py,sha256=ucX8e6JrqlFY-f80zkv084vxnKdtxpO32QJ-RG8Nv1s,4454
|
|
|
28
28
|
datachain/client/local.py,sha256=NQVkLTJQ-a7Udavqbh_4uT-IejfZQYn10j22owz9sis,5150
|
|
29
29
|
datachain/client/s3.py,sha256=TmW4f7VUM5CMZjSmgyFQFKeMUGrXt2SLoLEbLOUleiU,6296
|
|
30
30
|
datachain/data_storage/__init__.py,sha256=cEOJpyu1JDZtfUupYucCDNFI6e5Wmp_Oyzq6rZv32Y8,398
|
|
31
|
-
datachain/data_storage/db_engine.py,sha256=
|
|
31
|
+
datachain/data_storage/db_engine.py,sha256=rgBuqJ-M1j5QyqiUQuJRewctuvRRj8LBDL54-aPEFxE,3287
|
|
32
32
|
datachain/data_storage/id_generator.py,sha256=VlDALKijggegAnNMJwuMETJgnLoPYxpkrkld5DNTPQw,3839
|
|
33
33
|
datachain/data_storage/job.py,sha256=w-7spowjkOa1P5fUVtJou3OltT0L48P0RYWZ9rSJ9-s,383
|
|
34
34
|
datachain/data_storage/metastore.py,sha256=y-4fYvuOPnWeYxAvqhDnw6CdlTvQiurg0Gg4TaG9LR0,54074
|
|
35
|
-
datachain/data_storage/schema.py,sha256=
|
|
35
|
+
datachain/data_storage/schema.py,sha256=bY3q2OUaUraos0s5BnwWkhgce8YpeNmIl7M1ifshoes,8074
|
|
36
36
|
datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
|
|
37
37
|
datachain/data_storage/sqlite.py,sha256=F68Q_AIqNAObZ5kJ0GnBqRC6e2D2sRehkQo8UzrHgtI,25079
|
|
38
|
-
datachain/data_storage/warehouse.py,sha256=
|
|
38
|
+
datachain/data_storage/warehouse.py,sha256=h35JiJoCGtwkMctis_x3NHxkwEejX5sIWvJOluZxrOI,33132
|
|
39
39
|
datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
40
|
-
datachain/lib/arrow.py,sha256=
|
|
40
|
+
datachain/lib/arrow.py,sha256=FF3WWUOjB6Prw8ygfiLsrVfrdob0S01lPzEazuGqoO8,2556
|
|
41
41
|
datachain/lib/cached_stream.py,sha256=t2ifK0hZVZiVn0MQ8D3FaFK1-qK84TwJW2Dw1SRsw9g,1066
|
|
42
42
|
datachain/lib/claude.py,sha256=iAauA1zNVNONpLzUo1t0QN5PZ5Ot6cZkfib7Ka_c638,1969
|
|
43
|
-
datachain/lib/
|
|
43
|
+
datachain/lib/clip.py,sha256=rDeZlFGs0DXBlpmh5ZQJhR9Sz13bWAZGQjfYm1hsUI4,5388
|
|
44
|
+
datachain/lib/dc.py,sha256=Sf99R0oOqf7tlS2gieaG56z3bF7YVcMjhJOZrFRfFs8,34778
|
|
44
45
|
datachain/lib/feature.py,sha256=QDloA9HE7URf9J_veKrguYBvSg-0cbXZFTswNxrKsB8,12135
|
|
45
46
|
datachain/lib/feature_registry.py,sha256=K3jGQzBp2HZDjR9hdGe1BZaXOAne8RpkCRRQdTVjkTs,1622
|
|
46
|
-
datachain/lib/feature_utils.py,sha256=
|
|
47
|
-
datachain/lib/file.py,sha256=
|
|
47
|
+
datachain/lib/feature_utils.py,sha256=F4ZENO6tTQvd36a-O1AurYjFSUpoyZaT4qgXsKjQDts,4650
|
|
48
|
+
datachain/lib/file.py,sha256=TdhsPYmG0Atkd_QAO997oA8AuM854wNbjjLLT1uiD2M,8346
|
|
48
49
|
datachain/lib/gpt4_vision.py,sha256=idyXVZVWzltstGaVIu5RYE5UNbdqcPEjIWy81O1MwkM,2922
|
|
49
50
|
datachain/lib/hf_image_to_text.py,sha256=HiPSWzJRDT-vnz9DXJbJBNCMNl9wmpxiSS3PbbVz8SE,3310
|
|
50
51
|
datachain/lib/hf_pipeline.py,sha256=f0AH_XCziOF1OKN3d1w1swTBLaeajMJ8xgdsX37i5-o,2287
|
|
51
|
-
datachain/lib/image.py,sha256=
|
|
52
|
+
datachain/lib/image.py,sha256=ZYfDqr9p-RRmWBeWFQwXLS1J3vQS616ykfMUvQVpqBY,2717
|
|
52
53
|
datachain/lib/image_transform.py,sha256=NXWtnVOcofWBgl_YMxb4ABpaT7JTBMx7tLKvErH1IC4,3024
|
|
53
54
|
datachain/lib/iptc_exif_xmp.py,sha256=xrbxFeY-wRP6T5JsUgE3EXfTxKvZVymRaRD_VIfxD0A,2236
|
|
54
55
|
datachain/lib/meta_formats.py,sha256=wIVVLRLp45Zk4vjZRd_P1UtD24vpDCb-vILWtcsACwk,6630
|
|
55
|
-
datachain/lib/pytorch.py,sha256=
|
|
56
|
-
datachain/lib/reader.py,sha256=rPXXNoTUdm6PQwkAlaU-nOBreP_q4ett_EjFStrA_W0,1727
|
|
56
|
+
datachain/lib/pytorch.py,sha256=Z7iZCsqJzUT0PynVo23Xu4Fx7qIuuEZyH83R1tR5mfI,5561
|
|
57
57
|
datachain/lib/settings.py,sha256=6Nkoh8riETrftYwDp3aniK53Dsjc07MdztL8N0cW1D8,2849
|
|
58
|
-
datachain/lib/signal_schema.py,sha256=
|
|
59
|
-
datachain/lib/text.py,sha256=
|
|
60
|
-
datachain/lib/udf.py,sha256=
|
|
61
|
-
datachain/lib/udf_signature.py,sha256=
|
|
58
|
+
datachain/lib/signal_schema.py,sha256=KTegbx-yMvtaKEoUxLgDx5MxMA8De-nmdtqnV1932N8,10151
|
|
59
|
+
datachain/lib/text.py,sha256=PUT1O0jNJoQGsuhff2LgDpzTWk2eMdwIKqEDBrE448M,1307
|
|
60
|
+
datachain/lib/udf.py,sha256=kMlOsHCVybnnq4AMtYqjylZH7x2tGE62FsDPOu9qhWM,6612
|
|
61
|
+
datachain/lib/udf_signature.py,sha256=CUKgoVpM_N8CgvMncpAw2RYchoiJdAGdDSdluoP0hIk,7161
|
|
62
62
|
datachain/lib/unstructured.py,sha256=9Y6rAelXdYqkNbPaqz6DhXjhS8d6qXcP0ieIsWkzvkk,1143
|
|
63
|
-
datachain/lib/utils.py,sha256=
|
|
63
|
+
datachain/lib/utils.py,sha256=5-kJlAZE0D9nXXweAjo7-SP_AWGo28feaDByONYaooQ,463
|
|
64
64
|
datachain/lib/vfile.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
65
|
-
datachain/lib/webdataset.py,sha256=
|
|
65
|
+
datachain/lib/webdataset.py,sha256=GWB_pocfRZGoU4Lhd7Wh3hx2Rnm_fJWXX4S_zXJIEmk,8286
|
|
66
66
|
datachain/lib/webdataset_laion.py,sha256=HAtSCbVvEQqzKkoRamRxDKaQALSB3QmJRU2yWRFNxwY,2147
|
|
67
67
|
datachain/query/__init__.py,sha256=tv-spkjUCYamMN9ys_90scYrZ8kJ7C7d1MTYVmxGtk4,325
|
|
68
68
|
datachain/query/batch.py,sha256=sOMxXbaNii7lVyFIEZ2noqbhy_S8qtZ-WWxrka72shc,3474
|
|
69
69
|
datachain/query/builtins.py,sha256=ZKNs49t8Oa_OaboCBIEqtXZt7c1Qe9OR_C_HpoDriIU,2781
|
|
70
|
-
datachain/query/dataset.py,sha256=
|
|
71
|
-
datachain/query/dispatch.py,sha256=
|
|
70
|
+
datachain/query/dataset.py,sha256=vpu2wQYC5uWc-LdZrNV-PV7xQapbYCtqyrXiiIa77DI,64982
|
|
71
|
+
datachain/query/dispatch.py,sha256=ZeL5dga5d4cJDBftK7gAQ_mx4C7zq6t3z0Hdt7mcZYY,13094
|
|
72
72
|
datachain/query/metrics.py,sha256=vsECqbZfoSDBnvC3GQlziKXmISVYDLgHP1fMPEOtKyo,640
|
|
73
73
|
datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
|
|
74
|
-
datachain/query/schema.py,sha256=
|
|
74
|
+
datachain/query/schema.py,sha256=n1NBOj6JO2I26mZD4vSURmVC2rk3mjIkJQheeLogoy4,7748
|
|
75
75
|
datachain/query/session.py,sha256=e4_vv4RqAjU-g3KK0avgLd9MEsmJBzRVEj1w8v7fP1k,3663
|
|
76
|
-
datachain/query/udf.py,sha256=
|
|
76
|
+
datachain/query/udf.py,sha256=gnLDM7LKH8_bbdDeVHnlDKaBdbWc_NAbwvYCc4i-OlU,7101
|
|
77
77
|
datachain/remote/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
78
78
|
datachain/remote/studio.py,sha256=bZb85WjtqMNFBoRuPbH-TEGpAyz0afROR7E9UgIef_Y,7438
|
|
79
79
|
datachain/sql/__init__.py,sha256=A2djrbQwSMUZZEIKGnm-mnRA-NDSbiDJNpAmmwGNyIo,303
|
|
@@ -92,9 +92,9 @@ datachain/sql/sqlite/__init__.py,sha256=TAdJX0Bg28XdqPO-QwUVKy8rg78cgMileHvMNot7
|
|
|
92
92
|
datachain/sql/sqlite/base.py,sha256=nPMF6_FF04hclDNZev_YfxMgbJAsWEdF-rU2pUhqBtc,12048
|
|
93
93
|
datachain/sql/sqlite/types.py,sha256=oP93nLfTBaYnN0z_4Dsv-HZm8j9rrUf1esMM-z3JLbg,1754
|
|
94
94
|
datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
|
|
95
|
-
datachain-0.2.
|
|
96
|
-
datachain-0.2.
|
|
97
|
-
datachain-0.2.
|
|
98
|
-
datachain-0.2.
|
|
99
|
-
datachain-0.2.
|
|
100
|
-
datachain-0.2.
|
|
95
|
+
datachain-0.2.3.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
|
|
96
|
+
datachain-0.2.3.dist-info/METADATA,sha256=NmviJ7UsETesadrJjeyoYjeNqul6GMd9D4zDZLk23Co,14399
|
|
97
|
+
datachain-0.2.3.dist-info/WHEEL,sha256=Z4pYXqR_rTB7OWNDYFOm1qRk0RX6GFP2o8LgvP453Hk,91
|
|
98
|
+
datachain-0.2.3.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
|
|
99
|
+
datachain-0.2.3.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
|
|
100
|
+
datachain-0.2.3.dist-info/RECORD,,
|
datachain/lib/reader.py
DELETED
|
@@ -1,49 +0,0 @@
|
|
|
1
|
-
from abc import ABC, abstractmethod
|
|
2
|
-
from typing import TYPE_CHECKING, Any
|
|
3
|
-
|
|
4
|
-
if TYPE_CHECKING:
|
|
5
|
-
from datachain.lib.feature_utils import FeatureLike
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
class FeatureReader(ABC):
|
|
9
|
-
def __init__(self, fr_class: "FeatureLike"):
|
|
10
|
-
"""
|
|
11
|
-
Class to call on feature values to perform post-processing. Used when
|
|
12
|
-
iterating over dataset with `ds.to_pytorch()` and `ds.get_values()`.
|
|
13
|
-
|
|
14
|
-
The class must include:
|
|
15
|
-
- `self.fr_class` to define the feature class to read.
|
|
16
|
-
- `self.__call__(self, value)` to call on the feature value returned by
|
|
17
|
-
`self.fr_class.get_value()`.
|
|
18
|
-
|
|
19
|
-
Examples:
|
|
20
|
-
>>> class PrefixReader(FeatureReader):
|
|
21
|
-
>>> def __call__(self, value):
|
|
22
|
-
>>> return "prefix-" + value
|
|
23
|
-
>>> for row in ds.get_values(PrefixReader(MyFeature)):
|
|
24
|
-
>>> print(row)
|
|
25
|
-
|
|
26
|
-
>>> class SuffixReader(FeatureReader):
|
|
27
|
-
>>> def __init__(self, fr_class, suffix):
|
|
28
|
-
>>> self.suffix = suffix
|
|
29
|
-
>>> super().__init__(fr_class)
|
|
30
|
-
>>> def __call__(self, value):
|
|
31
|
-
>>> return value + self.suffix
|
|
32
|
-
>>> for row in ds.get_values(SuffixReader(MyFeature, "-suffix")):
|
|
33
|
-
>>> print(row)
|
|
34
|
-
"""
|
|
35
|
-
self.fr_class = fr_class
|
|
36
|
-
|
|
37
|
-
@abstractmethod
|
|
38
|
-
def __call__(self, value: Any) -> Any:
|
|
39
|
-
pass
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
class LabelReader(FeatureReader):
|
|
43
|
-
def __init__(self, fr_class: "FeatureLike", classes: list):
|
|
44
|
-
"""Get column values as 0-based integer index of classes."""
|
|
45
|
-
self.classes = classes
|
|
46
|
-
super().__init__(fr_class)
|
|
47
|
-
|
|
48
|
-
def __call__(self, value: str) -> int:
|
|
49
|
-
return self.classes.index(value)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|