datachain 0.1.12__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/_version.py +2 -2
- datachain/asyn.py +3 -3
- datachain/catalog/__init__.py +3 -3
- datachain/catalog/catalog.py +6 -6
- datachain/catalog/loader.py +3 -3
- datachain/cli.py +2 -1
- datachain/client/azure.py +37 -1
- datachain/client/fsspec.py +1 -1
- datachain/client/local.py +1 -1
- datachain/data_storage/__init__.py +1 -1
- datachain/data_storage/metastore.py +11 -3
- datachain/data_storage/schema.py +2 -3
- datachain/data_storage/warehouse.py +31 -30
- datachain/dataset.py +1 -3
- datachain/lib/arrow.py +85 -0
- datachain/lib/dc.py +377 -178
- datachain/lib/feature.py +41 -90
- datachain/lib/feature_registry.py +3 -1
- datachain/lib/feature_utils.py +2 -2
- datachain/lib/file.py +20 -20
- datachain/lib/image.py +9 -2
- datachain/lib/meta_formats.py +66 -34
- datachain/lib/settings.py +5 -5
- datachain/lib/signal_schema.py +103 -105
- datachain/lib/udf.py +3 -12
- datachain/lib/udf_signature.py +11 -6
- datachain/lib/webdataset_laion.py +5 -22
- datachain/listing.py +8 -8
- datachain/node.py +1 -1
- datachain/progress.py +1 -1
- datachain/query/builtins.py +1 -1
- datachain/query/dataset.py +39 -110
- datachain/query/dispatch.py +1 -1
- datachain/query/metrics.py +19 -0
- datachain/query/schema.py +13 -3
- datachain/sql/__init__.py +1 -1
- datachain/utils.py +1 -122
- {datachain-0.1.12.dist-info → datachain-0.2.0.dist-info}/METADATA +10 -3
- {datachain-0.1.12.dist-info → datachain-0.2.0.dist-info}/RECORD +43 -42
- {datachain-0.1.12.dist-info → datachain-0.2.0.dist-info}/WHEEL +1 -1
- datachain/lib/parquet.py +0 -32
- {datachain-0.1.12.dist-info → datachain-0.2.0.dist-info}/LICENSE +0 -0
- {datachain-0.1.12.dist-info → datachain-0.2.0.dist-info}/entry_points.txt +0 -0
- {datachain-0.1.12.dist-info → datachain-0.2.0.dist-info}/top_level.txt +0 -0
datachain/lib/parquet.py
DELETED
|
@@ -1,32 +0,0 @@
|
|
|
1
|
-
from collections.abc import Iterator
|
|
2
|
-
from typing import Callable, Optional
|
|
3
|
-
|
|
4
|
-
import pandas as pd
|
|
5
|
-
from pydantic import Field
|
|
6
|
-
|
|
7
|
-
from datachain.lib.feature import Feature
|
|
8
|
-
from datachain.lib.file import File
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
class BasicParquet(Feature):
|
|
12
|
-
file: File
|
|
13
|
-
index: Optional[int] = Field(default=None)
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
def process_parquet(spec: type[BasicParquet]) -> Callable:
|
|
17
|
-
def process(file: File) -> Iterator[spec]: # type: ignore[valid-type]
|
|
18
|
-
with file.open() as fd:
|
|
19
|
-
df = pd.read_parquet(fd)
|
|
20
|
-
df["index"] = df.index
|
|
21
|
-
|
|
22
|
-
for pq_dict in df.to_dict("records"):
|
|
23
|
-
pq_dict["file"] = File(
|
|
24
|
-
name=str(pq_dict["index"]),
|
|
25
|
-
source=file.source,
|
|
26
|
-
parent=file.get_full_name(),
|
|
27
|
-
version=file.version,
|
|
28
|
-
etag=file.etag,
|
|
29
|
-
)
|
|
30
|
-
yield spec(**pq_dict)
|
|
31
|
-
|
|
32
|
-
return process
|
|
File without changes
|
|
File without changes
|
|
File without changes
|