datachain 0.1.12__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (44) hide show
  1. datachain/_version.py +2 -2
  2. datachain/asyn.py +3 -3
  3. datachain/catalog/__init__.py +3 -3
  4. datachain/catalog/catalog.py +6 -6
  5. datachain/catalog/loader.py +3 -3
  6. datachain/cli.py +2 -1
  7. datachain/client/azure.py +37 -1
  8. datachain/client/fsspec.py +1 -1
  9. datachain/client/local.py +1 -1
  10. datachain/data_storage/__init__.py +1 -1
  11. datachain/data_storage/metastore.py +11 -3
  12. datachain/data_storage/schema.py +2 -3
  13. datachain/data_storage/warehouse.py +31 -30
  14. datachain/dataset.py +1 -3
  15. datachain/lib/arrow.py +85 -0
  16. datachain/lib/dc.py +377 -178
  17. datachain/lib/feature.py +41 -90
  18. datachain/lib/feature_registry.py +3 -1
  19. datachain/lib/feature_utils.py +2 -2
  20. datachain/lib/file.py +20 -20
  21. datachain/lib/image.py +9 -2
  22. datachain/lib/meta_formats.py +66 -34
  23. datachain/lib/settings.py +5 -5
  24. datachain/lib/signal_schema.py +103 -105
  25. datachain/lib/udf.py +3 -12
  26. datachain/lib/udf_signature.py +11 -6
  27. datachain/lib/webdataset_laion.py +5 -22
  28. datachain/listing.py +8 -8
  29. datachain/node.py +1 -1
  30. datachain/progress.py +1 -1
  31. datachain/query/builtins.py +1 -1
  32. datachain/query/dataset.py +39 -110
  33. datachain/query/dispatch.py +1 -1
  34. datachain/query/metrics.py +19 -0
  35. datachain/query/schema.py +13 -3
  36. datachain/sql/__init__.py +1 -1
  37. datachain/utils.py +1 -122
  38. {datachain-0.1.12.dist-info → datachain-0.2.0.dist-info}/METADATA +10 -3
  39. {datachain-0.1.12.dist-info → datachain-0.2.0.dist-info}/RECORD +43 -42
  40. {datachain-0.1.12.dist-info → datachain-0.2.0.dist-info}/WHEEL +1 -1
  41. datachain/lib/parquet.py +0 -32
  42. {datachain-0.1.12.dist-info → datachain-0.2.0.dist-info}/LICENSE +0 -0
  43. {datachain-0.1.12.dist-info → datachain-0.2.0.dist-info}/entry_points.txt +0 -0
  44. {datachain-0.1.12.dist-info → datachain-0.2.0.dist-info}/top_level.txt +0 -0
datachain/lib/parquet.py DELETED
@@ -1,32 +0,0 @@
1
- from collections.abc import Iterator
2
- from typing import Callable, Optional
3
-
4
- import pandas as pd
5
- from pydantic import Field
6
-
7
- from datachain.lib.feature import Feature
8
- from datachain.lib.file import File
9
-
10
-
11
- class BasicParquet(Feature):
12
- file: File
13
- index: Optional[int] = Field(default=None)
14
-
15
-
16
- def process_parquet(spec: type[BasicParquet]) -> Callable:
17
- def process(file: File) -> Iterator[spec]: # type: ignore[valid-type]
18
- with file.open() as fd:
19
- df = pd.read_parquet(fd)
20
- df["index"] = df.index
21
-
22
- for pq_dict in df.to_dict("records"):
23
- pq_dict["file"] = File(
24
- name=str(pq_dict["index"]),
25
- source=file.source,
26
- parent=file.get_full_name(),
27
- version=file.version,
28
- etag=file.etag,
29
- )
30
- yield spec(**pq_dict)
31
-
32
- return process