datachain 0.7.10__py3-none-any.whl → 0.7.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

@@ -1,4 +1,3 @@
1
1
  from .fsspec import Client
2
- from .s3 import ClientS3
3
2
 
4
- __all__ = ["Client", "ClientS3"]
3
+ __all__ = ["Client"]
datachain/lib/dc.py CHANGED
@@ -19,7 +19,6 @@ from typing import (
19
19
  )
20
20
 
21
21
  import orjson
22
- import pandas as pd
23
22
  import sqlalchemy
24
23
  from pydantic import BaseModel
25
24
  from sqlalchemy.sql.functions import GenericFunction
@@ -57,6 +56,7 @@ from datachain.telemetry import telemetry
57
56
  from datachain.utils import batched_it, inside_notebook, row_to_nested_dict
58
57
 
59
58
  if TYPE_CHECKING:
59
+ import pandas as pd
60
60
  from pyarrow import DataType as ArrowDataType
61
61
  from typing_extensions import Concatenate, ParamSpec, Self
62
62
 
@@ -1701,6 +1701,8 @@ class DataChain:
1701
1701
  Parameters:
1702
1702
  flatten : Whether to use a multiindex or flatten column names.
1703
1703
  """
1704
+ import pandas as pd
1705
+
1704
1706
  headers, max_length = self._effective_signals_schema.get_headers_with_length()
1705
1707
  if flatten or max_length < 2:
1706
1708
  columns = [".".join(filter(None, header)) for header in headers]
@@ -1724,6 +1726,8 @@ class DataChain:
1724
1726
  transpose : Whether to transpose rows and columns.
1725
1727
  truncate : Whether or not to truncate the contents of columns.
1726
1728
  """
1729
+ import pandas as pd
1730
+
1727
1731
  dc = self.limit(limit) if limit > 0 else self # type: ignore[misc]
1728
1732
  df = dc.to_pandas(flatten)
1729
1733
 
datachain/lib/file.py CHANGED
@@ -17,7 +17,6 @@ from urllib.request import url2pathname
17
17
 
18
18
  from fsspec.callbacks import DEFAULT_CALLBACK, Callback
19
19
  from PIL import Image
20
- from pyarrow.dataset import dataset
21
20
  from pydantic import Field, field_validator
22
21
 
23
22
  from datachain.client.fileslice import FileSlice
@@ -452,6 +451,8 @@ class ArrowRow(DataModel):
452
451
  @contextmanager
453
452
  def open(self):
454
453
  """Stream row contents from indexed file."""
454
+ from pyarrow.dataset import dataset
455
+
455
456
  if self.file._caching_enabled:
456
457
  self.file.ensure_cached()
457
458
  path = self.file.get_local_path()
@@ -6,7 +6,6 @@ from collections.abc import Iterator
6
6
  from pathlib import Path
7
7
  from typing import Callable
8
8
 
9
- import datamodel_code_generator
10
9
  import jmespath as jsp
11
10
  from pydantic import BaseModel, ConfigDict, Field, ValidationError # noqa: F401
12
11
 
@@ -67,6 +66,8 @@ def read_schema(source_file, data_type="csv", expr=None, model_name=None):
67
66
  data_type = "json" # treat json line as plain JSON in auto-schema
68
67
  data_string = json.dumps(json_object)
69
68
 
69
+ import datamodel_code_generator
70
+
70
71
  input_file_types = {i.value: i for i in datamodel_code_generator.InputFileType}
71
72
  input_file_type = input_file_types[data_type]
72
73
  with tempfile.TemporaryDirectory() as tmpdir:
datachain/lib/pytorch.py CHANGED
@@ -7,7 +7,6 @@ from torch import float32
7
7
  from torch.distributed import get_rank, get_world_size
8
8
  from torch.utils.data import IterableDataset, get_worker_info
9
9
  from torchvision.transforms import v2
10
- from tqdm import tqdm
11
10
 
12
11
  from datachain import Session
13
12
  from datachain.asyn import AsyncMapper
@@ -112,10 +111,7 @@ class PytorchDataset(IterableDataset):
112
111
  from datachain.lib.udf import _prefetch_input
113
112
 
114
113
  rows = AsyncMapper(_prefetch_input, rows, workers=self.prefetch).iterate()
115
-
116
- desc = f"Parsed PyTorch dataset for rank={total_rank} worker"
117
- with tqdm(rows, desc=desc, unit=" rows", position=total_rank) as rows_it:
118
- yield from map(self._process_row, rows_it)
114
+ yield from map(self._process_row, rows)
119
115
 
120
116
  def _process_row(self, row_features):
121
117
  row = []
@@ -402,9 +402,20 @@ class SignalSchema:
402
402
  if ModelStore.is_pydantic(finfo.annotation):
403
403
  SignalSchema._set_file_stream(getattr(obj, field), catalog, cache)
404
404
 
405
- def get_column_type(self, col_name: str) -> DataType:
405
+ def get_column_type(self, col_name: str, with_subtree: bool = False) -> DataType:
406
+ """
407
+ Returns column type by column name.
408
+
409
+ If `with_subtree` is True, then it will return the type of the column
410
+ even if it has a subtree (e.g. model with nested fields), otherwise it will
411
+ return the type of the column (standard type field, not the model).
412
+
413
+ If column is not found, raises `SignalResolvingError`.
414
+ """
406
415
  for path, _type, has_subtree, _ in self.get_flat_tree():
407
- if not has_subtree and DEFAULT_DELIMITER.join(path) == col_name:
416
+ if (with_subtree or not has_subtree) and DEFAULT_DELIMITER.join(
417
+ path
418
+ ) == col_name:
408
419
  return _type
409
420
  raise SignalResolvingError([col_name], "is not found")
410
421
 
@@ -492,14 +503,25 @@ class SignalSchema:
492
503
  # renaming existing signal
493
504
  del new_values[value.name]
494
505
  new_values[name] = self.values[value.name]
495
- elif isinstance(value, Func):
506
+ continue
507
+ if isinstance(value, Column):
508
+ # adding new signal from existing signal field
509
+ try:
510
+ new_values[name] = self.get_column_type(
511
+ value.name, with_subtree=True
512
+ )
513
+ continue
514
+ except SignalResolvingError:
515
+ pass
516
+ if isinstance(value, Func):
496
517
  # adding new signal with function
497
518
  new_values[name] = value.get_result_type(self)
498
- elif isinstance(value, ColumnElement):
519
+ continue
520
+ if isinstance(value, ColumnElement):
499
521
  # adding new signal
500
522
  new_values[name] = sql_to_python(value)
501
- else:
502
- new_values[name] = value
523
+ continue
524
+ new_values[name] = value
503
525
 
504
526
  return SignalSchema(new_values)
505
527
 
@@ -35,7 +35,6 @@ from sqlalchemy.sql.schema import TableClause
35
35
  from sqlalchemy.sql.selectable import Select
36
36
 
37
37
  from datachain.asyn import ASYNC_WORKERS, AsyncMapper, OrderedMapper
38
- from datachain.catalog import QUERY_SCRIPT_CANCELED_EXIT_CODE, get_catalog
39
38
  from datachain.data_storage.schema import (
40
39
  PARTITION_COLUMN_ID,
41
40
  partition_col_names,
@@ -394,6 +393,8 @@ class UDFStep(Step, ABC):
394
393
  """
395
394
 
396
395
  def populate_udf_table(self, udf_table: "Table", query: Select) -> None:
396
+ from datachain.catalog import QUERY_SCRIPT_CANCELED_EXIT_CODE
397
+
397
398
  use_partitioning = self.partition_by is not None
398
399
  batching = self.udf.get_batching(use_partitioning)
399
400
  workers = self.workers
@@ -1087,6 +1088,8 @@ class DatasetQuery:
1087
1088
  def delete(
1088
1089
  name: str, version: Optional[int] = None, catalog: Optional["Catalog"] = None
1089
1090
  ) -> None:
1091
+ from datachain.catalog import get_catalog
1092
+
1090
1093
  catalog = catalog or get_catalog()
1091
1094
  version = version or catalog.get_dataset(name).latest_version
1092
1095
  catalog.remove_dataset(name, version)
@@ -1,7 +1,16 @@
1
+ import random
2
+ from typing import Optional
3
+
1
4
  from datachain import C, DataChain
2
5
 
6
+ RESOLUTION = 2**31 - 1 # Maximum positive value for a 32-bit signed integer.
7
+
3
8
 
4
- def train_test_split(dc: DataChain, weights: list[float]) -> list[DataChain]:
9
+ def train_test_split(
10
+ dc: DataChain,
11
+ weights: list[float],
12
+ seed: Optional[int] = None,
13
+ ) -> list[DataChain]:
5
14
  """
6
15
  Splits a DataChain into multiple subsets based on the provided weights.
7
16
 
@@ -18,6 +27,8 @@ def train_test_split(dc: DataChain, weights: list[float]) -> list[DataChain]:
18
27
  For example:
19
28
  - `[0.7, 0.3]` corresponds to a 70/30 split;
20
29
  - `[2, 1, 1]` corresponds to a 50/25/25 split.
30
+ seed (int, optional):
31
+ The seed for the random number generator. Defaults to None.
21
32
 
22
33
  Returns:
23
34
  list[DataChain]:
@@ -58,14 +69,16 @@ def train_test_split(dc: DataChain, weights: list[float]) -> list[DataChain]:
58
69
 
59
70
  weights_normalized = [weight / sum(weights) for weight in weights]
60
71
 
61
- resolution = 2**31 - 1 # Maximum positive value for a 32-bit signed integer.
72
+ rand_col = C("sys.rand")
73
+ if seed is not None:
74
+ uniform_seed = random.Random(seed).randrange(1, RESOLUTION) # noqa: S311
75
+ rand_col = (rand_col % RESOLUTION) * uniform_seed # type: ignore[assignment]
76
+ rand_col = rand_col % RESOLUTION # type: ignore[assignment]
62
77
 
63
78
  return [
64
79
  dc.filter(
65
- C("sys__rand") % resolution
66
- >= round(sum(weights_normalized[:index]) * resolution),
67
- C("sys__rand") % resolution
68
- < round(sum(weights_normalized[: index + 1]) * resolution),
80
+ rand_col >= round(sum(weights_normalized[:index]) * (RESOLUTION - 1)),
81
+ rand_col < round(sum(weights_normalized[: index + 1]) * (RESOLUTION - 1)),
69
82
  )
70
83
  for index, _ in enumerate(weights_normalized)
71
84
  ]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.7.10
3
+ Version: 0.7.11
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -91,14 +91,14 @@ Requires-Dist: types-requests; extra == "dev"
91
91
  Requires-Dist: types-tabulate; extra == "dev"
92
92
  Provides-Extra: examples
93
93
  Requires-Dist: datachain[tests]; extra == "examples"
94
- Requires-Dist: numpy<2,>=1; extra == "examples"
95
94
  Requires-Dist: defusedxml; extra == "examples"
96
95
  Requires-Dist: accelerate; extra == "examples"
97
- Requires-Dist: unstructured[embed-huggingface,pdf]<0.16.0; extra == "examples"
96
+ Requires-Dist: unstructured_ingest[embed-huggingface]; extra == "examples"
97
+ Requires-Dist: unstructured[pdf]; extra == "examples"
98
98
  Requires-Dist: pdfplumber==0.11.4; extra == "examples"
99
99
  Requires-Dist: huggingface_hub[hf_transfer]; extra == "examples"
100
100
  Requires-Dist: onnx==1.16.1; extra == "examples"
101
- Requires-Dist: ultralytics==8.3.37; extra == "examples"
101
+ Requires-Dist: ultralytics==8.3.48; extra == "examples"
102
102
 
103
103
  ================
104
104
  |logo| DataChain
@@ -138,6 +138,11 @@ Use Cases
138
138
  3. **Versioning.** DataChain doesn't store, require moving or copying data (unlike DVC).
139
139
  Perfect use case is a bucket with thousands or millions of images, videos, audio, PDFs.
140
140
 
141
+ Getting Started
142
+ ===============
143
+
144
+ Visit `Quick Start <https://docs.datachain.ai/quick-start>`_ and `Docs <https://docs.datachain.ai/>`_
145
+ to get started with `DataChain` and learn more.
141
146
 
142
147
  Key Features
143
148
  ============
@@ -161,12 +166,6 @@ Key Features
161
166
  - Pass datasets to Pytorch and Tensorflow, or export them back into storage.
162
167
 
163
168
 
164
- Getting Started
165
- ===============
166
-
167
- Visit `Quick Start <https://docs.datachain.ai/quick-start>`_ to get started with `DataChain` and learn more.
168
-
169
-
170
169
  Contributing
171
170
  ============
172
171
 
@@ -21,7 +21,7 @@ datachain/catalog/__init__.py,sha256=g2iAAFx_gEIrqshXlhSEbrc8qDaEH11cjU40n3CHDz4
21
21
  datachain/catalog/catalog.py,sha256=s4fat0jjP3JPq0RGQ9zfzRkX1JavxxCrcB1tJKMgsks,57686
22
22
  datachain/catalog/datasource.py,sha256=D-VWIVDCM10A8sQavLhRXdYSCG7F4o4ifswEF80_NAQ,1412
23
23
  datachain/catalog/loader.py,sha256=HA_mBC7q_My8j2WnSvIjUGuJpl6SIdg5vvy_lagxJlA,5733
24
- datachain/client/__init__.py,sha256=T4wiYL9KIM0ZZ_UqIyzV8_ufzYlewmizlV4iymHNluE,86
24
+ datachain/client/__init__.py,sha256=1kDpCPoibMXi1gExR4lTLc5pi-k6M5TANiwtXkPoLhU,49
25
25
  datachain/client/azure.py,sha256=ffxs26zm6KLAL1aUWJm-vtzuZP3LSNha7UDGXynMBKo,2234
26
26
  datachain/client/fileslice.py,sha256=bT7TYco1Qe3bqoc8aUkUZcPdPofJDHlryL5BsTn9xsY,3021
27
27
  datachain/client/fsspec.py,sha256=kf1blSGNcEXJ0tra3y5i35jc1aAy-67wMHXkqjlRMXg,12736
@@ -53,17 +53,17 @@ datachain/lib/arrow.py,sha256=b5efxAUaNNYVwtXVJqj07D3zf5KC-BPlLCxKEZbEG6w,9429
53
53
  datachain/lib/clip.py,sha256=lm5CzVi4Cj1jVLEKvERKArb-egb9j1Ls-fwTItT6vlI,6150
54
54
  datachain/lib/data_model.py,sha256=zS4lmXHVBXc9ntcyea2a1CRLXGSAN_0glXcF88CohgY,2685
55
55
  datachain/lib/dataset_info.py,sha256=IjdF1E0TQNOq9YyynfWiCFTeZpbyGfyJvxgJY4YN810,2493
56
- datachain/lib/dc.py,sha256=xqLR4IH_mbuet0FsxBHDsRUg-zR6tO8UZdLQQTLG8EE,89533
57
- datachain/lib/file.py,sha256=-XMkL6ED1sE7TMhWoMRTEuOXswZJw8X6AEmJDONFP74,15019
56
+ datachain/lib/dc.py,sha256=qMhpVPdWeuXBDhmKKoq3fkq12Cx_ZPxDdsl_juu482o,89595
57
+ datachain/lib/file.py,sha256=4dDWXVCHHP2uELDPHP_LheyTyyr01jwp5wp3HaOIeFI,15028
58
58
  datachain/lib/hf.py,sha256=a-zFpDmZIR4r8dlNNTjfpAKSnuJ9xyRXlgcdENiXt3E,5864
59
59
  datachain/lib/image.py,sha256=AMXYwQsmarZjRbPCZY3M1jDsM2WAB_b3cTY4uOIuXNU,2675
60
60
  datachain/lib/listing.py,sha256=cVkCp7TRVpcZKSx-Bbk9t51bQI9Mw0o86W6ZPhAsuzM,3667
61
61
  datachain/lib/listing_info.py,sha256=9ua40Hw0aiQByUw3oAEeNzMavJYfW0Uhe8YdCTK-m_g,1110
62
- datachain/lib/meta_formats.py,sha256=anK2bDVbaeCCh0yvKUBaW2MVos3zRgdaSV8uSduzPcU,6680
62
+ datachain/lib/meta_formats.py,sha256=6_gB23fWlvd-edOO3UvDHvj6dBXVL61T7x8RX51FW84,6685
63
63
  datachain/lib/model_store.py,sha256=DNIv8Y6Jtk1_idNLzIpsThOsdW2BMAudyUCbPUcgcxk,2515
64
- datachain/lib/pytorch.py,sha256=QMJO_OGEMvBi2x71vGcG25agLzNwyLmF4Qx5iILlwaM,6350
64
+ datachain/lib/pytorch.py,sha256=dA3r1JY0wqV_907a1D0lFaEN-7v3fMRpc1ePFE9CnvA,6168
65
65
  datachain/lib/settings.py,sha256=ZELRCTLbi5vzRPiDX6cQ9LLg9TefJ_A05gIGni0lll8,2535
66
- datachain/lib/signal_schema.py,sha256=_uh19nCKhiD9ua8oIN1Q8R9iYv1BZAuqTJCLYVmyW8k,24557
66
+ datachain/lib/signal_schema.py,sha256=ziRTctom0-wAqURZfkfG6dc_3P2FcYxKjYsKC49NQ1Q,25415
67
67
  datachain/lib/tar.py,sha256=3WIzao6yD5fbLqXLTt9GhPGNonbFIs_fDRu-9vgLgsA,1038
68
68
  datachain/lib/text.py,sha256=UNHm8fhidk7wdrWqacEWaA6I9ykfYqarQ2URby7jc7M,1261
69
69
  datachain/lib/udf.py,sha256=-j0krjNAELTqRI0dB1N65AmawtcIY5vN---AuUcW8Us,13637
@@ -88,7 +88,7 @@ datachain/model/ultralytics/pose.py,sha256=71KBTcoST2wcEtsyGXqLVpvUtqbp9gwZGA15p
88
88
  datachain/model/ultralytics/segment.py,sha256=Z1ab0tZRJubSYNH4KkFlzhYeGNTfAyC71KmkQcToHDQ,2760
89
89
  datachain/query/__init__.py,sha256=7DhEIjAA8uZJfejruAVMZVcGFmvUpffuZJwgRqNwe-c,263
90
90
  datachain/query/batch.py,sha256=5fEhORFe7li12SdYddaSK3LyqksMfCHhwN1_A6TfsA4,3485
91
- datachain/query/dataset.py,sha256=eXr9fJz2grX2evmkmsiH0Xeqajd8gFnujmt_USMxy0c,54563
91
+ datachain/query/dataset.py,sha256=JrImhguXj2ZDwJpfuyhcgxSIlqSPy5NmLDLc3muFQJs,54610
92
92
  datachain/query/dispatch.py,sha256=fZ0TgGFRcsrYh1iXQoZVjkUl4Xetom9PSHoeDes3IRs,11606
93
93
  datachain/query/metrics.py,sha256=r5b0ygYhokbXp8Mg3kCH8iFSRw0jxzyeBe-C-J_bKFc,938
94
94
  datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
@@ -116,11 +116,11 @@ datachain/sql/sqlite/base.py,sha256=E2PK3hoGlHey1eEjcReXRrI-c_ASr3AmAXaNYKDY_o8,
116
116
  datachain/sql/sqlite/types.py,sha256=lPXS1XbkmUtlkkiRxy_A_UzsgpPv2VSkXYOD4zIHM4w,1734
117
117
  datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
118
118
  datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
119
- datachain/toolkit/split.py,sha256=ZgDcrNiKiPXZmKD591_1z9qRIXitu5zwAsoVPB7ykiU,2508
119
+ datachain/toolkit/split.py,sha256=z3zRJNzjWrpPuRw-zgFbCOBKInyYxJew8ygrYQRQLNc,2930
120
120
  datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
121
- datachain-0.7.10.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
122
- datachain-0.7.10.dist-info/METADATA,sha256=qtw_rToRdmR9-CO6MFCAGv6NWJJ87C95iQaDEnDE4H8,8371
123
- datachain-0.7.10.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
124
- datachain-0.7.10.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
125
- datachain-0.7.10.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
126
- datachain-0.7.10.dist-info/RECORD,,
121
+ datachain-0.7.11.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
122
+ datachain-0.7.11.dist-info/METADATA,sha256=ADTTf0_eJImM-tIPR-jQydM3N9Iis-ECRxWgkwLM8lU,8412
123
+ datachain-0.7.11.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
124
+ datachain-0.7.11.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
125
+ datachain-0.7.11.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
126
+ datachain-0.7.11.dist-info/RECORD,,