datachain 0.6.1__py3-none-any.whl → 0.6.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

datachain/listing.py CHANGED
@@ -4,12 +4,10 @@ from collections.abc import Iterable, Iterator
4
4
  from itertools import zip_longest
5
5
  from typing import TYPE_CHECKING, Optional
6
6
 
7
- from fsspec.asyn import get_loop, sync
8
7
  from sqlalchemy import Column
9
8
  from sqlalchemy.sql import func
10
9
  from tqdm import tqdm
11
10
 
12
- from datachain.lib.file import File
13
11
  from datachain.node import DirType, Node, NodeWithPath
14
12
  from datachain.sql.functions import path as pathfunc
15
13
  from datachain.utils import suffix_to_number
@@ -17,33 +15,29 @@ from datachain.utils import suffix_to_number
17
15
  if TYPE_CHECKING:
18
16
  from datachain.catalog.datasource import DataSource
19
17
  from datachain.client import Client
20
- from datachain.data_storage import AbstractMetastore, AbstractWarehouse
18
+ from datachain.data_storage import AbstractWarehouse
21
19
  from datachain.dataset import DatasetRecord
22
- from datachain.storage import Storage
23
20
 
24
21
 
25
22
  class Listing:
26
23
  def __init__(
27
24
  self,
28
- storage: Optional["Storage"],
29
- metastore: "AbstractMetastore",
30
25
  warehouse: "AbstractWarehouse",
31
26
  client: "Client",
32
27
  dataset: Optional["DatasetRecord"],
28
+ object_name: str = "file",
33
29
  ):
34
- self.storage = storage
35
- self.metastore = metastore
36
30
  self.warehouse = warehouse
37
31
  self.client = client
38
32
  self.dataset = dataset # dataset representing bucket listing
33
+ self.object_name = object_name
39
34
 
40
35
  def clone(self) -> "Listing":
41
36
  return self.__class__(
42
- self.storage,
43
- self.metastore.clone(),
44
37
  self.warehouse.clone(),
45
38
  self.client,
46
39
  self.dataset,
40
+ self.object_name,
47
41
  )
48
42
 
49
43
  def __enter__(self) -> "Listing":
@@ -53,46 +47,20 @@ class Listing:
53
47
  self.close()
54
48
 
55
49
  def close(self) -> None:
56
- self.metastore.close()
57
50
  self.warehouse.close()
58
51
 
59
52
  @property
60
- def id(self):
61
- return self.storage.id
53
+ def uri(self):
54
+ from datachain.lib.listing import listing_uri_from_name
55
+
56
+ return listing_uri_from_name(self.dataset.name)
62
57
 
63
58
  @property
64
59
  def dataset_rows(self):
65
- return self.warehouse.dataset_rows(self.dataset, self.dataset.latest_version)
66
-
67
- def fetch(self, start_prefix="", method: str = "default") -> None:
68
- sync(get_loop(), self._fetch, start_prefix, method)
69
-
70
- async def _fetch(self, start_prefix: str, method: str) -> None:
71
- with self.clone() as fetch_listing:
72
- if start_prefix:
73
- start_prefix = start_prefix.rstrip("/")
74
- try:
75
- async for entries in fetch_listing.client.scandir(
76
- start_prefix, method=method
77
- ):
78
- fetch_listing.insert_entries(entries)
79
- if len(entries) > 1:
80
- fetch_listing.metastore.update_last_inserted_at()
81
- finally:
82
- fetch_listing.insert_entries_done()
83
-
84
- def insert_entry(self, entry: File) -> None:
85
- self.insert_entries([entry])
86
-
87
- def insert_entries(self, entries: Iterable[File]) -> None:
88
- self.warehouse.insert_rows(
89
- self.dataset_rows.get_table(),
90
- self.warehouse.prepare_entries(entries),
60
+ return self.warehouse.dataset_rows(
61
+ self.dataset, self.dataset.latest_version, object_name=self.object_name
91
62
  )
92
63
 
93
- def insert_entries_done(self) -> None:
94
- self.warehouse.insert_rows_done(self.dataset_rows.get_table())
95
-
96
64
  def expand_path(self, path, use_glob=True) -> list[Node]:
97
65
  if use_glob and glob.has_magic(path):
98
66
  return self.warehouse.expand_path(self.dataset_rows, path)
@@ -200,25 +168,31 @@ class Listing:
200
168
  conds = []
201
169
  if names:
202
170
  for name in names:
203
- conds.append(pathfunc.name(Column("path")).op("GLOB")(name))
171
+ conds.append(
172
+ pathfunc.name(Column(dr.col_name("path"))).op("GLOB")(name)
173
+ )
204
174
  if inames:
205
175
  for iname in inames:
206
176
  conds.append(
207
- func.lower(pathfunc.name(Column("path"))).op("GLOB")(iname.lower())
177
+ func.lower(pathfunc.name(Column(dr.col_name("path")))).op("GLOB")(
178
+ iname.lower()
179
+ )
208
180
  )
209
181
  if paths:
210
182
  for path in paths:
211
- conds.append(Column("path").op("GLOB")(path))
183
+ conds.append(Column(dr.col_name("path")).op("GLOB")(path))
212
184
  if ipaths:
213
185
  for ipath in ipaths:
214
- conds.append(func.lower(Column("path")).op("GLOB")(ipath.lower()))
186
+ conds.append(
187
+ func.lower(Column(dr.col_name("path"))).op("GLOB")(ipath.lower())
188
+ )
215
189
 
216
190
  if size is not None:
217
191
  size_limit = suffix_to_number(size)
218
192
  if size_limit >= 0:
219
- conds.append(Column("size") >= size_limit)
193
+ conds.append(Column(dr.col_name("size")) >= size_limit)
220
194
  else:
221
- conds.append(Column("size") <= -size_limit)
195
+ conds.append(Column(dr.col_name("size")) <= -size_limit)
222
196
 
223
197
  return self.warehouse.find(
224
198
  dr,
@@ -10,6 +10,7 @@ from abc import ABC, abstractmethod
10
10
  from collections.abc import Generator, Iterable, Iterator, Sequence
11
11
  from copy import copy
12
12
  from functools import wraps
13
+ from secrets import token_hex
13
14
  from typing import (
14
15
  TYPE_CHECKING,
15
16
  Any,
@@ -173,10 +174,10 @@ class QueryStep(StartingStep):
173
174
  return sqlalchemy.select(*columns)
174
175
 
175
176
  dataset = self.catalog.get_dataset(self.dataset_name)
176
- table = self.catalog.warehouse.dataset_rows(dataset, self.dataset_version)
177
+ dr = self.catalog.warehouse.dataset_rows(dataset, self.dataset_version)
177
178
 
178
179
  return step_result(
179
- q, table.c, dependencies=[(self.dataset_name, self.dataset_version)]
180
+ q, dr.columns, dependencies=[(self.dataset_name, self.dataset_version)]
180
181
  )
181
182
 
182
183
 
@@ -720,10 +721,17 @@ class SQLMutate(SQLClause):
720
721
 
721
722
  def apply_sql_clause(self, query: Select) -> Select:
722
723
  original_subquery = query.subquery()
724
+ to_mutate = {c.name for c in self.args}
725
+
726
+ prefix = f"mutate{token_hex(8)}_"
727
+ cols = [
728
+ c.label(prefix + c.name) if c.name in to_mutate else c
729
+ for c in original_subquery.c
730
+ ]
723
731
  # this is needed for new column to be used in clauses
724
732
  # like ORDER BY, otherwise new column is not recognized
725
733
  subquery = (
726
- sqlalchemy.select(*original_subquery.c, *self.args)
734
+ sqlalchemy.select(*cols, *self.args)
727
735
  .select_from(original_subquery)
728
736
  .subquery()
729
737
  )
@@ -1,5 +1,6 @@
1
1
  import json
2
2
  import logging
3
+ import os
3
4
  from collections.abc import Iterable, Iterator
4
5
  from datetime import datetime, timedelta, timezone
5
6
  from struct import unpack
@@ -10,8 +11,10 @@ from typing import (
10
11
  TypeVar,
11
12
  )
12
13
 
14
+ from datachain.config import Config
13
15
  from datachain.dataset import DatasetStats
14
- from datachain.utils import retry_with_backoff
16
+ from datachain.error import DataChainError
17
+ from datachain.utils import STUDIO_URL, retry_with_backoff
15
18
 
16
19
  T = TypeVar("T")
17
20
  LsData = Optional[list[dict[str, Any]]]
@@ -54,14 +57,54 @@ class Response(Generic[T]):
54
57
 
55
58
 
56
59
  class StudioClient:
57
- def __init__(
58
- self, url: str, username: str, token: str, timeout: float = 3600.0
59
- ) -> None:
60
+ def __init__(self, timeout: float = 3600.0, team: Optional[str] = None) -> None:
60
61
  self._check_dependencies()
61
- self.url = url.rstrip("/")
62
- self.username = username
63
- self.token = token
64
62
  self.timeout = timeout
63
+ self._config = None
64
+ self._team = team
65
+
66
+ @property
67
+ def token(self) -> str:
68
+ token = os.environ.get("DVC_STUDIO_TOKEN") or self.config.get("token")
69
+
70
+ if not token:
71
+ raise DataChainError(
72
+ "Studio token is not set. Use `datachain studio login` "
73
+ "or environment variable `DVC_STUDIO_TOKEN` to set it."
74
+ )
75
+
76
+ return token
77
+
78
+ @property
79
+ def url(self) -> str:
80
+ return (
81
+ os.environ.get("DVC_STUDIO_URL") or self.config.get("url") or STUDIO_URL
82
+ ) + "/api"
83
+
84
+ @property
85
+ def config(self) -> dict:
86
+ if self._config is None:
87
+ self._config = Config().read().get("studio", {})
88
+ return self._config # type: ignore [return-value]
89
+
90
+ @property
91
+ def team(self) -> str:
92
+ if self._team is None:
93
+ self._team = self._get_team()
94
+ return self._team
95
+
96
+ def _get_team(self) -> str:
97
+ team = os.environ.get("DVC_STUDIO_TEAM") or self.config.get("team")
98
+
99
+ if not team:
100
+ raise DataChainError(
101
+ "Studio team is not set. "
102
+ "Use `datachain studio team <team_name>` "
103
+ "or environment variable `DVC_STUDIO_TEAM` to set it."
104
+ "You can also set it in the config file as team under studio."
105
+ )
106
+
107
+ return team
65
108
 
66
109
  def _check_dependencies(self) -> None:
67
110
  try:
@@ -80,7 +123,7 @@ class StudioClient:
80
123
 
81
124
  response = requests.post(
82
125
  f"{self.url}/{route}",
83
- json={**data, "team_name": self.username},
126
+ json={**data, "team_name": self.team},
84
127
  headers={
85
128
  "Content-Type": "application/json",
86
129
  "Authorization": f"token {self.token}",
@@ -108,7 +151,7 @@ class StudioClient:
108
151
 
109
152
  response = requests.post(
110
153
  f"{self.url}/{route}",
111
- json={**data, "team_name": self.username},
154
+ json={**data, "team_name": self.team},
112
155
  headers={
113
156
  "Content-Type": "application/json",
114
157
  "Authorization": f"token {self.token}",
@@ -174,6 +217,9 @@ class StudioClient:
174
217
  response = self._send_request_msgpack("ls", {"source": path})
175
218
  yield path, response
176
219
 
220
+ def ls_datasets(self) -> Response[LsData]:
221
+ return self._send_request("datachain/ls-datasets", {})
222
+
177
223
  def dataset_info(self, name: str) -> Response[DatasetInfoData]:
178
224
  def _parse_dataset_info(dataset_info):
179
225
  _parse_dates(dataset_info, ["created_at", "finished_at"])
@@ -182,7 +228,7 @@ class StudioClient:
182
228
 
183
229
  return dataset_info
184
230
 
185
- response = self._send_request("dataset-info", {"dataset_name": name})
231
+ response = self._send_request("datachain/dataset-info", {"dataset_name": name})
186
232
  if response.ok:
187
233
  response.data = _parse_dataset_info(response.data)
188
234
  return response
@@ -192,13 +238,14 @@ class StudioClient:
192
238
  ) -> Response[DatasetRowsData]:
193
239
  req_data = {"dataset_name": name, "dataset_version": version}
194
240
  return self._send_request_msgpack(
195
- "dataset-rows",
241
+ "datachain/dataset-rows",
196
242
  {**req_data, "offset": offset, "limit": DATASET_ROWS_CHUNK_SIZE},
197
243
  )
198
244
 
199
245
  def dataset_stats(self, name: str, version: int) -> Response[DatasetStatsData]:
200
246
  response = self._send_request(
201
- "dataset-stats", {"dataset_name": name, "dataset_version": version}
247
+ "datachain/dataset-stats",
248
+ {"dataset_name": name, "dataset_version": version},
202
249
  )
203
250
  if response.ok:
204
251
  response.data = DatasetStats(**response.data)
@@ -208,12 +255,14 @@ class StudioClient:
208
255
  self, name: str, version: int
209
256
  ) -> Response[DatasetExportSignedUrls]:
210
257
  return self._send_request(
211
- "dataset-export", {"dataset_name": name, "dataset_version": version}
258
+ "datachain/dataset-export",
259
+ {"dataset_name": name, "dataset_version": version},
212
260
  )
213
261
 
214
262
  def dataset_export_status(
215
263
  self, name: str, version: int
216
264
  ) -> Response[DatasetExportStatus]:
217
265
  return self._send_request(
218
- "dataset-export-status", {"dataset_name": name, "dataset_version": version}
266
+ "datachain/dataset-export-status",
267
+ {"dataset_name": name, "dataset_version": version},
219
268
  )
datachain/studio.py ADDED
@@ -0,0 +1,129 @@
1
+ import os
2
+ from typing import TYPE_CHECKING
3
+
4
+ from datachain.catalog.catalog import raise_remote_error
5
+ from datachain.config import Config, ConfigLevel
6
+ from datachain.error import DataChainError
7
+ from datachain.remote.studio import StudioClient
8
+ from datachain.utils import STUDIO_URL
9
+
10
+ if TYPE_CHECKING:
11
+ from argparse import Namespace
12
+
13
+ POST_LOGIN_MESSAGE = (
14
+ "Once you've logged in, return here "
15
+ "and you'll be ready to start using DataChain with Studio."
16
+ )
17
+
18
+
19
+ def process_studio_cli_args(args: "Namespace"):
20
+ if args.cmd == "login":
21
+ return login(args)
22
+ if args.cmd == "logout":
23
+ return logout()
24
+ if args.cmd == "token":
25
+ return token()
26
+ if args.cmd == "datasets":
27
+ return list_datasets(args)
28
+ if args.cmd == "team":
29
+ return set_team(args)
30
+ raise DataChainError(f"Unknown command '{args.cmd}'.")
31
+
32
+
33
+ def set_team(args: "Namespace"):
34
+ level = ConfigLevel.GLOBAL if args.__dict__.get("global") else ConfigLevel.LOCAL
35
+ config = Config(level)
36
+ with config.edit() as conf:
37
+ studio_conf = conf.get("studio", {})
38
+ studio_conf["team"] = args.team_name
39
+ conf["studio"] = studio_conf
40
+
41
+ print(f"Set default team to '{args.team_name}' in {config.config_file()}")
42
+
43
+
44
+ def login(args: "Namespace"):
45
+ from dvc_studio_client.auth import StudioAuthError, get_access_token
46
+
47
+ config = Config().read().get("studio", {})
48
+ name = args.name
49
+ hostname = (
50
+ args.hostname
51
+ or os.environ.get("DVC_STUDIO_URL")
52
+ or config.get("url")
53
+ or STUDIO_URL
54
+ )
55
+ scopes = args.scopes
56
+
57
+ if config.get("url", hostname) == hostname and "token" in config:
58
+ raise DataChainError(
59
+ "Token already exists. "
60
+ "To login with a different token, "
61
+ "logout using `datachain studio logout`."
62
+ )
63
+
64
+ open_browser = not args.no_open
65
+ try:
66
+ _, access_token = get_access_token(
67
+ token_name=name,
68
+ hostname=hostname,
69
+ scopes=scopes,
70
+ open_browser=open_browser,
71
+ client_name="DataChain",
72
+ post_login_message=POST_LOGIN_MESSAGE,
73
+ )
74
+ except StudioAuthError as exc:
75
+ raise DataChainError(f"Failed to authenticate with Studio: {exc}") from exc
76
+
77
+ config_path = save_config(hostname, access_token)
78
+ print(f"Authentication complete. Saved token to {config_path}.")
79
+ return 0
80
+
81
+
82
+ def logout():
83
+ with Config(ConfigLevel.GLOBAL).edit() as conf:
84
+ token = conf.get("studio", {}).get("token")
85
+ if not token:
86
+ raise DataChainError(
87
+ "Not logged in to Studio. Log in with 'datachain studio login'."
88
+ )
89
+
90
+ del conf["studio"]["token"]
91
+
92
+ print("Logged out from Studio. (you can log back in with 'datachain studio login')")
93
+
94
+
95
+ def token():
96
+ config = Config().read().get("studio", {})
97
+ token = config.get("token")
98
+ if not token:
99
+ raise DataChainError(
100
+ "Not logged in to Studio. Log in with 'datachain studio login'."
101
+ )
102
+
103
+ print(token)
104
+
105
+
106
+ def list_datasets(args: "Namespace"):
107
+ client = StudioClient(team=args.team)
108
+ response = client.ls_datasets()
109
+ if not response.ok:
110
+ raise_remote_error(response.message)
111
+ if not response.data:
112
+ print("No datasets found.")
113
+ return
114
+ for d in response.data:
115
+ name = d.get("name")
116
+ for v in d.get("versions", []):
117
+ version = v.get("version")
118
+ print(f"{name} (v{version})")
119
+
120
+
121
+ def save_config(hostname, token):
122
+ config = Config(ConfigLevel.GLOBAL)
123
+ with config.edit() as conf:
124
+ studio_conf = conf.get("studio", {})
125
+ studio_conf["url"] = hostname
126
+ studio_conf["token"] = token
127
+ conf["studio"] = studio_conf
128
+
129
+ return config.config_file()
datachain/utils.py CHANGED
@@ -15,6 +15,7 @@ from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union
15
15
  from uuid import UUID
16
16
 
17
17
  import cloudpickle
18
+ import platformdirs
18
19
  from dateutil import tz
19
20
  from dateutil.parser import isoparse
20
21
  from pydantic import BaseModel
@@ -25,6 +26,13 @@ if TYPE_CHECKING:
25
26
  NUL = b"\0"
26
27
  TIME_ZERO = datetime.fromtimestamp(0, tz=timezone.utc)
27
28
 
29
+ APPNAME = "datachain"
30
+ APPAUTHOR = "iterative"
31
+ ENV_DATACHAIN_SYSTEM_CONFIG_DIR = "DATACHAIN_SYSTEM_CONFIG_DIR"
32
+ ENV_DATACHAIN_GLOBAL_CONFIG_DIR = "DATACHAIN_GLOBAL_CONFIG_DIR"
33
+ STUDIO_URL = "https://studio.dvc.ai"
34
+
35
+
28
36
  T = TypeVar("T", bound="DataChainDir")
29
37
 
30
38
 
@@ -33,6 +41,7 @@ class DataChainDir:
33
41
  CACHE = "cache"
34
42
  TMP = "tmp"
35
43
  DB = "db"
44
+ CONFIG = "config"
36
45
  ENV_VAR = "DATACHAIN_DIR"
37
46
  ENV_VAR_DATACHAIN_ROOT = "DATACHAIN_ROOT_DIR"
38
47
 
@@ -42,6 +51,7 @@ class DataChainDir:
42
51
  cache: Optional[str] = None,
43
52
  tmp: Optional[str] = None,
44
53
  db: Optional[str] = None,
54
+ config: Optional[str] = None,
45
55
  ) -> None:
46
56
  self.root = osp.abspath(root) if root is not None else self.default_root()
47
57
  self.cache = (
@@ -51,12 +61,24 @@ class DataChainDir:
51
61
  osp.abspath(tmp) if tmp is not None else osp.join(self.root, self.TMP)
52
62
  )
53
63
  self.db = osp.abspath(db) if db is not None else osp.join(self.root, self.DB)
64
+ self.config = (
65
+ osp.abspath(config)
66
+ if config is not None
67
+ else osp.join(self.root, self.CONFIG)
68
+ )
69
+ self.config = (
70
+ osp.abspath(config)
71
+ if config is not None
72
+ else osp.join(self.root, self.CONFIG)
73
+ )
54
74
 
55
75
  def init(self):
56
76
  os.makedirs(self.root, exist_ok=True)
57
77
  os.makedirs(self.cache, exist_ok=True)
58
78
  os.makedirs(self.tmp, exist_ok=True)
59
79
  os.makedirs(osp.split(self.db)[0], exist_ok=True)
80
+ os.makedirs(osp.split(self.config)[0], exist_ok=True)
81
+ os.makedirs(osp.split(self.config)[0], exist_ok=True)
60
82
 
61
83
  @classmethod
62
84
  def default_root(cls) -> str:
@@ -82,6 +104,18 @@ class DataChainDir:
82
104
  return instance
83
105
 
84
106
 
107
+ def system_config_dir():
108
+ return os.getenv(ENV_DATACHAIN_SYSTEM_CONFIG_DIR) or platformdirs.site_config_dir(
109
+ APPNAME, APPAUTHOR
110
+ )
111
+
112
+
113
+ def global_config_dir():
114
+ return os.getenv(ENV_DATACHAIN_GLOBAL_CONFIG_DIR) or platformdirs.user_config_dir(
115
+ APPNAME, APPAUTHOR
116
+ )
117
+
118
+
85
119
  def human_time_to_int(time: str) -> Optional[int]:
86
120
  if not time:
87
121
  return None
@@ -421,3 +455,27 @@ def env2bool(var, undefined=False):
421
455
  if var is None:
422
456
  return undefined
423
457
  return bool(re.search("1|y|yes|true", var, flags=re.IGNORECASE))
458
+
459
+
460
+ def nested_dict_path_set(
461
+ data: dict[str, Any], path: Sequence[str], value: Any
462
+ ) -> dict[str, Any]:
463
+ """Sets a value inside a nested dict based on the list of dict keys as a path,
464
+ and will create sub-dicts as needed to set the value."""
465
+ sub_data = data
466
+ for element in path[:-1]:
467
+ if element not in sub_data:
468
+ sub_data[element] = {}
469
+ sub_data = sub_data[element]
470
+ sub_data[path[len(path) - 1]] = value
471
+ return data
472
+
473
+
474
+ def row_to_nested_dict(
475
+ headers: Iterable[Sequence[str]], row: Iterable[Any]
476
+ ) -> dict[str, Any]:
477
+ """Converts a row to a nested dict based on the provided headers."""
478
+ result: dict[str, Any] = {}
479
+ for h, v in zip(headers, row):
480
+ nested_dict_path_set(result, h, v)
481
+ return result
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.6.1
3
+ Version: 0.6.2
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -19,7 +19,7 @@ License-File: LICENSE
19
19
  Requires-Dist: pyyaml
20
20
  Requires-Dist: tomlkit
21
21
  Requires-Dist: tqdm
22
- Requires-Dist: numpy
22
+ Requires-Dist: numpy <3,>=1
23
23
  Requires-Dist: pandas >=2.0.0
24
24
  Requires-Dist: pyarrow
25
25
  Requires-Dist: typing-extensions
@@ -38,15 +38,16 @@ Requires-Dist: orjson >=3.10.5
38
38
  Requires-Dist: pydantic <3,>=2
39
39
  Requires-Dist: jmespath >=1.0
40
40
  Requires-Dist: datamodel-code-generator >=0.25
41
- Requires-Dist: Pillow <11,>=10.0.0
41
+ Requires-Dist: Pillow <12,>=10.0.0
42
42
  Requires-Dist: msgpack <2,>=1.0.4
43
43
  Requires-Dist: psutil
44
44
  Requires-Dist: huggingface-hub
45
45
  Requires-Dist: iterative-telemetry >=0.0.9
46
- Requires-Dist: numpy <2,>=1 ; sys_platform == "win32"
46
+ Requires-Dist: platformdirs
47
+ Requires-Dist: dvc-studio-client <1,>=0.21
47
48
  Provides-Extra: dev
48
49
  Requires-Dist: datachain[docs,tests] ; extra == 'dev'
49
- Requires-Dist: mypy ==1.12.0 ; extra == 'dev'
50
+ Requires-Dist: mypy ==1.12.1 ; extra == 'dev'
50
51
  Requires-Dist: types-python-dateutil ; extra == 'dev'
51
52
  Requires-Dist: types-pytz ; extra == 'dev'
52
53
  Requires-Dist: types-PyYAML ; extra == 'dev'
@@ -63,7 +64,7 @@ Requires-Dist: datachain[tests] ; extra == 'examples'
63
64
  Requires-Dist: numpy <2,>=1 ; extra == 'examples'
64
65
  Requires-Dist: defusedxml ; extra == 'examples'
65
66
  Requires-Dist: accelerate ; extra == 'examples'
66
- Requires-Dist: unstructured[embed-huggingface,pdf] ; extra == 'examples'
67
+ Requires-Dist: unstructured[embed-huggingface,pdf] <0.16.0 ; extra == 'examples'
67
68
  Requires-Dist: pdfplumber ==0.11.4 ; extra == 'examples'
68
69
  Requires-Dist: huggingface-hub[hf_transfer] ; extra == 'examples'
69
70
  Requires-Dist: onnx ==1.16.1 ; extra == 'examples'