datachain 0.6.0__py3-none-any.whl → 0.6.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (38) hide show
  1. datachain/__init__.py +2 -0
  2. datachain/catalog/catalog.py +62 -228
  3. datachain/cli.py +136 -22
  4. datachain/client/fsspec.py +9 -0
  5. datachain/client/local.py +11 -32
  6. datachain/config.py +126 -51
  7. datachain/data_storage/schema.py +66 -33
  8. datachain/data_storage/sqlite.py +12 -4
  9. datachain/data_storage/warehouse.py +101 -129
  10. datachain/lib/convert/sql_to_python.py +8 -12
  11. datachain/lib/dc.py +275 -80
  12. datachain/lib/func/__init__.py +32 -0
  13. datachain/lib/func/aggregate.py +353 -0
  14. datachain/lib/func/func.py +152 -0
  15. datachain/lib/listing.py +6 -21
  16. datachain/lib/listing_info.py +4 -0
  17. datachain/lib/signal_schema.py +17 -8
  18. datachain/lib/udf.py +3 -3
  19. datachain/lib/utils.py +5 -0
  20. datachain/listing.py +22 -48
  21. datachain/query/__init__.py +1 -2
  22. datachain/query/batch.py +0 -1
  23. datachain/query/dataset.py +33 -46
  24. datachain/query/schema.py +1 -61
  25. datachain/query/session.py +33 -25
  26. datachain/remote/studio.py +63 -14
  27. datachain/sql/functions/__init__.py +1 -1
  28. datachain/sql/functions/aggregate.py +47 -0
  29. datachain/sql/functions/array.py +0 -8
  30. datachain/sql/sqlite/base.py +20 -2
  31. datachain/studio.py +129 -0
  32. datachain/utils.py +58 -0
  33. {datachain-0.6.0.dist-info → datachain-0.6.2.dist-info}/METADATA +7 -6
  34. {datachain-0.6.0.dist-info → datachain-0.6.2.dist-info}/RECORD +38 -33
  35. {datachain-0.6.0.dist-info → datachain-0.6.2.dist-info}/WHEEL +1 -1
  36. {datachain-0.6.0.dist-info → datachain-0.6.2.dist-info}/LICENSE +0 -0
  37. {datachain-0.6.0.dist-info → datachain-0.6.2.dist-info}/entry_points.txt +0 -0
  38. {datachain-0.6.0.dist-info → datachain-0.6.2.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,6 @@
1
1
  import json
2
2
  import logging
3
+ import os
3
4
  from collections.abc import Iterable, Iterator
4
5
  from datetime import datetime, timedelta, timezone
5
6
  from struct import unpack
@@ -10,8 +11,10 @@ from typing import (
10
11
  TypeVar,
11
12
  )
12
13
 
14
+ from datachain.config import Config
13
15
  from datachain.dataset import DatasetStats
14
- from datachain.utils import retry_with_backoff
16
+ from datachain.error import DataChainError
17
+ from datachain.utils import STUDIO_URL, retry_with_backoff
15
18
 
16
19
  T = TypeVar("T")
17
20
  LsData = Optional[list[dict[str, Any]]]
@@ -54,14 +57,54 @@ class Response(Generic[T]):
54
57
 
55
58
 
56
59
  class StudioClient:
57
- def __init__(
58
- self, url: str, username: str, token: str, timeout: float = 3600.0
59
- ) -> None:
60
+ def __init__(self, timeout: float = 3600.0, team: Optional[str] = None) -> None:
60
61
  self._check_dependencies()
61
- self.url = url.rstrip("/")
62
- self.username = username
63
- self.token = token
64
62
  self.timeout = timeout
63
+ self._config = None
64
+ self._team = team
65
+
66
+ @property
67
+ def token(self) -> str:
68
+ token = os.environ.get("DVC_STUDIO_TOKEN") or self.config.get("token")
69
+
70
+ if not token:
71
+ raise DataChainError(
72
+ "Studio token is not set. Use `datachain studio login` "
73
+ "or environment variable `DVC_STUDIO_TOKEN` to set it."
74
+ )
75
+
76
+ return token
77
+
78
+ @property
79
+ def url(self) -> str:
80
+ return (
81
+ os.environ.get("DVC_STUDIO_URL") or self.config.get("url") or STUDIO_URL
82
+ ) + "/api"
83
+
84
+ @property
85
+ def config(self) -> dict:
86
+ if self._config is None:
87
+ self._config = Config().read().get("studio", {})
88
+ return self._config # type: ignore [return-value]
89
+
90
+ @property
91
+ def team(self) -> str:
92
+ if self._team is None:
93
+ self._team = self._get_team()
94
+ return self._team
95
+
96
+ def _get_team(self) -> str:
97
+ team = os.environ.get("DVC_STUDIO_TEAM") or self.config.get("team")
98
+
99
+ if not team:
100
+ raise DataChainError(
101
+ "Studio team is not set. "
102
+ "Use `datachain studio team <team_name>` "
103
+ "or environment variable `DVC_STUDIO_TEAM` to set it."
104
+ "You can also set it in the config file as team under studio."
105
+ )
106
+
107
+ return team
65
108
 
66
109
  def _check_dependencies(self) -> None:
67
110
  try:
@@ -80,7 +123,7 @@ class StudioClient:
80
123
 
81
124
  response = requests.post(
82
125
  f"{self.url}/{route}",
83
- json={**data, "team_name": self.username},
126
+ json={**data, "team_name": self.team},
84
127
  headers={
85
128
  "Content-Type": "application/json",
86
129
  "Authorization": f"token {self.token}",
@@ -108,7 +151,7 @@ class StudioClient:
108
151
 
109
152
  response = requests.post(
110
153
  f"{self.url}/{route}",
111
- json={**data, "team_name": self.username},
154
+ json={**data, "team_name": self.team},
112
155
  headers={
113
156
  "Content-Type": "application/json",
114
157
  "Authorization": f"token {self.token}",
@@ -174,6 +217,9 @@ class StudioClient:
174
217
  response = self._send_request_msgpack("ls", {"source": path})
175
218
  yield path, response
176
219
 
220
+ def ls_datasets(self) -> Response[LsData]:
221
+ return self._send_request("datachain/ls-datasets", {})
222
+
177
223
  def dataset_info(self, name: str) -> Response[DatasetInfoData]:
178
224
  def _parse_dataset_info(dataset_info):
179
225
  _parse_dates(dataset_info, ["created_at", "finished_at"])
@@ -182,7 +228,7 @@ class StudioClient:
182
228
 
183
229
  return dataset_info
184
230
 
185
- response = self._send_request("dataset-info", {"dataset_name": name})
231
+ response = self._send_request("datachain/dataset-info", {"dataset_name": name})
186
232
  if response.ok:
187
233
  response.data = _parse_dataset_info(response.data)
188
234
  return response
@@ -192,13 +238,14 @@ class StudioClient:
192
238
  ) -> Response[DatasetRowsData]:
193
239
  req_data = {"dataset_name": name, "dataset_version": version}
194
240
  return self._send_request_msgpack(
195
- "dataset-rows",
241
+ "datachain/dataset-rows",
196
242
  {**req_data, "offset": offset, "limit": DATASET_ROWS_CHUNK_SIZE},
197
243
  )
198
244
 
199
245
  def dataset_stats(self, name: str, version: int) -> Response[DatasetStatsData]:
200
246
  response = self._send_request(
201
- "dataset-stats", {"dataset_name": name, "dataset_version": version}
247
+ "datachain/dataset-stats",
248
+ {"dataset_name": name, "dataset_version": version},
202
249
  )
203
250
  if response.ok:
204
251
  response.data = DatasetStats(**response.data)
@@ -208,12 +255,14 @@ class StudioClient:
208
255
  self, name: str, version: int
209
256
  ) -> Response[DatasetExportSignedUrls]:
210
257
  return self._send_request(
211
- "dataset-export", {"dataset_name": name, "dataset_version": version}
258
+ "datachain/dataset-export",
259
+ {"dataset_name": name, "dataset_version": version},
212
260
  )
213
261
 
214
262
  def dataset_export_status(
215
263
  self, name: str, version: int
216
264
  ) -> Response[DatasetExportStatus]:
217
265
  return self._send_request(
218
- "dataset-export-status", {"dataset_name": name, "dataset_version": version}
266
+ "datachain/dataset-export-status",
267
+ {"dataset_name": name, "dataset_version": version},
219
268
  )
@@ -1,7 +1,7 @@
1
1
  from sqlalchemy.sql.expression import func
2
2
 
3
3
  from . import array, path, string
4
- from .array import avg
4
+ from .aggregate import avg
5
5
  from .conditional import greatest, least
6
6
  from .random import rand
7
7
 
@@ -0,0 +1,47 @@
1
+ from sqlalchemy.sql.functions import GenericFunction, ReturnTypeFromArgs
2
+
3
+ from datachain.sql.types import Float, String
4
+ from datachain.sql.utils import compiler_not_implemented
5
+
6
+
7
+ class avg(GenericFunction): # noqa: N801
8
+ """
9
+ Returns the average of the column.
10
+ """
11
+
12
+ type = Float()
13
+ package = "array"
14
+ name = "avg"
15
+ inherit_cache = True
16
+
17
+
18
+ class group_concat(GenericFunction): # noqa: N801
19
+ """
20
+ Returns the concatenated string of the column.
21
+ """
22
+
23
+ type = String()
24
+ package = "array"
25
+ name = "group_concat"
26
+ inherit_cache = True
27
+
28
+
29
+ class any_value(ReturnTypeFromArgs): # noqa: N801
30
+ """
31
+ Returns first value of the column.
32
+ """
33
+
34
+ inherit_cache = True
35
+
36
+
37
+ class collect(ReturnTypeFromArgs): # noqa: N801
38
+ """
39
+ Returns an array of the column.
40
+ """
41
+
42
+ inherit_cache = True
43
+
44
+
45
+ compiler_not_implemented(avg)
46
+ compiler_not_implemented(group_concat)
47
+ compiler_not_implemented(any_value)
@@ -44,15 +44,7 @@ class sip_hash_64(GenericFunction): # noqa: N801
44
44
  inherit_cache = True
45
45
 
46
46
 
47
- class avg(GenericFunction): # noqa: N801
48
- type = Float()
49
- package = "array"
50
- name = "avg"
51
- inherit_cache = True
52
-
53
-
54
47
  compiler_not_implemented(cosine_distance)
55
48
  compiler_not_implemented(euclidean_distance)
56
49
  compiler_not_implemented(length)
57
50
  compiler_not_implemented(sip_hash_64)
58
- compiler_not_implemented(avg)
@@ -14,7 +14,7 @@ from sqlalchemy.sql.elements import literal
14
14
  from sqlalchemy.sql.expression import case
15
15
  from sqlalchemy.sql.functions import func
16
16
 
17
- from datachain.sql.functions import array, conditional, random, string
17
+ from datachain.sql.functions import aggregate, array, conditional, random, string
18
18
  from datachain.sql.functions import path as sql_path
19
19
  from datachain.sql.selectable import Values, base_values_compiler
20
20
  from datachain.sql.sqlite.types import (
@@ -84,7 +84,10 @@ def setup():
84
84
  compiles(conditional.least, "sqlite")(compile_least)
85
85
  compiles(Values, "sqlite")(compile_values)
86
86
  compiles(random.rand, "sqlite")(compile_rand)
87
- compiles(array.avg, "sqlite")(compile_avg)
87
+ compiles(aggregate.avg, "sqlite")(compile_avg)
88
+ compiles(aggregate.group_concat, "sqlite")(compile_group_concat)
89
+ compiles(aggregate.any_value, "sqlite")(compile_any_value)
90
+ compiles(aggregate.collect, "sqlite")(compile_collect)
88
91
 
89
92
  if load_usearch_extension(sqlite3.connect(":memory:")):
90
93
  compiles(array.cosine_distance, "sqlite")(compile_cosine_distance_ext)
@@ -400,6 +403,21 @@ def compile_avg(element, compiler, **kwargs):
400
403
  return compiler.process(func.avg(*element.clauses.clauses), **kwargs)
401
404
 
402
405
 
406
+ def compile_group_concat(element, compiler, **kwargs):
407
+ return compiler.process(func.aggregate_strings(*element.clauses.clauses), **kwargs)
408
+
409
+
410
+ def compile_any_value(element, compiler, **kwargs):
411
+ # use bare column to return any value from the group,
412
+ # this is documented behavior for sqlite,
413
+ # see https://www.sqlite.org/lang_select.html#bare_columns_in_an_aggregate_query
414
+ return compiler.process(*element.clauses.clauses, **kwargs)
415
+
416
+
417
+ def compile_collect(element, compiler, **kwargs):
418
+ return compiler.process(func.json_group_array(*element.clauses.clauses), **kwargs)
419
+
420
+
403
421
  def load_usearch_extension(conn) -> bool:
404
422
  try:
405
423
  # usearch is part of the vector optional dependencies
datachain/studio.py ADDED
@@ -0,0 +1,129 @@
1
+ import os
2
+ from typing import TYPE_CHECKING
3
+
4
+ from datachain.catalog.catalog import raise_remote_error
5
+ from datachain.config import Config, ConfigLevel
6
+ from datachain.error import DataChainError
7
+ from datachain.remote.studio import StudioClient
8
+ from datachain.utils import STUDIO_URL
9
+
10
+ if TYPE_CHECKING:
11
+ from argparse import Namespace
12
+
13
+ POST_LOGIN_MESSAGE = (
14
+ "Once you've logged in, return here "
15
+ "and you'll be ready to start using DataChain with Studio."
16
+ )
17
+
18
+
19
+ def process_studio_cli_args(args: "Namespace"):
20
+ if args.cmd == "login":
21
+ return login(args)
22
+ if args.cmd == "logout":
23
+ return logout()
24
+ if args.cmd == "token":
25
+ return token()
26
+ if args.cmd == "datasets":
27
+ return list_datasets(args)
28
+ if args.cmd == "team":
29
+ return set_team(args)
30
+ raise DataChainError(f"Unknown command '{args.cmd}'.")
31
+
32
+
33
+ def set_team(args: "Namespace"):
34
+ level = ConfigLevel.GLOBAL if args.__dict__.get("global") else ConfigLevel.LOCAL
35
+ config = Config(level)
36
+ with config.edit() as conf:
37
+ studio_conf = conf.get("studio", {})
38
+ studio_conf["team"] = args.team_name
39
+ conf["studio"] = studio_conf
40
+
41
+ print(f"Set default team to '{args.team_name}' in {config.config_file()}")
42
+
43
+
44
+ def login(args: "Namespace"):
45
+ from dvc_studio_client.auth import StudioAuthError, get_access_token
46
+
47
+ config = Config().read().get("studio", {})
48
+ name = args.name
49
+ hostname = (
50
+ args.hostname
51
+ or os.environ.get("DVC_STUDIO_URL")
52
+ or config.get("url")
53
+ or STUDIO_URL
54
+ )
55
+ scopes = args.scopes
56
+
57
+ if config.get("url", hostname) == hostname and "token" in config:
58
+ raise DataChainError(
59
+ "Token already exists. "
60
+ "To login with a different token, "
61
+ "logout using `datachain studio logout`."
62
+ )
63
+
64
+ open_browser = not args.no_open
65
+ try:
66
+ _, access_token = get_access_token(
67
+ token_name=name,
68
+ hostname=hostname,
69
+ scopes=scopes,
70
+ open_browser=open_browser,
71
+ client_name="DataChain",
72
+ post_login_message=POST_LOGIN_MESSAGE,
73
+ )
74
+ except StudioAuthError as exc:
75
+ raise DataChainError(f"Failed to authenticate with Studio: {exc}") from exc
76
+
77
+ config_path = save_config(hostname, access_token)
78
+ print(f"Authentication complete. Saved token to {config_path}.")
79
+ return 0
80
+
81
+
82
+ def logout():
83
+ with Config(ConfigLevel.GLOBAL).edit() as conf:
84
+ token = conf.get("studio", {}).get("token")
85
+ if not token:
86
+ raise DataChainError(
87
+ "Not logged in to Studio. Log in with 'datachain studio login'."
88
+ )
89
+
90
+ del conf["studio"]["token"]
91
+
92
+ print("Logged out from Studio. (you can log back in with 'datachain studio login')")
93
+
94
+
95
+ def token():
96
+ config = Config().read().get("studio", {})
97
+ token = config.get("token")
98
+ if not token:
99
+ raise DataChainError(
100
+ "Not logged in to Studio. Log in with 'datachain studio login'."
101
+ )
102
+
103
+ print(token)
104
+
105
+
106
+ def list_datasets(args: "Namespace"):
107
+ client = StudioClient(team=args.team)
108
+ response = client.ls_datasets()
109
+ if not response.ok:
110
+ raise_remote_error(response.message)
111
+ if not response.data:
112
+ print("No datasets found.")
113
+ return
114
+ for d in response.data:
115
+ name = d.get("name")
116
+ for v in d.get("versions", []):
117
+ version = v.get("version")
118
+ print(f"{name} (v{version})")
119
+
120
+
121
+ def save_config(hostname, token):
122
+ config = Config(ConfigLevel.GLOBAL)
123
+ with config.edit() as conf:
124
+ studio_conf = conf.get("studio", {})
125
+ studio_conf["url"] = hostname
126
+ studio_conf["token"] = token
127
+ conf["studio"] = studio_conf
128
+
129
+ return config.config_file()
datachain/utils.py CHANGED
@@ -15,6 +15,7 @@ from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union
15
15
  from uuid import UUID
16
16
 
17
17
  import cloudpickle
18
+ import platformdirs
18
19
  from dateutil import tz
19
20
  from dateutil.parser import isoparse
20
21
  from pydantic import BaseModel
@@ -25,6 +26,13 @@ if TYPE_CHECKING:
25
26
  NUL = b"\0"
26
27
  TIME_ZERO = datetime.fromtimestamp(0, tz=timezone.utc)
27
28
 
29
+ APPNAME = "datachain"
30
+ APPAUTHOR = "iterative"
31
+ ENV_DATACHAIN_SYSTEM_CONFIG_DIR = "DATACHAIN_SYSTEM_CONFIG_DIR"
32
+ ENV_DATACHAIN_GLOBAL_CONFIG_DIR = "DATACHAIN_GLOBAL_CONFIG_DIR"
33
+ STUDIO_URL = "https://studio.dvc.ai"
34
+
35
+
28
36
  T = TypeVar("T", bound="DataChainDir")
29
37
 
30
38
 
@@ -33,6 +41,7 @@ class DataChainDir:
33
41
  CACHE = "cache"
34
42
  TMP = "tmp"
35
43
  DB = "db"
44
+ CONFIG = "config"
36
45
  ENV_VAR = "DATACHAIN_DIR"
37
46
  ENV_VAR_DATACHAIN_ROOT = "DATACHAIN_ROOT_DIR"
38
47
 
@@ -42,6 +51,7 @@ class DataChainDir:
42
51
  cache: Optional[str] = None,
43
52
  tmp: Optional[str] = None,
44
53
  db: Optional[str] = None,
54
+ config: Optional[str] = None,
45
55
  ) -> None:
46
56
  self.root = osp.abspath(root) if root is not None else self.default_root()
47
57
  self.cache = (
@@ -51,12 +61,24 @@ class DataChainDir:
51
61
  osp.abspath(tmp) if tmp is not None else osp.join(self.root, self.TMP)
52
62
  )
53
63
  self.db = osp.abspath(db) if db is not None else osp.join(self.root, self.DB)
64
+ self.config = (
65
+ osp.abspath(config)
66
+ if config is not None
67
+ else osp.join(self.root, self.CONFIG)
68
+ )
69
+ self.config = (
70
+ osp.abspath(config)
71
+ if config is not None
72
+ else osp.join(self.root, self.CONFIG)
73
+ )
54
74
 
55
75
  def init(self):
56
76
  os.makedirs(self.root, exist_ok=True)
57
77
  os.makedirs(self.cache, exist_ok=True)
58
78
  os.makedirs(self.tmp, exist_ok=True)
59
79
  os.makedirs(osp.split(self.db)[0], exist_ok=True)
80
+ os.makedirs(osp.split(self.config)[0], exist_ok=True)
81
+ os.makedirs(osp.split(self.config)[0], exist_ok=True)
60
82
 
61
83
  @classmethod
62
84
  def default_root(cls) -> str:
@@ -82,6 +104,18 @@ class DataChainDir:
82
104
  return instance
83
105
 
84
106
 
107
+ def system_config_dir():
108
+ return os.getenv(ENV_DATACHAIN_SYSTEM_CONFIG_DIR) or platformdirs.site_config_dir(
109
+ APPNAME, APPAUTHOR
110
+ )
111
+
112
+
113
+ def global_config_dir():
114
+ return os.getenv(ENV_DATACHAIN_GLOBAL_CONFIG_DIR) or platformdirs.user_config_dir(
115
+ APPNAME, APPAUTHOR
116
+ )
117
+
118
+
85
119
  def human_time_to_int(time: str) -> Optional[int]:
86
120
  if not time:
87
121
  return None
@@ -421,3 +455,27 @@ def env2bool(var, undefined=False):
421
455
  if var is None:
422
456
  return undefined
423
457
  return bool(re.search("1|y|yes|true", var, flags=re.IGNORECASE))
458
+
459
+
460
+ def nested_dict_path_set(
461
+ data: dict[str, Any], path: Sequence[str], value: Any
462
+ ) -> dict[str, Any]:
463
+ """Sets a value inside a nested dict based on the list of dict keys as a path,
464
+ and will create sub-dicts as needed to set the value."""
465
+ sub_data = data
466
+ for element in path[:-1]:
467
+ if element not in sub_data:
468
+ sub_data[element] = {}
469
+ sub_data = sub_data[element]
470
+ sub_data[path[len(path) - 1]] = value
471
+ return data
472
+
473
+
474
+ def row_to_nested_dict(
475
+ headers: Iterable[Sequence[str]], row: Iterable[Any]
476
+ ) -> dict[str, Any]:
477
+ """Converts a row to a nested dict based on the provided headers."""
478
+ result: dict[str, Any] = {}
479
+ for h, v in zip(headers, row):
480
+ nested_dict_path_set(result, h, v)
481
+ return result
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.6.0
3
+ Version: 0.6.2
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -19,7 +19,7 @@ License-File: LICENSE
19
19
  Requires-Dist: pyyaml
20
20
  Requires-Dist: tomlkit
21
21
  Requires-Dist: tqdm
22
- Requires-Dist: numpy
22
+ Requires-Dist: numpy <3,>=1
23
23
  Requires-Dist: pandas >=2.0.0
24
24
  Requires-Dist: pyarrow
25
25
  Requires-Dist: typing-extensions
@@ -38,15 +38,16 @@ Requires-Dist: orjson >=3.10.5
38
38
  Requires-Dist: pydantic <3,>=2
39
39
  Requires-Dist: jmespath >=1.0
40
40
  Requires-Dist: datamodel-code-generator >=0.25
41
- Requires-Dist: Pillow <11,>=10.0.0
41
+ Requires-Dist: Pillow <12,>=10.0.0
42
42
  Requires-Dist: msgpack <2,>=1.0.4
43
43
  Requires-Dist: psutil
44
44
  Requires-Dist: huggingface-hub
45
45
  Requires-Dist: iterative-telemetry >=0.0.9
46
- Requires-Dist: numpy <2,>=1 ; sys_platform == "win32"
46
+ Requires-Dist: platformdirs
47
+ Requires-Dist: dvc-studio-client <1,>=0.21
47
48
  Provides-Extra: dev
48
49
  Requires-Dist: datachain[docs,tests] ; extra == 'dev'
49
- Requires-Dist: mypy ==1.11.2 ; extra == 'dev'
50
+ Requires-Dist: mypy ==1.12.1 ; extra == 'dev'
50
51
  Requires-Dist: types-python-dateutil ; extra == 'dev'
51
52
  Requires-Dist: types-pytz ; extra == 'dev'
52
53
  Requires-Dist: types-PyYAML ; extra == 'dev'
@@ -63,7 +64,7 @@ Requires-Dist: datachain[tests] ; extra == 'examples'
63
64
  Requires-Dist: numpy <2,>=1 ; extra == 'examples'
64
65
  Requires-Dist: defusedxml ; extra == 'examples'
65
66
  Requires-Dist: accelerate ; extra == 'examples'
66
- Requires-Dist: unstructured[embed-huggingface,pdf] ; extra == 'examples'
67
+ Requires-Dist: unstructured[embed-huggingface,pdf] <0.16.0 ; extra == 'examples'
67
68
  Requires-Dist: pdfplumber ==0.11.4 ; extra == 'examples'
68
69
  Requires-Dist: huggingface-hub[hf_transfer] ; extra == 'examples'
69
70
  Requires-Dist: onnx ==1.16.1 ; extra == 'examples'