datachain 0.34.6__py3-none-any.whl → 0.35.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (105) hide show
  1. datachain/asyn.py +11 -12
  2. datachain/cache.py +5 -5
  3. datachain/catalog/catalog.py +75 -83
  4. datachain/catalog/loader.py +3 -3
  5. datachain/checkpoint.py +1 -2
  6. datachain/cli/__init__.py +2 -4
  7. datachain/cli/commands/datasets.py +13 -13
  8. datachain/cli/commands/ls.py +4 -4
  9. datachain/cli/commands/query.py +3 -3
  10. datachain/cli/commands/show.py +2 -2
  11. datachain/cli/parser/job.py +1 -1
  12. datachain/cli/parser/utils.py +1 -2
  13. datachain/cli/utils.py +1 -2
  14. datachain/client/azure.py +2 -2
  15. datachain/client/fsspec.py +11 -21
  16. datachain/client/gcs.py +3 -3
  17. datachain/client/http.py +4 -4
  18. datachain/client/local.py +4 -4
  19. datachain/client/s3.py +3 -3
  20. datachain/config.py +4 -8
  21. datachain/data_storage/db_engine.py +5 -5
  22. datachain/data_storage/metastore.py +107 -107
  23. datachain/data_storage/schema.py +18 -24
  24. datachain/data_storage/sqlite.py +21 -28
  25. datachain/data_storage/warehouse.py +13 -13
  26. datachain/dataset.py +64 -70
  27. datachain/delta.py +21 -18
  28. datachain/diff/__init__.py +13 -13
  29. datachain/func/aggregate.py +9 -11
  30. datachain/func/array.py +12 -12
  31. datachain/func/base.py +7 -4
  32. datachain/func/conditional.py +9 -13
  33. datachain/func/func.py +45 -42
  34. datachain/func/numeric.py +5 -7
  35. datachain/func/string.py +2 -2
  36. datachain/hash_utils.py +54 -81
  37. datachain/job.py +8 -8
  38. datachain/lib/arrow.py +17 -14
  39. datachain/lib/audio.py +6 -6
  40. datachain/lib/clip.py +5 -4
  41. datachain/lib/convert/python_to_sql.py +4 -22
  42. datachain/lib/convert/values_to_tuples.py +4 -9
  43. datachain/lib/data_model.py +20 -19
  44. datachain/lib/dataset_info.py +6 -6
  45. datachain/lib/dc/csv.py +10 -10
  46. datachain/lib/dc/database.py +28 -29
  47. datachain/lib/dc/datachain.py +98 -97
  48. datachain/lib/dc/datasets.py +22 -22
  49. datachain/lib/dc/hf.py +4 -4
  50. datachain/lib/dc/json.py +9 -10
  51. datachain/lib/dc/listings.py +5 -8
  52. datachain/lib/dc/pandas.py +3 -6
  53. datachain/lib/dc/parquet.py +5 -5
  54. datachain/lib/dc/records.py +5 -5
  55. datachain/lib/dc/storage.py +12 -12
  56. datachain/lib/dc/storage_pattern.py +2 -2
  57. datachain/lib/dc/utils.py +11 -14
  58. datachain/lib/dc/values.py +3 -6
  59. datachain/lib/file.py +32 -28
  60. datachain/lib/hf.py +7 -5
  61. datachain/lib/image.py +13 -13
  62. datachain/lib/listing.py +5 -5
  63. datachain/lib/listing_info.py +1 -2
  64. datachain/lib/meta_formats.py +1 -2
  65. datachain/lib/model_store.py +3 -3
  66. datachain/lib/namespaces.py +4 -6
  67. datachain/lib/projects.py +5 -9
  68. datachain/lib/pytorch.py +10 -10
  69. datachain/lib/settings.py +23 -23
  70. datachain/lib/signal_schema.py +52 -44
  71. datachain/lib/text.py +8 -7
  72. datachain/lib/udf.py +25 -17
  73. datachain/lib/udf_signature.py +11 -11
  74. datachain/lib/video.py +3 -4
  75. datachain/lib/webdataset.py +30 -35
  76. datachain/lib/webdataset_laion.py +15 -16
  77. datachain/listing.py +4 -4
  78. datachain/model/bbox.py +3 -1
  79. datachain/namespace.py +4 -4
  80. datachain/node.py +6 -6
  81. datachain/nodes_thread_pool.py +0 -1
  82. datachain/plugins.py +1 -7
  83. datachain/project.py +4 -4
  84. datachain/query/batch.py +7 -8
  85. datachain/query/dataset.py +80 -87
  86. datachain/query/dispatch.py +7 -7
  87. datachain/query/metrics.py +3 -4
  88. datachain/query/params.py +2 -3
  89. datachain/query/schema.py +7 -6
  90. datachain/query/session.py +7 -7
  91. datachain/query/udf.py +8 -7
  92. datachain/query/utils.py +3 -5
  93. datachain/remote/studio.py +33 -39
  94. datachain/script_meta.py +12 -12
  95. datachain/sql/sqlite/base.py +6 -9
  96. datachain/studio.py +30 -30
  97. datachain/toolkit/split.py +1 -2
  98. datachain/utils.py +21 -21
  99. {datachain-0.34.6.dist-info → datachain-0.35.0.dist-info}/METADATA +2 -3
  100. datachain-0.35.0.dist-info/RECORD +173 -0
  101. datachain-0.34.6.dist-info/RECORD +0 -173
  102. {datachain-0.34.6.dist-info → datachain-0.35.0.dist-info}/WHEEL +0 -0
  103. {datachain-0.34.6.dist-info → datachain-0.35.0.dist-info}/entry_points.txt +0 -0
  104. {datachain-0.34.6.dist-info → datachain-0.35.0.dist-info}/licenses/LICENSE +0 -0
  105. {datachain-0.34.6.dist-info → datachain-0.35.0.dist-info}/top_level.txt +0 -0
datachain/lib/dc/csv.py CHANGED
@@ -1,6 +1,6 @@
1
1
  import os
2
- from collections.abc import Sequence
3
- from typing import TYPE_CHECKING, Callable, Optional, Union
2
+ from collections.abc import Callable, Sequence
3
+ from typing import TYPE_CHECKING
4
4
 
5
5
  from datachain.lib.dc.utils import DatasetPrepareError, OutputType
6
6
  from datachain.lib.model_store import ModelStore
@@ -13,18 +13,18 @@ if TYPE_CHECKING:
13
13
 
14
14
 
15
15
  def read_csv(
16
- path: Union[str, os.PathLike[str], list[str], list[os.PathLike[str]]],
17
- delimiter: Optional[str] = None,
16
+ path: str | os.PathLike[str] | list[str] | list[os.PathLike[str]],
17
+ delimiter: str | None = None,
18
18
  header: bool = True,
19
19
  output: OutputType = None,
20
20
  column: str = "",
21
21
  model_name: str = "",
22
22
  source: bool = True,
23
- nrows: Optional[int] = None,
24
- session: Optional[Session] = None,
25
- settings: Optional[dict] = None,
26
- column_types: Optional[dict[str, "Union[str, ArrowDataType]"]] = None,
27
- parse_options: Optional[dict[str, "Union[str, Union[bool, Callable]]"]] = None,
23
+ nrows: int | None = None,
24
+ session: Session | None = None,
25
+ settings: dict | None = None,
26
+ column_types: dict[str, "str | ArrowDataType"] | None = None,
27
+ parse_options: dict[str, str | bool | Callable] | None = None,
28
28
  **kwargs,
29
29
  ) -> "DataChain":
30
30
  """Generate chain from csv files.
@@ -63,7 +63,7 @@ def read_csv(
63
63
  chain = dc.read_csv("s3://mybucket/dir")
64
64
  ```
65
65
  """
66
- from pandas.io.parsers.readers import STR_NA_VALUES
66
+ from pandas._libs.parsers import STR_NA_VALUES
67
67
  from pyarrow.csv import ConvertOptions, ParseOptions, ReadOptions
68
68
  from pyarrow.dataset import CsvFileFormat
69
69
  from pyarrow.lib import type_for_alias
@@ -2,7 +2,8 @@ import contextlib
2
2
  import itertools
3
3
  import os
4
4
  import sqlite3
5
- from typing import TYPE_CHECKING, Any, Optional, Union
5
+ from collections.abc import Iterator, Mapping, Sequence
6
+ from typing import TYPE_CHECKING, Any
6
7
 
7
8
  import sqlalchemy
8
9
 
@@ -12,8 +13,6 @@ from datachain.utils import batched
12
13
  DEFAULT_DATABASE_BATCH_SIZE = 10_000
13
14
 
14
15
  if TYPE_CHECKING:
15
- from collections.abc import Iterator, Mapping, Sequence
16
-
17
16
  import sqlalchemy.orm # noqa: TC004
18
17
 
19
18
  from datachain.lib.data_model import DataType
@@ -21,21 +20,21 @@ if TYPE_CHECKING:
21
20
 
22
21
  from .datachain import DataChain
23
22
 
24
- ConnectionType = Union[
25
- str,
26
- sqlalchemy.engine.URL,
27
- sqlalchemy.engine.interfaces.Connectable,
28
- sqlalchemy.engine.Engine,
29
- sqlalchemy.engine.Connection,
30
- sqlalchemy.orm.Session,
31
- sqlite3.Connection,
32
- ]
23
+ ConnectionType = (
24
+ str
25
+ | sqlalchemy.engine.URL
26
+ | sqlalchemy.engine.interfaces.Connectable
27
+ | sqlalchemy.engine.Engine
28
+ | sqlalchemy.engine.Connection
29
+ | sqlalchemy.orm.Session
30
+ | sqlite3.Connection
31
+ )
33
32
 
34
33
 
35
34
  @contextlib.contextmanager
36
35
  def _connect(
37
36
  connection: "ConnectionType",
38
- ) -> "Iterator[sqlalchemy.engine.Connection]":
37
+ ) -> Iterator[sqlalchemy.engine.Connection]:
39
38
  import sqlalchemy.orm
40
39
 
41
40
  with contextlib.ExitStack() as stack:
@@ -74,9 +73,9 @@ def to_database(
74
73
  connection: "ConnectionType",
75
74
  *,
76
75
  batch_size: int = DEFAULT_DATABASE_BATCH_SIZE,
77
- on_conflict: Optional[str] = None,
78
- conflict_columns: Optional[list[str]] = None,
79
- column_mapping: Optional[dict[str, Optional[str]]] = None,
76
+ on_conflict: str | None = None,
77
+ conflict_columns: list[str] | None = None,
78
+ column_mapping: dict[str, str | None] | None = None,
80
79
  ) -> int:
81
80
  """
82
81
  Implementation function for exporting DataChain to database tables.
@@ -150,8 +149,8 @@ def to_database(
150
149
 
151
150
 
152
151
  def _normalize_column_mapping(
153
- column_mapping: dict[str, Optional[str]],
154
- ) -> dict[str, Optional[str]]:
152
+ column_mapping: dict[str, str | None],
153
+ ) -> dict[str, str | None]:
155
154
  """
156
155
  Convert column mapping keys from DataChain format (dots) to database format
157
156
  (double underscores).
@@ -163,7 +162,7 @@ def _normalize_column_mapping(
163
162
  if not column_mapping:
164
163
  return {}
165
164
 
166
- normalized_mapping: dict[str, Optional[str]] = {}
165
+ normalized_mapping: dict[str, str | None] = {}
167
166
  original_keys: dict[str, str] = {}
168
167
  for key, value in column_mapping.items():
169
168
  db_key = ColumnMeta.to_db_name(key)
@@ -181,7 +180,7 @@ def _normalize_column_mapping(
181
180
  from collections import defaultdict
182
181
 
183
182
  default_factory = column_mapping.default_factory
184
- result: dict[str, Optional[str]] = defaultdict(default_factory)
183
+ result: dict[str, str | None] = defaultdict(default_factory)
185
184
  result.update(normalized_mapping)
186
185
  return result
187
186
 
@@ -189,8 +188,8 @@ def _normalize_column_mapping(
189
188
 
190
189
 
191
190
  def _normalize_conflict_columns(
192
- conflict_columns: Optional[list[str]], column_mapping: dict[str, Optional[str]]
193
- ) -> Optional[list[str]]:
191
+ conflict_columns: list[str] | None, column_mapping: dict[str, str | None]
192
+ ) -> list[str] | None:
194
193
  """
195
194
  Normalize conflict_columns by converting DataChain format to database format
196
195
  and applying column mapping.
@@ -297,15 +296,15 @@ def _process_batch(
297
296
 
298
297
 
299
298
  def read_database(
300
- query: Union[str, "sqlalchemy.sql.expression.Executable"],
299
+ query: "str | sqlalchemy.sql.expression.Executable",
301
300
  connection: "ConnectionType",
302
- params: Union["Sequence[Mapping[str, Any]]", "Mapping[str, Any]", None] = None,
301
+ params: Sequence[Mapping[str, Any]] | Mapping[str, Any] | None = None,
303
302
  *,
304
- output: Optional["dict[str, DataType]"] = None,
305
- session: Optional["Session"] = None,
306
- settings: Optional[dict] = None,
303
+ output: dict[str, "DataType"] | None = None,
304
+ session: "Session | None" = None,
305
+ settings: dict | None = None,
307
306
  in_memory: bool = False,
308
- infer_schema_length: Optional[int] = 100,
307
+ infer_schema_length: int | None = 100,
309
308
  ) -> "DataChain":
310
309
  """
311
310
  Read the results of a SQL query into a DataChain, using a given database connection.
@@ -382,7 +381,7 @@ def read_database(
382
381
  def _infer_schema(
383
382
  result: "sqlalchemy.engine.Result",
384
383
  to_infer: list[str],
385
- infer_schema_length: Optional[int] = 100,
384
+ infer_schema_length: int | None = 100,
386
385
  ) -> tuple[list["sqlalchemy.Row"], dict[str, "DataType"]]:
387
386
  from datachain.lib.convert.values_to_tuples import values_to_tuples
388
387