datachain 0.14.2__py3-none-any.whl → 0.39.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (137) hide show
  1. datachain/__init__.py +20 -0
  2. datachain/asyn.py +11 -12
  3. datachain/cache.py +7 -7
  4. datachain/catalog/__init__.py +2 -2
  5. datachain/catalog/catalog.py +621 -507
  6. datachain/catalog/dependency.py +164 -0
  7. datachain/catalog/loader.py +28 -18
  8. datachain/checkpoint.py +43 -0
  9. datachain/cli/__init__.py +24 -33
  10. datachain/cli/commands/__init__.py +1 -8
  11. datachain/cli/commands/datasets.py +83 -52
  12. datachain/cli/commands/ls.py +17 -17
  13. datachain/cli/commands/show.py +4 -4
  14. datachain/cli/parser/__init__.py +8 -74
  15. datachain/cli/parser/job.py +95 -3
  16. datachain/cli/parser/studio.py +11 -4
  17. datachain/cli/parser/utils.py +1 -2
  18. datachain/cli/utils.py +2 -15
  19. datachain/client/azure.py +4 -4
  20. datachain/client/fsspec.py +45 -28
  21. datachain/client/gcs.py +6 -6
  22. datachain/client/hf.py +29 -2
  23. datachain/client/http.py +157 -0
  24. datachain/client/local.py +15 -11
  25. datachain/client/s3.py +17 -9
  26. datachain/config.py +4 -8
  27. datachain/data_storage/db_engine.py +12 -6
  28. datachain/data_storage/job.py +5 -1
  29. datachain/data_storage/metastore.py +1252 -186
  30. datachain/data_storage/schema.py +58 -45
  31. datachain/data_storage/serializer.py +105 -15
  32. datachain/data_storage/sqlite.py +286 -127
  33. datachain/data_storage/warehouse.py +250 -113
  34. datachain/dataset.py +353 -148
  35. datachain/delta.py +391 -0
  36. datachain/diff/__init__.py +27 -29
  37. datachain/error.py +60 -0
  38. datachain/func/__init__.py +2 -1
  39. datachain/func/aggregate.py +66 -42
  40. datachain/func/array.py +242 -38
  41. datachain/func/base.py +7 -4
  42. datachain/func/conditional.py +110 -60
  43. datachain/func/func.py +96 -45
  44. datachain/func/numeric.py +55 -38
  45. datachain/func/path.py +32 -20
  46. datachain/func/random.py +2 -2
  47. datachain/func/string.py +67 -37
  48. datachain/func/window.py +7 -8
  49. datachain/hash_utils.py +123 -0
  50. datachain/job.py +11 -7
  51. datachain/json.py +138 -0
  52. datachain/lib/arrow.py +58 -22
  53. datachain/lib/audio.py +245 -0
  54. datachain/lib/clip.py +14 -13
  55. datachain/lib/convert/flatten.py +5 -3
  56. datachain/lib/convert/python_to_sql.py +6 -10
  57. datachain/lib/convert/sql_to_python.py +8 -0
  58. datachain/lib/convert/values_to_tuples.py +156 -51
  59. datachain/lib/data_model.py +42 -20
  60. datachain/lib/dataset_info.py +36 -8
  61. datachain/lib/dc/__init__.py +8 -2
  62. datachain/lib/dc/csv.py +25 -28
  63. datachain/lib/dc/database.py +398 -0
  64. datachain/lib/dc/datachain.py +1289 -425
  65. datachain/lib/dc/datasets.py +320 -38
  66. datachain/lib/dc/hf.py +38 -24
  67. datachain/lib/dc/json.py +29 -32
  68. datachain/lib/dc/listings.py +112 -8
  69. datachain/lib/dc/pandas.py +16 -12
  70. datachain/lib/dc/parquet.py +35 -23
  71. datachain/lib/dc/records.py +31 -23
  72. datachain/lib/dc/storage.py +154 -64
  73. datachain/lib/dc/storage_pattern.py +251 -0
  74. datachain/lib/dc/utils.py +24 -16
  75. datachain/lib/dc/values.py +8 -9
  76. datachain/lib/file.py +622 -89
  77. datachain/lib/hf.py +69 -39
  78. datachain/lib/image.py +14 -14
  79. datachain/lib/listing.py +14 -11
  80. datachain/lib/listing_info.py +1 -2
  81. datachain/lib/meta_formats.py +3 -4
  82. datachain/lib/model_store.py +39 -7
  83. datachain/lib/namespaces.py +125 -0
  84. datachain/lib/projects.py +130 -0
  85. datachain/lib/pytorch.py +32 -21
  86. datachain/lib/settings.py +192 -56
  87. datachain/lib/signal_schema.py +427 -104
  88. datachain/lib/tar.py +1 -2
  89. datachain/lib/text.py +8 -7
  90. datachain/lib/udf.py +164 -76
  91. datachain/lib/udf_signature.py +60 -35
  92. datachain/lib/utils.py +118 -4
  93. datachain/lib/video.py +17 -9
  94. datachain/lib/webdataset.py +61 -56
  95. datachain/lib/webdataset_laion.py +15 -16
  96. datachain/listing.py +22 -10
  97. datachain/model/bbox.py +3 -1
  98. datachain/model/ultralytics/bbox.py +16 -12
  99. datachain/model/ultralytics/pose.py +16 -12
  100. datachain/model/ultralytics/segment.py +16 -12
  101. datachain/namespace.py +84 -0
  102. datachain/node.py +6 -6
  103. datachain/nodes_thread_pool.py +0 -1
  104. datachain/plugins.py +24 -0
  105. datachain/project.py +78 -0
  106. datachain/query/batch.py +40 -41
  107. datachain/query/dataset.py +604 -322
  108. datachain/query/dispatch.py +261 -154
  109. datachain/query/metrics.py +4 -6
  110. datachain/query/params.py +2 -3
  111. datachain/query/queue.py +3 -12
  112. datachain/query/schema.py +11 -6
  113. datachain/query/session.py +200 -33
  114. datachain/query/udf.py +34 -2
  115. datachain/remote/studio.py +171 -69
  116. datachain/script_meta.py +12 -12
  117. datachain/semver.py +68 -0
  118. datachain/sql/__init__.py +2 -0
  119. datachain/sql/functions/array.py +33 -1
  120. datachain/sql/postgresql_dialect.py +9 -0
  121. datachain/sql/postgresql_types.py +21 -0
  122. datachain/sql/sqlite/__init__.py +5 -1
  123. datachain/sql/sqlite/base.py +102 -29
  124. datachain/sql/sqlite/types.py +8 -13
  125. datachain/sql/types.py +70 -15
  126. datachain/studio.py +223 -46
  127. datachain/toolkit/split.py +31 -10
  128. datachain/utils.py +101 -59
  129. {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/METADATA +77 -22
  130. datachain-0.39.0.dist-info/RECORD +173 -0
  131. {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/WHEEL +1 -1
  132. datachain/cli/commands/query.py +0 -53
  133. datachain/query/utils.py +0 -42
  134. datachain-0.14.2.dist-info/RECORD +0 -158
  135. {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/entry_points.txt +0 -0
  136. {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/licenses/LICENSE +0 -0
  137. {datachain-0.14.2.dist-info → datachain-0.39.0.dist-info}/top_level.txt +0 -0
@@ -1,25 +1,61 @@
1
- from typing import (
2
- TYPE_CHECKING,
3
- Optional,
4
- )
1
+ from typing import TYPE_CHECKING
5
2
 
3
+ from datachain.lib.listing import LISTING_PREFIX, ls
6
4
  from datachain.lib.listing_info import ListingInfo
5
+ from datachain.lib.settings import Settings
6
+ from datachain.lib.signal_schema import SignalSchema
7
7
  from datachain.query import Session
8
+ from datachain.query.dataset import DatasetQuery, QueryStep, step_result
8
9
 
9
10
  from .values import read_values
10
11
 
11
12
  if TYPE_CHECKING:
12
13
  from typing_extensions import ParamSpec
13
14
 
15
+ from datachain.dataset import DatasetVersion
16
+ from datachain.query.dataset import StepResult
17
+
14
18
  from .datachain import DataChain
15
19
 
16
20
  P = ParamSpec("P")
17
21
 
18
22
 
23
+ class ReadOnlyQueryStep(QueryStep):
24
+ """
25
+ This step is used to read the dataset in read-only mode.
26
+ It is used to avoid the need to read the table metadata from the warehouse.
27
+ This is useful when we want to list the files in the dataset.
28
+ """
29
+
30
+ def apply(self) -> "StepResult":
31
+ import sqlalchemy as sa
32
+
33
+ def q(*columns):
34
+ return sa.select(*columns)
35
+
36
+ table_name = self.catalog.warehouse.dataset_table_name(
37
+ self.dataset, self.dataset_version
38
+ )
39
+ dataset_row_cls = self.catalog.warehouse.schema.dataset_row_cls
40
+ table = dataset_row_cls.new_table(
41
+ table_name,
42
+ columns=(
43
+ [
44
+ *dataset_row_cls.sys_columns(),
45
+ *dataset_row_cls.listing_columns(),
46
+ ]
47
+ ),
48
+ )
49
+
50
+ return step_result(
51
+ q, table.columns, dependencies=[(self.dataset, self.dataset_version)]
52
+ )
53
+
54
+
19
55
  def listings(
20
- session: Optional[Session] = None,
56
+ session: Session | None = None,
21
57
  in_memory: bool = False,
22
- object_name: str = "listing",
58
+ column: str = "listing",
23
59
  **kwargs,
24
60
  ) -> "DataChain":
25
61
  """Generate chain with list of cached listings.
@@ -38,6 +74,74 @@ def listings(
38
74
  return read_values(
39
75
  session=session,
40
76
  in_memory=in_memory,
41
- output={object_name: ListingInfo},
42
- **{object_name: catalog.listings()}, # type: ignore[arg-type]
77
+ output={column: ListingInfo},
78
+ **{column: catalog.listings()}, # type: ignore[arg-type]
43
79
  )
80
+
81
+
82
+ def read_listing_dataset(
83
+ name: str,
84
+ version: str | None = None,
85
+ path: str = "",
86
+ session: Session | None = None,
87
+ settings: dict | None = None,
88
+ ) -> tuple["DataChain", "DatasetVersion"]:
89
+ """Read a listing dataset and return a DataChain and listing version.
90
+
91
+ Args:
92
+ name: Name of the dataset
93
+ version: Version of the dataset
94
+ path: Path within the listing to read. Path can have globs.
95
+ session: Optional Session object to use for reading
96
+ settings: Optional settings dictionary to use for reading
97
+
98
+ Returns:
99
+ tuple[DataChain, DatasetVersion]: A tuple containing:
100
+ - DataChain configured for listing files
101
+ - DatasetVersion object for the specified listing version
102
+
103
+ Example:
104
+ ```py
105
+ import datachain as dc
106
+ chain, listing_version = dc.read_listing_dataset(
107
+ "lst__s3://my-bucket/my-path", version="1.0.0", path="my-path"
108
+ )
109
+ chain.show()
110
+ ```
111
+ """
112
+ # Configure and return a DataChain for reading listing dataset files
113
+ # Uses ReadOnlyQueryStep to avoid warehouse metadata lookups
114
+ from datachain.lib.dc import Sys
115
+ from datachain.lib.file import File
116
+
117
+ from .datachain import DataChain
118
+
119
+ if not name.startswith(LISTING_PREFIX):
120
+ name = LISTING_PREFIX + name
121
+
122
+ session = Session.get(session)
123
+ dataset = session.catalog.get_dataset(name)
124
+ if version is None:
125
+ version = dataset.latest_version
126
+
127
+ query = DatasetQuery(name=name, session=session)
128
+
129
+ if settings:
130
+ cfg = {**settings}
131
+ if "prefetch" not in cfg:
132
+ cfg["prefetch"] = 0
133
+ _settings = Settings(**cfg)
134
+ else:
135
+ _settings = Settings(prefetch=0)
136
+ signal_schema = SignalSchema({"sys": Sys, "file": File})
137
+
138
+ query.starting_step = ReadOnlyQueryStep(query.catalog, dataset, version)
139
+ query.version = version
140
+ # We already know that this is a listing dataset,
141
+ # so we can set the listing function to True
142
+ query.set_listing_fn(lambda: True)
143
+
144
+ chain = DataChain(query, _settings, signal_schema)
145
+ chain = ls(chain, path, recursive=True, column="file")
146
+
147
+ return chain, dataset.get_version(version)
@@ -1,7 +1,4 @@
1
- from typing import (
2
- TYPE_CHECKING,
3
- Optional,
4
- )
1
+ from typing import TYPE_CHECKING
5
2
 
6
3
  from datachain.query import Session
7
4
 
@@ -19,10 +16,10 @@ if TYPE_CHECKING:
19
16
  def read_pandas( # type: ignore[override]
20
17
  df: "pd.DataFrame",
21
18
  name: str = "",
22
- session: Optional[Session] = None,
23
- settings: Optional[dict] = None,
19
+ session: Session | None = None,
20
+ settings: dict | None = None,
24
21
  in_memory: bool = False,
25
- object_name: str = "",
22
+ column: str = "",
26
23
  ) -> "DataChain":
27
24
  """Generate chain from pandas data-frame.
28
25
 
@@ -37,20 +34,27 @@ def read_pandas( # type: ignore[override]
37
34
  """
38
35
  from .utils import DatasetPrepareError
39
36
 
40
- fr_map = {col.lower(): df[col].tolist() for col in df.columns}
37
+ def get_col_name(col):
38
+ if isinstance(col, tuple):
39
+ # Join tuple elements with underscore for MultiIndex columns
40
+ return "_".join(map(str, col)).lower()
41
+ # Handle regular string column names
42
+ return str(col).lower()
41
43
 
42
- for column in fr_map:
43
- if not column.isidentifier():
44
+ fr_map = {get_col_name(col): df[col].tolist() for col in df.columns}
45
+
46
+ for c in fr_map:
47
+ if not c.isidentifier():
44
48
  raise DatasetPrepareError(
45
49
  name,
46
- f"import from pandas error - '{column}' cannot be a column name",
50
+ f"import from pandas error - '{c}' cannot be a column name",
47
51
  )
48
52
 
49
53
  return read_values(
50
54
  name,
51
55
  session,
52
56
  settings=settings,
53
- object_name=object_name,
57
+ column=column,
54
58
  in_memory=in_memory,
55
59
  **fr_map,
56
60
  )
@@ -1,8 +1,5 @@
1
- from typing import (
2
- TYPE_CHECKING,
3
- Any,
4
- Optional,
5
- )
1
+ import os
2
+ from typing import TYPE_CHECKING, Any
6
3
 
7
4
  from datachain.lib.data_model import DataType
8
5
  from datachain.query import Session
@@ -16,28 +13,34 @@ if TYPE_CHECKING:
16
13
 
17
14
 
18
15
  def read_parquet(
19
- path,
16
+ path: str | os.PathLike[str] | list[str] | list[os.PathLike[str]],
20
17
  partitioning: Any = "hive",
21
- output: Optional[dict[str, DataType]] = None,
22
- object_name: str = "",
18
+ output: dict[str, DataType] | None = None,
19
+ column: str = "",
23
20
  model_name: str = "",
24
21
  source: bool = True,
25
- session: Optional[Session] = None,
26
- settings: Optional[dict] = None,
22
+ session: Session | None = None,
23
+ settings: dict | None = None,
27
24
  **kwargs,
28
25
  ) -> "DataChain":
29
26
  """Generate chain from parquet files.
30
27
 
31
28
  Parameters:
32
- path : Storage URI with directory. URI must start with storage prefix such
33
- as `s3://`, `gs://`, `az://` or "file:///".
34
- partitioning : Any pyarrow partitioning schema.
35
- output : Dictionary defining column names and their corresponding types.
36
- object_name : Created object column name.
37
- model_name : Generated model name.
38
- source : Whether to include info about the source file.
39
- session : Session to use for the chain.
40
- settings : Settings to use for the chain.
29
+ path: Storage path(s) or URI(s). Can be a local path or start with a
30
+ storage prefix like `s3://`, `gs://`, `az://`, `hf://` or "file:///".
31
+ Supports glob patterns:
32
+ - `*` : wildcard
33
+ - `**` : recursive wildcard
34
+ - `?` : single character
35
+ - `{a,b}` : brace expansion list
36
+ - `{1..9}` : brace numeric or alphabetic range
37
+ partitioning: Any pyarrow partitioning schema.
38
+ output: Dictionary defining column names and their corresponding types.
39
+ column: Created column name.
40
+ model_name: Generated model name.
41
+ source: Whether to include info about the source file.
42
+ session: Session to use for the chain.
43
+ settings: Settings to use for the chain.
41
44
 
42
45
  Example:
43
46
  Reading a single file:
@@ -46,10 +49,19 @@ def read_parquet(
46
49
  dc.read_parquet("s3://mybucket/file.parquet")
47
50
  ```
48
51
 
49
- Reading a partitioned dataset from a directory:
52
+ All files from a directory:
50
53
  ```py
51
- import datachain as dc
52
- dc.read_parquet("s3://mybucket/dir")
54
+ dc.read_parquet("s3://mybucket/dir/")
55
+ ```
56
+
57
+ Only parquet files from a directory, and all it's subdirectories:
58
+ ```py
59
+ dc.read_parquet("s3://mybucket/dir/**/*.parquet")
60
+ ```
61
+
62
+ Using filename patterns - numeric, list, starting with zeros:
63
+ ```py
64
+ dc.read_parquet("s3://mybucket/202{1..4}/{yellow,green}-{01..12}.parquet")
53
65
  ```
54
66
  """
55
67
  from .storage import read_storage
@@ -57,7 +69,7 @@ def read_parquet(
57
69
  chain = read_storage(path, session=session, settings=settings, **kwargs)
58
70
  return chain.parse_tabular(
59
71
  output=output,
60
- object_name=object_name,
72
+ column=column,
61
73
  model_name=model_name,
62
74
  source=source,
63
75
  format="parquet",
@@ -1,15 +1,10 @@
1
- from typing import (
2
- TYPE_CHECKING,
3
- Optional,
4
- Union,
5
- )
1
+ from collections.abc import Iterable
2
+ from typing import TYPE_CHECKING
6
3
 
7
4
  import sqlalchemy
8
5
 
9
6
  from datachain.lib.data_model import DataType
10
- from datachain.lib.file import (
11
- File,
12
- )
7
+ from datachain.lib.file import File
13
8
  from datachain.lib.signal_schema import SignalSchema
14
9
  from datachain.query import Session
15
10
 
@@ -20,29 +15,37 @@ if TYPE_CHECKING:
20
15
 
21
16
  P = ParamSpec("P")
22
17
 
18
+ READ_RECORDS_BATCH_SIZE = 10000
19
+
23
20
 
24
21
  def read_records(
25
- to_insert: Optional[Union[dict, list[dict]]],
26
- session: Optional[Session] = None,
27
- settings: Optional[dict] = None,
22
+ to_insert: dict | Iterable[dict] | None,
23
+ session: Session | None = None,
24
+ settings: dict | None = None,
28
25
  in_memory: bool = False,
29
- schema: Optional[dict[str, DataType]] = None,
26
+ schema: dict[str, DataType] | None = None,
30
27
  ) -> "DataChain":
31
28
  """Create a DataChain from the provided records. This method can be used for
32
29
  programmatically generating a chain in contrast of reading data from storages
33
30
  or other sources.
34
31
 
35
32
  Parameters:
36
- to_insert : records (or a single record) to insert. Each record is
37
- a dictionary of signals and theirs values.
38
- schema : describes chain signals and their corresponding types
33
+ to_insert: records (or a single record) to insert. Each record is
34
+ a dictionary of signals and their values.
35
+ schema: describes chain signals and their corresponding types
39
36
 
40
37
  Example:
41
38
  ```py
42
39
  import datachain as dc
43
40
  single_record = dc.read_records(dc.DEFAULT_FILE_RECORD)
44
41
  ```
42
+
43
+ Notes:
44
+ This call blocks until all records are inserted.
45
45
  """
46
+ from datachain.query.dataset import adjust_outputs, get_col_types
47
+ from datachain.sql.types import SQLType
48
+
46
49
  from .datasets import read_dataset
47
50
 
48
51
  session = Session.get(session, in_memory=in_memory)
@@ -56,7 +59,7 @@ def read_records(
56
59
  signal_schema = SignalSchema(schema)
57
60
  columns = [
58
61
  sqlalchemy.Column(c.name, c.type) # type: ignore[union-attr]
59
- for c in signal_schema.db_signals(as_columns=True) # type: ignore[assignment]
62
+ for c in signal_schema.db_signals(as_columns=True)
60
63
  ]
61
64
  else:
62
65
  columns = [
@@ -66,6 +69,7 @@ def read_records(
66
69
 
67
70
  dsr = catalog.create_dataset(
68
71
  name,
72
+ catalog.metastore.default_project,
69
73
  columns=columns,
70
74
  feature_schema=(
71
75
  signal_schema.clone_without_sys_signals().serialize()
@@ -74,8 +78,6 @@ def read_records(
74
78
  ),
75
79
  )
76
80
 
77
- session.add_dataset_version(dsr, dsr.latest_version)
78
-
79
81
  if isinstance(to_insert, dict):
80
82
  to_insert = [to_insert]
81
83
  elif not to_insert:
@@ -83,8 +85,14 @@ def read_records(
83
85
 
84
86
  warehouse = catalog.warehouse
85
87
  dr = warehouse.dataset_rows(dsr)
86
- db = warehouse.db
87
- insert_q = dr.get_table().insert()
88
- for record in to_insert:
89
- db.execute(insert_q.values(**record))
90
- return read_dataset(name=dsr.name, session=session, settings=settings)
88
+ table = dr.get_table()
89
+
90
+ # Optimization: Compute row types once, rather than for every row.
91
+ col_types = get_col_types(
92
+ warehouse,
93
+ {c.name: c.type for c in columns if isinstance(c.type, SQLType)},
94
+ )
95
+ records = (adjust_outputs(warehouse, record, col_types) for record in to_insert)
96
+ warehouse.insert_rows(table, records, batch_size=READ_RECORDS_BATCH_SIZE)
97
+ warehouse.insert_rows_done(table)
98
+ return read_dataset(name=dsr.full_name, session=session, settings=settings)