datachain 0.1.13__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/__init__.py +0 -4
- datachain/asyn.py +3 -3
- datachain/catalog/__init__.py +3 -3
- datachain/catalog/catalog.py +6 -6
- datachain/catalog/loader.py +3 -3
- datachain/cli.py +10 -2
- datachain/client/azure.py +37 -1
- datachain/client/fsspec.py +1 -1
- datachain/client/local.py +1 -1
- datachain/data_storage/__init__.py +1 -1
- datachain/data_storage/metastore.py +11 -3
- datachain/data_storage/schema.py +12 -7
- datachain/data_storage/sqlite.py +3 -0
- datachain/data_storage/warehouse.py +31 -30
- datachain/dataset.py +1 -3
- datachain/lib/arrow.py +85 -0
- datachain/lib/cached_stream.py +3 -85
- datachain/lib/dc.py +382 -179
- datachain/lib/feature.py +46 -91
- datachain/lib/feature_registry.py +4 -1
- datachain/lib/feature_utils.py +2 -2
- datachain/lib/file.py +30 -44
- datachain/lib/image.py +9 -2
- datachain/lib/meta_formats.py +66 -34
- datachain/lib/settings.py +5 -5
- datachain/lib/signal_schema.py +103 -105
- datachain/lib/udf.py +10 -38
- datachain/lib/udf_signature.py +11 -6
- datachain/lib/webdataset_laion.py +5 -22
- datachain/listing.py +8 -8
- datachain/node.py +1 -1
- datachain/progress.py +1 -1
- datachain/query/builtins.py +1 -1
- datachain/query/dataset.py +42 -119
- datachain/query/dispatch.py +1 -1
- datachain/query/metrics.py +19 -0
- datachain/query/schema.py +13 -3
- datachain/sql/__init__.py +1 -1
- datachain/sql/sqlite/base.py +34 -2
- datachain/sql/sqlite/vector.py +13 -5
- datachain/utils.py +1 -122
- {datachain-0.1.13.dist-info → datachain-0.2.1.dist-info}/METADATA +11 -4
- {datachain-0.1.13.dist-info → datachain-0.2.1.dist-info}/RECORD +47 -47
- {datachain-0.1.13.dist-info → datachain-0.2.1.dist-info}/WHEEL +1 -1
- datachain/_version.py +0 -16
- datachain/lib/parquet.py +0 -32
- {datachain-0.1.13.dist-info → datachain-0.2.1.dist-info}/LICENSE +0 -0
- {datachain-0.1.13.dist-info → datachain-0.2.1.dist-info}/entry_points.txt +0 -0
- {datachain-0.1.13.dist-info → datachain-0.2.1.dist-info}/top_level.txt +0 -0
datachain/lib/dc.py
CHANGED
|
@@ -1,12 +1,21 @@
|
|
|
1
|
+
import re
|
|
1
2
|
from collections.abc import Iterator, Sequence
|
|
2
|
-
from typing import
|
|
3
|
+
from typing import (
|
|
4
|
+
TYPE_CHECKING,
|
|
5
|
+
Any,
|
|
6
|
+
Callable,
|
|
7
|
+
ClassVar,
|
|
8
|
+
Literal,
|
|
9
|
+
Optional,
|
|
10
|
+
Union,
|
|
11
|
+
)
|
|
3
12
|
|
|
4
13
|
import sqlalchemy
|
|
5
14
|
|
|
6
15
|
from datachain.lib.feature import Feature, FeatureType
|
|
7
16
|
from datachain.lib.feature_utils import features_to_tuples
|
|
8
17
|
from datachain.lib.file import File, get_file
|
|
9
|
-
from datachain.lib.meta_formats import read_meta
|
|
18
|
+
from datachain.lib.meta_formats import read_meta, read_schema
|
|
10
19
|
from datachain.lib.settings import Settings
|
|
11
20
|
from datachain.lib.signal_schema import SignalSchema
|
|
12
21
|
from datachain.lib.udf import (
|
|
@@ -27,8 +36,11 @@ from datachain.query.dataset import (
|
|
|
27
36
|
from datachain.query.schema import Column, DatasetRow
|
|
28
37
|
|
|
29
38
|
if TYPE_CHECKING:
|
|
39
|
+
import pandas as pd
|
|
30
40
|
from typing_extensions import Self
|
|
31
41
|
|
|
42
|
+
from datachain.catalog import Catalog
|
|
43
|
+
|
|
32
44
|
C = Column
|
|
33
45
|
|
|
34
46
|
|
|
@@ -68,44 +80,43 @@ class DataChain(DatasetQuery):
|
|
|
68
80
|
The supported set of field types include: majority of the type supported by the
|
|
69
81
|
underlyind library `Pydantic`.
|
|
70
82
|
|
|
71
|
-
See Also
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
>>> print(chain)
|
|
83
|
+
See Also:
|
|
84
|
+
`DataChain.from_storage("s3://my-bucket/my-dir/")` - reading unstructured
|
|
85
|
+
data files from storages such as S3, gs or Azure ADLS.
|
|
86
|
+
|
|
87
|
+
`DataChain.save("name")` - saving to a dataset.
|
|
88
|
+
|
|
89
|
+
`DataChain.from_dataset("name")` - reading from a dataset.
|
|
90
|
+
|
|
91
|
+
`DataChain.from_features(fib=[1, 2, 3, 5, 8])` - generating from a values.
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
Example:
|
|
95
|
+
```py
|
|
96
|
+
from datachain import DataChain, Feature
|
|
97
|
+
from datachain.lib.claude import claude_processor
|
|
98
|
+
|
|
99
|
+
class Rating(Feature):
|
|
100
|
+
status: str = ""
|
|
101
|
+
explanation: str = ""
|
|
102
|
+
|
|
103
|
+
PROMPT = "A 'user' is a human trying to find the best mobile plan.... "
|
|
104
|
+
MODEL = "claude-3-opus-20240229"
|
|
105
|
+
|
|
106
|
+
chain = (
|
|
107
|
+
DataChain.from_storage("s3://my-bucket/my")
|
|
108
|
+
.filter(C.name.glob("*.txt"))
|
|
109
|
+
.limit(5)
|
|
110
|
+
.map(claude=claude_processor(prompt=PROMPT, model=MODEL))
|
|
111
|
+
.map(
|
|
112
|
+
rating=lambda claude: Rating(
|
|
113
|
+
**(json.loads(claude.content[0].text) if claude.content else {})
|
|
114
|
+
),
|
|
115
|
+
output=Rating,
|
|
116
|
+
)
|
|
117
|
+
chain.save("ratings")
|
|
118
|
+
print(chain)
|
|
119
|
+
```
|
|
109
120
|
"""
|
|
110
121
|
|
|
111
122
|
DEFAULT_FILE_RECORD: ClassVar[dict] = {
|
|
@@ -119,8 +130,7 @@ class DataChain(DatasetQuery):
|
|
|
119
130
|
|
|
120
131
|
def __init__(self, *args, **kwargs):
|
|
121
132
|
"""This method needs to be redefined as a part of Dataset and DacaChin
|
|
122
|
-
decoupling
|
|
123
|
-
"""
|
|
133
|
+
decoupling."""
|
|
124
134
|
super().__init__(
|
|
125
135
|
*args,
|
|
126
136
|
**kwargs,
|
|
@@ -133,6 +143,16 @@ class DataChain(DatasetQuery):
|
|
|
133
143
|
else:
|
|
134
144
|
self.signals_schema = SignalSchema.from_column_types(self.column_types)
|
|
135
145
|
|
|
146
|
+
@property
|
|
147
|
+
def schema(self):
|
|
148
|
+
return self.signals_schema.values if self.signals_schema else None
|
|
149
|
+
|
|
150
|
+
def print_schema(self):
|
|
151
|
+
self.signals_schema.print_tree()
|
|
152
|
+
|
|
153
|
+
def create_model(self, name: str) -> type[Feature]:
|
|
154
|
+
return self.signals_schema.create_model(name)
|
|
155
|
+
|
|
136
156
|
def settings(
|
|
137
157
|
self, cache=None, batch=None, parallel=None, workers=None, min_task_size=None
|
|
138
158
|
) -> "Self":
|
|
@@ -141,29 +161,28 @@ class DataChain(DatasetQuery):
|
|
|
141
161
|
This function changes specified settings without changing not specified ones.
|
|
142
162
|
It returns chain, so, it can be chained later with next operation.
|
|
143
163
|
|
|
144
|
-
Parameters
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
>>> )
|
|
164
|
+
Parameters:
|
|
165
|
+
cache : data caching (default=False)
|
|
166
|
+
batch : size of the batch (default=1000)
|
|
167
|
+
parallel : number of thread for processors. True is a special value to
|
|
168
|
+
enable all available CPUs (default=1)
|
|
169
|
+
workers : number of distributed workers. Only for Studio mode. (default=1)
|
|
170
|
+
min_task_size : minimum number of tasks (default=1)
|
|
171
|
+
|
|
172
|
+
Example:
|
|
173
|
+
```py
|
|
174
|
+
chain = (
|
|
175
|
+
chain
|
|
176
|
+
.settings(cache=True, parallel=8)
|
|
177
|
+
.map(laion=process_webdataset(spec=WDSLaion), params="file")
|
|
178
|
+
)
|
|
179
|
+
```
|
|
161
180
|
"""
|
|
162
181
|
self._settings.add(Settings(cache, batch, parallel, workers, min_task_size))
|
|
163
182
|
return self
|
|
164
183
|
|
|
165
184
|
def reset_settings(self, settings: Optional[Settings] = None) -> "Self":
|
|
166
|
-
"""Reset all settings to default values"""
|
|
185
|
+
"""Reset all settings to default values."""
|
|
167
186
|
self._settings = settings if settings else Settings()
|
|
168
187
|
return self
|
|
169
188
|
|
|
@@ -183,40 +202,40 @@ class DataChain(DatasetQuery):
|
|
|
183
202
|
def from_storage(
|
|
184
203
|
cls,
|
|
185
204
|
path,
|
|
205
|
+
*,
|
|
186
206
|
type: Literal["binary", "text", "image"] = "binary",
|
|
207
|
+
catalog: Optional["Catalog"] = None,
|
|
208
|
+
recursive: Optional[bool] = True,
|
|
187
209
|
anon: bool = False,
|
|
188
|
-
) -> "
|
|
189
|
-
"""Get data from a storage as a list of file with all file attributes.
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
Parameters
|
|
193
|
-
----------
|
|
194
|
-
path : storage URI with directory. URI must start with storage prefix such
|
|
195
|
-
as `s3://`, `gs://`, `az://` or "file:///"
|
|
196
|
-
type : read file as "binary", "text", or "image" data. Default is "binary".
|
|
197
|
-
anon : use anonymous mode to access the storage.
|
|
210
|
+
) -> "Self":
|
|
211
|
+
"""Get data from a storage as a list of file with all file attributes. It
|
|
212
|
+
returns the chain itself as usual.
|
|
198
213
|
|
|
199
|
-
|
|
200
|
-
|
|
214
|
+
Parameters:
|
|
215
|
+
path : storage URI with directory. URI must start with storage prefix such
|
|
216
|
+
as `s3://`, `gs://`, `az://` or "file:///"
|
|
217
|
+
type : read file as "binary", "text", or "image" data. Default is "binary".
|
|
218
|
+
recursive : search recursively for the given path.
|
|
219
|
+
anon : use anonymous mode to access the storage.
|
|
201
220
|
|
|
202
|
-
|
|
221
|
+
Example:
|
|
222
|
+
```py
|
|
223
|
+
chain = DataChain.from_storage("s3://my-bucket/my-dir")
|
|
224
|
+
```
|
|
203
225
|
"""
|
|
204
226
|
func = get_file(type)
|
|
205
|
-
return
|
|
227
|
+
return cls(path, catalog=catalog, recursive=recursive, anon=anon).map(file=func)
|
|
206
228
|
|
|
207
229
|
@classmethod
|
|
208
230
|
def from_dataset(cls, name: str, version: Optional[int] = None) -> "DataChain":
|
|
209
231
|
"""Get data from dataset. It returns the chain itself.
|
|
210
232
|
|
|
211
|
-
Parameters
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
version : dataset version
|
|
215
|
-
|
|
216
|
-
Examples
|
|
217
|
-
--------
|
|
233
|
+
Parameters:
|
|
234
|
+
name : dataset name
|
|
235
|
+
version : dataset version
|
|
218
236
|
|
|
219
|
-
|
|
237
|
+
Examples:
|
|
238
|
+
>>> chain = DataChain.from_dataset("my_cats")
|
|
220
239
|
"""
|
|
221
240
|
return DataChain(name=name, version=version)
|
|
222
241
|
|
|
@@ -228,37 +247,44 @@ class DataChain(DatasetQuery):
|
|
|
228
247
|
anon: bool = False,
|
|
229
248
|
spec: Optional[FeatureType] = None,
|
|
230
249
|
schema_from: Optional[str] = "auto",
|
|
250
|
+
object_name: Optional[str] = "csv",
|
|
251
|
+
model_name: Optional[str] = None,
|
|
231
252
|
show_schema: Optional[bool] = False,
|
|
232
253
|
) -> "DataChain":
|
|
233
254
|
"""Get data from CSV. It returns the chain itself.
|
|
234
255
|
|
|
235
|
-
Parameters
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
256
|
+
Parameters:
|
|
257
|
+
path : storage URI with directory. URI must start with storage prefix such
|
|
258
|
+
as `s3://`, `gs://`, `az://` or "file:///"
|
|
259
|
+
type : read file as "binary", "text", or "image" data. Default is "text".
|
|
260
|
+
anon : use anonymous mode to access the storage.
|
|
261
|
+
spec : Data Model for CSV file
|
|
262
|
+
object_name : generated object column name
|
|
263
|
+
model_name : generated model name
|
|
264
|
+
schema_from : path to sample to infer spec from
|
|
265
|
+
show_schema : print auto-generated schema
|
|
244
266
|
|
|
245
|
-
Examples
|
|
246
|
-
|
|
267
|
+
Examples:
|
|
268
|
+
infer model from the first two lines (header + data)
|
|
269
|
+
>>> chain = DataChain.from_csv("gs://csv")
|
|
247
270
|
|
|
248
|
-
|
|
271
|
+
use a particular data model
|
|
272
|
+
>>> chain = DataChain.from_csv("gs://csv"i, spec=MyModel)
|
|
249
273
|
"""
|
|
250
274
|
if schema_from == "auto":
|
|
251
275
|
schema_from = path
|
|
252
276
|
|
|
253
277
|
chain = DataChain.from_storage(path=path, type=type, anon=anon)
|
|
254
|
-
|
|
255
|
-
|
|
278
|
+
signal_dict = {
|
|
279
|
+
object_name: read_meta(
|
|
256
280
|
schema_from=schema_from,
|
|
257
281
|
meta_type="csv",
|
|
258
282
|
spec=spec,
|
|
283
|
+
model_name=model_name,
|
|
259
284
|
show_schema=show_schema,
|
|
260
285
|
)
|
|
261
|
-
|
|
286
|
+
}
|
|
287
|
+
return chain.gen(**signal_dict) # type: ignore[misc, arg-type]
|
|
262
288
|
|
|
263
289
|
@classmethod
|
|
264
290
|
def from_json(
|
|
@@ -269,50 +295,104 @@ class DataChain(DatasetQuery):
|
|
|
269
295
|
spec: Optional[FeatureType] = None,
|
|
270
296
|
schema_from: Optional[str] = "auto",
|
|
271
297
|
jmespath: Optional[str] = None,
|
|
298
|
+
object_name: Optional[str] = None,
|
|
299
|
+
model_name: Optional[str] = None,
|
|
272
300
|
show_schema: Optional[bool] = False,
|
|
301
|
+
meta_type: Optional[str] = "json",
|
|
273
302
|
) -> "DataChain":
|
|
274
|
-
"""Get data from
|
|
303
|
+
"""Get data from JSON. It returns the chain itself.
|
|
304
|
+
|
|
305
|
+
Parameters:
|
|
306
|
+
path : storage URI with directory. URI must start with storage prefix such
|
|
307
|
+
as `s3://`, `gs://`, `az://` or "file:///"
|
|
308
|
+
type : read file as "binary", "text", or "image" data. Default is "binary".
|
|
309
|
+
anon : use anonymous mode to access the storage.
|
|
310
|
+
spec : optional Data Model
|
|
311
|
+
schema_from : path to sample to infer spec from
|
|
312
|
+
object_name : generated object column name
|
|
313
|
+
model_name : generated model name
|
|
314
|
+
show_schema : print auto-generated schema
|
|
315
|
+
jmespath : JMESPATH expression to reduce JSON
|
|
316
|
+
|
|
317
|
+
Examples:
|
|
318
|
+
infer JSON schema from data, reduce using JMESPATH, print schema
|
|
319
|
+
>>> chain = DataChain.from_json("gs://json", jmespath="key1.key2")
|
|
275
320
|
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
path : storage URI with directory. URI must start with storage prefix such
|
|
279
|
-
as `s3://`, `gs://`, `az://` or "file:///"
|
|
280
|
-
type : read file as "binary", "text", or "image" data. Default is "binary".
|
|
281
|
-
anon : use anonymous mode to access the storage.
|
|
282
|
-
spec : optional Data Model
|
|
283
|
-
schema_from : path to sample to infer spec from
|
|
284
|
-
show_schema : print auto-generated schema
|
|
285
|
-
jmespath : JMESPATH expression to reduce JSON
|
|
286
|
-
name : return object name
|
|
287
|
-
Examples
|
|
288
|
-
--------
|
|
289
|
-
|
|
290
|
-
>>> chain = DataChain.from_json("gs://json")
|
|
321
|
+
infer JSON schema from a particular path, print data model
|
|
322
|
+
>>> chain = DataChain.from_json("gs://json_ds", schema_from="gs://json/my.json")
|
|
291
323
|
"""
|
|
292
324
|
if schema_from == "auto":
|
|
293
325
|
schema_from = path
|
|
294
326
|
|
|
327
|
+
def jmespath_to_name(s: str):
|
|
328
|
+
name_end = re.search(r"\W", s).start() if re.search(r"\W", s) else len(s) # type: ignore[union-attr]
|
|
329
|
+
return s[:name_end]
|
|
330
|
+
|
|
331
|
+
if (not object_name) and jmespath:
|
|
332
|
+
object_name = jmespath_to_name(jmespath)
|
|
333
|
+
if not object_name:
|
|
334
|
+
object_name = "json"
|
|
295
335
|
chain = DataChain.from_storage(path=path, type=type, anon=anon)
|
|
296
|
-
|
|
297
|
-
|
|
336
|
+
signal_dict = {
|
|
337
|
+
object_name: read_meta(
|
|
298
338
|
schema_from=schema_from,
|
|
299
|
-
meta_type=
|
|
339
|
+
meta_type=meta_type,
|
|
300
340
|
spec=spec,
|
|
341
|
+
model_name=model_name,
|
|
301
342
|
show_schema=show_schema,
|
|
302
343
|
jmespath=jmespath,
|
|
303
344
|
)
|
|
345
|
+
}
|
|
346
|
+
return chain.gen(**signal_dict) # type: ignore[arg-type]
|
|
347
|
+
|
|
348
|
+
def show_json_schema( # type: ignore[override]
|
|
349
|
+
self, jmespath: Optional[str] = None, model_name: Optional[str] = None
|
|
350
|
+
) -> "DataChain":
|
|
351
|
+
"""Print JSON data model and save it. It returns the chain itself.
|
|
352
|
+
|
|
353
|
+
Parameters:
|
|
354
|
+
jmespath : JMESPATH expression to reduce JSON
|
|
355
|
+
model_name : generated model name
|
|
356
|
+
|
|
357
|
+
Examples:
|
|
358
|
+
print JSON schema and save to column "meta_from":
|
|
359
|
+
>>> uri = "gs://datachain-demo/coco2017/annotations_captions/"
|
|
360
|
+
>>> chain = DataChain.from_storage(uri)
|
|
361
|
+
>>> chain = chain.show_json_schema()
|
|
362
|
+
>>> chain.save()
|
|
363
|
+
"""
|
|
364
|
+
return self.map(
|
|
365
|
+
meta_schema=lambda file: read_schema(
|
|
366
|
+
file, data_type="json", expr=jmespath, model_name=model_name
|
|
367
|
+
),
|
|
368
|
+
output=str,
|
|
369
|
+
)
|
|
370
|
+
|
|
371
|
+
def show_jsonl_schema( # type: ignore[override]
|
|
372
|
+
self, jmespath: Optional[str] = None, model_name: Optional[str] = None
|
|
373
|
+
) -> "DataChain":
|
|
374
|
+
"""Print JSON data model and save it. It returns the chain itself.
|
|
375
|
+
|
|
376
|
+
Parameters:
|
|
377
|
+
jmespath : JMESPATH expression to reduce JSON
|
|
378
|
+
model_name : generated model name
|
|
379
|
+
"""
|
|
380
|
+
return self.map(
|
|
381
|
+
meta_schema=lambda file: read_schema(
|
|
382
|
+
file, data_type="jsonl", expr=jmespath, model_name=model_name
|
|
383
|
+
),
|
|
384
|
+
output=str,
|
|
304
385
|
)
|
|
305
386
|
|
|
306
387
|
def save( # type: ignore[override]
|
|
307
388
|
self, name: Optional[str] = None, version: Optional[int] = None
|
|
308
389
|
) -> "DataChain":
|
|
309
|
-
"""Save to a Dataset. It returns the chain itself
|
|
390
|
+
"""Save to a Dataset. It returns the chain itself.
|
|
310
391
|
|
|
311
|
-
Parameters
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
version : version of a dataset. Default - the last version that exist.
|
|
392
|
+
Parameters:
|
|
393
|
+
name : dataset name. Empty name saves to a temporary dataset that will be
|
|
394
|
+
removed after process ends. Temp dataset are useful for optimization.
|
|
395
|
+
version : version of a dataset. Default - the last version that exist.
|
|
316
396
|
"""
|
|
317
397
|
schema = self.signals_schema.serialize()
|
|
318
398
|
return super().save(name=name, version=version, feature_schema=schema)
|
|
@@ -333,29 +413,26 @@ class DataChain(DatasetQuery):
|
|
|
333
413
|
Input-output relationship: 1:1
|
|
334
414
|
|
|
335
415
|
Parameters:
|
|
416
|
+
func : Function applied to each row.
|
|
417
|
+
params : List of column names used as input for the function. Default
|
|
418
|
+
is taken from function signature.
|
|
419
|
+
output : Dictionary defining new signals and their corresponding types.
|
|
420
|
+
Default type is taken from function signature. Default can be also
|
|
421
|
+
taken from kwargs - **signal_map (see below).
|
|
422
|
+
If signal name is defined using signal_map (see below) only a single
|
|
423
|
+
type value can be used.
|
|
424
|
+
**signal_map : kwargs can be used to define `func` together with it's return
|
|
425
|
+
signal name in format of `map(my_sign=my_func)`. This helps define
|
|
426
|
+
signal names and function in a nicer way.
|
|
427
|
+
|
|
428
|
+
Examples:
|
|
429
|
+
Using signal_map and single type in output:
|
|
430
|
+
>>> chain = chain.map(value=lambda name: name[:-4] + ".json", output=str)
|
|
431
|
+
>>> chain.save("new_dataset")
|
|
336
432
|
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
output : Dictionary defining new signals and their corresponding types. Default
|
|
341
|
-
type is taken from function signature. Default can be also taken from
|
|
342
|
-
kwargs - **signal_map (see below).
|
|
343
|
-
If signal name is defined using signal_map (see below) only a single
|
|
344
|
-
type value can be used.
|
|
345
|
-
**signal_map : kwargs can be used to define `func` together with it's return
|
|
346
|
-
signal name in format of `map(my_sign=my_func)`. This helps define
|
|
347
|
-
signal names and function in a nicer way.
|
|
348
|
-
|
|
349
|
-
Examples
|
|
350
|
-
--------
|
|
351
|
-
|
|
352
|
-
Using signal_map and single type in output:
|
|
353
|
-
>>> chain = chain.map(value=lambda name: name[:-4] + ".json", output=str)
|
|
354
|
-
>>> chain.save("new_dataset")
|
|
355
|
-
|
|
356
|
-
Using func and output as a map:
|
|
357
|
-
>>> chain = chain.map(lambda name: name[:-4] + ".json", output={"res": str})
|
|
358
|
-
>>> chain.save("new_dataset")
|
|
433
|
+
Using func and output as a map:
|
|
434
|
+
>>> chain = chain.map(lambda name: name[:-4] + ".json", output={"res": str})
|
|
435
|
+
>>> chain.save("new_dataset")
|
|
359
436
|
"""
|
|
360
437
|
|
|
361
438
|
udf_obj = self._udf_to_obj(Mapper, func, params, output, signal_map)
|
|
@@ -375,9 +452,8 @@ class DataChain(DatasetQuery):
|
|
|
375
452
|
output: Union[None, FeatureType, Sequence[str], dict[str, FeatureType]] = None,
|
|
376
453
|
**signal_map,
|
|
377
454
|
) -> "Self":
|
|
378
|
-
"""
|
|
379
|
-
|
|
380
|
-
The function needs to return a new objects for each of the new rows.
|
|
455
|
+
"""Apply a function to each row to create new rows (with potentially new
|
|
456
|
+
signals). The function needs to return a new objects for each of the new rows.
|
|
381
457
|
It returns a chain itself with new signals.
|
|
382
458
|
|
|
383
459
|
Input-output relationship: 1:N
|
|
@@ -435,7 +511,9 @@ class DataChain(DatasetQuery):
|
|
|
435
511
|
output: Union[None, FeatureType, Sequence[str], dict[str, FeatureType]] = None,
|
|
436
512
|
**signal_map,
|
|
437
513
|
) -> "Self":
|
|
438
|
-
"""This is a batch version of map().
|
|
514
|
+
"""This is a batch version of map().
|
|
515
|
+
|
|
516
|
+
It accepts the same parameters plus an
|
|
439
517
|
additional parameter:
|
|
440
518
|
"""
|
|
441
519
|
udf_obj = self._udf_to_obj(BatchMapper, func, params, output, signal_map)
|
|
@@ -455,7 +533,7 @@ class DataChain(DatasetQuery):
|
|
|
455
533
|
output: Union[None, FeatureType, Sequence[str], dict[str, FeatureType]],
|
|
456
534
|
signal_map,
|
|
457
535
|
) -> UDFBase:
|
|
458
|
-
is_generator =
|
|
536
|
+
is_generator = target_class.is_output_batched
|
|
459
537
|
name = self.name or "Unknown"
|
|
460
538
|
sign = UdfSignature.parse(name, signal_map, func, params, output, is_generator)
|
|
461
539
|
|
|
@@ -476,7 +554,7 @@ class DataChain(DatasetQuery):
|
|
|
476
554
|
|
|
477
555
|
@detach
|
|
478
556
|
def select(self, *args: str) -> "Self":
|
|
479
|
-
"""Select only a specified set of signals"""
|
|
557
|
+
"""Select only a specified set of signals."""
|
|
480
558
|
new_schema = self.signals_schema.resolve(*args)
|
|
481
559
|
columns = new_schema.db_signals()
|
|
482
560
|
chain = super().select(*columns)
|
|
@@ -485,7 +563,7 @@ class DataChain(DatasetQuery):
|
|
|
485
563
|
|
|
486
564
|
@detach
|
|
487
565
|
def select_except(self, *args: str) -> "Self":
|
|
488
|
-
"""Select all the signals expect the specified signals"""
|
|
566
|
+
"""Select all the signals expect the specified signals."""
|
|
489
567
|
new_schema = self.signals_schema.select_except_signals(*args)
|
|
490
568
|
columns = new_schema.db_signals()
|
|
491
569
|
chain = super().select(*columns)
|
|
@@ -494,6 +572,7 @@ class DataChain(DatasetQuery):
|
|
|
494
572
|
|
|
495
573
|
def get_values(self, *cols: str) -> Iterator[list]:
|
|
496
574
|
"""Iterate over rows, getting feature values and applying reader calls.
|
|
575
|
+
|
|
497
576
|
If columns are specified - limit them to specified columns.
|
|
498
577
|
"""
|
|
499
578
|
for features in self.iterate(*cols):
|
|
@@ -504,7 +583,9 @@ class DataChain(DatasetQuery):
|
|
|
504
583
|
yield item[0]
|
|
505
584
|
|
|
506
585
|
def iterate(self, *cols: str) -> Iterator[list[FeatureType]]:
|
|
507
|
-
"""Iterate over rows.
|
|
586
|
+
"""Iterate over rows.
|
|
587
|
+
|
|
588
|
+
If columns are specified - limit them to specified
|
|
508
589
|
columns.
|
|
509
590
|
"""
|
|
510
591
|
chain = self.select(*cols) if cols else self
|
|
@@ -563,20 +644,19 @@ class DataChain(DatasetQuery):
|
|
|
563
644
|
) -> "Self":
|
|
564
645
|
"""Merge two chains based on the specified criteria.
|
|
565
646
|
|
|
566
|
-
Parameters
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
647
|
+
Parameters:
|
|
648
|
+
right_ds : Chain to join with.
|
|
649
|
+
on : Predicate or list of Predicates to join on. If both chains have the
|
|
650
|
+
same predicates then this predicate is enough for the join. Otherwise,
|
|
651
|
+
`right_on` parameter has to specify the predicates for the other chain.
|
|
652
|
+
right_on: Optional predicate or list of Predicates
|
|
653
|
+
for the `right_ds` to join.
|
|
654
|
+
inner (bool): Whether to run inner join or outer join.
|
|
655
|
+
rname (str): name prefix for conflicting signal names.
|
|
575
656
|
|
|
576
657
|
Examples:
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
658
|
+
>>> meta = meta_emd.merge(meta_pq, on=(C.name, C.emd__index),
|
|
659
|
+
right_on=(C.name, C.pq__index))
|
|
580
660
|
"""
|
|
581
661
|
if on is None:
|
|
582
662
|
raise DatasetMergeError(["None"], None, "'on' must be specified")
|
|
@@ -599,7 +679,7 @@ class DataChain(DatasetQuery):
|
|
|
599
679
|
raise DatasetMergeError(
|
|
600
680
|
on,
|
|
601
681
|
right_on,
|
|
602
|
-
|
|
682
|
+
"'right_on' must be 'str' or 'Sequence' object"
|
|
603
683
|
f" but got type '{right_on}'",
|
|
604
684
|
)
|
|
605
685
|
|
|
@@ -616,7 +696,7 @@ class DataChain(DatasetQuery):
|
|
|
616
696
|
raise DatasetMergeError(
|
|
617
697
|
on,
|
|
618
698
|
right_on,
|
|
619
|
-
|
|
699
|
+
"'on' and 'right_on' must have the same number of columns in db'."
|
|
620
700
|
f" on -> {on_str}, right_on -> {right_on_str}",
|
|
621
701
|
)
|
|
622
702
|
else:
|
|
@@ -654,7 +734,7 @@ class DataChain(DatasetQuery):
|
|
|
654
734
|
|
|
655
735
|
@classmethod
|
|
656
736
|
def from_pandas( # type: ignore[override]
|
|
657
|
-
cls, df, name: str = "", session: Optional[Session] = None
|
|
737
|
+
cls, df: "pd.DataFrame", name: str = "", session: Optional[Session] = None
|
|
658
738
|
) -> "DataChain":
|
|
659
739
|
"""Generate chain from pandas data-frame."""
|
|
660
740
|
fr_map = {col.lower(): df[col].tolist() for col in df.columns}
|
|
@@ -664,7 +744,7 @@ class DataChain(DatasetQuery):
|
|
|
664
744
|
raise DatasetPrepareError(
|
|
665
745
|
name,
|
|
666
746
|
f"import from pandas error - column '{column}' conflicts with"
|
|
667
|
-
|
|
747
|
+
" default schema",
|
|
668
748
|
)
|
|
669
749
|
if not column.isidentifier():
|
|
670
750
|
raise DatasetPrepareError(
|
|
@@ -674,6 +754,131 @@ class DataChain(DatasetQuery):
|
|
|
674
754
|
|
|
675
755
|
return cls.from_features(name, session, **fr_map)
|
|
676
756
|
|
|
757
|
+
def parse_tabular(
|
|
758
|
+
self,
|
|
759
|
+
output: Optional[dict[str, FeatureType]] = None,
|
|
760
|
+
**kwargs,
|
|
761
|
+
) -> "DataChain":
|
|
762
|
+
"""Generate chain from list of tabular files.
|
|
763
|
+
|
|
764
|
+
Parameters:
|
|
765
|
+
output : Dictionary defining column names and their corresponding types.
|
|
766
|
+
kwargs : Parameters to pass to pyarrow.dataset.dataset.
|
|
767
|
+
|
|
768
|
+
Examples:
|
|
769
|
+
Reading a json lines file:
|
|
770
|
+
>>> dc = DataChain.from_storage("s3://mybucket/file.jsonl")
|
|
771
|
+
>>> dc = dc.parse_tabular(format="json")
|
|
772
|
+
|
|
773
|
+
Reading a filtered list of files as a dataset:
|
|
774
|
+
>>> dc = DataChain.from_storage("s3://mybucket")
|
|
775
|
+
>>> dc = dc.filter(C("file.name").glob("*.jsonl"))
|
|
776
|
+
>>> dc = dc.parse_tabular(format="json")
|
|
777
|
+
"""
|
|
778
|
+
from pyarrow import unify_schemas
|
|
779
|
+
from pyarrow.dataset import dataset
|
|
780
|
+
|
|
781
|
+
from datachain.lib.arrow import ArrowGenerator, Source, schema_to_output
|
|
782
|
+
|
|
783
|
+
schema = None
|
|
784
|
+
if output:
|
|
785
|
+
output = {"source": Source} | output
|
|
786
|
+
else:
|
|
787
|
+
schemas = []
|
|
788
|
+
for row in self.select("file").iterate():
|
|
789
|
+
file = row[0]
|
|
790
|
+
ds = dataset(file.get_path(), filesystem=file.get_fs(), **kwargs) # type: ignore[union-attr]
|
|
791
|
+
schemas.append(ds.schema)
|
|
792
|
+
if not schemas:
|
|
793
|
+
msg = "error parsing tabular data schema - found no files to parse"
|
|
794
|
+
raise DatasetPrepareError(self.name, msg)
|
|
795
|
+
schema = unify_schemas(schemas)
|
|
796
|
+
try:
|
|
797
|
+
output = schema_to_output(schema)
|
|
798
|
+
print(f"Inferred tabular data schema: {output}")
|
|
799
|
+
except ValueError as e:
|
|
800
|
+
raise DatasetPrepareError(self.name, e) from e
|
|
801
|
+
|
|
802
|
+
return self.gen(ArrowGenerator(schema, **kwargs), output=output)
|
|
803
|
+
|
|
804
|
+
def parse_csv(
|
|
805
|
+
self,
|
|
806
|
+
delimiter: str = ",",
|
|
807
|
+
header: bool = True,
|
|
808
|
+
column_names: Optional[list[str]] = None,
|
|
809
|
+
output: Optional[dict[str, FeatureType]] = None,
|
|
810
|
+
) -> "DataChain":
|
|
811
|
+
"""Generate chain from list of csv files.
|
|
812
|
+
|
|
813
|
+
Parameters:
|
|
814
|
+
delimiter : Character for delimiting columns.
|
|
815
|
+
header : Whether the files include a header row.
|
|
816
|
+
column_names : Column names if no header. Implies `header = False`.
|
|
817
|
+
output : Dictionary defining column names and their corresponding types.
|
|
818
|
+
|
|
819
|
+
Examples:
|
|
820
|
+
Reading a csv file:
|
|
821
|
+
>>> dc = DataChain.from_storage("s3://mybucket/file.csv")
|
|
822
|
+
>>> dc = dc.parse_tabular(format="csv")
|
|
823
|
+
|
|
824
|
+
Reading a filtered list of csv files as a dataset:
|
|
825
|
+
>>> dc = DataChain.from_storage("s3://mybucket")
|
|
826
|
+
>>> dc = dc.filter(C("file.name").glob("*.csv"))
|
|
827
|
+
>>> dc = dc.parse_tabular()
|
|
828
|
+
"""
|
|
829
|
+
from pyarrow.csv import ParseOptions, ReadOptions
|
|
830
|
+
from pyarrow.dataset import CsvFileFormat
|
|
831
|
+
|
|
832
|
+
if column_names and output:
|
|
833
|
+
msg = "error parsing csv - only one of column_names or output is allowed"
|
|
834
|
+
raise DatasetPrepareError(self.name, msg)
|
|
835
|
+
|
|
836
|
+
if not header and not column_names:
|
|
837
|
+
if output:
|
|
838
|
+
column_names = list(output.keys())
|
|
839
|
+
else:
|
|
840
|
+
msg = "error parsing csv - provide column_names or output if no header"
|
|
841
|
+
raise DatasetPrepareError(self.name, msg)
|
|
842
|
+
|
|
843
|
+
parse_options = ParseOptions(delimiter=delimiter)
|
|
844
|
+
read_options = ReadOptions(column_names=column_names)
|
|
845
|
+
format = CsvFileFormat(parse_options=parse_options, read_options=read_options)
|
|
846
|
+
return self.parse_tabular(output=output, format=format)
|
|
847
|
+
|
|
848
|
+
def parse_parquet(
|
|
849
|
+
self,
|
|
850
|
+
partitioning: Any = "hive",
|
|
851
|
+
output: Optional[dict[str, FeatureType]] = None,
|
|
852
|
+
) -> "DataChain":
|
|
853
|
+
"""Generate chain from list of parquet files.
|
|
854
|
+
|
|
855
|
+
Parameters:
|
|
856
|
+
partitioning : Any pyarrow partitioning schema.
|
|
857
|
+
output : Dictionary defining column names and their corresponding types.
|
|
858
|
+
|
|
859
|
+
Examples:
|
|
860
|
+
Reading a single file:
|
|
861
|
+
>>> dc = DataChain.from_storage("s3://mybucket/file.parquet")
|
|
862
|
+
>>> dc = dc.parse_tabular()
|
|
863
|
+
|
|
864
|
+
Reading a partitioned dataset from a directory:
|
|
865
|
+
>>> dc = DataChain.from_storage("path/to/dir")
|
|
866
|
+
>>> dc = dc.parse_tabular()
|
|
867
|
+
|
|
868
|
+
Reading a filtered list of files as a dataset:
|
|
869
|
+
>>> dc = DataChain.from_storage("s3://mybucket")
|
|
870
|
+
>>> dc = dc.filter(C("file.name").glob("*.parquet"))
|
|
871
|
+
>>> dc = dc.parse_tabular()
|
|
872
|
+
|
|
873
|
+
Reading a filtered list of partitions as a dataset:
|
|
874
|
+
>>> dc = DataChain.from_storage("s3://mybucket")
|
|
875
|
+
>>> dc = dc.filter(C("file.parent").glob("*month=1*"))
|
|
876
|
+
>>> dc = dc.parse_tabular()
|
|
877
|
+
"""
|
|
878
|
+
return self.parse_tabular(
|
|
879
|
+
output=output, format="parquet", partitioning=partitioning
|
|
880
|
+
)
|
|
881
|
+
|
|
677
882
|
@classmethod
|
|
678
883
|
def create_empty(
|
|
679
884
|
cls,
|
|
@@ -683,17 +888,13 @@ class DataChain(DatasetQuery):
|
|
|
683
888
|
"""Create empty chain. Returns a chain. This method is used for programmatically
|
|
684
889
|
generating a chains in contrast of reading data from storages or other sources.
|
|
685
890
|
|
|
686
|
-
Parameters
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
to_insert : records (or a single record) to insert. Each record is a dictionary
|
|
690
|
-
of signals and theirs values.
|
|
691
|
-
|
|
692
|
-
Examples
|
|
693
|
-
--------
|
|
891
|
+
Parameters:
|
|
892
|
+
to_insert : records (or a single record) to insert. Each record is
|
|
893
|
+
a dictionary of signals and theirs values.
|
|
694
894
|
|
|
695
|
-
|
|
696
|
-
|
|
895
|
+
Examples:
|
|
896
|
+
>>> empty = DataChain.create_empty()
|
|
897
|
+
>>> single_record = DataChain.create_empty(DataChain.DEFAULT_FILE_RECORD)
|
|
697
898
|
"""
|
|
698
899
|
session = Session.get(session)
|
|
699
900
|
dsr = cls.create_empty_record(session=session)
|
|
@@ -740,10 +941,12 @@ class DataChain(DatasetQuery):
|
|
|
740
941
|
@detach
|
|
741
942
|
def chunk(self, index: int, total: int) -> "DataChain":
|
|
742
943
|
"""Split a query into smaller chunks for e.g. parallelization.
|
|
743
|
-
|
|
944
|
+
|
|
945
|
+
Examples:
|
|
744
946
|
>>> dc = DataChain(...)
|
|
745
947
|
>>> chunk_1 = dc._chunk(0, 2)
|
|
746
948
|
>>> chunk_2 = dc._chunk(1, 2)
|
|
949
|
+
|
|
747
950
|
Note:
|
|
748
951
|
Bear in mind that `index` is 0-indexed but `total` isn't.
|
|
749
952
|
Use 0/3, 1/3 and 2/3, not 1/3, 2/3 and 3/3.
|