datachain 0.1.13__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/_version.py +2 -2
- datachain/asyn.py +3 -3
- datachain/catalog/__init__.py +3 -3
- datachain/catalog/catalog.py +6 -6
- datachain/catalog/loader.py +3 -3
- datachain/cli.py +2 -1
- datachain/client/azure.py +37 -1
- datachain/client/fsspec.py +1 -1
- datachain/client/local.py +1 -1
- datachain/data_storage/__init__.py +1 -1
- datachain/data_storage/metastore.py +11 -3
- datachain/data_storage/schema.py +2 -3
- datachain/data_storage/warehouse.py +31 -30
- datachain/dataset.py +1 -3
- datachain/lib/arrow.py +85 -0
- datachain/lib/dc.py +377 -178
- datachain/lib/feature.py +41 -90
- datachain/lib/feature_registry.py +3 -1
- datachain/lib/feature_utils.py +2 -2
- datachain/lib/file.py +20 -20
- datachain/lib/image.py +9 -2
- datachain/lib/meta_formats.py +66 -34
- datachain/lib/settings.py +5 -5
- datachain/lib/signal_schema.py +103 -105
- datachain/lib/udf.py +3 -12
- datachain/lib/udf_signature.py +11 -6
- datachain/lib/webdataset_laion.py +5 -22
- datachain/listing.py +8 -8
- datachain/node.py +1 -1
- datachain/progress.py +1 -1
- datachain/query/builtins.py +1 -1
- datachain/query/dataset.py +39 -110
- datachain/query/dispatch.py +1 -1
- datachain/query/metrics.py +19 -0
- datachain/query/schema.py +13 -3
- datachain/sql/__init__.py +1 -1
- datachain/utils.py +1 -122
- {datachain-0.1.13.dist-info → datachain-0.2.0.dist-info}/METADATA +10 -3
- {datachain-0.1.13.dist-info → datachain-0.2.0.dist-info}/RECORD +43 -42
- {datachain-0.1.13.dist-info → datachain-0.2.0.dist-info}/WHEEL +1 -1
- datachain/lib/parquet.py +0 -32
- {datachain-0.1.13.dist-info → datachain-0.2.0.dist-info}/LICENSE +0 -0
- {datachain-0.1.13.dist-info → datachain-0.2.0.dist-info}/entry_points.txt +0 -0
- {datachain-0.1.13.dist-info → datachain-0.2.0.dist-info}/top_level.txt +0 -0
datachain/lib/dc.py
CHANGED
|
@@ -1,12 +1,21 @@
|
|
|
1
|
+
import re
|
|
1
2
|
from collections.abc import Iterator, Sequence
|
|
2
|
-
from typing import
|
|
3
|
+
from typing import (
|
|
4
|
+
TYPE_CHECKING,
|
|
5
|
+
Any,
|
|
6
|
+
Callable,
|
|
7
|
+
ClassVar,
|
|
8
|
+
Literal,
|
|
9
|
+
Optional,
|
|
10
|
+
Union,
|
|
11
|
+
)
|
|
3
12
|
|
|
4
13
|
import sqlalchemy
|
|
5
14
|
|
|
6
15
|
from datachain.lib.feature import Feature, FeatureType
|
|
7
16
|
from datachain.lib.feature_utils import features_to_tuples
|
|
8
17
|
from datachain.lib.file import File, get_file
|
|
9
|
-
from datachain.lib.meta_formats import read_meta
|
|
18
|
+
from datachain.lib.meta_formats import read_meta, read_schema
|
|
10
19
|
from datachain.lib.settings import Settings
|
|
11
20
|
from datachain.lib.signal_schema import SignalSchema
|
|
12
21
|
from datachain.lib.udf import (
|
|
@@ -27,6 +36,7 @@ from datachain.query.dataset import (
|
|
|
27
36
|
from datachain.query.schema import Column, DatasetRow
|
|
28
37
|
|
|
29
38
|
if TYPE_CHECKING:
|
|
39
|
+
import pandas as pd
|
|
30
40
|
from typing_extensions import Self
|
|
31
41
|
|
|
32
42
|
C = Column
|
|
@@ -68,44 +78,43 @@ class DataChain(DatasetQuery):
|
|
|
68
78
|
The supported set of field types include: majority of the type supported by the
|
|
69
79
|
underlyind library `Pydantic`.
|
|
70
80
|
|
|
71
|
-
See Also
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
>>> print(chain)
|
|
81
|
+
See Also:
|
|
82
|
+
`DataChain.from_storage("s3://my-bucket/my-dir/")` - reading unstructured
|
|
83
|
+
data files from storages such as S3, gs or Azure ADLS.
|
|
84
|
+
|
|
85
|
+
`DataChain.save("name")` - saving to a dataset.
|
|
86
|
+
|
|
87
|
+
`DataChain.from_dataset("name")` - reading from a dataset.
|
|
88
|
+
|
|
89
|
+
`DataChain.from_features(fib=[1, 2, 3, 5, 8])` - generating from a values.
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
Example:
|
|
93
|
+
```py
|
|
94
|
+
from datachain import DataChain, Feature
|
|
95
|
+
from datachain.lib.claude import claude_processor
|
|
96
|
+
|
|
97
|
+
class Rating(Feature):
|
|
98
|
+
status: str = ""
|
|
99
|
+
explanation: str = ""
|
|
100
|
+
|
|
101
|
+
PROMPT = "A 'user' is a human trying to find the best mobile plan.... "
|
|
102
|
+
MODEL = "claude-3-opus-20240229"
|
|
103
|
+
|
|
104
|
+
chain = (
|
|
105
|
+
DataChain.from_storage("s3://my-bucket/my")
|
|
106
|
+
.filter(C.name.glob("*.txt"))
|
|
107
|
+
.limit(5)
|
|
108
|
+
.map(claude=claude_processor(prompt=PROMPT, model=MODEL))
|
|
109
|
+
.map(
|
|
110
|
+
rating=lambda claude: Rating(
|
|
111
|
+
**(json.loads(claude.content[0].text) if claude.content else {})
|
|
112
|
+
),
|
|
113
|
+
output=Rating,
|
|
114
|
+
)
|
|
115
|
+
chain.save("ratings")
|
|
116
|
+
print(chain)
|
|
117
|
+
```
|
|
109
118
|
"""
|
|
110
119
|
|
|
111
120
|
DEFAULT_FILE_RECORD: ClassVar[dict] = {
|
|
@@ -119,8 +128,7 @@ class DataChain(DatasetQuery):
|
|
|
119
128
|
|
|
120
129
|
def __init__(self, *args, **kwargs):
|
|
121
130
|
"""This method needs to be redefined as a part of Dataset and DacaChin
|
|
122
|
-
decoupling
|
|
123
|
-
"""
|
|
131
|
+
decoupling."""
|
|
124
132
|
super().__init__(
|
|
125
133
|
*args,
|
|
126
134
|
**kwargs,
|
|
@@ -133,6 +141,16 @@ class DataChain(DatasetQuery):
|
|
|
133
141
|
else:
|
|
134
142
|
self.signals_schema = SignalSchema.from_column_types(self.column_types)
|
|
135
143
|
|
|
144
|
+
@property
|
|
145
|
+
def schema(self):
|
|
146
|
+
return self.signals_schema.values if self.signals_schema else None
|
|
147
|
+
|
|
148
|
+
def print_schema(self):
|
|
149
|
+
self.signals_schema.print_tree()
|
|
150
|
+
|
|
151
|
+
def create_model(self, name: str) -> type[Feature]:
|
|
152
|
+
return self.signals_schema.create_model(name)
|
|
153
|
+
|
|
136
154
|
def settings(
|
|
137
155
|
self, cache=None, batch=None, parallel=None, workers=None, min_task_size=None
|
|
138
156
|
) -> "Self":
|
|
@@ -141,29 +159,28 @@ class DataChain(DatasetQuery):
|
|
|
141
159
|
This function changes specified settings without changing not specified ones.
|
|
142
160
|
It returns chain, so, it can be chained later with next operation.
|
|
143
161
|
|
|
144
|
-
Parameters
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
>>> )
|
|
162
|
+
Parameters:
|
|
163
|
+
cache : data caching (default=False)
|
|
164
|
+
batch : size of the batch (default=1000)
|
|
165
|
+
parallel : number of thread for processors. True is a special value to
|
|
166
|
+
enable all available CPUs (default=1)
|
|
167
|
+
workers : number of distributed workers. Only for Studio mode. (default=1)
|
|
168
|
+
min_task_size : minimum number of tasks (default=1)
|
|
169
|
+
|
|
170
|
+
Example:
|
|
171
|
+
```py
|
|
172
|
+
chain = (
|
|
173
|
+
chain
|
|
174
|
+
.settings(cache=True, parallel=8)
|
|
175
|
+
.map(laion=process_webdataset(spec=WDSLaion), params="file")
|
|
176
|
+
)
|
|
177
|
+
```
|
|
161
178
|
"""
|
|
162
179
|
self._settings.add(Settings(cache, batch, parallel, workers, min_task_size))
|
|
163
180
|
return self
|
|
164
181
|
|
|
165
182
|
def reset_settings(self, settings: Optional[Settings] = None) -> "Self":
|
|
166
|
-
"""Reset all settings to default values"""
|
|
183
|
+
"""Reset all settings to default values."""
|
|
167
184
|
self._settings = settings if settings else Settings()
|
|
168
185
|
return self
|
|
169
186
|
|
|
@@ -184,39 +201,37 @@ class DataChain(DatasetQuery):
|
|
|
184
201
|
cls,
|
|
185
202
|
path,
|
|
186
203
|
type: Literal["binary", "text", "image"] = "binary",
|
|
204
|
+
recursive: Optional[bool] = True,
|
|
187
205
|
anon: bool = False,
|
|
188
206
|
) -> "DataChain":
|
|
189
|
-
"""Get data from a storage as a list of file with all file attributes.
|
|
190
|
-
|
|
207
|
+
"""Get data from a storage as a list of file with all file attributes. It
|
|
208
|
+
returns the chain itself as usual.
|
|
191
209
|
|
|
192
|
-
Parameters
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
Examples
|
|
200
|
-
--------
|
|
210
|
+
Parameters:
|
|
211
|
+
path : storage URI with directory. URI must start with storage prefix such
|
|
212
|
+
as `s3://`, `gs://`, `az://` or "file:///"
|
|
213
|
+
type : read file as "binary", "text", or "image" data. Default is "binary".
|
|
214
|
+
recursive : search recursively for the given path.
|
|
215
|
+
anon : use anonymous mode to access the storage.
|
|
201
216
|
|
|
202
|
-
|
|
217
|
+
Example:
|
|
218
|
+
```py
|
|
219
|
+
chain = DataChain.from_storage("s3://my-bucket/my-dir")
|
|
220
|
+
```
|
|
203
221
|
"""
|
|
204
222
|
func = get_file(type)
|
|
205
|
-
return DataChain(path, anon=anon).map(file=func)
|
|
223
|
+
return DataChain(path, recursive=recursive, anon=anon).map(file=func)
|
|
206
224
|
|
|
207
225
|
@classmethod
|
|
208
226
|
def from_dataset(cls, name: str, version: Optional[int] = None) -> "DataChain":
|
|
209
227
|
"""Get data from dataset. It returns the chain itself.
|
|
210
228
|
|
|
211
|
-
Parameters
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
version : dataset version
|
|
215
|
-
|
|
216
|
-
Examples
|
|
217
|
-
--------
|
|
229
|
+
Parameters:
|
|
230
|
+
name : dataset name
|
|
231
|
+
version : dataset version
|
|
218
232
|
|
|
219
|
-
|
|
233
|
+
Examples:
|
|
234
|
+
>>> chain = DataChain.from_dataset("my_cats")
|
|
220
235
|
"""
|
|
221
236
|
return DataChain(name=name, version=version)
|
|
222
237
|
|
|
@@ -228,37 +243,44 @@ class DataChain(DatasetQuery):
|
|
|
228
243
|
anon: bool = False,
|
|
229
244
|
spec: Optional[FeatureType] = None,
|
|
230
245
|
schema_from: Optional[str] = "auto",
|
|
246
|
+
object_name: Optional[str] = "csv",
|
|
247
|
+
model_name: Optional[str] = None,
|
|
231
248
|
show_schema: Optional[bool] = False,
|
|
232
249
|
) -> "DataChain":
|
|
233
250
|
"""Get data from CSV. It returns the chain itself.
|
|
234
251
|
|
|
235
|
-
Parameters
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
252
|
+
Parameters:
|
|
253
|
+
path : storage URI with directory. URI must start with storage prefix such
|
|
254
|
+
as `s3://`, `gs://`, `az://` or "file:///"
|
|
255
|
+
type : read file as "binary", "text", or "image" data. Default is "text".
|
|
256
|
+
anon : use anonymous mode to access the storage.
|
|
257
|
+
spec : Data Model for CSV file
|
|
258
|
+
object_name : generated object column name
|
|
259
|
+
model_name : generated model name
|
|
260
|
+
schema_from : path to sample to infer spec from
|
|
261
|
+
show_schema : print auto-generated schema
|
|
244
262
|
|
|
245
|
-
Examples
|
|
246
|
-
|
|
263
|
+
Examples:
|
|
264
|
+
infer model from the first two lines (header + data)
|
|
265
|
+
>>> chain = DataChain.from_csv("gs://csv")
|
|
247
266
|
|
|
248
|
-
|
|
267
|
+
use a particular data model
|
|
268
|
+
>>> chain = DataChain.from_csv("gs://csv"i, spec=MyModel)
|
|
249
269
|
"""
|
|
250
270
|
if schema_from == "auto":
|
|
251
271
|
schema_from = path
|
|
252
272
|
|
|
253
273
|
chain = DataChain.from_storage(path=path, type=type, anon=anon)
|
|
254
|
-
|
|
255
|
-
|
|
274
|
+
signal_dict = {
|
|
275
|
+
object_name: read_meta(
|
|
256
276
|
schema_from=schema_from,
|
|
257
277
|
meta_type="csv",
|
|
258
278
|
spec=spec,
|
|
279
|
+
model_name=model_name,
|
|
259
280
|
show_schema=show_schema,
|
|
260
281
|
)
|
|
261
|
-
|
|
282
|
+
}
|
|
283
|
+
return chain.gen(**signal_dict) # type: ignore[misc, arg-type]
|
|
262
284
|
|
|
263
285
|
@classmethod
|
|
264
286
|
def from_json(
|
|
@@ -269,50 +291,104 @@ class DataChain(DatasetQuery):
|
|
|
269
291
|
spec: Optional[FeatureType] = None,
|
|
270
292
|
schema_from: Optional[str] = "auto",
|
|
271
293
|
jmespath: Optional[str] = None,
|
|
294
|
+
object_name: Optional[str] = None,
|
|
295
|
+
model_name: Optional[str] = None,
|
|
272
296
|
show_schema: Optional[bool] = False,
|
|
297
|
+
meta_type: Optional[str] = "json",
|
|
273
298
|
) -> "DataChain":
|
|
274
|
-
"""Get data from
|
|
299
|
+
"""Get data from JSON. It returns the chain itself.
|
|
275
300
|
|
|
276
|
-
Parameters
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
301
|
+
Parameters:
|
|
302
|
+
path : storage URI with directory. URI must start with storage prefix such
|
|
303
|
+
as `s3://`, `gs://`, `az://` or "file:///"
|
|
304
|
+
type : read file as "binary", "text", or "image" data. Default is "binary".
|
|
305
|
+
anon : use anonymous mode to access the storage.
|
|
306
|
+
spec : optional Data Model
|
|
307
|
+
schema_from : path to sample to infer spec from
|
|
308
|
+
object_name : generated object column name
|
|
309
|
+
model_name : generated model name
|
|
310
|
+
show_schema : print auto-generated schema
|
|
311
|
+
jmespath : JMESPATH expression to reduce JSON
|
|
312
|
+
|
|
313
|
+
Examples:
|
|
314
|
+
infer JSON schema from data, reduce using JMESPATH, print schema
|
|
315
|
+
>>> chain = DataChain.from_json("gs://json", jmespath="key1.key2")
|
|
316
|
+
|
|
317
|
+
infer JSON schema from a particular path, print data model
|
|
318
|
+
>>> chain = DataChain.from_json("gs://json_ds", schema_from="gs://json/my.json")
|
|
291
319
|
"""
|
|
292
320
|
if schema_from == "auto":
|
|
293
321
|
schema_from = path
|
|
294
322
|
|
|
323
|
+
def jmespath_to_name(s: str):
|
|
324
|
+
name_end = re.search(r"\W", s).start() if re.search(r"\W", s) else len(s) # type: ignore[union-attr]
|
|
325
|
+
return s[:name_end]
|
|
326
|
+
|
|
327
|
+
if (not object_name) and jmespath:
|
|
328
|
+
object_name = jmespath_to_name(jmespath)
|
|
329
|
+
if not object_name:
|
|
330
|
+
object_name = "json"
|
|
295
331
|
chain = DataChain.from_storage(path=path, type=type, anon=anon)
|
|
296
|
-
|
|
297
|
-
|
|
332
|
+
signal_dict = {
|
|
333
|
+
object_name: read_meta(
|
|
298
334
|
schema_from=schema_from,
|
|
299
|
-
meta_type=
|
|
335
|
+
meta_type=meta_type,
|
|
300
336
|
spec=spec,
|
|
337
|
+
model_name=model_name,
|
|
301
338
|
show_schema=show_schema,
|
|
302
339
|
jmespath=jmespath,
|
|
303
340
|
)
|
|
341
|
+
}
|
|
342
|
+
return chain.gen(**signal_dict) # type: ignore[arg-type]
|
|
343
|
+
|
|
344
|
+
def show_json_schema( # type: ignore[override]
|
|
345
|
+
self, jmespath: Optional[str] = None, model_name: Optional[str] = None
|
|
346
|
+
) -> "DataChain":
|
|
347
|
+
"""Print JSON data model and save it. It returns the chain itself.
|
|
348
|
+
|
|
349
|
+
Parameters:
|
|
350
|
+
jmespath : JMESPATH expression to reduce JSON
|
|
351
|
+
model_name : generated model name
|
|
352
|
+
|
|
353
|
+
Examples:
|
|
354
|
+
print JSON schema and save to column "meta_from":
|
|
355
|
+
>>> uri = "gs://datachain-demo/coco2017/annotations_captions/"
|
|
356
|
+
>>> chain = DataChain.from_storage(uri)
|
|
357
|
+
>>> chain = chain.show_json_schema()
|
|
358
|
+
>>> chain.save()
|
|
359
|
+
"""
|
|
360
|
+
return self.map(
|
|
361
|
+
meta_schema=lambda file: read_schema(
|
|
362
|
+
file, data_type="json", expr=jmespath, model_name=model_name
|
|
363
|
+
),
|
|
364
|
+
output=str,
|
|
365
|
+
)
|
|
366
|
+
|
|
367
|
+
def show_jsonl_schema( # type: ignore[override]
|
|
368
|
+
self, jmespath: Optional[str] = None, model_name: Optional[str] = None
|
|
369
|
+
) -> "DataChain":
|
|
370
|
+
"""Print JSON data model and save it. It returns the chain itself.
|
|
371
|
+
|
|
372
|
+
Parameters:
|
|
373
|
+
jmespath : JMESPATH expression to reduce JSON
|
|
374
|
+
model_name : generated model name
|
|
375
|
+
"""
|
|
376
|
+
return self.map(
|
|
377
|
+
meta_schema=lambda file: read_schema(
|
|
378
|
+
file, data_type="jsonl", expr=jmespath, model_name=model_name
|
|
379
|
+
),
|
|
380
|
+
output=str,
|
|
304
381
|
)
|
|
305
382
|
|
|
306
383
|
def save( # type: ignore[override]
|
|
307
384
|
self, name: Optional[str] = None, version: Optional[int] = None
|
|
308
385
|
) -> "DataChain":
|
|
309
|
-
"""Save to a Dataset. It returns the chain itself
|
|
386
|
+
"""Save to a Dataset. It returns the chain itself.
|
|
310
387
|
|
|
311
|
-
Parameters
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
version : version of a dataset. Default - the last version that exist.
|
|
388
|
+
Parameters:
|
|
389
|
+
name : dataset name. Empty name saves to a temporary dataset that will be
|
|
390
|
+
removed after process ends. Temp dataset are useful for optimization.
|
|
391
|
+
version : version of a dataset. Default - the last version that exist.
|
|
316
392
|
"""
|
|
317
393
|
schema = self.signals_schema.serialize()
|
|
318
394
|
return super().save(name=name, version=version, feature_schema=schema)
|
|
@@ -333,29 +409,26 @@ class DataChain(DatasetQuery):
|
|
|
333
409
|
Input-output relationship: 1:1
|
|
334
410
|
|
|
335
411
|
Parameters:
|
|
412
|
+
func : Function applied to each row.
|
|
413
|
+
params : List of column names used as input for the function. Default
|
|
414
|
+
is taken from function signature.
|
|
415
|
+
output : Dictionary defining new signals and their corresponding types.
|
|
416
|
+
Default type is taken from function signature. Default can be also
|
|
417
|
+
taken from kwargs - **signal_map (see below).
|
|
418
|
+
If signal name is defined using signal_map (see below) only a single
|
|
419
|
+
type value can be used.
|
|
420
|
+
**signal_map : kwargs can be used to define `func` together with it's return
|
|
421
|
+
signal name in format of `map(my_sign=my_func)`. This helps define
|
|
422
|
+
signal names and function in a nicer way.
|
|
423
|
+
|
|
424
|
+
Examples:
|
|
425
|
+
Using signal_map and single type in output:
|
|
426
|
+
>>> chain = chain.map(value=lambda name: name[:-4] + ".json", output=str)
|
|
427
|
+
>>> chain.save("new_dataset")
|
|
336
428
|
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
output : Dictionary defining new signals and their corresponding types. Default
|
|
341
|
-
type is taken from function signature. Default can be also taken from
|
|
342
|
-
kwargs - **signal_map (see below).
|
|
343
|
-
If signal name is defined using signal_map (see below) only a single
|
|
344
|
-
type value can be used.
|
|
345
|
-
**signal_map : kwargs can be used to define `func` together with it's return
|
|
346
|
-
signal name in format of `map(my_sign=my_func)`. This helps define
|
|
347
|
-
signal names and function in a nicer way.
|
|
348
|
-
|
|
349
|
-
Examples
|
|
350
|
-
--------
|
|
351
|
-
|
|
352
|
-
Using signal_map and single type in output:
|
|
353
|
-
>>> chain = chain.map(value=lambda name: name[:-4] + ".json", output=str)
|
|
354
|
-
>>> chain.save("new_dataset")
|
|
355
|
-
|
|
356
|
-
Using func and output as a map:
|
|
357
|
-
>>> chain = chain.map(lambda name: name[:-4] + ".json", output={"res": str})
|
|
358
|
-
>>> chain.save("new_dataset")
|
|
429
|
+
Using func and output as a map:
|
|
430
|
+
>>> chain = chain.map(lambda name: name[:-4] + ".json", output={"res": str})
|
|
431
|
+
>>> chain.save("new_dataset")
|
|
359
432
|
"""
|
|
360
433
|
|
|
361
434
|
udf_obj = self._udf_to_obj(Mapper, func, params, output, signal_map)
|
|
@@ -375,9 +448,8 @@ class DataChain(DatasetQuery):
|
|
|
375
448
|
output: Union[None, FeatureType, Sequence[str], dict[str, FeatureType]] = None,
|
|
376
449
|
**signal_map,
|
|
377
450
|
) -> "Self":
|
|
378
|
-
"""
|
|
379
|
-
|
|
380
|
-
The function needs to return a new objects for each of the new rows.
|
|
451
|
+
"""Apply a function to each row to create new rows (with potentially new
|
|
452
|
+
signals). The function needs to return a new objects for each of the new rows.
|
|
381
453
|
It returns a chain itself with new signals.
|
|
382
454
|
|
|
383
455
|
Input-output relationship: 1:N
|
|
@@ -435,7 +507,9 @@ class DataChain(DatasetQuery):
|
|
|
435
507
|
output: Union[None, FeatureType, Sequence[str], dict[str, FeatureType]] = None,
|
|
436
508
|
**signal_map,
|
|
437
509
|
) -> "Self":
|
|
438
|
-
"""This is a batch version of map().
|
|
510
|
+
"""This is a batch version of map().
|
|
511
|
+
|
|
512
|
+
It accepts the same parameters plus an
|
|
439
513
|
additional parameter:
|
|
440
514
|
"""
|
|
441
515
|
udf_obj = self._udf_to_obj(BatchMapper, func, params, output, signal_map)
|
|
@@ -455,7 +529,7 @@ class DataChain(DatasetQuery):
|
|
|
455
529
|
output: Union[None, FeatureType, Sequence[str], dict[str, FeatureType]],
|
|
456
530
|
signal_map,
|
|
457
531
|
) -> UDFBase:
|
|
458
|
-
is_generator =
|
|
532
|
+
is_generator = target_class.is_output_batched
|
|
459
533
|
name = self.name or "Unknown"
|
|
460
534
|
sign = UdfSignature.parse(name, signal_map, func, params, output, is_generator)
|
|
461
535
|
|
|
@@ -476,7 +550,7 @@ class DataChain(DatasetQuery):
|
|
|
476
550
|
|
|
477
551
|
@detach
|
|
478
552
|
def select(self, *args: str) -> "Self":
|
|
479
|
-
"""Select only a specified set of signals"""
|
|
553
|
+
"""Select only a specified set of signals."""
|
|
480
554
|
new_schema = self.signals_schema.resolve(*args)
|
|
481
555
|
columns = new_schema.db_signals()
|
|
482
556
|
chain = super().select(*columns)
|
|
@@ -485,7 +559,7 @@ class DataChain(DatasetQuery):
|
|
|
485
559
|
|
|
486
560
|
@detach
|
|
487
561
|
def select_except(self, *args: str) -> "Self":
|
|
488
|
-
"""Select all the signals expect the specified signals"""
|
|
562
|
+
"""Select all the signals expect the specified signals."""
|
|
489
563
|
new_schema = self.signals_schema.select_except_signals(*args)
|
|
490
564
|
columns = new_schema.db_signals()
|
|
491
565
|
chain = super().select(*columns)
|
|
@@ -494,6 +568,7 @@ class DataChain(DatasetQuery):
|
|
|
494
568
|
|
|
495
569
|
def get_values(self, *cols: str) -> Iterator[list]:
|
|
496
570
|
"""Iterate over rows, getting feature values and applying reader calls.
|
|
571
|
+
|
|
497
572
|
If columns are specified - limit them to specified columns.
|
|
498
573
|
"""
|
|
499
574
|
for features in self.iterate(*cols):
|
|
@@ -504,7 +579,9 @@ class DataChain(DatasetQuery):
|
|
|
504
579
|
yield item[0]
|
|
505
580
|
|
|
506
581
|
def iterate(self, *cols: str) -> Iterator[list[FeatureType]]:
|
|
507
|
-
"""Iterate over rows.
|
|
582
|
+
"""Iterate over rows.
|
|
583
|
+
|
|
584
|
+
If columns are specified - limit them to specified
|
|
508
585
|
columns.
|
|
509
586
|
"""
|
|
510
587
|
chain = self.select(*cols) if cols else self
|
|
@@ -563,20 +640,19 @@ class DataChain(DatasetQuery):
|
|
|
563
640
|
) -> "Self":
|
|
564
641
|
"""Merge two chains based on the specified criteria.
|
|
565
642
|
|
|
566
|
-
Parameters
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
643
|
+
Parameters:
|
|
644
|
+
right_ds : Chain to join with.
|
|
645
|
+
on : Predicate or list of Predicates to join on. If both chains have the
|
|
646
|
+
same predicates then this predicate is enough for the join. Otherwise,
|
|
647
|
+
`right_on` parameter has to specify the predicates for the other chain.
|
|
648
|
+
right_on: Optional predicate or list of Predicates
|
|
649
|
+
for the `right_ds` to join.
|
|
650
|
+
inner (bool): Whether to run inner join or outer join.
|
|
651
|
+
rname (str): name prefix for conflicting signal names.
|
|
575
652
|
|
|
576
653
|
Examples:
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
654
|
+
>>> meta = meta_emd.merge(meta_pq, on=(C.name, C.emd__index),
|
|
655
|
+
right_on=(C.name, C.pq__index))
|
|
580
656
|
"""
|
|
581
657
|
if on is None:
|
|
582
658
|
raise DatasetMergeError(["None"], None, "'on' must be specified")
|
|
@@ -599,7 +675,7 @@ class DataChain(DatasetQuery):
|
|
|
599
675
|
raise DatasetMergeError(
|
|
600
676
|
on,
|
|
601
677
|
right_on,
|
|
602
|
-
|
|
678
|
+
"'right_on' must be 'str' or 'Sequence' object"
|
|
603
679
|
f" but got type '{right_on}'",
|
|
604
680
|
)
|
|
605
681
|
|
|
@@ -616,7 +692,7 @@ class DataChain(DatasetQuery):
|
|
|
616
692
|
raise DatasetMergeError(
|
|
617
693
|
on,
|
|
618
694
|
right_on,
|
|
619
|
-
|
|
695
|
+
"'on' and 'right_on' must have the same number of columns in db'."
|
|
620
696
|
f" on -> {on_str}, right_on -> {right_on_str}",
|
|
621
697
|
)
|
|
622
698
|
else:
|
|
@@ -654,7 +730,7 @@ class DataChain(DatasetQuery):
|
|
|
654
730
|
|
|
655
731
|
@classmethod
|
|
656
732
|
def from_pandas( # type: ignore[override]
|
|
657
|
-
cls, df, name: str = "", session: Optional[Session] = None
|
|
733
|
+
cls, df: "pd.DataFrame", name: str = "", session: Optional[Session] = None
|
|
658
734
|
) -> "DataChain":
|
|
659
735
|
"""Generate chain from pandas data-frame."""
|
|
660
736
|
fr_map = {col.lower(): df[col].tolist() for col in df.columns}
|
|
@@ -664,7 +740,7 @@ class DataChain(DatasetQuery):
|
|
|
664
740
|
raise DatasetPrepareError(
|
|
665
741
|
name,
|
|
666
742
|
f"import from pandas error - column '{column}' conflicts with"
|
|
667
|
-
|
|
743
|
+
" default schema",
|
|
668
744
|
)
|
|
669
745
|
if not column.isidentifier():
|
|
670
746
|
raise DatasetPrepareError(
|
|
@@ -674,6 +750,131 @@ class DataChain(DatasetQuery):
|
|
|
674
750
|
|
|
675
751
|
return cls.from_features(name, session, **fr_map)
|
|
676
752
|
|
|
753
|
+
def parse_tabular(
|
|
754
|
+
self,
|
|
755
|
+
output: Optional[dict[str, FeatureType]] = None,
|
|
756
|
+
**kwargs,
|
|
757
|
+
) -> "DataChain":
|
|
758
|
+
"""Generate chain from list of tabular files.
|
|
759
|
+
|
|
760
|
+
Parameters:
|
|
761
|
+
output : Dictionary defining column names and their corresponding types.
|
|
762
|
+
kwargs : Parameters to pass to pyarrow.dataset.dataset.
|
|
763
|
+
|
|
764
|
+
Examples:
|
|
765
|
+
Reading a json lines file:
|
|
766
|
+
>>> dc = DataChain.from_storage("s3://mybucket/file.jsonl")
|
|
767
|
+
>>> dc = dc.parse_tabular(format="json")
|
|
768
|
+
|
|
769
|
+
Reading a filtered list of files as a dataset:
|
|
770
|
+
>>> dc = DataChain.from_storage("s3://mybucket")
|
|
771
|
+
>>> dc = dc.filter(C("file.name").glob("*.jsonl"))
|
|
772
|
+
>>> dc = dc.parse_tabular(format="json")
|
|
773
|
+
"""
|
|
774
|
+
from pyarrow import unify_schemas
|
|
775
|
+
from pyarrow.dataset import dataset
|
|
776
|
+
|
|
777
|
+
from datachain.lib.arrow import ArrowGenerator, Source, schema_to_output
|
|
778
|
+
|
|
779
|
+
schema = None
|
|
780
|
+
if output:
|
|
781
|
+
output = {"source": Source} | output
|
|
782
|
+
else:
|
|
783
|
+
schemas = []
|
|
784
|
+
for row in self.select("file").iterate():
|
|
785
|
+
file = row[0]
|
|
786
|
+
ds = dataset(file.get_path(), filesystem=file.get_fs(), **kwargs) # type: ignore[union-attr]
|
|
787
|
+
schemas.append(ds.schema)
|
|
788
|
+
if not schemas:
|
|
789
|
+
msg = "error parsing tabular data schema - found no files to parse"
|
|
790
|
+
raise DatasetPrepareError(self.name, msg)
|
|
791
|
+
schema = unify_schemas(schemas)
|
|
792
|
+
try:
|
|
793
|
+
output = schema_to_output(schema)
|
|
794
|
+
print(f"Inferred tabular data schema: {output}")
|
|
795
|
+
except ValueError as e:
|
|
796
|
+
raise DatasetPrepareError(self.name, e) from e
|
|
797
|
+
|
|
798
|
+
return self.gen(ArrowGenerator(schema, **kwargs), output=output)
|
|
799
|
+
|
|
800
|
+
def parse_csv(
|
|
801
|
+
self,
|
|
802
|
+
delimiter: str = ",",
|
|
803
|
+
header: bool = True,
|
|
804
|
+
column_names: Optional[list[str]] = None,
|
|
805
|
+
output: Optional[dict[str, FeatureType]] = None,
|
|
806
|
+
) -> "DataChain":
|
|
807
|
+
"""Generate chain from list of csv files.
|
|
808
|
+
|
|
809
|
+
Parameters:
|
|
810
|
+
delimiter : Character for delimiting columns.
|
|
811
|
+
header : Whether the files include a header row.
|
|
812
|
+
column_names : Column names if no header. Implies `header = False`.
|
|
813
|
+
output : Dictionary defining column names and their corresponding types.
|
|
814
|
+
|
|
815
|
+
Examples:
|
|
816
|
+
Reading a csv file:
|
|
817
|
+
>>> dc = DataChain.from_storage("s3://mybucket/file.csv")
|
|
818
|
+
>>> dc = dc.parse_tabular(format="csv")
|
|
819
|
+
|
|
820
|
+
Reading a filtered list of csv files as a dataset:
|
|
821
|
+
>>> dc = DataChain.from_storage("s3://mybucket")
|
|
822
|
+
>>> dc = dc.filter(C("file.name").glob("*.csv"))
|
|
823
|
+
>>> dc = dc.parse_tabular()
|
|
824
|
+
"""
|
|
825
|
+
from pyarrow.csv import ParseOptions, ReadOptions
|
|
826
|
+
from pyarrow.dataset import CsvFileFormat
|
|
827
|
+
|
|
828
|
+
if column_names and output:
|
|
829
|
+
msg = "error parsing csv - only one of column_names or output is allowed"
|
|
830
|
+
raise DatasetPrepareError(self.name, msg)
|
|
831
|
+
|
|
832
|
+
if not header and not column_names:
|
|
833
|
+
if output:
|
|
834
|
+
column_names = list(output.keys())
|
|
835
|
+
else:
|
|
836
|
+
msg = "error parsing csv - provide column_names or output if no header"
|
|
837
|
+
raise DatasetPrepareError(self.name, msg)
|
|
838
|
+
|
|
839
|
+
parse_options = ParseOptions(delimiter=delimiter)
|
|
840
|
+
read_options = ReadOptions(column_names=column_names)
|
|
841
|
+
format = CsvFileFormat(parse_options=parse_options, read_options=read_options)
|
|
842
|
+
return self.parse_tabular(output=output, format=format)
|
|
843
|
+
|
|
844
|
+
def parse_parquet(
|
|
845
|
+
self,
|
|
846
|
+
partitioning: Any = "hive",
|
|
847
|
+
output: Optional[dict[str, FeatureType]] = None,
|
|
848
|
+
) -> "DataChain":
|
|
849
|
+
"""Generate chain from list of parquet files.
|
|
850
|
+
|
|
851
|
+
Parameters:
|
|
852
|
+
partitioning : Any pyarrow partitioning schema.
|
|
853
|
+
output : Dictionary defining column names and their corresponding types.
|
|
854
|
+
|
|
855
|
+
Examples:
|
|
856
|
+
Reading a single file:
|
|
857
|
+
>>> dc = DataChain.from_storage("s3://mybucket/file.parquet")
|
|
858
|
+
>>> dc = dc.parse_tabular()
|
|
859
|
+
|
|
860
|
+
Reading a partitioned dataset from a directory:
|
|
861
|
+
>>> dc = DataChain.from_storage("path/to/dir")
|
|
862
|
+
>>> dc = dc.parse_tabular()
|
|
863
|
+
|
|
864
|
+
Reading a filtered list of files as a dataset:
|
|
865
|
+
>>> dc = DataChain.from_storage("s3://mybucket")
|
|
866
|
+
>>> dc = dc.filter(C("file.name").glob("*.parquet"))
|
|
867
|
+
>>> dc = dc.parse_tabular()
|
|
868
|
+
|
|
869
|
+
Reading a filtered list of partitions as a dataset:
|
|
870
|
+
>>> dc = DataChain.from_storage("s3://mybucket")
|
|
871
|
+
>>> dc = dc.filter(C("file.parent").glob("*month=1*"))
|
|
872
|
+
>>> dc = dc.parse_tabular()
|
|
873
|
+
"""
|
|
874
|
+
return self.parse_tabular(
|
|
875
|
+
output=output, format="parquet", partitioning=partitioning
|
|
876
|
+
)
|
|
877
|
+
|
|
677
878
|
@classmethod
|
|
678
879
|
def create_empty(
|
|
679
880
|
cls,
|
|
@@ -683,17 +884,13 @@ class DataChain(DatasetQuery):
|
|
|
683
884
|
"""Create empty chain. Returns a chain. This method is used for programmatically
|
|
684
885
|
generating a chains in contrast of reading data from storages or other sources.
|
|
685
886
|
|
|
686
|
-
Parameters
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
to_insert : records (or a single record) to insert. Each record is a dictionary
|
|
690
|
-
of signals and theirs values.
|
|
691
|
-
|
|
692
|
-
Examples
|
|
693
|
-
--------
|
|
887
|
+
Parameters:
|
|
888
|
+
to_insert : records (or a single record) to insert. Each record is
|
|
889
|
+
a dictionary of signals and theirs values.
|
|
694
890
|
|
|
695
|
-
|
|
696
|
-
|
|
891
|
+
Examples:
|
|
892
|
+
>>> empty = DataChain.create_empty()
|
|
893
|
+
>>> single_record = DataChain.create_empty(DataChain.DEFAULT_FILE_RECORD)
|
|
697
894
|
"""
|
|
698
895
|
session = Session.get(session)
|
|
699
896
|
dsr = cls.create_empty_record(session=session)
|
|
@@ -740,10 +937,12 @@ class DataChain(DatasetQuery):
|
|
|
740
937
|
@detach
|
|
741
938
|
def chunk(self, index: int, total: int) -> "DataChain":
|
|
742
939
|
"""Split a query into smaller chunks for e.g. parallelization.
|
|
743
|
-
|
|
940
|
+
|
|
941
|
+
Examples:
|
|
744
942
|
>>> dc = DataChain(...)
|
|
745
943
|
>>> chunk_1 = dc._chunk(0, 2)
|
|
746
944
|
>>> chunk_2 = dc._chunk(1, 2)
|
|
945
|
+
|
|
747
946
|
Note:
|
|
748
947
|
Bear in mind that `index` is 0-indexed but `total` isn't.
|
|
749
948
|
Use 0/3, 1/3 and 2/3, not 1/3, 2/3 and 3/3.
|