datachain 0.1.13__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (44) hide show
  1. datachain/_version.py +2 -2
  2. datachain/asyn.py +3 -3
  3. datachain/catalog/__init__.py +3 -3
  4. datachain/catalog/catalog.py +6 -6
  5. datachain/catalog/loader.py +3 -3
  6. datachain/cli.py +2 -1
  7. datachain/client/azure.py +37 -1
  8. datachain/client/fsspec.py +1 -1
  9. datachain/client/local.py +1 -1
  10. datachain/data_storage/__init__.py +1 -1
  11. datachain/data_storage/metastore.py +11 -3
  12. datachain/data_storage/schema.py +2 -3
  13. datachain/data_storage/warehouse.py +31 -30
  14. datachain/dataset.py +1 -3
  15. datachain/lib/arrow.py +85 -0
  16. datachain/lib/dc.py +377 -178
  17. datachain/lib/feature.py +41 -90
  18. datachain/lib/feature_registry.py +3 -1
  19. datachain/lib/feature_utils.py +2 -2
  20. datachain/lib/file.py +20 -20
  21. datachain/lib/image.py +9 -2
  22. datachain/lib/meta_formats.py +66 -34
  23. datachain/lib/settings.py +5 -5
  24. datachain/lib/signal_schema.py +103 -105
  25. datachain/lib/udf.py +3 -12
  26. datachain/lib/udf_signature.py +11 -6
  27. datachain/lib/webdataset_laion.py +5 -22
  28. datachain/listing.py +8 -8
  29. datachain/node.py +1 -1
  30. datachain/progress.py +1 -1
  31. datachain/query/builtins.py +1 -1
  32. datachain/query/dataset.py +39 -110
  33. datachain/query/dispatch.py +1 -1
  34. datachain/query/metrics.py +19 -0
  35. datachain/query/schema.py +13 -3
  36. datachain/sql/__init__.py +1 -1
  37. datachain/utils.py +1 -122
  38. {datachain-0.1.13.dist-info → datachain-0.2.0.dist-info}/METADATA +10 -3
  39. {datachain-0.1.13.dist-info → datachain-0.2.0.dist-info}/RECORD +43 -42
  40. {datachain-0.1.13.dist-info → datachain-0.2.0.dist-info}/WHEEL +1 -1
  41. datachain/lib/parquet.py +0 -32
  42. {datachain-0.1.13.dist-info → datachain-0.2.0.dist-info}/LICENSE +0 -0
  43. {datachain-0.1.13.dist-info → datachain-0.2.0.dist-info}/entry_points.txt +0 -0
  44. {datachain-0.1.13.dist-info → datachain-0.2.0.dist-info}/top_level.txt +0 -0
datachain/lib/dc.py CHANGED
@@ -1,12 +1,21 @@
1
+ import re
1
2
  from collections.abc import Iterator, Sequence
2
- from typing import TYPE_CHECKING, Callable, ClassVar, Literal, Optional, Union
3
+ from typing import (
4
+ TYPE_CHECKING,
5
+ Any,
6
+ Callable,
7
+ ClassVar,
8
+ Literal,
9
+ Optional,
10
+ Union,
11
+ )
3
12
 
4
13
  import sqlalchemy
5
14
 
6
15
  from datachain.lib.feature import Feature, FeatureType
7
16
  from datachain.lib.feature_utils import features_to_tuples
8
17
  from datachain.lib.file import File, get_file
9
- from datachain.lib.meta_formats import read_meta
18
+ from datachain.lib.meta_formats import read_meta, read_schema
10
19
  from datachain.lib.settings import Settings
11
20
  from datachain.lib.signal_schema import SignalSchema
12
21
  from datachain.lib.udf import (
@@ -27,6 +36,7 @@ from datachain.query.dataset import (
27
36
  from datachain.query.schema import Column, DatasetRow
28
37
 
29
38
  if TYPE_CHECKING:
39
+ import pandas as pd
30
40
  from typing_extensions import Self
31
41
 
32
42
  C = Column
@@ -68,44 +78,43 @@ class DataChain(DatasetQuery):
68
78
  The supported set of field types include: majority of the type supported by the
69
79
  underlyind library `Pydantic`.
70
80
 
71
- See Also
72
- --------
73
- DataChain.from_storage("s3://my-bucket/my-dir/") - reading unstructured data files
74
- from storages such as S3, gs or Azure ADLS.
75
-
76
- DataChain.save("name") - saving to a dataset.
77
-
78
- DataChain.from_dataset("name") - reading from a dataset.
79
-
80
- DataChain.from_features(fib=[1, 2, 3, 5, 8]) - generating from a values.
81
-
82
-
83
- Examples
84
- --------
85
-
86
- >>> from datachain import DataChain, Feature
87
- >>> from datachain.lib.claude import claude_processor
88
- >>>
89
- >>> class Rating(Feature):
90
- >>> status: str = ""
91
- >>> explanation: str = ""
92
- >>>
93
- >>> PROMPT = "A 'user' is a human trying to find the best mobile plan.... "
94
- >>> MODEL = "claude-3-opus-20240229"
95
- >>>
96
- >>> chain = (
97
- >>> DataChain.from_storage("s3://my-bucket/my")
98
- >>> .filter(C.name.glob("*.txt"))
99
- >>> .limit(5)
100
- >>> .map(claude=claude_processor(prompt=PROMPT, model=MODEL))
101
- >>> .map(
102
- >>> rating=lambda claude: Rating(
103
- >>> **(json.loads(claude.content[0].text) if claude.content else {})
104
- >>> ),
105
- >>> output=Rating,
106
- >>> )
107
- >>> chain.save("ratings")
108
- >>> print(chain)
81
+ See Also:
82
+ `DataChain.from_storage("s3://my-bucket/my-dir/")` - reading unstructured
83
+ data files from storages such as S3, gs or Azure ADLS.
84
+
85
+ `DataChain.save("name")` - saving to a dataset.
86
+
87
+ `DataChain.from_dataset("name")` - reading from a dataset.
88
+
89
+ `DataChain.from_features(fib=[1, 2, 3, 5, 8])` - generating from a values.
90
+
91
+
92
+ Example:
93
+ ```py
94
+ from datachain import DataChain, Feature
95
+ from datachain.lib.claude import claude_processor
96
+
97
+ class Rating(Feature):
98
+ status: str = ""
99
+ explanation: str = ""
100
+
101
+ PROMPT = "A 'user' is a human trying to find the best mobile plan.... "
102
+ MODEL = "claude-3-opus-20240229"
103
+
104
+ chain = (
105
+ DataChain.from_storage("s3://my-bucket/my")
106
+ .filter(C.name.glob("*.txt"))
107
+ .limit(5)
108
+ .map(claude=claude_processor(prompt=PROMPT, model=MODEL))
109
+ .map(
110
+ rating=lambda claude: Rating(
111
+ **(json.loads(claude.content[0].text) if claude.content else {})
112
+ ),
113
+ output=Rating,
114
+ )
115
+ chain.save("ratings")
116
+ print(chain)
117
+ ```
109
118
  """
110
119
 
111
120
  DEFAULT_FILE_RECORD: ClassVar[dict] = {
@@ -119,8 +128,7 @@ class DataChain(DatasetQuery):
119
128
 
120
129
  def __init__(self, *args, **kwargs):
121
130
  """This method needs to be redefined as a part of Dataset and DacaChin
122
- decoupling
123
- """
131
+ decoupling."""
124
132
  super().__init__(
125
133
  *args,
126
134
  **kwargs,
@@ -133,6 +141,16 @@ class DataChain(DatasetQuery):
133
141
  else:
134
142
  self.signals_schema = SignalSchema.from_column_types(self.column_types)
135
143
 
144
+ @property
145
+ def schema(self):
146
+ return self.signals_schema.values if self.signals_schema else None
147
+
148
+ def print_schema(self):
149
+ self.signals_schema.print_tree()
150
+
151
+ def create_model(self, name: str) -> type[Feature]:
152
+ return self.signals_schema.create_model(name)
153
+
136
154
  def settings(
137
155
  self, cache=None, batch=None, parallel=None, workers=None, min_task_size=None
138
156
  ) -> "Self":
@@ -141,29 +159,28 @@ class DataChain(DatasetQuery):
141
159
  This function changes specified settings without changing not specified ones.
142
160
  It returns chain, so, it can be chained later with next operation.
143
161
 
144
- Parameters
145
- ----------
146
- cache : data caching (default=False)
147
- batch : size of the batch (default=1000)
148
- parallel : number of thread for processors. True is a special value to
149
- enable all available CPUs (default=1)
150
- workers : number of distributed workers. Only for Studio mode. (default=1)
151
- min_task_size : minimum number of tasks (default=1)
152
-
153
- Examples
154
- --------
155
-
156
- >>> chain = (
157
- >>> chain
158
- >>> .settings(cache=True, parallel=8)
159
- >>> .map(laion=process_webdataset(spec=WDSLaion), params="file")
160
- >>> )
162
+ Parameters:
163
+ cache : data caching (default=False)
164
+ batch : size of the batch (default=1000)
165
+ parallel : number of thread for processors. True is a special value to
166
+ enable all available CPUs (default=1)
167
+ workers : number of distributed workers. Only for Studio mode. (default=1)
168
+ min_task_size : minimum number of tasks (default=1)
169
+
170
+ Example:
171
+ ```py
172
+ chain = (
173
+ chain
174
+ .settings(cache=True, parallel=8)
175
+ .map(laion=process_webdataset(spec=WDSLaion), params="file")
176
+ )
177
+ ```
161
178
  """
162
179
  self._settings.add(Settings(cache, batch, parallel, workers, min_task_size))
163
180
  return self
164
181
 
165
182
  def reset_settings(self, settings: Optional[Settings] = None) -> "Self":
166
- """Reset all settings to default values"""
183
+ """Reset all settings to default values."""
167
184
  self._settings = settings if settings else Settings()
168
185
  return self
169
186
 
@@ -184,39 +201,37 @@ class DataChain(DatasetQuery):
184
201
  cls,
185
202
  path,
186
203
  type: Literal["binary", "text", "image"] = "binary",
204
+ recursive: Optional[bool] = True,
187
205
  anon: bool = False,
188
206
  ) -> "DataChain":
189
- """Get data from a storage as a list of file with all file attributes.
190
- It returns the chain itself as usual.
207
+ """Get data from a storage as a list of file with all file attributes. It
208
+ returns the chain itself as usual.
191
209
 
192
- Parameters
193
- ----------
194
- path : storage URI with directory. URI must start with storage prefix such
195
- as `s3://`, `gs://`, `az://` or "file:///"
196
- type : read file as "binary", "text", or "image" data. Default is "binary".
197
- anon : use anonymous mode to access the storage.
198
-
199
- Examples
200
- --------
210
+ Parameters:
211
+ path : storage URI with directory. URI must start with storage prefix such
212
+ as `s3://`, `gs://`, `az://` or "file:///"
213
+ type : read file as "binary", "text", or "image" data. Default is "binary".
214
+ recursive : search recursively for the given path.
215
+ anon : use anonymous mode to access the storage.
201
216
 
202
- >>> chain = DataChain.from_storage("s3://my-bucket/my-dir")
217
+ Example:
218
+ ```py
219
+ chain = DataChain.from_storage("s3://my-bucket/my-dir")
220
+ ```
203
221
  """
204
222
  func = get_file(type)
205
- return DataChain(path, anon=anon).map(file=func)
223
+ return DataChain(path, recursive=recursive, anon=anon).map(file=func)
206
224
 
207
225
  @classmethod
208
226
  def from_dataset(cls, name: str, version: Optional[int] = None) -> "DataChain":
209
227
  """Get data from dataset. It returns the chain itself.
210
228
 
211
- Parameters
212
- ----------
213
- name : dataset name
214
- version : dataset version
215
-
216
- Examples
217
- --------
229
+ Parameters:
230
+ name : dataset name
231
+ version : dataset version
218
232
 
219
- >>> chain = DataChain.from_dataset("my_cats")
233
+ Examples:
234
+ >>> chain = DataChain.from_dataset("my_cats")
220
235
  """
221
236
  return DataChain(name=name, version=version)
222
237
 
@@ -228,37 +243,44 @@ class DataChain(DatasetQuery):
228
243
  anon: bool = False,
229
244
  spec: Optional[FeatureType] = None,
230
245
  schema_from: Optional[str] = "auto",
246
+ object_name: Optional[str] = "csv",
247
+ model_name: Optional[str] = None,
231
248
  show_schema: Optional[bool] = False,
232
249
  ) -> "DataChain":
233
250
  """Get data from CSV. It returns the chain itself.
234
251
 
235
- Parameters
236
- ----------
237
- path : storage URI with directory. URI must start with storage prefix such
238
- as `s3://`, `gs://`, `az://` or "file:///"
239
- type : read file as "binary", "text", or "image" data. Default is "binary".
240
- anon : use anonymous mode to access the storage.
241
- spec : optional Data Model
242
- schema_from : path to sample to infer spec from
243
- show_schema : print auto-generated schema
252
+ Parameters:
253
+ path : storage URI with directory. URI must start with storage prefix such
254
+ as `s3://`, `gs://`, `az://` or "file:///"
255
+ type : read file as "binary", "text", or "image" data. Default is "text".
256
+ anon : use anonymous mode to access the storage.
257
+ spec : Data Model for CSV file
258
+ object_name : generated object column name
259
+ model_name : generated model name
260
+ schema_from : path to sample to infer spec from
261
+ show_schema : print auto-generated schema
244
262
 
245
- Examples
246
- --------
263
+ Examples:
264
+ infer model from the first two lines (header + data)
265
+ >>> chain = DataChain.from_csv("gs://csv")
247
266
 
248
- >>> chain = DataChain.from_csv("gs://csv")
267
+ use a particular data model
268
+ >>> chain = DataChain.from_csv("gs://csv"i, spec=MyModel)
249
269
  """
250
270
  if schema_from == "auto":
251
271
  schema_from = path
252
272
 
253
273
  chain = DataChain.from_storage(path=path, type=type, anon=anon)
254
- return chain.gen(
255
- csv=read_meta(
274
+ signal_dict = {
275
+ object_name: read_meta(
256
276
  schema_from=schema_from,
257
277
  meta_type="csv",
258
278
  spec=spec,
279
+ model_name=model_name,
259
280
  show_schema=show_schema,
260
281
  )
261
- )
282
+ }
283
+ return chain.gen(**signal_dict) # type: ignore[misc, arg-type]
262
284
 
263
285
  @classmethod
264
286
  def from_json(
@@ -269,50 +291,104 @@ class DataChain(DatasetQuery):
269
291
  spec: Optional[FeatureType] = None,
270
292
  schema_from: Optional[str] = "auto",
271
293
  jmespath: Optional[str] = None,
294
+ object_name: Optional[str] = None,
295
+ model_name: Optional[str] = None,
272
296
  show_schema: Optional[bool] = False,
297
+ meta_type: Optional[str] = "json",
273
298
  ) -> "DataChain":
274
- """Get data from CSV. It returns the chain itself.
299
+ """Get data from JSON. It returns the chain itself.
275
300
 
276
- Parameters
277
- ----------
278
- path : storage URI with directory. URI must start with storage prefix such
279
- as `s3://`, `gs://`, `az://` or "file:///"
280
- type : read file as "binary", "text", or "image" data. Default is "binary".
281
- anon : use anonymous mode to access the storage.
282
- spec : optional Data Model
283
- schema_from : path to sample to infer spec from
284
- show_schema : print auto-generated schema
285
- jmespath : JMESPATH expression to reduce JSON
286
- name : return object name
287
- Examples
288
- --------
289
-
290
- >>> chain = DataChain.from_json("gs://json")
301
+ Parameters:
302
+ path : storage URI with directory. URI must start with storage prefix such
303
+ as `s3://`, `gs://`, `az://` or "file:///"
304
+ type : read file as "binary", "text", or "image" data. Default is "binary".
305
+ anon : use anonymous mode to access the storage.
306
+ spec : optional Data Model
307
+ schema_from : path to sample to infer spec from
308
+ object_name : generated object column name
309
+ model_name : generated model name
310
+ show_schema : print auto-generated schema
311
+ jmespath : JMESPATH expression to reduce JSON
312
+
313
+ Examples:
314
+ infer JSON schema from data, reduce using JMESPATH, print schema
315
+ >>> chain = DataChain.from_json("gs://json", jmespath="key1.key2")
316
+
317
+ infer JSON schema from a particular path, print data model
318
+ >>> chain = DataChain.from_json("gs://json_ds", schema_from="gs://json/my.json")
291
319
  """
292
320
  if schema_from == "auto":
293
321
  schema_from = path
294
322
 
323
+ def jmespath_to_name(s: str):
324
+ name_end = re.search(r"\W", s).start() if re.search(r"\W", s) else len(s) # type: ignore[union-attr]
325
+ return s[:name_end]
326
+
327
+ if (not object_name) and jmespath:
328
+ object_name = jmespath_to_name(jmespath)
329
+ if not object_name:
330
+ object_name = "json"
295
331
  chain = DataChain.from_storage(path=path, type=type, anon=anon)
296
- return chain.gen(
297
- json=read_meta(
332
+ signal_dict = {
333
+ object_name: read_meta(
298
334
  schema_from=schema_from,
299
- meta_type="json",
335
+ meta_type=meta_type,
300
336
  spec=spec,
337
+ model_name=model_name,
301
338
  show_schema=show_schema,
302
339
  jmespath=jmespath,
303
340
  )
341
+ }
342
+ return chain.gen(**signal_dict) # type: ignore[arg-type]
343
+
344
+ def show_json_schema( # type: ignore[override]
345
+ self, jmespath: Optional[str] = None, model_name: Optional[str] = None
346
+ ) -> "DataChain":
347
+ """Print JSON data model and save it. It returns the chain itself.
348
+
349
+ Parameters:
350
+ jmespath : JMESPATH expression to reduce JSON
351
+ model_name : generated model name
352
+
353
+ Examples:
354
+ print JSON schema and save to column "meta_from":
355
+ >>> uri = "gs://datachain-demo/coco2017/annotations_captions/"
356
+ >>> chain = DataChain.from_storage(uri)
357
+ >>> chain = chain.show_json_schema()
358
+ >>> chain.save()
359
+ """
360
+ return self.map(
361
+ meta_schema=lambda file: read_schema(
362
+ file, data_type="json", expr=jmespath, model_name=model_name
363
+ ),
364
+ output=str,
365
+ )
366
+
367
+ def show_jsonl_schema( # type: ignore[override]
368
+ self, jmespath: Optional[str] = None, model_name: Optional[str] = None
369
+ ) -> "DataChain":
370
+ """Print JSON data model and save it. It returns the chain itself.
371
+
372
+ Parameters:
373
+ jmespath : JMESPATH expression to reduce JSON
374
+ model_name : generated model name
375
+ """
376
+ return self.map(
377
+ meta_schema=lambda file: read_schema(
378
+ file, data_type="jsonl", expr=jmespath, model_name=model_name
379
+ ),
380
+ output=str,
304
381
  )
305
382
 
306
383
  def save( # type: ignore[override]
307
384
  self, name: Optional[str] = None, version: Optional[int] = None
308
385
  ) -> "DataChain":
309
- """Save to a Dataset. It returns the chain itself
386
+ """Save to a Dataset. It returns the chain itself.
310
387
 
311
- Parameters
312
- ----------
313
- name : dataset name. Empty name saves to a temporary dataset that will be
314
- removed after process ends. Temp dataset are useful for optimization.
315
- version : version of a dataset. Default - the last version that exist.
388
+ Parameters:
389
+ name : dataset name. Empty name saves to a temporary dataset that will be
390
+ removed after process ends. Temp dataset are useful for optimization.
391
+ version : version of a dataset. Default - the last version that exist.
316
392
  """
317
393
  schema = self.signals_schema.serialize()
318
394
  return super().save(name=name, version=version, feature_schema=schema)
@@ -333,29 +409,26 @@ class DataChain(DatasetQuery):
333
409
  Input-output relationship: 1:1
334
410
 
335
411
  Parameters:
412
+ func : Function applied to each row.
413
+ params : List of column names used as input for the function. Default
414
+ is taken from function signature.
415
+ output : Dictionary defining new signals and their corresponding types.
416
+ Default type is taken from function signature. Default can be also
417
+ taken from kwargs - **signal_map (see below).
418
+ If signal name is defined using signal_map (see below) only a single
419
+ type value can be used.
420
+ **signal_map : kwargs can be used to define `func` together with it's return
421
+ signal name in format of `map(my_sign=my_func)`. This helps define
422
+ signal names and function in a nicer way.
423
+
424
+ Examples:
425
+ Using signal_map and single type in output:
426
+ >>> chain = chain.map(value=lambda name: name[:-4] + ".json", output=str)
427
+ >>> chain.save("new_dataset")
336
428
 
337
- func : Function applied to each row.
338
- params : List of column names used as input for the function. Default
339
- is taken from function signature.
340
- output : Dictionary defining new signals and their corresponding types. Default
341
- type is taken from function signature. Default can be also taken from
342
- kwargs - **signal_map (see below).
343
- If signal name is defined using signal_map (see below) only a single
344
- type value can be used.
345
- **signal_map : kwargs can be used to define `func` together with it's return
346
- signal name in format of `map(my_sign=my_func)`. This helps define
347
- signal names and function in a nicer way.
348
-
349
- Examples
350
- --------
351
-
352
- Using signal_map and single type in output:
353
- >>> chain = chain.map(value=lambda name: name[:-4] + ".json", output=str)
354
- >>> chain.save("new_dataset")
355
-
356
- Using func and output as a map:
357
- >>> chain = chain.map(lambda name: name[:-4] + ".json", output={"res": str})
358
- >>> chain.save("new_dataset")
429
+ Using func and output as a map:
430
+ >>> chain = chain.map(lambda name: name[:-4] + ".json", output={"res": str})
431
+ >>> chain.save("new_dataset")
359
432
  """
360
433
 
361
434
  udf_obj = self._udf_to_obj(Mapper, func, params, output, signal_map)
@@ -375,9 +448,8 @@ class DataChain(DatasetQuery):
375
448
  output: Union[None, FeatureType, Sequence[str], dict[str, FeatureType]] = None,
376
449
  **signal_map,
377
450
  ) -> "Self":
378
- """
379
- Apply a function to each row to create new rows (with potentially new signals).
380
- The function needs to return a new objects for each of the new rows.
451
+ """Apply a function to each row to create new rows (with potentially new
452
+ signals). The function needs to return a new objects for each of the new rows.
381
453
  It returns a chain itself with new signals.
382
454
 
383
455
  Input-output relationship: 1:N
@@ -435,7 +507,9 @@ class DataChain(DatasetQuery):
435
507
  output: Union[None, FeatureType, Sequence[str], dict[str, FeatureType]] = None,
436
508
  **signal_map,
437
509
  ) -> "Self":
438
- """This is a batch version of map(). It accepts the same parameters plus an
510
+ """This is a batch version of map().
511
+
512
+ It accepts the same parameters plus an
439
513
  additional parameter:
440
514
  """
441
515
  udf_obj = self._udf_to_obj(BatchMapper, func, params, output, signal_map)
@@ -455,7 +529,7 @@ class DataChain(DatasetQuery):
455
529
  output: Union[None, FeatureType, Sequence[str], dict[str, FeatureType]],
456
530
  signal_map,
457
531
  ) -> UDFBase:
458
- is_generator = issubclass(target_class, (Generator, Aggregator, BatchMapper))
532
+ is_generator = target_class.is_output_batched
459
533
  name = self.name or "Unknown"
460
534
  sign = UdfSignature.parse(name, signal_map, func, params, output, is_generator)
461
535
 
@@ -476,7 +550,7 @@ class DataChain(DatasetQuery):
476
550
 
477
551
  @detach
478
552
  def select(self, *args: str) -> "Self":
479
- """Select only a specified set of signals"""
553
+ """Select only a specified set of signals."""
480
554
  new_schema = self.signals_schema.resolve(*args)
481
555
  columns = new_schema.db_signals()
482
556
  chain = super().select(*columns)
@@ -485,7 +559,7 @@ class DataChain(DatasetQuery):
485
559
 
486
560
  @detach
487
561
  def select_except(self, *args: str) -> "Self":
488
- """Select all the signals expect the specified signals"""
562
+ """Select all the signals expect the specified signals."""
489
563
  new_schema = self.signals_schema.select_except_signals(*args)
490
564
  columns = new_schema.db_signals()
491
565
  chain = super().select(*columns)
@@ -494,6 +568,7 @@ class DataChain(DatasetQuery):
494
568
 
495
569
  def get_values(self, *cols: str) -> Iterator[list]:
496
570
  """Iterate over rows, getting feature values and applying reader calls.
571
+
497
572
  If columns are specified - limit them to specified columns.
498
573
  """
499
574
  for features in self.iterate(*cols):
@@ -504,7 +579,9 @@ class DataChain(DatasetQuery):
504
579
  yield item[0]
505
580
 
506
581
  def iterate(self, *cols: str) -> Iterator[list[FeatureType]]:
507
- """Iterate over rows. If columns are specified - limit them to specified
582
+ """Iterate over rows.
583
+
584
+ If columns are specified - limit them to specified
508
585
  columns.
509
586
  """
510
587
  chain = self.select(*cols) if cols else self
@@ -563,20 +640,19 @@ class DataChain(DatasetQuery):
563
640
  ) -> "Self":
564
641
  """Merge two chains based on the specified criteria.
565
642
 
566
- Parameters
567
- ----------
568
- right_ds : Chain to join with.
569
- on : Predicate or list of Predicates to join on. If both chains have the same
570
- predicates then this predicate is enough for the join. Otherwise,
571
- `right_on` parameter has to specify the predicates for the other chain.
572
- right_on: Optional predicate or list of Predicates for the `right_ds` to join.
573
- inner: Whether to run inner join or outer join. Default is False.
574
- rname: name prefix for conflicting signal names. Default: "{name}_right"
643
+ Parameters:
644
+ right_ds : Chain to join with.
645
+ on : Predicate or list of Predicates to join on. If both chains have the
646
+ same predicates then this predicate is enough for the join. Otherwise,
647
+ `right_on` parameter has to specify the predicates for the other chain.
648
+ right_on: Optional predicate or list of Predicates
649
+ for the `right_ds` to join.
650
+ inner (bool): Whether to run inner join or outer join.
651
+ rname (str): name prefix for conflicting signal names.
575
652
 
576
653
  Examples:
577
- >>> meta = meta_emd.merge(meta_pq, on=(C.name, C.emd__index),
578
- right_on=(C.name, C.pq__index))
579
-
654
+ >>> meta = meta_emd.merge(meta_pq, on=(C.name, C.emd__index),
655
+ right_on=(C.name, C.pq__index))
580
656
  """
581
657
  if on is None:
582
658
  raise DatasetMergeError(["None"], None, "'on' must be specified")
@@ -599,7 +675,7 @@ class DataChain(DatasetQuery):
599
675
  raise DatasetMergeError(
600
676
  on,
601
677
  right_on,
602
- f"'right_on' must be 'str' or 'Sequence' object"
678
+ "'right_on' must be 'str' or 'Sequence' object"
603
679
  f" but got type '{right_on}'",
604
680
  )
605
681
 
@@ -616,7 +692,7 @@ class DataChain(DatasetQuery):
616
692
  raise DatasetMergeError(
617
693
  on,
618
694
  right_on,
619
- f"'on' and 'right_on' must have the same number of columns in db'."
695
+ "'on' and 'right_on' must have the same number of columns in db'."
620
696
  f" on -> {on_str}, right_on -> {right_on_str}",
621
697
  )
622
698
  else:
@@ -654,7 +730,7 @@ class DataChain(DatasetQuery):
654
730
 
655
731
  @classmethod
656
732
  def from_pandas( # type: ignore[override]
657
- cls, df, name: str = "", session: Optional[Session] = None
733
+ cls, df: "pd.DataFrame", name: str = "", session: Optional[Session] = None
658
734
  ) -> "DataChain":
659
735
  """Generate chain from pandas data-frame."""
660
736
  fr_map = {col.lower(): df[col].tolist() for col in df.columns}
@@ -664,7 +740,7 @@ class DataChain(DatasetQuery):
664
740
  raise DatasetPrepareError(
665
741
  name,
666
742
  f"import from pandas error - column '{column}' conflicts with"
667
- f" default schema",
743
+ " default schema",
668
744
  )
669
745
  if not column.isidentifier():
670
746
  raise DatasetPrepareError(
@@ -674,6 +750,131 @@ class DataChain(DatasetQuery):
674
750
 
675
751
  return cls.from_features(name, session, **fr_map)
676
752
 
753
+ def parse_tabular(
754
+ self,
755
+ output: Optional[dict[str, FeatureType]] = None,
756
+ **kwargs,
757
+ ) -> "DataChain":
758
+ """Generate chain from list of tabular files.
759
+
760
+ Parameters:
761
+ output : Dictionary defining column names and their corresponding types.
762
+ kwargs : Parameters to pass to pyarrow.dataset.dataset.
763
+
764
+ Examples:
765
+ Reading a json lines file:
766
+ >>> dc = DataChain.from_storage("s3://mybucket/file.jsonl")
767
+ >>> dc = dc.parse_tabular(format="json")
768
+
769
+ Reading a filtered list of files as a dataset:
770
+ >>> dc = DataChain.from_storage("s3://mybucket")
771
+ >>> dc = dc.filter(C("file.name").glob("*.jsonl"))
772
+ >>> dc = dc.parse_tabular(format="json")
773
+ """
774
+ from pyarrow import unify_schemas
775
+ from pyarrow.dataset import dataset
776
+
777
+ from datachain.lib.arrow import ArrowGenerator, Source, schema_to_output
778
+
779
+ schema = None
780
+ if output:
781
+ output = {"source": Source} | output
782
+ else:
783
+ schemas = []
784
+ for row in self.select("file").iterate():
785
+ file = row[0]
786
+ ds = dataset(file.get_path(), filesystem=file.get_fs(), **kwargs) # type: ignore[union-attr]
787
+ schemas.append(ds.schema)
788
+ if not schemas:
789
+ msg = "error parsing tabular data schema - found no files to parse"
790
+ raise DatasetPrepareError(self.name, msg)
791
+ schema = unify_schemas(schemas)
792
+ try:
793
+ output = schema_to_output(schema)
794
+ print(f"Inferred tabular data schema: {output}")
795
+ except ValueError as e:
796
+ raise DatasetPrepareError(self.name, e) from e
797
+
798
+ return self.gen(ArrowGenerator(schema, **kwargs), output=output)
799
+
800
+ def parse_csv(
801
+ self,
802
+ delimiter: str = ",",
803
+ header: bool = True,
804
+ column_names: Optional[list[str]] = None,
805
+ output: Optional[dict[str, FeatureType]] = None,
806
+ ) -> "DataChain":
807
+ """Generate chain from list of csv files.
808
+
809
+ Parameters:
810
+ delimiter : Character for delimiting columns.
811
+ header : Whether the files include a header row.
812
+ column_names : Column names if no header. Implies `header = False`.
813
+ output : Dictionary defining column names and their corresponding types.
814
+
815
+ Examples:
816
+ Reading a csv file:
817
+ >>> dc = DataChain.from_storage("s3://mybucket/file.csv")
818
+ >>> dc = dc.parse_tabular(format="csv")
819
+
820
+ Reading a filtered list of csv files as a dataset:
821
+ >>> dc = DataChain.from_storage("s3://mybucket")
822
+ >>> dc = dc.filter(C("file.name").glob("*.csv"))
823
+ >>> dc = dc.parse_tabular()
824
+ """
825
+ from pyarrow.csv import ParseOptions, ReadOptions
826
+ from pyarrow.dataset import CsvFileFormat
827
+
828
+ if column_names and output:
829
+ msg = "error parsing csv - only one of column_names or output is allowed"
830
+ raise DatasetPrepareError(self.name, msg)
831
+
832
+ if not header and not column_names:
833
+ if output:
834
+ column_names = list(output.keys())
835
+ else:
836
+ msg = "error parsing csv - provide column_names or output if no header"
837
+ raise DatasetPrepareError(self.name, msg)
838
+
839
+ parse_options = ParseOptions(delimiter=delimiter)
840
+ read_options = ReadOptions(column_names=column_names)
841
+ format = CsvFileFormat(parse_options=parse_options, read_options=read_options)
842
+ return self.parse_tabular(output=output, format=format)
843
+
844
+ def parse_parquet(
845
+ self,
846
+ partitioning: Any = "hive",
847
+ output: Optional[dict[str, FeatureType]] = None,
848
+ ) -> "DataChain":
849
+ """Generate chain from list of parquet files.
850
+
851
+ Parameters:
852
+ partitioning : Any pyarrow partitioning schema.
853
+ output : Dictionary defining column names and their corresponding types.
854
+
855
+ Examples:
856
+ Reading a single file:
857
+ >>> dc = DataChain.from_storage("s3://mybucket/file.parquet")
858
+ >>> dc = dc.parse_tabular()
859
+
860
+ Reading a partitioned dataset from a directory:
861
+ >>> dc = DataChain.from_storage("path/to/dir")
862
+ >>> dc = dc.parse_tabular()
863
+
864
+ Reading a filtered list of files as a dataset:
865
+ >>> dc = DataChain.from_storage("s3://mybucket")
866
+ >>> dc = dc.filter(C("file.name").glob("*.parquet"))
867
+ >>> dc = dc.parse_tabular()
868
+
869
+ Reading a filtered list of partitions as a dataset:
870
+ >>> dc = DataChain.from_storage("s3://mybucket")
871
+ >>> dc = dc.filter(C("file.parent").glob("*month=1*"))
872
+ >>> dc = dc.parse_tabular()
873
+ """
874
+ return self.parse_tabular(
875
+ output=output, format="parquet", partitioning=partitioning
876
+ )
877
+
677
878
  @classmethod
678
879
  def create_empty(
679
880
  cls,
@@ -683,17 +884,13 @@ class DataChain(DatasetQuery):
683
884
  """Create empty chain. Returns a chain. This method is used for programmatically
684
885
  generating a chains in contrast of reading data from storages or other sources.
685
886
 
686
- Parameters
687
- ----------
688
-
689
- to_insert : records (or a single record) to insert. Each record is a dictionary
690
- of signals and theirs values.
691
-
692
- Examples
693
- --------
887
+ Parameters:
888
+ to_insert : records (or a single record) to insert. Each record is
889
+ a dictionary of signals and theirs values.
694
890
 
695
- >>> empty = DataChain.create_empty()
696
- >>> single_record = DataChain.create_empty(DataChain.DEFAULT_FILE_RECORD)
891
+ Examples:
892
+ >>> empty = DataChain.create_empty()
893
+ >>> single_record = DataChain.create_empty(DataChain.DEFAULT_FILE_RECORD)
697
894
  """
698
895
  session = Session.get(session)
699
896
  dsr = cls.create_empty_record(session=session)
@@ -740,10 +937,12 @@ class DataChain(DatasetQuery):
740
937
  @detach
741
938
  def chunk(self, index: int, total: int) -> "DataChain":
742
939
  """Split a query into smaller chunks for e.g. parallelization.
743
- Example:
940
+
941
+ Examples:
744
942
  >>> dc = DataChain(...)
745
943
  >>> chunk_1 = dc._chunk(0, 2)
746
944
  >>> chunk_2 = dc._chunk(1, 2)
945
+
747
946
  Note:
748
947
  Bear in mind that `index` is 0-indexed but `total` isn't.
749
948
  Use 0/3, 1/3 and 2/3, not 1/3, 2/3 and 3/3.