datachain 0.1.13__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (49) hide show
  1. datachain/__init__.py +0 -4
  2. datachain/asyn.py +3 -3
  3. datachain/catalog/__init__.py +3 -3
  4. datachain/catalog/catalog.py +6 -6
  5. datachain/catalog/loader.py +3 -3
  6. datachain/cli.py +10 -2
  7. datachain/client/azure.py +37 -1
  8. datachain/client/fsspec.py +1 -1
  9. datachain/client/local.py +1 -1
  10. datachain/data_storage/__init__.py +1 -1
  11. datachain/data_storage/metastore.py +11 -3
  12. datachain/data_storage/schema.py +12 -7
  13. datachain/data_storage/sqlite.py +3 -0
  14. datachain/data_storage/warehouse.py +31 -30
  15. datachain/dataset.py +1 -3
  16. datachain/lib/arrow.py +85 -0
  17. datachain/lib/cached_stream.py +3 -85
  18. datachain/lib/dc.py +382 -179
  19. datachain/lib/feature.py +46 -91
  20. datachain/lib/feature_registry.py +4 -1
  21. datachain/lib/feature_utils.py +2 -2
  22. datachain/lib/file.py +30 -44
  23. datachain/lib/image.py +9 -2
  24. datachain/lib/meta_formats.py +66 -34
  25. datachain/lib/settings.py +5 -5
  26. datachain/lib/signal_schema.py +103 -105
  27. datachain/lib/udf.py +10 -38
  28. datachain/lib/udf_signature.py +11 -6
  29. datachain/lib/webdataset_laion.py +5 -22
  30. datachain/listing.py +8 -8
  31. datachain/node.py +1 -1
  32. datachain/progress.py +1 -1
  33. datachain/query/builtins.py +1 -1
  34. datachain/query/dataset.py +42 -119
  35. datachain/query/dispatch.py +1 -1
  36. datachain/query/metrics.py +19 -0
  37. datachain/query/schema.py +13 -3
  38. datachain/sql/__init__.py +1 -1
  39. datachain/sql/sqlite/base.py +34 -2
  40. datachain/sql/sqlite/vector.py +13 -5
  41. datachain/utils.py +1 -122
  42. {datachain-0.1.13.dist-info → datachain-0.2.1.dist-info}/METADATA +11 -4
  43. {datachain-0.1.13.dist-info → datachain-0.2.1.dist-info}/RECORD +47 -47
  44. {datachain-0.1.13.dist-info → datachain-0.2.1.dist-info}/WHEEL +1 -1
  45. datachain/_version.py +0 -16
  46. datachain/lib/parquet.py +0 -32
  47. {datachain-0.1.13.dist-info → datachain-0.2.1.dist-info}/LICENSE +0 -0
  48. {datachain-0.1.13.dist-info → datachain-0.2.1.dist-info}/entry_points.txt +0 -0
  49. {datachain-0.1.13.dist-info → datachain-0.2.1.dist-info}/top_level.txt +0 -0
datachain/lib/dc.py CHANGED
@@ -1,12 +1,21 @@
1
+ import re
1
2
  from collections.abc import Iterator, Sequence
2
- from typing import TYPE_CHECKING, Callable, ClassVar, Literal, Optional, Union
3
+ from typing import (
4
+ TYPE_CHECKING,
5
+ Any,
6
+ Callable,
7
+ ClassVar,
8
+ Literal,
9
+ Optional,
10
+ Union,
11
+ )
3
12
 
4
13
  import sqlalchemy
5
14
 
6
15
  from datachain.lib.feature import Feature, FeatureType
7
16
  from datachain.lib.feature_utils import features_to_tuples
8
17
  from datachain.lib.file import File, get_file
9
- from datachain.lib.meta_formats import read_meta
18
+ from datachain.lib.meta_formats import read_meta, read_schema
10
19
  from datachain.lib.settings import Settings
11
20
  from datachain.lib.signal_schema import SignalSchema
12
21
  from datachain.lib.udf import (
@@ -27,8 +36,11 @@ from datachain.query.dataset import (
27
36
  from datachain.query.schema import Column, DatasetRow
28
37
 
29
38
  if TYPE_CHECKING:
39
+ import pandas as pd
30
40
  from typing_extensions import Self
31
41
 
42
+ from datachain.catalog import Catalog
43
+
32
44
  C = Column
33
45
 
34
46
 
@@ -68,44 +80,43 @@ class DataChain(DatasetQuery):
68
80
  The supported set of field types include: majority of the type supported by the
69
81
  underlyind library `Pydantic`.
70
82
 
71
- See Also
72
- --------
73
- DataChain.from_storage("s3://my-bucket/my-dir/") - reading unstructured data files
74
- from storages such as S3, gs or Azure ADLS.
75
-
76
- DataChain.save("name") - saving to a dataset.
77
-
78
- DataChain.from_dataset("name") - reading from a dataset.
79
-
80
- DataChain.from_features(fib=[1, 2, 3, 5, 8]) - generating from a values.
81
-
82
-
83
- Examples
84
- --------
85
-
86
- >>> from datachain import DataChain, Feature
87
- >>> from datachain.lib.claude import claude_processor
88
- >>>
89
- >>> class Rating(Feature):
90
- >>> status: str = ""
91
- >>> explanation: str = ""
92
- >>>
93
- >>> PROMPT = "A 'user' is a human trying to find the best mobile plan.... "
94
- >>> MODEL = "claude-3-opus-20240229"
95
- >>>
96
- >>> chain = (
97
- >>> DataChain.from_storage("s3://my-bucket/my")
98
- >>> .filter(C.name.glob("*.txt"))
99
- >>> .limit(5)
100
- >>> .map(claude=claude_processor(prompt=PROMPT, model=MODEL))
101
- >>> .map(
102
- >>> rating=lambda claude: Rating(
103
- >>> **(json.loads(claude.content[0].text) if claude.content else {})
104
- >>> ),
105
- >>> output=Rating,
106
- >>> )
107
- >>> chain.save("ratings")
108
- >>> print(chain)
83
+ See Also:
84
+ `DataChain.from_storage("s3://my-bucket/my-dir/")` - reading unstructured
85
+ data files from storages such as S3, gs or Azure ADLS.
86
+
87
+ `DataChain.save("name")` - saving to a dataset.
88
+
89
+ `DataChain.from_dataset("name")` - reading from a dataset.
90
+
91
+ `DataChain.from_features(fib=[1, 2, 3, 5, 8])` - generating from a values.
92
+
93
+
94
+ Example:
95
+ ```py
96
+ from datachain import DataChain, Feature
97
+ from datachain.lib.claude import claude_processor
98
+
99
+ class Rating(Feature):
100
+ status: str = ""
101
+ explanation: str = ""
102
+
103
+ PROMPT = "A 'user' is a human trying to find the best mobile plan.... "
104
+ MODEL = "claude-3-opus-20240229"
105
+
106
+ chain = (
107
+ DataChain.from_storage("s3://my-bucket/my")
108
+ .filter(C.name.glob("*.txt"))
109
+ .limit(5)
110
+ .map(claude=claude_processor(prompt=PROMPT, model=MODEL))
111
+ .map(
112
+ rating=lambda claude: Rating(
113
+ **(json.loads(claude.content[0].text) if claude.content else {})
114
+ ),
115
+ output=Rating,
116
+ )
117
+ chain.save("ratings")
118
+ print(chain)
119
+ ```
109
120
  """
110
121
 
111
122
  DEFAULT_FILE_RECORD: ClassVar[dict] = {
@@ -119,8 +130,7 @@ class DataChain(DatasetQuery):
119
130
 
120
131
  def __init__(self, *args, **kwargs):
121
132
  """This method needs to be redefined as a part of Dataset and DacaChin
122
- decoupling
123
- """
133
+ decoupling."""
124
134
  super().__init__(
125
135
  *args,
126
136
  **kwargs,
@@ -133,6 +143,16 @@ class DataChain(DatasetQuery):
133
143
  else:
134
144
  self.signals_schema = SignalSchema.from_column_types(self.column_types)
135
145
 
146
+ @property
147
+ def schema(self):
148
+ return self.signals_schema.values if self.signals_schema else None
149
+
150
+ def print_schema(self):
151
+ self.signals_schema.print_tree()
152
+
153
+ def create_model(self, name: str) -> type[Feature]:
154
+ return self.signals_schema.create_model(name)
155
+
136
156
  def settings(
137
157
  self, cache=None, batch=None, parallel=None, workers=None, min_task_size=None
138
158
  ) -> "Self":
@@ -141,29 +161,28 @@ class DataChain(DatasetQuery):
141
161
  This function changes specified settings without changing not specified ones.
142
162
  It returns chain, so, it can be chained later with next operation.
143
163
 
144
- Parameters
145
- ----------
146
- cache : data caching (default=False)
147
- batch : size of the batch (default=1000)
148
- parallel : number of thread for processors. True is a special value to
149
- enable all available CPUs (default=1)
150
- workers : number of distributed workers. Only for Studio mode. (default=1)
151
- min_task_size : minimum number of tasks (default=1)
152
-
153
- Examples
154
- --------
155
-
156
- >>> chain = (
157
- >>> chain
158
- >>> .settings(cache=True, parallel=8)
159
- >>> .map(laion=process_webdataset(spec=WDSLaion), params="file")
160
- >>> )
164
+ Parameters:
165
+ cache : data caching (default=False)
166
+ batch : size of the batch (default=1000)
167
+ parallel : number of thread for processors. True is a special value to
168
+ enable all available CPUs (default=1)
169
+ workers : number of distributed workers. Only for Studio mode. (default=1)
170
+ min_task_size : minimum number of tasks (default=1)
171
+
172
+ Example:
173
+ ```py
174
+ chain = (
175
+ chain
176
+ .settings(cache=True, parallel=8)
177
+ .map(laion=process_webdataset(spec=WDSLaion), params="file")
178
+ )
179
+ ```
161
180
  """
162
181
  self._settings.add(Settings(cache, batch, parallel, workers, min_task_size))
163
182
  return self
164
183
 
165
184
  def reset_settings(self, settings: Optional[Settings] = None) -> "Self":
166
- """Reset all settings to default values"""
185
+ """Reset all settings to default values."""
167
186
  self._settings = settings if settings else Settings()
168
187
  return self
169
188
 
@@ -183,40 +202,40 @@ class DataChain(DatasetQuery):
183
202
  def from_storage(
184
203
  cls,
185
204
  path,
205
+ *,
186
206
  type: Literal["binary", "text", "image"] = "binary",
207
+ catalog: Optional["Catalog"] = None,
208
+ recursive: Optional[bool] = True,
187
209
  anon: bool = False,
188
- ) -> "DataChain":
189
- """Get data from a storage as a list of file with all file attributes.
190
- It returns the chain itself as usual.
191
-
192
- Parameters
193
- ----------
194
- path : storage URI with directory. URI must start with storage prefix such
195
- as `s3://`, `gs://`, `az://` or "file:///"
196
- type : read file as "binary", "text", or "image" data. Default is "binary".
197
- anon : use anonymous mode to access the storage.
210
+ ) -> "Self":
211
+ """Get data from a storage as a list of file with all file attributes. It
212
+ returns the chain itself as usual.
198
213
 
199
- Examples
200
- --------
214
+ Parameters:
215
+ path : storage URI with directory. URI must start with storage prefix such
216
+ as `s3://`, `gs://`, `az://` or "file:///"
217
+ type : read file as "binary", "text", or "image" data. Default is "binary".
218
+ recursive : search recursively for the given path.
219
+ anon : use anonymous mode to access the storage.
201
220
 
202
- >>> chain = DataChain.from_storage("s3://my-bucket/my-dir")
221
+ Example:
222
+ ```py
223
+ chain = DataChain.from_storage("s3://my-bucket/my-dir")
224
+ ```
203
225
  """
204
226
  func = get_file(type)
205
- return DataChain(path, anon=anon).map(file=func)
227
+ return cls(path, catalog=catalog, recursive=recursive, anon=anon).map(file=func)
206
228
 
207
229
  @classmethod
208
230
  def from_dataset(cls, name: str, version: Optional[int] = None) -> "DataChain":
209
231
  """Get data from dataset. It returns the chain itself.
210
232
 
211
- Parameters
212
- ----------
213
- name : dataset name
214
- version : dataset version
215
-
216
- Examples
217
- --------
233
+ Parameters:
234
+ name : dataset name
235
+ version : dataset version
218
236
 
219
- >>> chain = DataChain.from_dataset("my_cats")
237
+ Examples:
238
+ >>> chain = DataChain.from_dataset("my_cats")
220
239
  """
221
240
  return DataChain(name=name, version=version)
222
241
 
@@ -228,37 +247,44 @@ class DataChain(DatasetQuery):
228
247
  anon: bool = False,
229
248
  spec: Optional[FeatureType] = None,
230
249
  schema_from: Optional[str] = "auto",
250
+ object_name: Optional[str] = "csv",
251
+ model_name: Optional[str] = None,
231
252
  show_schema: Optional[bool] = False,
232
253
  ) -> "DataChain":
233
254
  """Get data from CSV. It returns the chain itself.
234
255
 
235
- Parameters
236
- ----------
237
- path : storage URI with directory. URI must start with storage prefix such
238
- as `s3://`, `gs://`, `az://` or "file:///"
239
- type : read file as "binary", "text", or "image" data. Default is "binary".
240
- anon : use anonymous mode to access the storage.
241
- spec : optional Data Model
242
- schema_from : path to sample to infer spec from
243
- show_schema : print auto-generated schema
256
+ Parameters:
257
+ path : storage URI with directory. URI must start with storage prefix such
258
+ as `s3://`, `gs://`, `az://` or "file:///"
259
+ type : read file as "binary", "text", or "image" data. Default is "text".
260
+ anon : use anonymous mode to access the storage.
261
+ spec : Data Model for CSV file
262
+ object_name : generated object column name
263
+ model_name : generated model name
264
+ schema_from : path to sample to infer spec from
265
+ show_schema : print auto-generated schema
244
266
 
245
- Examples
246
- --------
267
+ Examples:
268
+ infer model from the first two lines (header + data)
269
+ >>> chain = DataChain.from_csv("gs://csv")
247
270
 
248
- >>> chain = DataChain.from_csv("gs://csv")
271
+ use a particular data model
272
+ >>> chain = DataChain.from_csv("gs://csv"i, spec=MyModel)
249
273
  """
250
274
  if schema_from == "auto":
251
275
  schema_from = path
252
276
 
253
277
  chain = DataChain.from_storage(path=path, type=type, anon=anon)
254
- return chain.gen(
255
- csv=read_meta(
278
+ signal_dict = {
279
+ object_name: read_meta(
256
280
  schema_from=schema_from,
257
281
  meta_type="csv",
258
282
  spec=spec,
283
+ model_name=model_name,
259
284
  show_schema=show_schema,
260
285
  )
261
- )
286
+ }
287
+ return chain.gen(**signal_dict) # type: ignore[misc, arg-type]
262
288
 
263
289
  @classmethod
264
290
  def from_json(
@@ -269,50 +295,104 @@ class DataChain(DatasetQuery):
269
295
  spec: Optional[FeatureType] = None,
270
296
  schema_from: Optional[str] = "auto",
271
297
  jmespath: Optional[str] = None,
298
+ object_name: Optional[str] = None,
299
+ model_name: Optional[str] = None,
272
300
  show_schema: Optional[bool] = False,
301
+ meta_type: Optional[str] = "json",
273
302
  ) -> "DataChain":
274
- """Get data from CSV. It returns the chain itself.
303
+ """Get data from JSON. It returns the chain itself.
304
+
305
+ Parameters:
306
+ path : storage URI with directory. URI must start with storage prefix such
307
+ as `s3://`, `gs://`, `az://` or "file:///"
308
+ type : read file as "binary", "text", or "image" data. Default is "binary".
309
+ anon : use anonymous mode to access the storage.
310
+ spec : optional Data Model
311
+ schema_from : path to sample to infer spec from
312
+ object_name : generated object column name
313
+ model_name : generated model name
314
+ show_schema : print auto-generated schema
315
+ jmespath : JMESPATH expression to reduce JSON
316
+
317
+ Examples:
318
+ infer JSON schema from data, reduce using JMESPATH, print schema
319
+ >>> chain = DataChain.from_json("gs://json", jmespath="key1.key2")
275
320
 
276
- Parameters
277
- ----------
278
- path : storage URI with directory. URI must start with storage prefix such
279
- as `s3://`, `gs://`, `az://` or "file:///"
280
- type : read file as "binary", "text", or "image" data. Default is "binary".
281
- anon : use anonymous mode to access the storage.
282
- spec : optional Data Model
283
- schema_from : path to sample to infer spec from
284
- show_schema : print auto-generated schema
285
- jmespath : JMESPATH expression to reduce JSON
286
- name : return object name
287
- Examples
288
- --------
289
-
290
- >>> chain = DataChain.from_json("gs://json")
321
+ infer JSON schema from a particular path, print data model
322
+ >>> chain = DataChain.from_json("gs://json_ds", schema_from="gs://json/my.json")
291
323
  """
292
324
  if schema_from == "auto":
293
325
  schema_from = path
294
326
 
327
+ def jmespath_to_name(s: str):
328
+ name_end = re.search(r"\W", s).start() if re.search(r"\W", s) else len(s) # type: ignore[union-attr]
329
+ return s[:name_end]
330
+
331
+ if (not object_name) and jmespath:
332
+ object_name = jmespath_to_name(jmespath)
333
+ if not object_name:
334
+ object_name = "json"
295
335
  chain = DataChain.from_storage(path=path, type=type, anon=anon)
296
- return chain.gen(
297
- json=read_meta(
336
+ signal_dict = {
337
+ object_name: read_meta(
298
338
  schema_from=schema_from,
299
- meta_type="json",
339
+ meta_type=meta_type,
300
340
  spec=spec,
341
+ model_name=model_name,
301
342
  show_schema=show_schema,
302
343
  jmespath=jmespath,
303
344
  )
345
+ }
346
+ return chain.gen(**signal_dict) # type: ignore[arg-type]
347
+
348
+ def show_json_schema( # type: ignore[override]
349
+ self, jmespath: Optional[str] = None, model_name: Optional[str] = None
350
+ ) -> "DataChain":
351
+ """Print JSON data model and save it. It returns the chain itself.
352
+
353
+ Parameters:
354
+ jmespath : JMESPATH expression to reduce JSON
355
+ model_name : generated model name
356
+
357
+ Examples:
358
+ print JSON schema and save to column "meta_from":
359
+ >>> uri = "gs://datachain-demo/coco2017/annotations_captions/"
360
+ >>> chain = DataChain.from_storage(uri)
361
+ >>> chain = chain.show_json_schema()
362
+ >>> chain.save()
363
+ """
364
+ return self.map(
365
+ meta_schema=lambda file: read_schema(
366
+ file, data_type="json", expr=jmespath, model_name=model_name
367
+ ),
368
+ output=str,
369
+ )
370
+
371
+ def show_jsonl_schema( # type: ignore[override]
372
+ self, jmespath: Optional[str] = None, model_name: Optional[str] = None
373
+ ) -> "DataChain":
374
+ """Print JSON data model and save it. It returns the chain itself.
375
+
376
+ Parameters:
377
+ jmespath : JMESPATH expression to reduce JSON
378
+ model_name : generated model name
379
+ """
380
+ return self.map(
381
+ meta_schema=lambda file: read_schema(
382
+ file, data_type="jsonl", expr=jmespath, model_name=model_name
383
+ ),
384
+ output=str,
304
385
  )
305
386
 
306
387
  def save( # type: ignore[override]
307
388
  self, name: Optional[str] = None, version: Optional[int] = None
308
389
  ) -> "DataChain":
309
- """Save to a Dataset. It returns the chain itself
390
+ """Save to a Dataset. It returns the chain itself.
310
391
 
311
- Parameters
312
- ----------
313
- name : dataset name. Empty name saves to a temporary dataset that will be
314
- removed after process ends. Temp dataset are useful for optimization.
315
- version : version of a dataset. Default - the last version that exist.
392
+ Parameters:
393
+ name : dataset name. Empty name saves to a temporary dataset that will be
394
+ removed after process ends. Temp dataset are useful for optimization.
395
+ version : version of a dataset. Default - the last version that exist.
316
396
  """
317
397
  schema = self.signals_schema.serialize()
318
398
  return super().save(name=name, version=version, feature_schema=schema)
@@ -333,29 +413,26 @@ class DataChain(DatasetQuery):
333
413
  Input-output relationship: 1:1
334
414
 
335
415
  Parameters:
416
+ func : Function applied to each row.
417
+ params : List of column names used as input for the function. Default
418
+ is taken from function signature.
419
+ output : Dictionary defining new signals and their corresponding types.
420
+ Default type is taken from function signature. Default can be also
421
+ taken from kwargs - **signal_map (see below).
422
+ If signal name is defined using signal_map (see below) only a single
423
+ type value can be used.
424
+ **signal_map : kwargs can be used to define `func` together with it's return
425
+ signal name in format of `map(my_sign=my_func)`. This helps define
426
+ signal names and function in a nicer way.
427
+
428
+ Examples:
429
+ Using signal_map and single type in output:
430
+ >>> chain = chain.map(value=lambda name: name[:-4] + ".json", output=str)
431
+ >>> chain.save("new_dataset")
336
432
 
337
- func : Function applied to each row.
338
- params : List of column names used as input for the function. Default
339
- is taken from function signature.
340
- output : Dictionary defining new signals and their corresponding types. Default
341
- type is taken from function signature. Default can be also taken from
342
- kwargs - **signal_map (see below).
343
- If signal name is defined using signal_map (see below) only a single
344
- type value can be used.
345
- **signal_map : kwargs can be used to define `func` together with it's return
346
- signal name in format of `map(my_sign=my_func)`. This helps define
347
- signal names and function in a nicer way.
348
-
349
- Examples
350
- --------
351
-
352
- Using signal_map and single type in output:
353
- >>> chain = chain.map(value=lambda name: name[:-4] + ".json", output=str)
354
- >>> chain.save("new_dataset")
355
-
356
- Using func and output as a map:
357
- >>> chain = chain.map(lambda name: name[:-4] + ".json", output={"res": str})
358
- >>> chain.save("new_dataset")
433
+ Using func and output as a map:
434
+ >>> chain = chain.map(lambda name: name[:-4] + ".json", output={"res": str})
435
+ >>> chain.save("new_dataset")
359
436
  """
360
437
 
361
438
  udf_obj = self._udf_to_obj(Mapper, func, params, output, signal_map)
@@ -375,9 +452,8 @@ class DataChain(DatasetQuery):
375
452
  output: Union[None, FeatureType, Sequence[str], dict[str, FeatureType]] = None,
376
453
  **signal_map,
377
454
  ) -> "Self":
378
- """
379
- Apply a function to each row to create new rows (with potentially new signals).
380
- The function needs to return a new objects for each of the new rows.
455
+ """Apply a function to each row to create new rows (with potentially new
456
+ signals). The function needs to return a new objects for each of the new rows.
381
457
  It returns a chain itself with new signals.
382
458
 
383
459
  Input-output relationship: 1:N
@@ -435,7 +511,9 @@ class DataChain(DatasetQuery):
435
511
  output: Union[None, FeatureType, Sequence[str], dict[str, FeatureType]] = None,
436
512
  **signal_map,
437
513
  ) -> "Self":
438
- """This is a batch version of map(). It accepts the same parameters plus an
514
+ """This is a batch version of map().
515
+
516
+ It accepts the same parameters plus an
439
517
  additional parameter:
440
518
  """
441
519
  udf_obj = self._udf_to_obj(BatchMapper, func, params, output, signal_map)
@@ -455,7 +533,7 @@ class DataChain(DatasetQuery):
455
533
  output: Union[None, FeatureType, Sequence[str], dict[str, FeatureType]],
456
534
  signal_map,
457
535
  ) -> UDFBase:
458
- is_generator = issubclass(target_class, (Generator, Aggregator, BatchMapper))
536
+ is_generator = target_class.is_output_batched
459
537
  name = self.name or "Unknown"
460
538
  sign = UdfSignature.parse(name, signal_map, func, params, output, is_generator)
461
539
 
@@ -476,7 +554,7 @@ class DataChain(DatasetQuery):
476
554
 
477
555
  @detach
478
556
  def select(self, *args: str) -> "Self":
479
- """Select only a specified set of signals"""
557
+ """Select only a specified set of signals."""
480
558
  new_schema = self.signals_schema.resolve(*args)
481
559
  columns = new_schema.db_signals()
482
560
  chain = super().select(*columns)
@@ -485,7 +563,7 @@ class DataChain(DatasetQuery):
485
563
 
486
564
  @detach
487
565
  def select_except(self, *args: str) -> "Self":
488
- """Select all the signals expect the specified signals"""
566
+ """Select all the signals expect the specified signals."""
489
567
  new_schema = self.signals_schema.select_except_signals(*args)
490
568
  columns = new_schema.db_signals()
491
569
  chain = super().select(*columns)
@@ -494,6 +572,7 @@ class DataChain(DatasetQuery):
494
572
 
495
573
  def get_values(self, *cols: str) -> Iterator[list]:
496
574
  """Iterate over rows, getting feature values and applying reader calls.
575
+
497
576
  If columns are specified - limit them to specified columns.
498
577
  """
499
578
  for features in self.iterate(*cols):
@@ -504,7 +583,9 @@ class DataChain(DatasetQuery):
504
583
  yield item[0]
505
584
 
506
585
  def iterate(self, *cols: str) -> Iterator[list[FeatureType]]:
507
- """Iterate over rows. If columns are specified - limit them to specified
586
+ """Iterate over rows.
587
+
588
+ If columns are specified - limit them to specified
508
589
  columns.
509
590
  """
510
591
  chain = self.select(*cols) if cols else self
@@ -563,20 +644,19 @@ class DataChain(DatasetQuery):
563
644
  ) -> "Self":
564
645
  """Merge two chains based on the specified criteria.
565
646
 
566
- Parameters
567
- ----------
568
- right_ds : Chain to join with.
569
- on : Predicate or list of Predicates to join on. If both chains have the same
570
- predicates then this predicate is enough for the join. Otherwise,
571
- `right_on` parameter has to specify the predicates for the other chain.
572
- right_on: Optional predicate or list of Predicates for the `right_ds` to join.
573
- inner: Whether to run inner join or outer join. Default is False.
574
- rname: name prefix for conflicting signal names. Default: "{name}_right"
647
+ Parameters:
648
+ right_ds : Chain to join with.
649
+ on : Predicate or list of Predicates to join on. If both chains have the
650
+ same predicates then this predicate is enough for the join. Otherwise,
651
+ `right_on` parameter has to specify the predicates for the other chain.
652
+ right_on: Optional predicate or list of Predicates
653
+ for the `right_ds` to join.
654
+ inner (bool): Whether to run inner join or outer join.
655
+ rname (str): name prefix for conflicting signal names.
575
656
 
576
657
  Examples:
577
- >>> meta = meta_emd.merge(meta_pq, on=(C.name, C.emd__index),
578
- right_on=(C.name, C.pq__index))
579
-
658
+ >>> meta = meta_emd.merge(meta_pq, on=(C.name, C.emd__index),
659
+ right_on=(C.name, C.pq__index))
580
660
  """
581
661
  if on is None:
582
662
  raise DatasetMergeError(["None"], None, "'on' must be specified")
@@ -599,7 +679,7 @@ class DataChain(DatasetQuery):
599
679
  raise DatasetMergeError(
600
680
  on,
601
681
  right_on,
602
- f"'right_on' must be 'str' or 'Sequence' object"
682
+ "'right_on' must be 'str' or 'Sequence' object"
603
683
  f" but got type '{right_on}'",
604
684
  )
605
685
 
@@ -616,7 +696,7 @@ class DataChain(DatasetQuery):
616
696
  raise DatasetMergeError(
617
697
  on,
618
698
  right_on,
619
- f"'on' and 'right_on' must have the same number of columns in db'."
699
+ "'on' and 'right_on' must have the same number of columns in db'."
620
700
  f" on -> {on_str}, right_on -> {right_on_str}",
621
701
  )
622
702
  else:
@@ -654,7 +734,7 @@ class DataChain(DatasetQuery):
654
734
 
655
735
  @classmethod
656
736
  def from_pandas( # type: ignore[override]
657
- cls, df, name: str = "", session: Optional[Session] = None
737
+ cls, df: "pd.DataFrame", name: str = "", session: Optional[Session] = None
658
738
  ) -> "DataChain":
659
739
  """Generate chain from pandas data-frame."""
660
740
  fr_map = {col.lower(): df[col].tolist() for col in df.columns}
@@ -664,7 +744,7 @@ class DataChain(DatasetQuery):
664
744
  raise DatasetPrepareError(
665
745
  name,
666
746
  f"import from pandas error - column '{column}' conflicts with"
667
- f" default schema",
747
+ " default schema",
668
748
  )
669
749
  if not column.isidentifier():
670
750
  raise DatasetPrepareError(
@@ -674,6 +754,131 @@ class DataChain(DatasetQuery):
674
754
 
675
755
  return cls.from_features(name, session, **fr_map)
676
756
 
757
+ def parse_tabular(
758
+ self,
759
+ output: Optional[dict[str, FeatureType]] = None,
760
+ **kwargs,
761
+ ) -> "DataChain":
762
+ """Generate chain from list of tabular files.
763
+
764
+ Parameters:
765
+ output : Dictionary defining column names and their corresponding types.
766
+ kwargs : Parameters to pass to pyarrow.dataset.dataset.
767
+
768
+ Examples:
769
+ Reading a json lines file:
770
+ >>> dc = DataChain.from_storage("s3://mybucket/file.jsonl")
771
+ >>> dc = dc.parse_tabular(format="json")
772
+
773
+ Reading a filtered list of files as a dataset:
774
+ >>> dc = DataChain.from_storage("s3://mybucket")
775
+ >>> dc = dc.filter(C("file.name").glob("*.jsonl"))
776
+ >>> dc = dc.parse_tabular(format="json")
777
+ """
778
+ from pyarrow import unify_schemas
779
+ from pyarrow.dataset import dataset
780
+
781
+ from datachain.lib.arrow import ArrowGenerator, Source, schema_to_output
782
+
783
+ schema = None
784
+ if output:
785
+ output = {"source": Source} | output
786
+ else:
787
+ schemas = []
788
+ for row in self.select("file").iterate():
789
+ file = row[0]
790
+ ds = dataset(file.get_path(), filesystem=file.get_fs(), **kwargs) # type: ignore[union-attr]
791
+ schemas.append(ds.schema)
792
+ if not schemas:
793
+ msg = "error parsing tabular data schema - found no files to parse"
794
+ raise DatasetPrepareError(self.name, msg)
795
+ schema = unify_schemas(schemas)
796
+ try:
797
+ output = schema_to_output(schema)
798
+ print(f"Inferred tabular data schema: {output}")
799
+ except ValueError as e:
800
+ raise DatasetPrepareError(self.name, e) from e
801
+
802
+ return self.gen(ArrowGenerator(schema, **kwargs), output=output)
803
+
804
+ def parse_csv(
805
+ self,
806
+ delimiter: str = ",",
807
+ header: bool = True,
808
+ column_names: Optional[list[str]] = None,
809
+ output: Optional[dict[str, FeatureType]] = None,
810
+ ) -> "DataChain":
811
+ """Generate chain from list of csv files.
812
+
813
+ Parameters:
814
+ delimiter : Character for delimiting columns.
815
+ header : Whether the files include a header row.
816
+ column_names : Column names if no header. Implies `header = False`.
817
+ output : Dictionary defining column names and their corresponding types.
818
+
819
+ Examples:
820
+ Reading a csv file:
821
+ >>> dc = DataChain.from_storage("s3://mybucket/file.csv")
822
+ >>> dc = dc.parse_tabular(format="csv")
823
+
824
+ Reading a filtered list of csv files as a dataset:
825
+ >>> dc = DataChain.from_storage("s3://mybucket")
826
+ >>> dc = dc.filter(C("file.name").glob("*.csv"))
827
+ >>> dc = dc.parse_tabular()
828
+ """
829
+ from pyarrow.csv import ParseOptions, ReadOptions
830
+ from pyarrow.dataset import CsvFileFormat
831
+
832
+ if column_names and output:
833
+ msg = "error parsing csv - only one of column_names or output is allowed"
834
+ raise DatasetPrepareError(self.name, msg)
835
+
836
+ if not header and not column_names:
837
+ if output:
838
+ column_names = list(output.keys())
839
+ else:
840
+ msg = "error parsing csv - provide column_names or output if no header"
841
+ raise DatasetPrepareError(self.name, msg)
842
+
843
+ parse_options = ParseOptions(delimiter=delimiter)
844
+ read_options = ReadOptions(column_names=column_names)
845
+ format = CsvFileFormat(parse_options=parse_options, read_options=read_options)
846
+ return self.parse_tabular(output=output, format=format)
847
+
848
+ def parse_parquet(
849
+ self,
850
+ partitioning: Any = "hive",
851
+ output: Optional[dict[str, FeatureType]] = None,
852
+ ) -> "DataChain":
853
+ """Generate chain from list of parquet files.
854
+
855
+ Parameters:
856
+ partitioning : Any pyarrow partitioning schema.
857
+ output : Dictionary defining column names and their corresponding types.
858
+
859
+ Examples:
860
+ Reading a single file:
861
+ >>> dc = DataChain.from_storage("s3://mybucket/file.parquet")
862
+ >>> dc = dc.parse_tabular()
863
+
864
+ Reading a partitioned dataset from a directory:
865
+ >>> dc = DataChain.from_storage("path/to/dir")
866
+ >>> dc = dc.parse_tabular()
867
+
868
+ Reading a filtered list of files as a dataset:
869
+ >>> dc = DataChain.from_storage("s3://mybucket")
870
+ >>> dc = dc.filter(C("file.name").glob("*.parquet"))
871
+ >>> dc = dc.parse_tabular()
872
+
873
+ Reading a filtered list of partitions as a dataset:
874
+ >>> dc = DataChain.from_storage("s3://mybucket")
875
+ >>> dc = dc.filter(C("file.parent").glob("*month=1*"))
876
+ >>> dc = dc.parse_tabular()
877
+ """
878
+ return self.parse_tabular(
879
+ output=output, format="parquet", partitioning=partitioning
880
+ )
881
+
677
882
  @classmethod
678
883
  def create_empty(
679
884
  cls,
@@ -683,17 +888,13 @@ class DataChain(DatasetQuery):
683
888
  """Create empty chain. Returns a chain. This method is used for programmatically
684
889
  generating a chains in contrast of reading data from storages or other sources.
685
890
 
686
- Parameters
687
- ----------
688
-
689
- to_insert : records (or a single record) to insert. Each record is a dictionary
690
- of signals and theirs values.
691
-
692
- Examples
693
- --------
891
+ Parameters:
892
+ to_insert : records (or a single record) to insert. Each record is
893
+ a dictionary of signals and theirs values.
694
894
 
695
- >>> empty = DataChain.create_empty()
696
- >>> single_record = DataChain.create_empty(DataChain.DEFAULT_FILE_RECORD)
895
+ Examples:
896
+ >>> empty = DataChain.create_empty()
897
+ >>> single_record = DataChain.create_empty(DataChain.DEFAULT_FILE_RECORD)
697
898
  """
698
899
  session = Session.get(session)
699
900
  dsr = cls.create_empty_record(session=session)
@@ -740,10 +941,12 @@ class DataChain(DatasetQuery):
740
941
  @detach
741
942
  def chunk(self, index: int, total: int) -> "DataChain":
742
943
  """Split a query into smaller chunks for e.g. parallelization.
743
- Example:
944
+
945
+ Examples:
744
946
  >>> dc = DataChain(...)
745
947
  >>> chunk_1 = dc._chunk(0, 2)
746
948
  >>> chunk_2 = dc._chunk(1, 2)
949
+
747
950
  Note:
748
951
  Bear in mind that `index` is 0-indexed but `total` isn't.
749
952
  Use 0/3, 1/3 and 2/3, not 1/3, 2/3 and 3/3.