datachain 0.13.1__py3-none-any.whl → 0.14.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

@@ -1,10 +1,9 @@
1
1
  import copy
2
2
  import os
3
3
  import os.path
4
- import re
5
4
  import sys
5
+ import warnings
6
6
  from collections.abc import Iterator, Sequence
7
- from functools import wraps
8
7
  from typing import (
9
8
  IO,
10
9
  TYPE_CHECKING,
@@ -22,7 +21,6 @@ from typing import (
22
21
  import orjson
23
22
  import sqlalchemy
24
23
  from pydantic import BaseModel
25
- from sqlalchemy.sql.functions import GenericFunction
26
24
  from tqdm import tqdm
27
25
 
28
26
  from datachain.dataset import DatasetRecord
@@ -30,22 +28,13 @@ from datachain.func import literal
30
28
  from datachain.func.base import Function
31
29
  from datachain.func.func import Func
32
30
  from datachain.lib.convert.python_to_sql import python_to_sql
33
- from datachain.lib.convert.values_to_tuples import values_to_tuples
34
31
  from datachain.lib.data_model import DataModel, DataType, DataValue, dict_to_data_model
35
- from datachain.lib.dataset_info import DatasetInfo
36
32
  from datachain.lib.file import (
37
33
  EXPORT_FILES_MAX_THREADS,
38
34
  ArrowRow,
39
- File,
40
35
  FileExporter,
41
- FileType,
42
- get_file_type,
43
36
  )
44
37
  from datachain.lib.file import ExportPlacement as FileExportPlacement
45
- from datachain.lib.listing import get_file_info, get_listing, list_bucket, ls
46
- from datachain.lib.listing_info import ListingInfo
47
- from datachain.lib.meta_formats import read_meta
48
- from datachain.lib.model_store import ModelStore
49
38
  from datachain.lib.settings import Settings
50
39
  from datachain.lib.signal_schema import SignalSchema
51
40
  from datachain.lib.udf import Aggregator, BatchMapper, Generator, Mapper, UDFBase
@@ -57,124 +46,29 @@ from datachain.query.schema import DEFAULT_DELIMITER, Column, ColumnMeta
57
46
  from datachain.sql.functions import path as pathfunc
58
47
  from datachain.utils import batched_it, inside_notebook, row_to_nested_dict
59
48
 
60
- if TYPE_CHECKING:
61
- import pandas as pd
62
- from pyarrow import DataType as ArrowDataType
63
- from typing_extensions import Concatenate, ParamSpec, Self
64
-
65
- from datachain.lib.hf import HFDatasetType
66
-
67
- P = ParamSpec("P")
49
+ from .utils import (
50
+ DatasetMergeError,
51
+ DatasetPrepareError,
52
+ MergeColType,
53
+ OutputType,
54
+ Sys,
55
+ _get_merge_error_str,
56
+ _validate_merge_on,
57
+ resolve_columns,
58
+ )
68
59
 
69
60
  C = Column
70
61
 
71
62
  _T = TypeVar("_T")
72
- D = TypeVar("D", bound="DataChain")
73
63
  UDFObjT = TypeVar("UDFObjT", bound=UDFBase)
74
64
 
75
65
  DEFAULT_PARQUET_CHUNK_SIZE = 100_000
76
66
 
67
+ if TYPE_CHECKING:
68
+ import pandas as pd
69
+ from typing_extensions import ParamSpec, Self
77
70
 
78
- def resolve_columns(
79
- method: "Callable[Concatenate[D, P], D]",
80
- ) -> "Callable[Concatenate[D, P], D]":
81
- """Decorator that resolvs input column names to their actual DB names. This is
82
- specially important for nested columns as user works with them by using dot
83
- notation e.g (file.name) but are actually defined with default delimiter
84
- in DB, e.g file__name.
85
- If there are any sql functions in arguments, they will just be transferred as is
86
- to a method.
87
- """
88
-
89
- @wraps(method)
90
- def _inner(self: D, *args: "P.args", **kwargs: "P.kwargs") -> D:
91
- resolved_args = self.signals_schema.resolve(
92
- *[arg for arg in args if not isinstance(arg, GenericFunction)] # type: ignore[arg-type]
93
- ).db_signals()
94
-
95
- for idx, arg in enumerate(args):
96
- if isinstance(arg, GenericFunction):
97
- resolved_args.insert(idx, arg) # type: ignore[arg-type]
98
-
99
- return method(self, *resolved_args, **kwargs)
100
-
101
- return _inner
102
-
103
-
104
- class DatasetPrepareError(DataChainParamsError): # noqa: D101
105
- def __init__(self, name, msg, output=None): # noqa: D107
106
- name = f" '{name}'" if name else ""
107
- output = f" output '{output}'" if output else ""
108
- super().__init__(f"Dataset{name}{output} processing prepare error: {msg}")
109
-
110
-
111
- class DatasetFromValuesError(DataChainParamsError): # noqa: D101
112
- def __init__(self, name, msg): # noqa: D107
113
- name = f" '{name}'" if name else ""
114
- super().__init__(f"Dataset{name} from values error: {msg}")
115
-
116
-
117
- MergeColType = Union[str, Function, sqlalchemy.ColumnElement]
118
-
119
-
120
- def _validate_merge_on(
121
- on: Union[MergeColType, Sequence[MergeColType]],
122
- ds: "DataChain",
123
- ) -> Sequence[MergeColType]:
124
- if isinstance(on, (str, sqlalchemy.ColumnElement)):
125
- return [on]
126
- if isinstance(on, Function):
127
- return [on.get_column(table=ds._query.table)]
128
- if isinstance(on, Sequence):
129
- return [
130
- c.get_column(table=ds._query.table) if isinstance(c, Function) else c
131
- for c in on
132
- ]
133
-
134
-
135
- def _get_merge_error_str(col: MergeColType) -> str:
136
- if isinstance(col, str):
137
- return col
138
- if isinstance(col, Function):
139
- return f"{col.name}()"
140
- if isinstance(col, sqlalchemy.Column):
141
- return col.name.replace(DEFAULT_DELIMITER, ".")
142
- if isinstance(col, sqlalchemy.ColumnElement) and hasattr(col, "name"):
143
- return f"{col.name} expression"
144
- return str(col)
145
-
146
-
147
- class DatasetMergeError(DataChainParamsError): # noqa: D101
148
- def __init__( # noqa: D107
149
- self,
150
- on: Union[MergeColType, Sequence[MergeColType]],
151
- right_on: Optional[Union[MergeColType, Sequence[MergeColType]]],
152
- msg: str,
153
- ):
154
- def _get_str(
155
- on: Union[MergeColType, Sequence[MergeColType]],
156
- ) -> str:
157
- if not isinstance(on, Sequence):
158
- return str(on) # type: ignore[unreachable]
159
- return ", ".join([_get_merge_error_str(col) for col in on])
160
-
161
- on_str = _get_str(on)
162
- right_on_str = (
163
- ", right_on='" + _get_str(right_on) + "'"
164
- if right_on and isinstance(right_on, Sequence)
165
- else ""
166
- )
167
- super().__init__(f"Merge error on='{on_str}'{right_on_str}: {msg}")
168
-
169
-
170
- OutputType = Union[None, DataType, Sequence[str], dict[str, DataType]]
171
-
172
-
173
- class Sys(DataModel):
174
- """Model for internal DataChain signals `id` and `rand`."""
175
-
176
- id: int
177
- rand: int
71
+ P = ParamSpec("P")
178
72
 
179
73
 
180
74
  class DataChain:
@@ -190,22 +84,22 @@ class DataChain:
190
84
  underlyind library `Pydantic`.
191
85
 
192
86
  See Also:
193
- `DataChain.from_storage("s3://my-bucket/my-dir/")` - reading unstructured
87
+ `from_storage("s3://my-bucket/my-dir/")` - reading unstructured
194
88
  data files from storages such as S3, gs or Azure ADLS.
195
89
 
196
90
  `DataChain.save("name")` - saving to a dataset.
197
91
 
198
- `DataChain.from_dataset("name")` - reading from a dataset.
92
+ `from_dataset("name")` - reading from a dataset.
199
93
 
200
- `DataChain.from_values(fib=[1, 2, 3, 5, 8])` - generating from values.
94
+ `from_values(fib=[1, 2, 3, 5, 8])` - generating from values.
201
95
 
202
- `DataChain.from_pandas(pd.DataFrame(...))` - generating from pandas.
96
+ `from_pandas(pd.DataFrame(...))` - generating from pandas.
203
97
 
204
- `DataChain.from_json("file.json")` - generating from json.
98
+ `from_json("file.json")` - generating from json.
205
99
 
206
- `DataChain.from_csv("file.csv")` - generating from csv.
100
+ `from_csv("file.csv")` - generating from csv.
207
101
 
208
- `DataChain.from_parquet("file.parquet")` - generating from parquet.
102
+ `from_parquet("file.parquet")` - generating from parquet.
209
103
 
210
104
  Example:
211
105
  ```py
@@ -213,8 +107,7 @@ class DataChain:
213
107
 
214
108
  from mistralai.client import MistralClient
215
109
  from mistralai.models.chat_completion import ChatMessage
216
-
217
- from datachain import DataChain, Column
110
+ import datachain as dc
218
111
 
219
112
  PROMPT = (
220
113
  "Was this bot dialog successful? "
@@ -225,7 +118,7 @@ class DataChain:
225
118
  api_key = os.environ["MISTRAL_API_KEY"]
226
119
 
227
120
  chain = (
228
- DataChain.from_storage("gs://datachain-demo/chatbot-KiT/")
121
+ dc.from_storage("gs://datachain-demo/chatbot-KiT/")
229
122
  .limit(5)
230
123
  .settings(cache=True, parallel=5)
231
124
  .map(
@@ -408,246 +301,57 @@ class DataChain:
408
301
  self._settings = settings if settings else Settings()
409
302
  return self
410
303
 
411
- def reset_schema(self, signals_schema: SignalSchema) -> "Self": # noqa: D102
304
+ def reset_schema(self, signals_schema: SignalSchema) -> "Self":
412
305
  self.signals_schema = signals_schema
413
306
  return self
414
307
 
415
- def add_schema(self, signals_schema: SignalSchema) -> "Self": # noqa: D102
308
+ def add_schema(self, signals_schema: SignalSchema) -> "Self":
416
309
  self.signals_schema |= signals_schema
417
310
  return self
418
311
 
419
312
  @classmethod
420
313
  def from_storage(
421
314
  cls,
422
- uri: Union[str, os.PathLike[str]],
423
- *,
424
- type: FileType = "binary",
425
- session: Optional[Session] = None,
426
- settings: Optional[dict] = None,
427
- in_memory: bool = False,
428
- recursive: Optional[bool] = True,
429
- object_name: str = "file",
430
- update: bool = False,
431
- anon: bool = False,
432
- client_config: Optional[dict] = None,
433
- ) -> "Self":
434
- """Get data from a storage as a list of file with all file attributes.
435
- It returns the chain itself as usual.
436
-
437
- Parameters:
438
- uri : storage URI with directory. URI must start with storage prefix such
439
- as `s3://`, `gs://`, `az://` or "file:///"
440
- type : read file as "binary", "text", or "image" data. Default is "binary".
441
- recursive : search recursively for the given path.
442
- object_name : Created object column name.
443
- update : force storage reindexing. Default is False.
444
- anon : If True, we will treat cloud bucket as public one
445
- client_config : Optional client configuration for the storage client.
446
-
447
- Example:
448
- Simple call from s3
449
- ```py
450
- chain = DataChain.from_storage("s3://my-bucket/my-dir")
451
- ```
452
-
453
- With AWS S3-compatible storage
454
- ```py
455
- chain = DataChain.from_storage(
456
- "s3://my-bucket/my-dir",
457
- client_config = {"aws_endpoint_url": "<minio-endpoint-url>"}
458
- )
459
- ```
460
-
461
- Pass existing session
462
- ```py
463
- session = Session.get()
464
- chain = DataChain.from_storage("s3://my-bucket/my-dir", session=session)
465
- ```
466
- """
467
- file_type = get_file_type(type)
468
-
469
- if anon:
470
- client_config = (client_config or {}) | {"anon": True}
471
- session = Session.get(session, client_config=client_config, in_memory=in_memory)
472
- cache = session.catalog.cache
473
- client_config = session.catalog.client_config
315
+ *args,
316
+ **kwargs,
317
+ ) -> "DataChain":
318
+ from .storage import from_storage
474
319
 
475
- list_ds_name, list_uri, list_path, list_ds_exists = get_listing(
476
- uri, session, update=update
320
+ warnings.warn(
321
+ "Class method `from_storage` is deprecated. "
322
+ "Use `from_storage` function instead from top_module.",
323
+ DeprecationWarning,
324
+ stacklevel=2,
477
325
  )
478
-
479
- # ds_name is None if object is a file, we don't want to use cache
480
- # or do listing in that case - just read that single object
481
- if not list_ds_name:
482
- dc = cls.from_values(
483
- session=session,
484
- settings=settings,
485
- in_memory=in_memory,
486
- file=[get_file_info(list_uri, cache, client_config=client_config)],
487
- )
488
- dc.signals_schema = dc.signals_schema.mutate({f"{object_name}": file_type})
489
- return dc
490
-
491
- if update or not list_ds_exists:
492
- # disable prefetch for listing, as it pre-downloads all files
493
- (
494
- cls.from_records(
495
- DataChain.DEFAULT_FILE_RECORD,
496
- session=session,
497
- settings=settings,
498
- in_memory=in_memory,
499
- )
500
- .settings(prefetch=0)
501
- .gen(
502
- list_bucket(list_uri, cache, client_config=client_config),
503
- output={f"{object_name}": File},
504
- )
505
- .save(list_ds_name, listing=True)
506
- )
507
-
508
- dc = cls.from_dataset(list_ds_name, session=session, settings=settings)
509
- dc.signals_schema = dc.signals_schema.mutate({f"{object_name}": file_type})
510
-
511
- return ls(dc, list_path, recursive=recursive, object_name=object_name)
326
+ return from_storage(*args, **kwargs)
512
327
 
513
328
  @classmethod
514
- def from_dataset(
515
- cls,
516
- name: str,
517
- version: Optional[int] = None,
518
- session: Optional[Session] = None,
519
- settings: Optional[dict] = None,
520
- fallback_to_studio: bool = True,
521
- ) -> "Self":
522
- """Get data from a saved Dataset. It returns the chain itself.
523
- If dataset or version is not found locally, it will try to pull it from Studio.
524
-
525
- Parameters:
526
- name : dataset name
527
- version : dataset version
528
- session : Session to use for the chain.
529
- settings : Settings to use for the chain.
530
- fallback_to_studio : Try to pull dataset from Studio if not found locally.
531
- Default is True.
532
-
533
- Example:
534
- ```py
535
- chain = DataChain.from_dataset("my_cats")
536
- ```
537
-
538
- ```py
539
- chain = DataChain.from_dataset("my_cats", fallback_to_studio=False)
540
- ```
541
-
542
- ```py
543
- chain = DataChain.from_dataset("my_cats", version=1)
544
- ```
545
-
546
- ```py
547
- session = Session.get(client_config={"aws_endpoint_url": "<minio-url>"})
548
- settings = {
549
- "cache": True,
550
- "parallel": 4,
551
- "workers": 4,
552
- "min_task_size": 1000,
553
- "prefetch": 10,
554
- }
555
- chain = DataChain.from_dataset(
556
- name="my_cats",
557
- version=1,
558
- session=session,
559
- settings=settings,
560
- fallback_to_studio=True,
561
- )
562
- ```
563
- """
564
- from datachain.telemetry import telemetry
565
-
566
- query = DatasetQuery(
567
- name=name,
568
- version=version,
569
- session=session,
570
- indexing_column_types=File._datachain_column_types,
571
- fallback_to_studio=fallback_to_studio,
329
+ def from_dataset(cls, *args, **kwargs) -> "DataChain":
330
+ from .datasets import from_dataset
331
+
332
+ warnings.warn(
333
+ "Class method `from_dataset` is deprecated. "
334
+ "Use `from_dataset` function instead from top_module.",
335
+ DeprecationWarning,
336
+ stacklevel=2,
572
337
  )
573
- telemetry.send_event_once("class", "datachain_init", name=name, version=version)
574
- if settings:
575
- _settings = Settings(**settings)
576
- else:
577
- _settings = Settings()
578
-
579
- signals_schema = SignalSchema({"sys": Sys})
580
- if query.feature_schema:
581
- signals_schema |= SignalSchema.deserialize(query.feature_schema)
582
- else:
583
- signals_schema |= SignalSchema.from_column_types(query.column_types or {})
584
- return cls(query, _settings, signals_schema)
338
+ return from_dataset(*args, **kwargs)
585
339
 
586
340
  @classmethod
587
341
  def from_json(
588
342
  cls,
589
- path: Union[str, os.PathLike[str]],
590
- type: FileType = "text",
591
- spec: Optional[DataType] = None,
592
- schema_from: Optional[str] = "auto",
593
- jmespath: Optional[str] = None,
594
- object_name: Optional[str] = "",
595
- model_name: Optional[str] = None,
596
- format: Optional[str] = "json",
597
- nrows=None,
343
+ *args,
598
344
  **kwargs,
599
345
  ) -> "DataChain":
600
- """Get data from JSON. It returns the chain itself.
601
-
602
- Parameters:
603
- path : storage URI with directory. URI must start with storage prefix such
604
- as `s3://`, `gs://`, `az://` or "file:///"
605
- type : read file as "binary", "text", or "image" data. Default is "text".
606
- spec : optional Data Model
607
- schema_from : path to sample to infer spec (if schema not provided)
608
- object_name : generated object column name
609
- model_name : optional generated model name
610
- format: "json", "jsonl"
611
- jmespath : optional JMESPATH expression to reduce JSON
612
- nrows : optional row limit for jsonl and JSON arrays
613
-
614
- Example:
615
- infer JSON schema from data, reduce using JMESPATH
616
- ```py
617
- chain = DataChain.from_json("gs://json", jmespath="key1.key2")
618
- ```
619
-
620
- infer JSON schema from a particular path
621
- ```py
622
- chain = DataChain.from_json("gs://json_ds", schema_from="gs://json/my.json")
623
- ```
624
- """
625
- if schema_from == "auto":
626
- schema_from = str(path)
346
+ from .json import from_json
627
347
 
628
- def jmespath_to_name(s: str):
629
- name_end = re.search(r"\W", s).start() if re.search(r"\W", s) else len(s) # type: ignore[union-attr]
630
- return s[:name_end]
631
-
632
- if (not object_name) and jmespath:
633
- object_name = jmespath_to_name(jmespath)
634
- if not object_name:
635
- object_name = format
636
- chain = DataChain.from_storage(uri=path, type=type, **kwargs)
637
- signal_dict = {
638
- object_name: read_meta(
639
- schema_from=schema_from,
640
- format=format,
641
- spec=spec,
642
- model_name=model_name,
643
- jmespath=jmespath,
644
- nrows=nrows,
645
- ),
646
- "params": {"file": File},
647
- }
648
- # disable prefetch if nrows is set
649
- settings = {"prefetch": 0} if nrows else {}
650
- return chain.settings(**settings).gen(**signal_dict) # type: ignore[misc, arg-type]
348
+ warnings.warn(
349
+ "Class method `from_json` is deprecated. "
350
+ "Use `from_json` function instead from top_module.",
351
+ DeprecationWarning,
352
+ stacklevel=2,
353
+ )
354
+ return from_json(*args, **kwargs)
651
355
 
652
356
  def explode(
653
357
  self,
@@ -710,81 +414,34 @@ class DataChain:
710
414
  @classmethod
711
415
  def datasets(
712
416
  cls,
713
- session: Optional[Session] = None,
714
- settings: Optional[dict] = None,
715
- in_memory: bool = False,
716
- object_name: str = "dataset",
717
- include_listing: bool = False,
718
- studio: bool = False,
417
+ *args,
418
+ **kwargs,
719
419
  ) -> "DataChain":
720
- """Generate chain with list of registered datasets.
721
-
722
- Args:
723
- session: Optional session instance. If not provided, uses default session.
724
- settings: Optional dictionary of settings to configure the chain.
725
- in_memory: If True, creates an in-memory session. Defaults to False.
726
- object_name: Name of the output object in the chain. Defaults to "dataset".
727
- include_listing: If True, includes listing datasets. Defaults to False.
728
- studio: If True, returns datasets from Studio only,
729
- otherwise returns all local datasets. Defaults to False.
730
-
731
- Returns:
732
- DataChain: A new DataChain instance containing dataset information.
733
-
734
- Example:
735
- ```py
736
- from datachain import DataChain
737
-
738
- chain = DataChain.datasets()
739
- for ds in chain.collect("dataset"):
740
- print(f"{ds.name}@v{ds.version}")
741
- ```
742
- """
743
- session = Session.get(session, in_memory=in_memory)
744
- catalog = session.catalog
745
-
746
- datasets = [
747
- DatasetInfo.from_models(d, v, j)
748
- for d, v, j in catalog.list_datasets_versions(
749
- include_listing=include_listing, studio=studio
750
- )
751
- ]
420
+ from .datasets import datasets
752
421
 
753
- return cls.from_values(
754
- session=session,
755
- settings=settings,
756
- in_memory=in_memory,
757
- output={object_name: DatasetInfo},
758
- **{object_name: datasets}, # type: ignore[arg-type]
422
+ warnings.warn(
423
+ "Class method `datasets` is deprecated. "
424
+ "Use `datasets` function instead from top_module.",
425
+ DeprecationWarning,
426
+ stacklevel=2,
759
427
  )
428
+ return datasets(*args, **kwargs)
760
429
 
761
430
  @classmethod
762
431
  def listings(
763
432
  cls,
764
- session: Optional[Session] = None,
765
- in_memory: bool = False,
766
- object_name: str = "listing",
433
+ *args,
767
434
  **kwargs,
768
435
  ) -> "DataChain":
769
- """Generate chain with list of cached listings.
770
- Listing is a special kind of dataset which has directory listing data of
771
- some underlying storage (e.g S3 bucket).
436
+ from .listings import listings
772
437
 
773
- Example:
774
- ```py
775
- from datachain import DataChain
776
- DataChain.listings().show()
777
- ```
778
- """
779
- session = Session.get(session, in_memory=in_memory)
780
- catalog = kwargs.get("catalog") or session.catalog
781
-
782
- return cls.from_values(
783
- session=session,
784
- in_memory=in_memory,
785
- output={object_name: ListingInfo},
786
- **{object_name: catalog.listings()}, # type: ignore[arg-type]
438
+ warnings.warn(
439
+ "Class method `listings` is deprecated. "
440
+ "Use `listings` function instead from top_module.",
441
+ DeprecationWarning,
442
+ stacklevel=2,
787
443
  )
444
+ return listings(*args, **kwargs)
788
445
 
789
446
  def save( # type: ignore[override]
790
447
  self,
@@ -822,6 +479,7 @@ class DataChain:
822
479
 
823
480
  Example:
824
481
  ```py
482
+ import datachain as dc
825
483
  def parse_stem(chain):
826
484
  return chain.map(
827
485
  lambda file: file.get_file_stem()
@@ -829,7 +487,7 @@ class DataChain:
829
487
  )
830
488
 
831
489
  chain = (
832
- DataChain.from_storage("s3://my-bucket")
490
+ dc.from_storage("s3://my-bucket")
833
491
  .apply(parse_stem)
834
492
  .filter(C("stem").glob("*cat*"))
835
493
  )
@@ -1358,7 +1016,7 @@ class DataChain:
1358
1016
  @overload
1359
1017
  def results(self, *, include_hidden: bool) -> list[tuple[Any, ...]]: ...
1360
1018
 
1361
- def results(self, *, row_factory=None, include_hidden=True): # noqa: D102
1019
+ def results(self, *, row_factory=None, include_hidden=True):
1362
1020
  if row_factory is None:
1363
1021
  return list(self.collect_flatten(include_hidden=include_hidden))
1364
1022
  return list(
@@ -1468,7 +1126,7 @@ class DataChain:
1468
1126
  remove_prefetched=remove_prefetched,
1469
1127
  )
1470
1128
 
1471
- def remove_file_signals(self) -> "Self": # noqa: D102
1129
+ def remove_file_signals(self) -> "Self":
1472
1130
  schema = self.signals_schema.clone_without_file_signals()
1473
1131
  return self.select(*schema.values.keys())
1474
1132
 
@@ -1805,73 +1463,34 @@ class DataChain:
1805
1463
  @classmethod
1806
1464
  def from_values(
1807
1465
  cls,
1808
- ds_name: str = "",
1809
- session: Optional[Session] = None,
1810
- settings: Optional[dict] = None,
1811
- in_memory: bool = False,
1812
- output: OutputType = None,
1813
- object_name: str = "",
1814
- **fr_map,
1815
- ) -> "Self":
1816
- """Generate chain from list of values.
1817
-
1818
- Example:
1819
- ```py
1820
- DataChain.from_values(fib=[1, 2, 3, 5, 8])
1821
- ```
1822
- """
1823
- tuple_type, output, tuples = values_to_tuples(ds_name, output, **fr_map)
1824
-
1825
- def _func_fr() -> Iterator[tuple_type]: # type: ignore[valid-type]
1826
- yield from tuples
1466
+ *args,
1467
+ **kwargs,
1468
+ ) -> "DataChain":
1469
+ from .values import from_values
1827
1470
 
1828
- chain = cls.from_records(
1829
- DataChain.DEFAULT_FILE_RECORD,
1830
- session=session,
1831
- settings=settings,
1832
- in_memory=in_memory,
1471
+ warnings.warn(
1472
+ "Class method `from_values` is deprecated. "
1473
+ "Use `from_values` function instead from top_module.",
1474
+ DeprecationWarning,
1475
+ stacklevel=2,
1833
1476
  )
1834
- if object_name:
1835
- output = {object_name: dict_to_data_model(object_name, output)} # type: ignore[arg-type]
1836
- return chain.gen(_func_fr, output=output)
1477
+ return from_values(*args, **kwargs)
1837
1478
 
1838
1479
  @classmethod
1839
- def from_pandas( # type: ignore[override]
1480
+ def from_pandas(
1840
1481
  cls,
1841
- df: "pd.DataFrame",
1842
- name: str = "",
1843
- session: Optional[Session] = None,
1844
- settings: Optional[dict] = None,
1845
- in_memory: bool = False,
1846
- object_name: str = "",
1482
+ *args,
1483
+ **kwargs,
1847
1484
  ) -> "DataChain":
1848
- """Generate chain from pandas data-frame.
1849
-
1850
- Example:
1851
- ```py
1852
- import pandas as pd
1853
-
1854
- df = pd.DataFrame({"fib": [1, 2, 3, 5, 8]})
1855
- DataChain.from_pandas(df)
1856
- ```
1857
- """
1858
- fr_map = {col.lower(): df[col].tolist() for col in df.columns}
1859
-
1860
- for column in fr_map:
1861
- if not column.isidentifier():
1862
- raise DatasetPrepareError(
1863
- name,
1864
- f"import from pandas error - '{column}' cannot be a column name",
1865
- )
1485
+ from .pandas import from_pandas
1866
1486
 
1867
- return cls.from_values(
1868
- name,
1869
- session,
1870
- settings=settings,
1871
- object_name=object_name,
1872
- in_memory=in_memory,
1873
- **fr_map,
1487
+ warnings.warn(
1488
+ "Class method `from_pandas` is deprecated. "
1489
+ "Use `from_pandas` function instead from top_module.",
1490
+ DeprecationWarning,
1491
+ stacklevel=2,
1874
1492
  )
1493
+ return from_pandas(*args, **kwargs)
1875
1494
 
1876
1495
  def to_pandas(self, flatten=False, include_hidden=True) -> "pd.DataFrame":
1877
1496
  """Return a pandas DataFrame from the chain.
@@ -1953,56 +1572,18 @@ class DataChain:
1953
1572
  @classmethod
1954
1573
  def from_hf(
1955
1574
  cls,
1956
- dataset: Union[str, "HFDatasetType"],
1957
1575
  *args,
1958
- session: Optional[Session] = None,
1959
- settings: Optional[dict] = None,
1960
- object_name: str = "",
1961
- model_name: str = "",
1962
1576
  **kwargs,
1963
1577
  ) -> "DataChain":
1964
- """Generate chain from huggingface hub dataset.
1965
-
1966
- Parameters:
1967
- dataset : Path or name of the dataset to read from Hugging Face Hub,
1968
- or an instance of `datasets.Dataset`-like object.
1969
- session : Session to use for the chain.
1970
- settings : Settings to use for the chain.
1971
- object_name : Generated object column name.
1972
- model_name : Generated model name.
1973
- kwargs : Parameters to pass to datasets.load_dataset.
1974
-
1975
- Example:
1976
- Load from Hugging Face Hub:
1977
- ```py
1978
- DataChain.from_hf("beans", split="train")
1979
- ```
1980
-
1981
- Generate chain from loaded dataset:
1982
- ```py
1983
- from datasets import load_dataset
1984
- ds = load_dataset("beans", split="train")
1985
- DataChain.from_hf(ds)
1986
- ```
1987
- """
1988
- from datachain.lib.hf import HFGenerator, get_output_schema, stream_splits
1989
-
1990
- output: dict[str, DataType] = {}
1991
- ds_dict = stream_splits(dataset, *args, **kwargs)
1992
- if len(ds_dict) > 1:
1993
- output = {"split": str}
1578
+ from .hf import from_hf
1994
1579
 
1995
- model_name = model_name or object_name or ""
1996
- hf_features = next(iter(ds_dict.values())).features
1997
- output = output | get_output_schema(hf_features)
1998
- model = dict_to_data_model(model_name, output)
1999
- if object_name:
2000
- output = {object_name: model}
2001
-
2002
- chain = DataChain.from_values(
2003
- split=list(ds_dict.keys()), session=session, settings=settings
1580
+ warnings.warn(
1581
+ "Class method `from_hf` is deprecated. "
1582
+ "Use `from_hf` function instead from top_module.",
1583
+ DeprecationWarning,
1584
+ stacklevel=2,
2004
1585
  )
2005
- return chain.gen(HFGenerator(dataset, model, *args, **kwargs), output=output)
1586
+ return from_hf(*args, **kwargs)
2006
1587
 
2007
1588
  def parse_tabular(
2008
1589
  self,
@@ -2028,15 +1609,18 @@ class DataChain:
2028
1609
  Example:
2029
1610
  Reading a json lines file:
2030
1611
  ```py
2031
- dc = DataChain.from_storage("s3://mybucket/file.jsonl")
2032
- dc = dc.parse_tabular(format="json")
1612
+ import datachain as dc
1613
+ chain = dc.from_storage("s3://mybucket/file.jsonl")
1614
+ chain = chain.parse_tabular(format="json")
2033
1615
  ```
2034
1616
 
2035
1617
  Reading a filtered list of files as a dataset:
2036
1618
  ```py
2037
- dc = DataChain.from_storage("s3://mybucket")
2038
- dc = dc.filter(C("file.name").glob("*.jsonl"))
2039
- dc = dc.parse_tabular(format="json")
1619
+ import datachain as dc
1620
+
1621
+ chain = dc.from_storage("s3://mybucket")
1622
+ chain = chain.filter(dc.C("file.name").glob("*.jsonl"))
1623
+ chain = chain.parse_tabular(format="json")
2040
1624
  ```
2041
1625
  """
2042
1626
  from pyarrow.dataset import CsvFileFormat, JsonFileFormat
@@ -2093,161 +1677,34 @@ class DataChain:
2093
1677
  @classmethod
2094
1678
  def from_csv(
2095
1679
  cls,
2096
- path,
2097
- delimiter: Optional[str] = None,
2098
- header: bool = True,
2099
- output: OutputType = None,
2100
- object_name: str = "",
2101
- model_name: str = "",
2102
- source: bool = True,
2103
- nrows=None,
2104
- session: Optional[Session] = None,
2105
- settings: Optional[dict] = None,
2106
- column_types: Optional[dict[str, "Union[str, ArrowDataType]"]] = None,
2107
- parse_options: Optional[dict[str, "Union[str, Union[bool, Callable]]"]] = None,
1680
+ *args,
2108
1681
  **kwargs,
2109
1682
  ) -> "DataChain":
2110
- """Generate chain from csv files.
2111
-
2112
- Parameters:
2113
- path : Storage URI with directory. URI must start with storage prefix such
2114
- as `s3://`, `gs://`, `az://` or "file:///".
2115
- delimiter : Character for delimiting columns. Takes precedence if also
2116
- specified in `parse_options`. Defaults to ",".
2117
- header : Whether the files include a header row.
2118
- output : Dictionary or feature class defining column names and their
2119
- corresponding types. List of column names is also accepted, in which
2120
- case types will be inferred.
2121
- object_name : Created object column name.
2122
- model_name : Generated model name.
2123
- source : Whether to include info about the source file.
2124
- nrows : Optional row limit.
2125
- session : Session to use for the chain.
2126
- settings : Settings to use for the chain.
2127
- column_types : Dictionary of column names and their corresponding types.
2128
- It is passed to CSV reader and for each column specified type auto
2129
- inference is disabled.
2130
- parse_options: Tells the parser how to process lines.
2131
- See https://arrow.apache.org/docs/python/generated/pyarrow.csv.ParseOptions.html
2132
-
2133
- Example:
2134
- Reading a csv file:
2135
- ```py
2136
- dc = DataChain.from_csv("s3://mybucket/file.csv")
2137
- ```
2138
-
2139
- Reading csv files from a directory as a combined dataset:
2140
- ```py
2141
- dc = DataChain.from_csv("s3://mybucket/dir")
2142
- ```
2143
- """
2144
- from pandas.io.parsers.readers import STR_NA_VALUES
2145
- from pyarrow.csv import ConvertOptions, ParseOptions, ReadOptions
2146
- from pyarrow.dataset import CsvFileFormat
2147
- from pyarrow.lib import type_for_alias
2148
-
2149
- parse_options = parse_options or {}
2150
- if "delimiter" not in parse_options:
2151
- parse_options["delimiter"] = ","
2152
- if delimiter:
2153
- parse_options["delimiter"] = delimiter
2154
-
2155
- if column_types:
2156
- column_types = {
2157
- name: type_for_alias(typ) if isinstance(typ, str) else typ
2158
- for name, typ in column_types.items()
2159
- }
2160
- else:
2161
- column_types = {}
1683
+ from .csv import from_csv
2162
1684
 
2163
- chain = DataChain.from_storage(
2164
- path, session=session, settings=settings, **kwargs
2165
- )
2166
-
2167
- column_names = None
2168
- if not header:
2169
- if not output:
2170
- msg = "error parsing csv - provide output if no header"
2171
- raise DatasetPrepareError(chain.name, msg)
2172
- if isinstance(output, Sequence):
2173
- column_names = output # type: ignore[assignment]
2174
- elif isinstance(output, dict):
2175
- column_names = list(output.keys())
2176
- elif (fr := ModelStore.to_pydantic(output)) is not None:
2177
- column_names = list(fr.model_fields.keys())
2178
- else:
2179
- msg = f"error parsing csv - incompatible output type {type(output)}"
2180
- raise DatasetPrepareError(chain.name, msg)
2181
-
2182
- parse_options = ParseOptions(**parse_options)
2183
- read_options = ReadOptions(column_names=column_names)
2184
- convert_options = ConvertOptions(
2185
- strings_can_be_null=True,
2186
- null_values=STR_NA_VALUES,
2187
- column_types=column_types,
2188
- )
2189
- format = CsvFileFormat(
2190
- parse_options=parse_options,
2191
- read_options=read_options,
2192
- convert_options=convert_options,
2193
- )
2194
- return chain.parse_tabular(
2195
- output=output,
2196
- object_name=object_name,
2197
- model_name=model_name,
2198
- source=source,
2199
- nrows=nrows,
2200
- format=format,
1685
+ warnings.warn(
1686
+ "Class method `from_csv` is deprecated. "
1687
+ "Use `from_csv` function instead from top_module.",
1688
+ DeprecationWarning,
1689
+ stacklevel=2,
2201
1690
  )
1691
+ return from_csv(*args, **kwargs)
2202
1692
 
2203
1693
  @classmethod
2204
1694
  def from_parquet(
2205
1695
  cls,
2206
- path,
2207
- partitioning: Any = "hive",
2208
- output: Optional[dict[str, DataType]] = None,
2209
- object_name: str = "",
2210
- model_name: str = "",
2211
- source: bool = True,
2212
- session: Optional[Session] = None,
2213
- settings: Optional[dict] = None,
1696
+ *args,
2214
1697
  **kwargs,
2215
1698
  ) -> "DataChain":
2216
- """Generate chain from parquet files.
2217
-
2218
- Parameters:
2219
- path : Storage URI with directory. URI must start with storage prefix such
2220
- as `s3://`, `gs://`, `az://` or "file:///".
2221
- partitioning : Any pyarrow partitioning schema.
2222
- output : Dictionary defining column names and their corresponding types.
2223
- object_name : Created object column name.
2224
- model_name : Generated model name.
2225
- source : Whether to include info about the source file.
2226
- session : Session to use for the chain.
2227
- settings : Settings to use for the chain.
2228
-
2229
- Example:
2230
- Reading a single file:
2231
- ```py
2232
- dc = DataChain.from_parquet("s3://mybucket/file.parquet")
2233
- ```
1699
+ from .parquet import from_parquet
2234
1700
 
2235
- Reading a partitioned dataset from a directory:
2236
- ```py
2237
- dc = DataChain.from_parquet("s3://mybucket/dir")
2238
- ```
2239
- """
2240
- chain = DataChain.from_storage(
2241
- path, session=session, settings=settings, **kwargs
2242
- )
2243
- return chain.parse_tabular(
2244
- output=output,
2245
- object_name=object_name,
2246
- model_name=model_name,
2247
- source=source,
2248
- format="parquet",
2249
- partitioning=partitioning,
1701
+ warnings.warn(
1702
+ "Class method `from_parquet` is deprecated. "
1703
+ "Use `from_parquet` function instead from top_module.",
1704
+ DeprecationWarning,
1705
+ stacklevel=2,
2250
1706
  )
1707
+ return from_parquet(*args, **kwargs)
2251
1708
 
2252
1709
  def to_parquet(
2253
1710
  self,
@@ -2470,69 +1927,18 @@ class DataChain:
2470
1927
  @classmethod
2471
1928
  def from_records(
2472
1929
  cls,
2473
- to_insert: Optional[Union[dict, list[dict]]],
2474
- session: Optional[Session] = None,
2475
- settings: Optional[dict] = None,
2476
- in_memory: bool = False,
2477
- schema: Optional[dict[str, DataType]] = None,
2478
- ) -> "Self":
2479
- """Create a DataChain from the provided records. This method can be used for
2480
- programmatically generating a chain in contrast of reading data from storages
2481
- or other sources.
2482
-
2483
- Parameters:
2484
- to_insert : records (or a single record) to insert. Each record is
2485
- a dictionary of signals and theirs values.
2486
- schema : describes chain signals and their corresponding types
2487
-
2488
- Example:
2489
- ```py
2490
- single_record = DataChain.from_records(DataChain.DEFAULT_FILE_RECORD)
2491
- ```
2492
- """
2493
- session = Session.get(session, in_memory=in_memory)
2494
- catalog = session.catalog
2495
-
2496
- name = session.generate_temp_dataset_name()
2497
- signal_schema = None
2498
- columns: list[sqlalchemy.Column] = []
2499
-
2500
- if schema:
2501
- signal_schema = SignalSchema(schema)
2502
- columns = [
2503
- sqlalchemy.Column(c.name, c.type) # type: ignore[union-attr]
2504
- for c in signal_schema.db_signals(as_columns=True) # type: ignore[assignment]
2505
- ]
2506
- else:
2507
- columns = [
2508
- sqlalchemy.Column(name, typ)
2509
- for name, typ in File._datachain_column_types.items()
2510
- ]
1930
+ *args,
1931
+ **kwargs,
1932
+ ) -> "DataChain":
1933
+ from .records import from_records
2511
1934
 
2512
- dsr = catalog.create_dataset(
2513
- name,
2514
- columns=columns,
2515
- feature_schema=(
2516
- signal_schema.clone_without_sys_signals().serialize()
2517
- if signal_schema
2518
- else None
2519
- ),
1935
+ warnings.warn(
1936
+ "Class method `from_records` is deprecated. "
1937
+ "Use `from_records` function instead from top_module.",
1938
+ DeprecationWarning,
1939
+ stacklevel=2,
2520
1940
  )
2521
-
2522
- session.add_dataset_version(dsr, dsr.latest_version)
2523
-
2524
- if isinstance(to_insert, dict):
2525
- to_insert = [to_insert]
2526
- elif not to_insert:
2527
- to_insert = []
2528
-
2529
- warehouse = catalog.warehouse
2530
- dr = warehouse.dataset_rows(dsr)
2531
- db = warehouse.db
2532
- insert_q = dr.get_table().insert()
2533
- for record in to_insert:
2534
- db.execute(insert_q.values(**record))
2535
- return cls.from_dataset(name=dsr.name, session=session, settings=settings)
1941
+ return from_records(*args, **kwargs)
2536
1942
 
2537
1943
  def sum(self, fr: DataType): # type: ignore[override]
2538
1944
  """Compute the sum of a column."""
@@ -2560,9 +1966,10 @@ class DataChain:
2560
1966
  ```py
2561
1967
  import anthropic
2562
1968
  from anthropic.types import Message
1969
+ import datachain as dc
2563
1970
 
2564
1971
  (
2565
- DataChain.from_storage(DATA, type="text")
1972
+ dc.from_storage(DATA, type="text")
2566
1973
  .settings(parallel=4, cache=True)
2567
1974
  .setup(client=lambda: anthropic.Anthropic(api_key=API_KEY))
2568
1975
  .map(
@@ -2612,7 +2019,9 @@ class DataChain:
2612
2019
  Example:
2613
2020
  Cross cloud transfer
2614
2021
  ```py
2615
- ds = DataChain.from_storage("s3://mybucket")
2022
+ import datachain as dc
2023
+
2024
+ ds = dc.from_storage("s3://mybucket")
2616
2025
  ds.to_storage("gs://mybucket", placement="filename")
2617
2026
  ```
2618
2027
  """
@@ -2728,7 +2137,9 @@ class DataChain:
2728
2137
 
2729
2138
  Example:
2730
2139
  ```py
2731
- chain = DataChain.from_storage(...)
2140
+ import datachain as dc
2141
+
2142
+ chain = dc.from_storage(...)
2732
2143
  chunk_1 = query._chunk(0, 2)
2733
2144
  chunk_2 = query._chunk(1, 2)
2734
2145
  ```