datachain 0.13.1__py3-none-any.whl → 0.14.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

datachain/__init__.py CHANGED
@@ -1,5 +1,21 @@
1
1
  from datachain.lib.data_model import DataModel, DataType, is_chain_type
2
- from datachain.lib.dc import C, Column, DataChain, Sys
2
+ from datachain.lib.dc import (
3
+ C,
4
+ Column,
5
+ DataChain,
6
+ Sys,
7
+ datasets,
8
+ from_csv,
9
+ from_dataset,
10
+ from_hf,
11
+ from_json,
12
+ from_pandas,
13
+ from_parquet,
14
+ from_records,
15
+ from_storage,
16
+ from_values,
17
+ listings,
18
+ )
3
19
  from datachain.lib.file import (
4
20
  ArrowRow,
5
21
  File,
@@ -44,7 +60,18 @@ __all__ = [
44
60
  "VideoFile",
45
61
  "VideoFragment",
46
62
  "VideoFrame",
63
+ "datasets",
64
+ "from_csv",
65
+ "from_dataset",
66
+ "from_hf",
67
+ "from_json",
68
+ "from_pandas",
69
+ "from_parquet",
70
+ "from_records",
71
+ "from_storage",
72
+ "from_values",
47
73
  "is_chain_type",
74
+ "listings",
48
75
  "metrics",
49
76
  "param",
50
77
  ]
@@ -583,12 +583,12 @@ class Catalog:
583
583
  object_name="file",
584
584
  skip_indexing=False,
585
585
  ) -> tuple[Optional["Listing"], "Client", str]:
586
- from datachain.lib.dc import DataChain
586
+ from datachain import from_storage
587
587
  from datachain.listing import Listing
588
588
 
589
- DataChain.from_storage(
589
+ from_storage(
590
590
  source, session=self.session, update=update, object_name=object_name
591
- )
591
+ ).exec()
592
592
 
593
593
  list_ds_name, list_uri, list_path, _ = get_listing(
594
594
  source, self.session, update=update
@@ -994,18 +994,14 @@ class Catalog:
994
994
  if not sources:
995
995
  raise ValueError("Sources needs to be non empty list")
996
996
 
997
- from datachain.lib.dc import DataChain
997
+ from datachain import from_dataset, from_storage
998
998
 
999
999
  chains = []
1000
1000
  for source in sources:
1001
1001
  if source.startswith(DATASET_PREFIX):
1002
- dc = DataChain.from_dataset(
1003
- source[len(DATASET_PREFIX) :], session=self.session
1004
- )
1002
+ dc = from_dataset(source[len(DATASET_PREFIX) :], session=self.session)
1005
1003
  else:
1006
- dc = DataChain.from_storage(
1007
- source, session=self.session, recursive=recursive
1008
- )
1004
+ dc = from_storage(source, session=self.session, recursive=recursive)
1009
1005
 
1010
1006
  chains.append(dc)
1011
1007
 
@@ -36,7 +36,7 @@ def ls_local(
36
36
  client_config=None,
37
37
  **kwargs,
38
38
  ):
39
- from datachain import DataChain
39
+ from datachain import listings
40
40
 
41
41
  if sources:
42
42
  if catalog is None:
@@ -63,7 +63,7 @@ def ls_local(
63
63
  print(format_ls_entry(entry))
64
64
  else:
65
65
  # Collect results in a list here to prevent interference from `tqdm` and `print`
66
- listing = list(DataChain.listings().collect("listing"))
66
+ listing = list(listings().collect("listing"))
67
67
  for ls in listing:
68
68
  print(format_ls_entry(f"{ls.uri}@v{ls.version}")) # type: ignore[union-attr]
69
69
 
@@ -18,8 +18,7 @@ def show(
18
18
  schema: bool = False,
19
19
  include_hidden: bool = False,
20
20
  ) -> None:
21
- from datachain import Session
22
- from datachain.lib.dc import DataChain
21
+ from datachain import Session, from_dataset
23
22
  from datachain.query.dataset import DatasetQuery
24
23
  from datachain.utils import show_records
25
24
 
@@ -52,5 +51,5 @@ def show(
52
51
  if schema and dataset_version.feature_schema:
53
52
  print("\nSchema:")
54
53
  session = Session.get(catalog=catalog)
55
- dc = DataChain.from_dataset(name=name, version=version, session=session)
54
+ dc = from_dataset(name=name, version=version, session=session)
56
55
  dc.print_schema()
@@ -89,9 +89,9 @@ class Client(ABC):
89
89
  from .local import FileClient
90
90
  from .s3 import ClientS3
91
91
 
92
- protocol = urlparse(str(url)).scheme
92
+ protocol = urlparse(os.fspath(url)).scheme
93
93
 
94
- if not protocol or _is_win_local_path(str(url)):
94
+ if not protocol or _is_win_local_path(os.fspath(url)):
95
95
  return FileClient
96
96
  if protocol == ClientS3.protocol:
97
97
  return ClientS3
@@ -122,7 +122,7 @@ class Client(ABC):
122
122
  source: Union[str, os.PathLike[str]], cache: Cache, **kwargs
123
123
  ) -> "Client":
124
124
  cls = Client.get_implementation(source)
125
- storage_url, _ = cls.split_url(str(source))
125
+ storage_url, _ = cls.split_url(os.fspath(source))
126
126
  if os.name == "nt":
127
127
  storage_url = storage_url.removeprefix("/")
128
128
 
@@ -0,0 +1,32 @@
1
+ from .csv import from_csv
2
+ from .datachain import C, Column, DataChain
3
+ from .datasets import datasets, from_dataset
4
+ from .hf import from_hf
5
+ from .json import from_json
6
+ from .listings import listings
7
+ from .pandas import from_pandas
8
+ from .parquet import from_parquet
9
+ from .records import from_records
10
+ from .storage import from_storage
11
+ from .utils import DatasetMergeError, DatasetPrepareError, Sys
12
+ from .values import from_values
13
+
14
+ __all__ = [
15
+ "C",
16
+ "Column",
17
+ "DataChain",
18
+ "DatasetMergeError",
19
+ "DatasetPrepareError",
20
+ "Sys",
21
+ "datasets",
22
+ "from_csv",
23
+ "from_dataset",
24
+ "from_hf",
25
+ "from_json",
26
+ "from_pandas",
27
+ "from_parquet",
28
+ "from_records",
29
+ "from_storage",
30
+ "from_values",
31
+ "listings",
32
+ ]
@@ -0,0 +1,127 @@
1
+ from collections.abc import Sequence
2
+ from typing import (
3
+ TYPE_CHECKING,
4
+ Callable,
5
+ Optional,
6
+ Union,
7
+ )
8
+
9
+ from datachain.lib.dc.utils import DatasetPrepareError, OutputType
10
+ from datachain.lib.model_store import ModelStore
11
+ from datachain.query import Session
12
+
13
+ if TYPE_CHECKING:
14
+ from pyarrow import DataType as ArrowDataType
15
+
16
+ from .datachain import DataChain
17
+
18
+
19
+ def from_csv(
20
+ path,
21
+ delimiter: Optional[str] = None,
22
+ header: bool = True,
23
+ output: OutputType = None,
24
+ object_name: str = "",
25
+ model_name: str = "",
26
+ source: bool = True,
27
+ nrows=None,
28
+ session: Optional[Session] = None,
29
+ settings: Optional[dict] = None,
30
+ column_types: Optional[dict[str, "Union[str, ArrowDataType]"]] = None,
31
+ parse_options: Optional[dict[str, "Union[str, Union[bool, Callable]]"]] = None,
32
+ **kwargs,
33
+ ) -> "DataChain":
34
+ """Generate chain from csv files.
35
+
36
+ Parameters:
37
+ path : Storage URI with directory. URI must start with storage prefix such
38
+ as `s3://`, `gs://`, `az://` or "file:///".
39
+ delimiter : Character for delimiting columns. Takes precedence if also
40
+ specified in `parse_options`. Defaults to ",".
41
+ header : Whether the files include a header row.
42
+ output : Dictionary or feature class defining column names and their
43
+ corresponding types. List of column names is also accepted, in which
44
+ case types will be inferred.
45
+ object_name : Created object column name.
46
+ model_name : Generated model name.
47
+ source : Whether to include info about the source file.
48
+ nrows : Optional row limit.
49
+ session : Session to use for the chain.
50
+ settings : Settings to use for the chain.
51
+ column_types : Dictionary of column names and their corresponding types.
52
+ It is passed to CSV reader and for each column specified type auto
53
+ inference is disabled.
54
+ parse_options: Tells the parser how to process lines.
55
+ See https://arrow.apache.org/docs/python/generated/pyarrow.csv.ParseOptions.html
56
+
57
+ Example:
58
+ Reading a csv file:
59
+ ```py
60
+ import datachain as dc
61
+ chain = dc.from_csv("s3://mybucket/file.csv")
62
+ ```
63
+
64
+ Reading csv files from a directory as a combined dataset:
65
+ ```py
66
+ import datachain as dc
67
+ chain = dc.from_csv("s3://mybucket/dir")
68
+ ```
69
+ """
70
+ from pandas.io.parsers.readers import STR_NA_VALUES
71
+ from pyarrow.csv import ConvertOptions, ParseOptions, ReadOptions
72
+ from pyarrow.dataset import CsvFileFormat
73
+ from pyarrow.lib import type_for_alias
74
+
75
+ from .storage import from_storage
76
+
77
+ parse_options = parse_options or {}
78
+ if "delimiter" not in parse_options:
79
+ parse_options["delimiter"] = ","
80
+ if delimiter:
81
+ parse_options["delimiter"] = delimiter
82
+
83
+ if column_types:
84
+ column_types = {
85
+ name: type_for_alias(typ) if isinstance(typ, str) else typ
86
+ for name, typ in column_types.items()
87
+ }
88
+ else:
89
+ column_types = {}
90
+
91
+ chain = from_storage(path, session=session, settings=settings, **kwargs)
92
+
93
+ column_names = None
94
+ if not header:
95
+ if not output:
96
+ msg = "error parsing csv - provide output if no header"
97
+ raise DatasetPrepareError(chain.name, msg)
98
+ if isinstance(output, Sequence):
99
+ column_names = output # type: ignore[assignment]
100
+ elif isinstance(output, dict):
101
+ column_names = list(output.keys())
102
+ elif (fr := ModelStore.to_pydantic(output)) is not None:
103
+ column_names = list(fr.model_fields.keys())
104
+ else:
105
+ msg = f"error parsing csv - incompatible output type {type(output)}"
106
+ raise DatasetPrepareError(chain.name, msg)
107
+
108
+ parse_options = ParseOptions(**parse_options)
109
+ read_options = ReadOptions(column_names=column_names)
110
+ convert_options = ConvertOptions(
111
+ strings_can_be_null=True,
112
+ null_values=STR_NA_VALUES,
113
+ column_types=column_types,
114
+ )
115
+ format = CsvFileFormat(
116
+ parse_options=parse_options,
117
+ read_options=read_options,
118
+ convert_options=convert_options,
119
+ )
120
+ return chain.parse_tabular(
121
+ output=output,
122
+ object_name=object_name,
123
+ model_name=model_name,
124
+ source=source,
125
+ nrows=nrows,
126
+ format=format,
127
+ )