datachain 0.13.0__py3-none-any.whl → 0.14.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

datachain/__init__.py CHANGED
@@ -1,5 +1,21 @@
1
1
  from datachain.lib.data_model import DataModel, DataType, is_chain_type
2
- from datachain.lib.dc import C, Column, DataChain, Sys
2
+ from datachain.lib.dc import (
3
+ C,
4
+ Column,
5
+ DataChain,
6
+ Sys,
7
+ datasets,
8
+ from_csv,
9
+ from_dataset,
10
+ from_hf,
11
+ from_json,
12
+ from_pandas,
13
+ from_parquet,
14
+ from_records,
15
+ from_storage,
16
+ from_values,
17
+ listings,
18
+ )
3
19
  from datachain.lib.file import (
4
20
  ArrowRow,
5
21
  File,
@@ -44,7 +60,18 @@ __all__ = [
44
60
  "VideoFile",
45
61
  "VideoFragment",
46
62
  "VideoFrame",
63
+ "datasets",
64
+ "from_csv",
65
+ "from_dataset",
66
+ "from_hf",
67
+ "from_json",
68
+ "from_pandas",
69
+ "from_parquet",
70
+ "from_records",
71
+ "from_storage",
72
+ "from_values",
47
73
  "is_chain_type",
74
+ "listings",
48
75
  "metrics",
49
76
  "param",
50
77
  ]
@@ -583,10 +583,10 @@ class Catalog:
583
583
  object_name="file",
584
584
  skip_indexing=False,
585
585
  ) -> tuple[Optional["Listing"], "Client", str]:
586
- from datachain.lib.dc import DataChain
586
+ from datachain import from_storage
587
587
  from datachain.listing import Listing
588
588
 
589
- DataChain.from_storage(
589
+ from_storage(
590
590
  source, session=self.session, update=update, object_name=object_name
591
591
  )
592
592
 
@@ -795,6 +795,19 @@ class Catalog:
795
795
  try:
796
796
  dataset = self.get_dataset(name)
797
797
  default_version = dataset.next_version
798
+
799
+ if (description or labels) and (
800
+ dataset.description != description or dataset.labels != labels
801
+ ):
802
+ description = description or dataset.description
803
+ labels = labels or dataset.labels
804
+
805
+ self.update_dataset(
806
+ dataset,
807
+ description=description,
808
+ labels=labels,
809
+ )
810
+
798
811
  except DatasetNotFoundError:
799
812
  schema = {
800
813
  c.name: c.type.to_dict() for c in columns if isinstance(c.type, SQLType)
@@ -981,18 +994,14 @@ class Catalog:
981
994
  if not sources:
982
995
  raise ValueError("Sources needs to be non empty list")
983
996
 
984
- from datachain.lib.dc import DataChain
997
+ from datachain import from_dataset, from_storage
985
998
 
986
999
  chains = []
987
1000
  for source in sources:
988
1001
  if source.startswith(DATASET_PREFIX):
989
- dc = DataChain.from_dataset(
990
- source[len(DATASET_PREFIX) :], session=self.session
991
- )
1002
+ dc = from_dataset(source[len(DATASET_PREFIX) :], session=self.session)
992
1003
  else:
993
- dc = DataChain.from_storage(
994
- source, session=self.session, recursive=recursive
995
- )
1004
+ dc = from_storage(source, session=self.session, recursive=recursive)
996
1005
 
997
1006
  chains.append(dc)
998
1007
 
@@ -36,7 +36,7 @@ def ls_local(
36
36
  client_config=None,
37
37
  **kwargs,
38
38
  ):
39
- from datachain import DataChain
39
+ from datachain import listings
40
40
 
41
41
  if sources:
42
42
  if catalog is None:
@@ -63,7 +63,7 @@ def ls_local(
63
63
  print(format_ls_entry(entry))
64
64
  else:
65
65
  # Collect results in a list here to prevent interference from `tqdm` and `print`
66
- listing = list(DataChain.listings().collect("listing"))
66
+ listing = list(listings().collect("listing"))
67
67
  for ls in listing:
68
68
  print(format_ls_entry(f"{ls.uri}@v{ls.version}")) # type: ignore[union-attr]
69
69
 
@@ -18,8 +18,7 @@ def show(
18
18
  schema: bool = False,
19
19
  include_hidden: bool = False,
20
20
  ) -> None:
21
- from datachain import Session
22
- from datachain.lib.dc import DataChain
21
+ from datachain import Session, from_dataset
23
22
  from datachain.query.dataset import DatasetQuery
24
23
  from datachain.utils import show_records
25
24
 
@@ -52,5 +51,5 @@ def show(
52
51
  if schema and dataset_version.feature_schema:
53
52
  print("\nSchema:")
54
53
  session = Session.get(catalog=catalog)
55
- dc = DataChain.from_dataset(name=name, version=version, session=session)
54
+ dc = from_dataset(name=name, version=version, session=session)
56
55
  dc.print_schema()
@@ -74,6 +74,7 @@ def _compare( # noqa: C901
74
74
  # all left and right columns
75
75
  cols = left.signals_schema.clone_without_sys_signals().db_signals()
76
76
  right_cols = right.signals_schema.clone_without_sys_signals().db_signals()
77
+ cols_select = list(left.signals_schema.clone_without_sys_signals().values.keys())
77
78
 
78
79
  # getting correct on and right_on column names
79
80
  on = left.signals_schema.resolve(*on).db_signals() # type: ignore[assignment]
@@ -131,10 +132,12 @@ def _compare( # noqa: C901
131
132
  # when the row is deleted, we need to take column values from the right chain
132
133
  .mutate(
133
134
  **{
134
- f"{c}": ifelse(
135
- C(diff_col) == CompareStatus.DELETED, C(f"{rname}{c}"), C(c)
135
+ f"{l_on}": ifelse(
136
+ C(diff_col) == CompareStatus.DELETED,
137
+ C(f"{rname + l_on if on == right_on else r_on}"),
138
+ C(l_on),
136
139
  )
137
- for c in [c for c in cols if c in right_cols]
140
+ for l_on, r_on in zip(on, right_on) # type: ignore[arg-type]
138
141
  }
139
142
  )
140
143
  .select_except(ldiff_col, rdiff_col)
@@ -150,9 +153,9 @@ def _compare( # noqa: C901
150
153
  dc_diff = dc_diff.filter(C(diff_col) != CompareStatus.DELETED)
151
154
 
152
155
  if status_col:
153
- cols.append(diff_col) # type: ignore[arg-type]
156
+ cols_select.append(diff_col)
154
157
 
155
- dc_diff = dc_diff.select(*cols)
158
+ dc_diff = dc_diff.select(*cols_select)
156
159
 
157
160
  # final schema is schema from the left chain with status column added if needed
158
161
  dc_diff.signals_schema = (
@@ -0,0 +1,32 @@
1
+ from .csv import from_csv
2
+ from .datachain import C, Column, DataChain
3
+ from .datasets import datasets, from_dataset
4
+ from .hf import from_hf
5
+ from .json import from_json
6
+ from .listings import listings
7
+ from .pandas import from_pandas
8
+ from .parquet import from_parquet
9
+ from .records import from_records
10
+ from .storage import from_storage
11
+ from .utils import DatasetMergeError, DatasetPrepareError, Sys
12
+ from .values import from_values
13
+
14
+ __all__ = [
15
+ "C",
16
+ "Column",
17
+ "DataChain",
18
+ "DatasetMergeError",
19
+ "DatasetPrepareError",
20
+ "Sys",
21
+ "datasets",
22
+ "from_csv",
23
+ "from_dataset",
24
+ "from_hf",
25
+ "from_json",
26
+ "from_pandas",
27
+ "from_parquet",
28
+ "from_records",
29
+ "from_storage",
30
+ "from_values",
31
+ "listings",
32
+ ]
@@ -0,0 +1,127 @@
1
+ from collections.abc import Sequence
2
+ from typing import (
3
+ TYPE_CHECKING,
4
+ Callable,
5
+ Optional,
6
+ Union,
7
+ )
8
+
9
+ from datachain.lib.dc.utils import DatasetPrepareError, OutputType
10
+ from datachain.lib.model_store import ModelStore
11
+ from datachain.query import Session
12
+
13
+ if TYPE_CHECKING:
14
+ from pyarrow import DataType as ArrowDataType
15
+
16
+ from .datachain import DataChain
17
+
18
+
19
+ def from_csv(
20
+ path,
21
+ delimiter: Optional[str] = None,
22
+ header: bool = True,
23
+ output: OutputType = None,
24
+ object_name: str = "",
25
+ model_name: str = "",
26
+ source: bool = True,
27
+ nrows=None,
28
+ session: Optional[Session] = None,
29
+ settings: Optional[dict] = None,
30
+ column_types: Optional[dict[str, "Union[str, ArrowDataType]"]] = None,
31
+ parse_options: Optional[dict[str, "Union[str, Union[bool, Callable]]"]] = None,
32
+ **kwargs,
33
+ ) -> "DataChain":
34
+ """Generate chain from csv files.
35
+
36
+ Parameters:
37
+ path : Storage URI with directory. URI must start with storage prefix such
38
+ as `s3://`, `gs://`, `az://` or "file:///".
39
+ delimiter : Character for delimiting columns. Takes precedence if also
40
+ specified in `parse_options`. Defaults to ",".
41
+ header : Whether the files include a header row.
42
+ output : Dictionary or feature class defining column names and their
43
+ corresponding types. List of column names is also accepted, in which
44
+ case types will be inferred.
45
+ object_name : Created object column name.
46
+ model_name : Generated model name.
47
+ source : Whether to include info about the source file.
48
+ nrows : Optional row limit.
49
+ session : Session to use for the chain.
50
+ settings : Settings to use for the chain.
51
+ column_types : Dictionary of column names and their corresponding types.
52
+ It is passed to CSV reader and for each column specified type auto
53
+ inference is disabled.
54
+ parse_options: Tells the parser how to process lines.
55
+ See https://arrow.apache.org/docs/python/generated/pyarrow.csv.ParseOptions.html
56
+
57
+ Example:
58
+ Reading a csv file:
59
+ ```py
60
+ import datachain as dc
61
+ chain = dc.from_csv("s3://mybucket/file.csv")
62
+ ```
63
+
64
+ Reading csv files from a directory as a combined dataset:
65
+ ```py
66
+ import datachain as dc
67
+ chain = dc.from_csv("s3://mybucket/dir")
68
+ ```
69
+ """
70
+ from pandas.io.parsers.readers import STR_NA_VALUES
71
+ from pyarrow.csv import ConvertOptions, ParseOptions, ReadOptions
72
+ from pyarrow.dataset import CsvFileFormat
73
+ from pyarrow.lib import type_for_alias
74
+
75
+ from .storage import from_storage
76
+
77
+ parse_options = parse_options or {}
78
+ if "delimiter" not in parse_options:
79
+ parse_options["delimiter"] = ","
80
+ if delimiter:
81
+ parse_options["delimiter"] = delimiter
82
+
83
+ if column_types:
84
+ column_types = {
85
+ name: type_for_alias(typ) if isinstance(typ, str) else typ
86
+ for name, typ in column_types.items()
87
+ }
88
+ else:
89
+ column_types = {}
90
+
91
+ chain = from_storage(path, session=session, settings=settings, **kwargs)
92
+
93
+ column_names = None
94
+ if not header:
95
+ if not output:
96
+ msg = "error parsing csv - provide output if no header"
97
+ raise DatasetPrepareError(chain.name, msg)
98
+ if isinstance(output, Sequence):
99
+ column_names = output # type: ignore[assignment]
100
+ elif isinstance(output, dict):
101
+ column_names = list(output.keys())
102
+ elif (fr := ModelStore.to_pydantic(output)) is not None:
103
+ column_names = list(fr.model_fields.keys())
104
+ else:
105
+ msg = f"error parsing csv - incompatible output type {type(output)}"
106
+ raise DatasetPrepareError(chain.name, msg)
107
+
108
+ parse_options = ParseOptions(**parse_options)
109
+ read_options = ReadOptions(column_names=column_names)
110
+ convert_options = ConvertOptions(
111
+ strings_can_be_null=True,
112
+ null_values=STR_NA_VALUES,
113
+ column_types=column_types,
114
+ )
115
+ format = CsvFileFormat(
116
+ parse_options=parse_options,
117
+ read_options=read_options,
118
+ convert_options=convert_options,
119
+ )
120
+ return chain.parse_tabular(
121
+ output=output,
122
+ object_name=object_name,
123
+ model_name=model_name,
124
+ source=source,
125
+ nrows=nrows,
126
+ format=format,
127
+ )