datachain 0.6.2__py3-none-any.whl → 0.6.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

datachain/lib/arrow.py CHANGED
@@ -1,4 +1,3 @@
1
- import re
2
1
  from collections.abc import Sequence
3
2
  from tempfile import NamedTemporaryFile
4
3
  from typing import TYPE_CHECKING, Any, Optional
@@ -13,6 +12,7 @@ from datachain.lib.file import ArrowRow, File
13
12
  from datachain.lib.model_store import ModelStore
14
13
  from datachain.lib.signal_schema import SignalSchema
15
14
  from datachain.lib.udf import Generator
15
+ from datachain.lib.utils import normalize_col_names
16
16
 
17
17
  if TYPE_CHECKING:
18
18
  from datasets.features.features import Features
@@ -128,7 +128,7 @@ def schema_to_output(schema: pa.Schema, col_names: Optional[Sequence[str]] = Non
128
128
  signal_schema = _get_datachain_schema(schema)
129
129
  if signal_schema:
130
130
  return signal_schema.values
131
- columns = _convert_col_names(col_names) # type: ignore[arg-type]
131
+ columns = list(normalize_col_names(col_names).keys()) # type: ignore[arg-type]
132
132
  hf_schema = _get_hf_schema(schema)
133
133
  if hf_schema:
134
134
  return {
@@ -143,19 +143,6 @@ def schema_to_output(schema: pa.Schema, col_names: Optional[Sequence[str]] = Non
143
143
  return output
144
144
 
145
145
 
146
- def _convert_col_names(col_names: Sequence[str]) -> list[str]:
147
- default_column = 0
148
- converted_col_names = []
149
- for column in col_names:
150
- column = column.lower()
151
- column = re.sub("[^0-9a-z_]+", "", column)
152
- if not column:
153
- column = f"c{default_column}"
154
- default_column += 1
155
- converted_col_names.append(column)
156
- return converted_col_names
157
-
158
-
159
146
  def arrow_type_mapper(col_type: pa.DataType, column: str = "") -> type: # noqa: PLR0911
160
147
  """Convert pyarrow types to basic types."""
161
148
  from datetime import datetime
@@ -2,9 +2,10 @@ from collections.abc import Sequence
2
2
  from datetime import datetime
3
3
  from typing import ClassVar, Union, get_args, get_origin
4
4
 
5
- from pydantic import BaseModel, create_model
5
+ from pydantic import BaseModel, Field, create_model
6
6
 
7
7
  from datachain.lib.model_store import ModelStore
8
+ from datachain.lib.utils import normalize_col_names
8
9
 
9
10
  StandardType = Union[
10
11
  type[int],
@@ -60,7 +61,14 @@ def is_chain_type(t: type) -> bool:
60
61
 
61
62
 
62
63
  def dict_to_data_model(name: str, data_dict: dict[str, DataType]) -> type[BaseModel]:
63
- fields = {name: (anno, ...) for name, anno in data_dict.items()}
64
+ # Gets a map of a normalized_name -> original_name
65
+ columns = normalize_col_names(list(data_dict.keys()))
66
+ # We reverse if for convenience to original_name -> normalized_name
67
+ columns = {v: k for k, v in columns.items()}
68
+
69
+ fields = {
70
+ columns[name]: (anno, Field(alias=name)) for name, anno in data_dict.items()
71
+ }
64
72
  return create_model(
65
73
  name,
66
74
  __base__=(DataModel,), # type: ignore[call-overload]
datachain/lib/utils.py CHANGED
@@ -1,4 +1,6 @@
1
+ import re
1
2
  from abc import ABC, abstractmethod
3
+ from collections.abc import Sequence
2
4
 
3
5
 
4
6
  class AbstractUDF(ABC):
@@ -28,3 +30,31 @@ class DataChainParamsError(DataChainError):
28
30
  class DataChainColumnError(DataChainParamsError):
29
31
  def __init__(self, col_name, msg):
30
32
  super().__init__(f"Error for column {col_name}: {msg}")
33
+
34
+
35
+ def normalize_col_names(col_names: Sequence[str]) -> dict[str, str]:
36
+ gen_col_counter = 0
37
+ new_col_names = {}
38
+ org_col_names = set(col_names)
39
+
40
+ for org_column in col_names:
41
+ new_column = org_column.lower()
42
+ new_column = re.sub("[^0-9a-z]+", "_", new_column)
43
+ new_column = new_column.strip("_")
44
+
45
+ generated_column = new_column
46
+
47
+ while (
48
+ not generated_column.isidentifier()
49
+ or generated_column in new_col_names
50
+ or (generated_column != org_column and generated_column in org_col_names)
51
+ ):
52
+ if new_column:
53
+ generated_column = f"c{gen_col_counter}_{new_column}"
54
+ else:
55
+ generated_column = f"c{gen_col_counter}"
56
+ gen_col_counter += 1
57
+
58
+ new_col_names[generated_column] = org_column
59
+
60
+ return new_col_names
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.6.2
3
+ Version: 0.6.3
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -47,7 +47,7 @@ Requires-Dist: platformdirs
47
47
  Requires-Dist: dvc-studio-client <1,>=0.21
48
48
  Provides-Extra: dev
49
49
  Requires-Dist: datachain[docs,tests] ; extra == 'dev'
50
- Requires-Dist: mypy ==1.12.1 ; extra == 'dev'
50
+ Requires-Dist: mypy ==1.13.0 ; extra == 'dev'
51
51
  Requires-Dist: types-python-dateutil ; extra == 'dev'
52
52
  Requires-Dist: types-pytz ; extra == 'dev'
53
53
  Requires-Dist: types-PyYAML ; extra == 'dev'
@@ -40,9 +40,9 @@ datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2kru
40
40
  datachain/data_storage/sqlite.py,sha256=jopfVftng157TVcBKMB_QPlbkE6fTatiY4GYSSLNkig,28737
41
41
  datachain/data_storage/warehouse.py,sha256=iIjFOutYxhLev3CcUhUTwMJOkHeAEBwXZ2y3wmjrF1s,30756
42
42
  datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
43
- datachain/lib/arrow.py,sha256=0R2CYsN82nNa5_03iS6jVix9EKeeqNZNAMgpSQP2hfo,9482
43
+ datachain/lib/arrow.py,sha256=M6SM4u2LeHgylzkPZBWckFeZt3CH3ehpBod3nGl6OYY,9138
44
44
  datachain/lib/clip.py,sha256=lm5CzVi4Cj1jVLEKvERKArb-egb9j1Ls-fwTItT6vlI,6150
45
- datachain/lib/data_model.py,sha256=ECTbvlnzM98hp2mZ4fo82Yi0-MuoqTIQasQKGIyd89I,2040
45
+ datachain/lib/data_model.py,sha256=dau4AlZBhOFvF7pEKMeqCeRkcFFg5KFvTBWW_2CdH5g,2371
46
46
  datachain/lib/dataset_info.py,sha256=srPPhI2UHf6hFPBecyFEVw2SS5aPisIIMsvGgKqi7ss,2366
47
47
  datachain/lib/dc.py,sha256=pOyE8LqIwo86GrZTSpSMUJAYYwep7nCdIxebkSYlMGo,84484
48
48
  datachain/lib/file.py,sha256=LjTW_-PDAnoUhvyB4bJ8Y8n__XGqrxvmd9mDOF0Gir8,14875
@@ -59,7 +59,7 @@ datachain/lib/tar.py,sha256=3WIzao6yD5fbLqXLTt9GhPGNonbFIs_fDRu-9vgLgsA,1038
59
59
  datachain/lib/text.py,sha256=UNHm8fhidk7wdrWqacEWaA6I9ykfYqarQ2URby7jc7M,1261
60
60
  datachain/lib/udf.py,sha256=4CqK51n3bntXCmkwoOQIrX34wMKOknkC23HtR4D_2vM,12705
61
61
  datachain/lib/udf_signature.py,sha256=GXw24A-Olna6DWCdgy2bC-gZh_gLGPQ-KvjuI6pUjC0,7281
62
- datachain/lib/utils.py,sha256=12elAX6eTFgMGKIf2UfZ4IW07kRwjK6wz8yGE41RtNM,618
62
+ datachain/lib/utils.py,sha256=6NwgWLl5JrgtD4rsSFEe-yR2ntEwJMJEtAZ3FIxK3fg,1529
63
63
  datachain/lib/vfile.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
64
64
  datachain/lib/webdataset.py,sha256=o7SHk5HOUWsZ5Ln04xOM04eQqiBHiJNO7xLgyVBrwo8,6924
65
65
  datachain/lib/webdataset_laion.py,sha256=aGMWeFmeYNK75ewO9JTA11iB1i3QtTzUfenQA5jajfo,2535
@@ -101,9 +101,9 @@ datachain/sql/sqlite/base.py,sha256=aHSZVvh4XSVkvZ07h3jMoRlHI4sWD8y3SnmGs9xMG9Y,
101
101
  datachain/sql/sqlite/types.py,sha256=yzvp0sXSEoEYXs6zaYC_2YubarQoZH-MiUNXcpuEP4s,1573
102
102
  datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
103
103
  datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
104
- datachain-0.6.2.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
105
- datachain-0.6.2.dist-info/METADATA,sha256=QJGHTrGZapho1am27dPKQCOKG_FiEMsvWNLloeU8qVQ,17188
106
- datachain-0.6.2.dist-info/WHEEL,sha256=OVMc5UfuAQiSplgO0_WdW7vXVGAt9Hdd6qtN4HotdyA,91
107
- datachain-0.6.2.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
108
- datachain-0.6.2.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
109
- datachain-0.6.2.dist-info/RECORD,,
104
+ datachain-0.6.3.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
105
+ datachain-0.6.3.dist-info/METADATA,sha256=BnPIINjkfA0P2Sj9mRziNuKm8SWyINrf8qqCic7NUAo,17188
106
+ datachain-0.6.3.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
107
+ datachain-0.6.3.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
108
+ datachain-0.6.3.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
109
+ datachain-0.6.3.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.2.0)
2
+ Generator: setuptools (75.3.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5