datachain 0.6.2__py3-none-any.whl → 0.6.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/lib/arrow.py +2 -15
- datachain/lib/data_model.py +10 -2
- datachain/lib/utils.py +30 -0
- {datachain-0.6.2.dist-info → datachain-0.6.3.dist-info}/METADATA +2 -2
- {datachain-0.6.2.dist-info → datachain-0.6.3.dist-info}/RECORD +9 -9
- {datachain-0.6.2.dist-info → datachain-0.6.3.dist-info}/WHEEL +1 -1
- {datachain-0.6.2.dist-info → datachain-0.6.3.dist-info}/LICENSE +0 -0
- {datachain-0.6.2.dist-info → datachain-0.6.3.dist-info}/entry_points.txt +0 -0
- {datachain-0.6.2.dist-info → datachain-0.6.3.dist-info}/top_level.txt +0 -0
datachain/lib/arrow.py
CHANGED
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
import re
|
|
2
1
|
from collections.abc import Sequence
|
|
3
2
|
from tempfile import NamedTemporaryFile
|
|
4
3
|
from typing import TYPE_CHECKING, Any, Optional
|
|
@@ -13,6 +12,7 @@ from datachain.lib.file import ArrowRow, File
|
|
|
13
12
|
from datachain.lib.model_store import ModelStore
|
|
14
13
|
from datachain.lib.signal_schema import SignalSchema
|
|
15
14
|
from datachain.lib.udf import Generator
|
|
15
|
+
from datachain.lib.utils import normalize_col_names
|
|
16
16
|
|
|
17
17
|
if TYPE_CHECKING:
|
|
18
18
|
from datasets.features.features import Features
|
|
@@ -128,7 +128,7 @@ def schema_to_output(schema: pa.Schema, col_names: Optional[Sequence[str]] = Non
|
|
|
128
128
|
signal_schema = _get_datachain_schema(schema)
|
|
129
129
|
if signal_schema:
|
|
130
130
|
return signal_schema.values
|
|
131
|
-
columns =
|
|
131
|
+
columns = list(normalize_col_names(col_names).keys()) # type: ignore[arg-type]
|
|
132
132
|
hf_schema = _get_hf_schema(schema)
|
|
133
133
|
if hf_schema:
|
|
134
134
|
return {
|
|
@@ -143,19 +143,6 @@ def schema_to_output(schema: pa.Schema, col_names: Optional[Sequence[str]] = Non
|
|
|
143
143
|
return output
|
|
144
144
|
|
|
145
145
|
|
|
146
|
-
def _convert_col_names(col_names: Sequence[str]) -> list[str]:
|
|
147
|
-
default_column = 0
|
|
148
|
-
converted_col_names = []
|
|
149
|
-
for column in col_names:
|
|
150
|
-
column = column.lower()
|
|
151
|
-
column = re.sub("[^0-9a-z_]+", "", column)
|
|
152
|
-
if not column:
|
|
153
|
-
column = f"c{default_column}"
|
|
154
|
-
default_column += 1
|
|
155
|
-
converted_col_names.append(column)
|
|
156
|
-
return converted_col_names
|
|
157
|
-
|
|
158
|
-
|
|
159
146
|
def arrow_type_mapper(col_type: pa.DataType, column: str = "") -> type: # noqa: PLR0911
|
|
160
147
|
"""Convert pyarrow types to basic types."""
|
|
161
148
|
from datetime import datetime
|
datachain/lib/data_model.py
CHANGED
|
@@ -2,9 +2,10 @@ from collections.abc import Sequence
|
|
|
2
2
|
from datetime import datetime
|
|
3
3
|
from typing import ClassVar, Union, get_args, get_origin
|
|
4
4
|
|
|
5
|
-
from pydantic import BaseModel, create_model
|
|
5
|
+
from pydantic import BaseModel, Field, create_model
|
|
6
6
|
|
|
7
7
|
from datachain.lib.model_store import ModelStore
|
|
8
|
+
from datachain.lib.utils import normalize_col_names
|
|
8
9
|
|
|
9
10
|
StandardType = Union[
|
|
10
11
|
type[int],
|
|
@@ -60,7 +61,14 @@ def is_chain_type(t: type) -> bool:
|
|
|
60
61
|
|
|
61
62
|
|
|
62
63
|
def dict_to_data_model(name: str, data_dict: dict[str, DataType]) -> type[BaseModel]:
|
|
63
|
-
|
|
64
|
+
# Gets a map of a normalized_name -> original_name
|
|
65
|
+
columns = normalize_col_names(list(data_dict.keys()))
|
|
66
|
+
# We reverse if for convenience to original_name -> normalized_name
|
|
67
|
+
columns = {v: k for k, v in columns.items()}
|
|
68
|
+
|
|
69
|
+
fields = {
|
|
70
|
+
columns[name]: (anno, Field(alias=name)) for name, anno in data_dict.items()
|
|
71
|
+
}
|
|
64
72
|
return create_model(
|
|
65
73
|
name,
|
|
66
74
|
__base__=(DataModel,), # type: ignore[call-overload]
|
datachain/lib/utils.py
CHANGED
|
@@ -1,4 +1,6 @@
|
|
|
1
|
+
import re
|
|
1
2
|
from abc import ABC, abstractmethod
|
|
3
|
+
from collections.abc import Sequence
|
|
2
4
|
|
|
3
5
|
|
|
4
6
|
class AbstractUDF(ABC):
|
|
@@ -28,3 +30,31 @@ class DataChainParamsError(DataChainError):
|
|
|
28
30
|
class DataChainColumnError(DataChainParamsError):
|
|
29
31
|
def __init__(self, col_name, msg):
|
|
30
32
|
super().__init__(f"Error for column {col_name}: {msg}")
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def normalize_col_names(col_names: Sequence[str]) -> dict[str, str]:
|
|
36
|
+
gen_col_counter = 0
|
|
37
|
+
new_col_names = {}
|
|
38
|
+
org_col_names = set(col_names)
|
|
39
|
+
|
|
40
|
+
for org_column in col_names:
|
|
41
|
+
new_column = org_column.lower()
|
|
42
|
+
new_column = re.sub("[^0-9a-z]+", "_", new_column)
|
|
43
|
+
new_column = new_column.strip("_")
|
|
44
|
+
|
|
45
|
+
generated_column = new_column
|
|
46
|
+
|
|
47
|
+
while (
|
|
48
|
+
not generated_column.isidentifier()
|
|
49
|
+
or generated_column in new_col_names
|
|
50
|
+
or (generated_column != org_column and generated_column in org_col_names)
|
|
51
|
+
):
|
|
52
|
+
if new_column:
|
|
53
|
+
generated_column = f"c{gen_col_counter}_{new_column}"
|
|
54
|
+
else:
|
|
55
|
+
generated_column = f"c{gen_col_counter}"
|
|
56
|
+
gen_col_counter += 1
|
|
57
|
+
|
|
58
|
+
new_col_names[generated_column] = org_column
|
|
59
|
+
|
|
60
|
+
return new_col_names
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.6.
|
|
3
|
+
Version: 0.6.3
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -47,7 +47,7 @@ Requires-Dist: platformdirs
|
|
|
47
47
|
Requires-Dist: dvc-studio-client <1,>=0.21
|
|
48
48
|
Provides-Extra: dev
|
|
49
49
|
Requires-Dist: datachain[docs,tests] ; extra == 'dev'
|
|
50
|
-
Requires-Dist: mypy ==1.
|
|
50
|
+
Requires-Dist: mypy ==1.13.0 ; extra == 'dev'
|
|
51
51
|
Requires-Dist: types-python-dateutil ; extra == 'dev'
|
|
52
52
|
Requires-Dist: types-pytz ; extra == 'dev'
|
|
53
53
|
Requires-Dist: types-PyYAML ; extra == 'dev'
|
|
@@ -40,9 +40,9 @@ datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2kru
|
|
|
40
40
|
datachain/data_storage/sqlite.py,sha256=jopfVftng157TVcBKMB_QPlbkE6fTatiY4GYSSLNkig,28737
|
|
41
41
|
datachain/data_storage/warehouse.py,sha256=iIjFOutYxhLev3CcUhUTwMJOkHeAEBwXZ2y3wmjrF1s,30756
|
|
42
42
|
datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
43
|
-
datachain/lib/arrow.py,sha256=
|
|
43
|
+
datachain/lib/arrow.py,sha256=M6SM4u2LeHgylzkPZBWckFeZt3CH3ehpBod3nGl6OYY,9138
|
|
44
44
|
datachain/lib/clip.py,sha256=lm5CzVi4Cj1jVLEKvERKArb-egb9j1Ls-fwTItT6vlI,6150
|
|
45
|
-
datachain/lib/data_model.py,sha256=
|
|
45
|
+
datachain/lib/data_model.py,sha256=dau4AlZBhOFvF7pEKMeqCeRkcFFg5KFvTBWW_2CdH5g,2371
|
|
46
46
|
datachain/lib/dataset_info.py,sha256=srPPhI2UHf6hFPBecyFEVw2SS5aPisIIMsvGgKqi7ss,2366
|
|
47
47
|
datachain/lib/dc.py,sha256=pOyE8LqIwo86GrZTSpSMUJAYYwep7nCdIxebkSYlMGo,84484
|
|
48
48
|
datachain/lib/file.py,sha256=LjTW_-PDAnoUhvyB4bJ8Y8n__XGqrxvmd9mDOF0Gir8,14875
|
|
@@ -59,7 +59,7 @@ datachain/lib/tar.py,sha256=3WIzao6yD5fbLqXLTt9GhPGNonbFIs_fDRu-9vgLgsA,1038
|
|
|
59
59
|
datachain/lib/text.py,sha256=UNHm8fhidk7wdrWqacEWaA6I9ykfYqarQ2URby7jc7M,1261
|
|
60
60
|
datachain/lib/udf.py,sha256=4CqK51n3bntXCmkwoOQIrX34wMKOknkC23HtR4D_2vM,12705
|
|
61
61
|
datachain/lib/udf_signature.py,sha256=GXw24A-Olna6DWCdgy2bC-gZh_gLGPQ-KvjuI6pUjC0,7281
|
|
62
|
-
datachain/lib/utils.py,sha256=
|
|
62
|
+
datachain/lib/utils.py,sha256=6NwgWLl5JrgtD4rsSFEe-yR2ntEwJMJEtAZ3FIxK3fg,1529
|
|
63
63
|
datachain/lib/vfile.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
64
64
|
datachain/lib/webdataset.py,sha256=o7SHk5HOUWsZ5Ln04xOM04eQqiBHiJNO7xLgyVBrwo8,6924
|
|
65
65
|
datachain/lib/webdataset_laion.py,sha256=aGMWeFmeYNK75ewO9JTA11iB1i3QtTzUfenQA5jajfo,2535
|
|
@@ -101,9 +101,9 @@ datachain/sql/sqlite/base.py,sha256=aHSZVvh4XSVkvZ07h3jMoRlHI4sWD8y3SnmGs9xMG9Y,
|
|
|
101
101
|
datachain/sql/sqlite/types.py,sha256=yzvp0sXSEoEYXs6zaYC_2YubarQoZH-MiUNXcpuEP4s,1573
|
|
102
102
|
datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
|
|
103
103
|
datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
|
|
104
|
-
datachain-0.6.
|
|
105
|
-
datachain-0.6.
|
|
106
|
-
datachain-0.6.
|
|
107
|
-
datachain-0.6.
|
|
108
|
-
datachain-0.6.
|
|
109
|
-
datachain-0.6.
|
|
104
|
+
datachain-0.6.3.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
|
|
105
|
+
datachain-0.6.3.dist-info/METADATA,sha256=BnPIINjkfA0P2Sj9mRziNuKm8SWyINrf8qqCic7NUAo,17188
|
|
106
|
+
datachain-0.6.3.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
|
|
107
|
+
datachain-0.6.3.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
|
|
108
|
+
datachain-0.6.3.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
|
|
109
|
+
datachain-0.6.3.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|