datachain 0.1.13__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/_version.py +2 -2
- datachain/asyn.py +3 -3
- datachain/catalog/__init__.py +3 -3
- datachain/catalog/catalog.py +6 -6
- datachain/catalog/loader.py +3 -3
- datachain/cli.py +2 -1
- datachain/client/azure.py +37 -1
- datachain/client/fsspec.py +1 -1
- datachain/client/local.py +1 -1
- datachain/data_storage/__init__.py +1 -1
- datachain/data_storage/metastore.py +11 -3
- datachain/data_storage/schema.py +2 -3
- datachain/data_storage/warehouse.py +31 -30
- datachain/dataset.py +1 -3
- datachain/lib/arrow.py +85 -0
- datachain/lib/dc.py +377 -178
- datachain/lib/feature.py +41 -90
- datachain/lib/feature_registry.py +3 -1
- datachain/lib/feature_utils.py +2 -2
- datachain/lib/file.py +20 -20
- datachain/lib/image.py +9 -2
- datachain/lib/meta_formats.py +66 -34
- datachain/lib/settings.py +5 -5
- datachain/lib/signal_schema.py +103 -105
- datachain/lib/udf.py +3 -12
- datachain/lib/udf_signature.py +11 -6
- datachain/lib/webdataset_laion.py +5 -22
- datachain/listing.py +8 -8
- datachain/node.py +1 -1
- datachain/progress.py +1 -1
- datachain/query/builtins.py +1 -1
- datachain/query/dataset.py +39 -110
- datachain/query/dispatch.py +1 -1
- datachain/query/metrics.py +19 -0
- datachain/query/schema.py +13 -3
- datachain/sql/__init__.py +1 -1
- datachain/utils.py +1 -122
- {datachain-0.1.13.dist-info → datachain-0.2.0.dist-info}/METADATA +10 -3
- {datachain-0.1.13.dist-info → datachain-0.2.0.dist-info}/RECORD +43 -42
- {datachain-0.1.13.dist-info → datachain-0.2.0.dist-info}/WHEEL +1 -1
- datachain/lib/parquet.py +0 -32
- {datachain-0.1.13.dist-info → datachain-0.2.0.dist-info}/LICENSE +0 -0
- {datachain-0.1.13.dist-info → datachain-0.2.0.dist-info}/entry_points.txt +0 -0
- {datachain-0.1.13.dist-info → datachain-0.2.0.dist-info}/top_level.txt +0 -0
datachain/query/dataset.py
CHANGED
|
@@ -56,13 +56,13 @@ from datachain.storage import Storage, StorageURI
|
|
|
56
56
|
from datachain.utils import batched, determine_processes
|
|
57
57
|
|
|
58
58
|
from .batch import RowBatch
|
|
59
|
+
from .metrics import metrics
|
|
59
60
|
from .schema import C, UDFParamSpec, normalize_param
|
|
60
61
|
from .session import Session
|
|
61
62
|
from .udf import UDFBase, UDFClassWrapper, UDFFactory, UDFType
|
|
62
63
|
|
|
63
64
|
if TYPE_CHECKING:
|
|
64
65
|
import pandas as pd
|
|
65
|
-
from pandas.core.interchange.dataframe_protocol import DataFrame as DataFrameXchg
|
|
66
66
|
from sqlalchemy.sql.elements import ClauseElement
|
|
67
67
|
from sqlalchemy.sql.schema import Table
|
|
68
68
|
from sqlalchemy.sql.selectable import GenerativeSelect
|
|
@@ -71,7 +71,6 @@ if TYPE_CHECKING:
|
|
|
71
71
|
from datachain.catalog import Catalog
|
|
72
72
|
from datachain.data_storage import AbstractWarehouse
|
|
73
73
|
from datachain.dataset import DatasetRecord
|
|
74
|
-
from datachain.sql.types import SQLType
|
|
75
74
|
|
|
76
75
|
from .udf import UDFResult
|
|
77
76
|
|
|
@@ -197,7 +196,7 @@ class IndexingStep(StartingStep):
|
|
|
197
196
|
def apply(self):
|
|
198
197
|
self.catalog.index([self.path], **self.kwargs)
|
|
199
198
|
uri, path = self.parse_path()
|
|
200
|
-
|
|
199
|
+
_partial_id, partial_path = self.catalog.metastore.get_valid_partial_id(
|
|
201
200
|
uri, path
|
|
202
201
|
)
|
|
203
202
|
dataset = self.catalog.get_dataset(Storage.dataset_name(uri, partial_path))
|
|
@@ -523,30 +522,23 @@ class UDF(Step, ABC):
|
|
|
523
522
|
"cache": self.cache,
|
|
524
523
|
}
|
|
525
524
|
|
|
526
|
-
feature_module_name, feature_file = self.process_feature_module()
|
|
527
|
-
|
|
528
|
-
# Write the module content to a .py file
|
|
529
|
-
with open(f"{feature_module_name}.py", "w") as module_file:
|
|
530
|
-
module_file.write(feature_file)
|
|
531
|
-
|
|
532
|
-
process_data = dumps(udf_info, recurse=True)
|
|
533
525
|
# Run the UDFDispatcher in another process to avoid needing
|
|
534
526
|
# if __name__ == '__main__': in user scripts
|
|
535
527
|
datachain_exec_path = os.environ.get("DATACHAIN_EXEC_PATH", "datachain")
|
|
536
528
|
|
|
537
529
|
envs = dict(os.environ)
|
|
538
530
|
envs.update({"PYTHONPATH": os.getcwd()})
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
531
|
+
with self.process_feature_module():
|
|
532
|
+
process_data = dumps(udf_info, recurse=True)
|
|
533
|
+
result = subprocess.run( # noqa: S603
|
|
534
|
+
[datachain_exec_path, "--internal-run-udf"],
|
|
542
535
|
input=process_data,
|
|
543
536
|
check=False,
|
|
544
537
|
env=envs,
|
|
545
538
|
)
|
|
546
539
|
if result.returncode != 0:
|
|
547
540
|
raise RuntimeError("UDF Execution Failed!")
|
|
548
|
-
|
|
549
|
-
os.unlink(f"{feature_module_name}.py")
|
|
541
|
+
|
|
550
542
|
else:
|
|
551
543
|
# Otherwise process single-threaded (faster for smaller UDFs)
|
|
552
544
|
# Optionally instantiate the UDF instance if a class is provided.
|
|
@@ -600,6 +592,7 @@ class UDF(Step, ABC):
|
|
|
600
592
|
self.catalog.warehouse.close()
|
|
601
593
|
raise
|
|
602
594
|
|
|
595
|
+
@contextlib.contextmanager
|
|
603
596
|
def process_feature_module(self):
|
|
604
597
|
# Generate a random name for the feature module
|
|
605
598
|
feature_module_name = "tmp" + _random_string(10)
|
|
@@ -611,10 +604,14 @@ class UDF(Step, ABC):
|
|
|
611
604
|
for name, obj in inspect.getmembers(sys.modules["__main__"], _imports)
|
|
612
605
|
if not (name.startswith("__") and name.endswith("__"))
|
|
613
606
|
]
|
|
607
|
+
main_module = sys.modules["__main__"]
|
|
608
|
+
|
|
614
609
|
# Get the feature classes from the main module
|
|
615
|
-
feature_classes =
|
|
616
|
-
|
|
617
|
-
|
|
610
|
+
feature_classes = {
|
|
611
|
+
name: obj
|
|
612
|
+
for name, obj in main_module.__dict__.items()
|
|
613
|
+
if _feature_predicate(obj)
|
|
614
|
+
}
|
|
618
615
|
# Get the source code of the feature classes
|
|
619
616
|
feature_sources = [source.getsource(cls) for _, cls in feature_classes.items()]
|
|
620
617
|
# Set the module name for the feature classes to the generated name
|
|
@@ -626,7 +623,18 @@ class UDF(Step, ABC):
|
|
|
626
623
|
# Combine the import lines and feature sources
|
|
627
624
|
feature_file = "".join(import_lines) + "\n".join(feature_sources)
|
|
628
625
|
|
|
629
|
-
|
|
626
|
+
# Write the module content to a .py file
|
|
627
|
+
with open(f"{feature_module_name}.py", "w") as module_file:
|
|
628
|
+
module_file.write(feature_file)
|
|
629
|
+
|
|
630
|
+
try:
|
|
631
|
+
yield feature_module_name
|
|
632
|
+
finally:
|
|
633
|
+
for cls in feature_classes.values():
|
|
634
|
+
cls.__module__ = main_module.__name__
|
|
635
|
+
os.unlink(f"{feature_module_name}.py")
|
|
636
|
+
# Remove the dynamic module from sys.modules
|
|
637
|
+
del sys.modules[feature_module_name]
|
|
630
638
|
|
|
631
639
|
def create_partitions_table(self, query: Select) -> "Table":
|
|
632
640
|
"""
|
|
@@ -685,8 +693,7 @@ class UDF(Step, ABC):
|
|
|
685
693
|
)
|
|
686
694
|
|
|
687
695
|
query, tables = self.process_input_query(query)
|
|
688
|
-
for t in tables
|
|
689
|
-
temp_tables.append(t.name)
|
|
696
|
+
temp_tables.extend(t.name for t in tables)
|
|
690
697
|
udf_table = self.create_udf_table(_query)
|
|
691
698
|
temp_tables.append(udf_table.name)
|
|
692
699
|
self.populate_udf_table(udf_table, query)
|
|
@@ -1120,6 +1127,12 @@ class DatasetQuery:
|
|
|
1120
1127
|
indexing_feature_schema: Optional[dict] = None,
|
|
1121
1128
|
indexing_column_types: Optional[dict[str, Any]] = None,
|
|
1122
1129
|
):
|
|
1130
|
+
if client_config is None:
|
|
1131
|
+
client_config = {}
|
|
1132
|
+
|
|
1133
|
+
if anon:
|
|
1134
|
+
client_config["anon"] = True
|
|
1135
|
+
|
|
1123
1136
|
self.steps: list[Step] = []
|
|
1124
1137
|
self.catalog = catalog or get_catalog(client_config=client_config)
|
|
1125
1138
|
self._chunk_index: Optional[int] = None
|
|
@@ -1134,22 +1147,14 @@ class DatasetQuery:
|
|
|
1134
1147
|
self.column_types: Optional[dict[str, Any]] = None
|
|
1135
1148
|
self.session = Session.get(session, catalog=catalog)
|
|
1136
1149
|
|
|
1137
|
-
if client_config is None:
|
|
1138
|
-
client_config = {}
|
|
1139
|
-
|
|
1140
|
-
if anon:
|
|
1141
|
-
client_config["anon"] = True
|
|
1142
|
-
|
|
1143
1150
|
if path:
|
|
1144
|
-
self.starting_step = IndexingStep(
|
|
1145
|
-
path, self.catalog, {"client_config": client_config}, recursive
|
|
1146
|
-
)
|
|
1151
|
+
self.starting_step = IndexingStep(path, self.catalog, {}, recursive)
|
|
1147
1152
|
self.feature_schema = indexing_feature_schema
|
|
1148
1153
|
self.column_types = indexing_column_types
|
|
1149
1154
|
elif name:
|
|
1150
1155
|
ds = self.catalog.get_dataset(name)
|
|
1151
1156
|
self.version = version or ds.latest_version
|
|
1152
|
-
self.feature_schema = ds.feature_schema
|
|
1157
|
+
self.feature_schema = ds.get_version(self.version).feature_schema
|
|
1153
1158
|
self.column_types = copy(ds.schema)
|
|
1154
1159
|
if "id" in self.column_types:
|
|
1155
1160
|
self.column_types.pop("id")
|
|
@@ -1348,8 +1353,7 @@ class DatasetQuery:
|
|
|
1348
1353
|
MapperCls = OrderedMapper if query._order_by_clauses else AsyncMapper # noqa: N806
|
|
1349
1354
|
with contextlib.closing(row_iter()) as rows:
|
|
1350
1355
|
mapper = MapperCls(get_params, rows, workers=workers)
|
|
1351
|
-
|
|
1352
|
-
yield params
|
|
1356
|
+
yield from mapper.iterate()
|
|
1353
1357
|
finally:
|
|
1354
1358
|
self.cleanup()
|
|
1355
1359
|
|
|
@@ -1386,82 +1390,6 @@ class DatasetQuery:
|
|
|
1386
1390
|
records = self.to_records()
|
|
1387
1391
|
return pd.DataFrame.from_records(records)
|
|
1388
1392
|
|
|
1389
|
-
@classmethod
|
|
1390
|
-
def from_dataframe(
|
|
1391
|
-
cls,
|
|
1392
|
-
df: Union["DataFrameXchg", "pd.DataFrame"],
|
|
1393
|
-
name: str = "",
|
|
1394
|
-
version: Optional[int] = None,
|
|
1395
|
-
catalog: Optional["Catalog"] = None,
|
|
1396
|
-
session: Optional[Session] = None,
|
|
1397
|
-
) -> "Self":
|
|
1398
|
-
from datachain.utils import dtype_mapper
|
|
1399
|
-
|
|
1400
|
-
catalog = catalog or get_catalog()
|
|
1401
|
-
assert catalog is not None
|
|
1402
|
-
session = Session.get(session, catalog=catalog)
|
|
1403
|
-
assert session is not None
|
|
1404
|
-
|
|
1405
|
-
try:
|
|
1406
|
-
if name and version and catalog.get_dataset(name).has_version(version):
|
|
1407
|
-
raise RuntimeError(f"Dataset {name} already has version {version}")
|
|
1408
|
-
except DatasetNotFoundError:
|
|
1409
|
-
pass
|
|
1410
|
-
|
|
1411
|
-
if not name and version:
|
|
1412
|
-
raise RuntimeError("Cannot set version for temporary datasets")
|
|
1413
|
-
|
|
1414
|
-
import pandas as pd # noqa: F401
|
|
1415
|
-
from pandas.api.interchange import from_dataframe
|
|
1416
|
-
|
|
1417
|
-
# This is not optimal for dataframes other than pd.DataFrame, as it may copy
|
|
1418
|
-
# all the data to a new dataframe.
|
|
1419
|
-
pd_df = from_dataframe(df)
|
|
1420
|
-
|
|
1421
|
-
dtype: dict[str, type[SQLType]] = {
|
|
1422
|
-
str(pd_df.columns[i]): dtype_mapper(pd_df.iloc[:, i])
|
|
1423
|
-
for i in range(len(pd_df.columns))
|
|
1424
|
-
}
|
|
1425
|
-
|
|
1426
|
-
name = name or session.generate_temp_dataset_name()
|
|
1427
|
-
dataset = catalog.create_dataset(
|
|
1428
|
-
name,
|
|
1429
|
-
version=version,
|
|
1430
|
-
columns=[Column(name, typ) for name, typ in dtype.items()],
|
|
1431
|
-
)
|
|
1432
|
-
version = version or dataset.latest_version
|
|
1433
|
-
|
|
1434
|
-
dr = catalog.warehouse.dataset_rows(dataset)
|
|
1435
|
-
pd_df.to_sql(
|
|
1436
|
-
dr.table.name,
|
|
1437
|
-
catalog.warehouse.db.engine,
|
|
1438
|
-
if_exists="append",
|
|
1439
|
-
index=False,
|
|
1440
|
-
chunksize=10_000,
|
|
1441
|
-
dtype=dtype,
|
|
1442
|
-
)
|
|
1443
|
-
|
|
1444
|
-
catalog.metastore.update_dataset_status(
|
|
1445
|
-
dataset, DatasetStatus.COMPLETE, version=version
|
|
1446
|
-
)
|
|
1447
|
-
catalog.update_dataset_version_with_warehouse_info(dataset, version)
|
|
1448
|
-
return cls(name=name, version=version, catalog=catalog, session=session)
|
|
1449
|
-
|
|
1450
|
-
from_pandas = from_dataframe
|
|
1451
|
-
|
|
1452
|
-
@classmethod
|
|
1453
|
-
def from_parquet(
|
|
1454
|
-
cls,
|
|
1455
|
-
uri: str,
|
|
1456
|
-
*args,
|
|
1457
|
-
**kwargs,
|
|
1458
|
-
) -> "Self":
|
|
1459
|
-
import pandas as pd
|
|
1460
|
-
|
|
1461
|
-
pd_df = pd.read_parquet(uri, dtype_backend="pyarrow")
|
|
1462
|
-
|
|
1463
|
-
return cls.from_dataframe(pd_df, *args, **kwargs)
|
|
1464
|
-
|
|
1465
1393
|
def shuffle(self) -> "Self":
|
|
1466
1394
|
# ToDo: implement shaffle based on seed and/or generating random column
|
|
1467
1395
|
return self.order_by(C.random)
|
|
@@ -1853,6 +1781,7 @@ def _get_output_fd_for_write() -> Union[str, int]:
|
|
|
1853
1781
|
class ExecutionResult:
|
|
1854
1782
|
preview: list[dict] = attrs.field(factory=list)
|
|
1855
1783
|
dataset: Optional[tuple[str, int]] = None
|
|
1784
|
+
metrics: dict[str, Any] = attrs.field(factory=dict)
|
|
1856
1785
|
|
|
1857
1786
|
|
|
1858
1787
|
def _send_result(dataset_query: DatasetQuery) -> None:
|
|
@@ -1886,7 +1815,7 @@ def _send_result(dataset_query: DatasetQuery) -> None:
|
|
|
1886
1815
|
dataset = dataset_query.name, dataset_query.version
|
|
1887
1816
|
|
|
1888
1817
|
preview = preview_query.to_records()
|
|
1889
|
-
result = ExecutionResult(preview, dataset)
|
|
1818
|
+
result = ExecutionResult(preview, dataset, metrics)
|
|
1890
1819
|
data = attrs.asdict(result)
|
|
1891
1820
|
|
|
1892
1821
|
with open(_get_output_fd_for_write(), mode="w") as f:
|
datachain/query/dispatch.py
CHANGED
|
@@ -257,7 +257,7 @@ class UDFDispatcher:
|
|
|
257
257
|
|
|
258
258
|
if self.buffer_size < n_workers:
|
|
259
259
|
raise RuntimeError(
|
|
260
|
-
|
|
260
|
+
"Parallel run error: buffer size is smaller than "
|
|
261
261
|
f"number of workers: {self.buffer_size} < {n_workers}"
|
|
262
262
|
)
|
|
263
263
|
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
from typing import Optional, Union
|
|
2
|
+
|
|
3
|
+
metrics: dict[str, Union[str, int, float, bool, None]] = {}
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def set(key: str, value: Union[str, int, float, bool, None]) -> None: # noqa: PYI041
|
|
7
|
+
"""Set a metric value."""
|
|
8
|
+
if not isinstance(key, str):
|
|
9
|
+
raise TypeError("Key must be a string")
|
|
10
|
+
if not key:
|
|
11
|
+
raise ValueError("Key must not be empty")
|
|
12
|
+
if not isinstance(value, (str, int, float, bool, type(None))):
|
|
13
|
+
raise TypeError("Value must be a string, int, float or bool")
|
|
14
|
+
metrics[key] = value
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def get(key: str) -> Optional[Union[str, int, float, bool]]:
|
|
18
|
+
"""Get a metric value."""
|
|
19
|
+
return metrics[key]
|
datachain/query/schema.py
CHANGED
|
@@ -18,20 +18,30 @@ if TYPE_CHECKING:
|
|
|
18
18
|
from datachain.dataset import RowDict
|
|
19
19
|
|
|
20
20
|
|
|
21
|
+
DEFAULT_DELIMITER = "__"
|
|
22
|
+
|
|
23
|
+
|
|
21
24
|
class ColumnMeta(type):
|
|
25
|
+
@staticmethod
|
|
26
|
+
def to_db_name(name: str) -> str:
|
|
27
|
+
return name.replace(".", DEFAULT_DELIMITER)
|
|
28
|
+
|
|
22
29
|
def __getattr__(cls, name: str):
|
|
23
|
-
return cls(name)
|
|
30
|
+
return cls(ColumnMeta.to_db_name(name))
|
|
24
31
|
|
|
25
32
|
|
|
26
33
|
class Column(sa.ColumnClause, metaclass=ColumnMeta):
|
|
27
34
|
inherit_cache: Optional[bool] = True
|
|
28
35
|
|
|
29
36
|
def __init__(self, text, type_=None, is_literal=False, _selectable=None):
|
|
30
|
-
self.name = text
|
|
37
|
+
self.name = ColumnMeta.to_db_name(text)
|
|
31
38
|
super().__init__(
|
|
32
|
-
|
|
39
|
+
self.name, type_=type_, is_literal=is_literal, _selectable=_selectable
|
|
33
40
|
)
|
|
34
41
|
|
|
42
|
+
def __getattr__(self, name: str):
|
|
43
|
+
return Column(self.name + DEFAULT_DELIMITER + name)
|
|
44
|
+
|
|
35
45
|
def glob(self, glob_str):
|
|
36
46
|
return self.op("GLOB")(glob_str)
|
|
37
47
|
|
datachain/sql/__init__.py
CHANGED
datachain/utils.py
CHANGED
|
@@ -18,9 +18,6 @@ from dateutil.parser import isoparse
|
|
|
18
18
|
|
|
19
19
|
if TYPE_CHECKING:
|
|
20
20
|
import pandas as pd
|
|
21
|
-
import pyarrow as pa
|
|
22
|
-
|
|
23
|
-
from datachain.sql.types import SQLType
|
|
24
21
|
|
|
25
22
|
NUL = b"\0"
|
|
26
23
|
TIME_ZERO = datetime.fromtimestamp(0, tz=timezone.utc)
|
|
@@ -78,7 +75,7 @@ class DataChainDir:
|
|
|
78
75
|
if create:
|
|
79
76
|
instance.init()
|
|
80
77
|
else:
|
|
81
|
-
NotADirectoryError(root)
|
|
78
|
+
raise NotADirectoryError(root)
|
|
82
79
|
return instance
|
|
83
80
|
|
|
84
81
|
|
|
@@ -363,121 +360,3 @@ class JSONSerialize(json.JSONEncoder):
|
|
|
363
360
|
return str(obj)
|
|
364
361
|
|
|
365
362
|
return super().default(obj)
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
def dtype_mapper(col: Union["pd.Index", "pd.Series"]) -> type["SQLType"]: # noqa: PLR0911
|
|
369
|
-
from pandas import ArrowDtype
|
|
370
|
-
from pandas.api.types import infer_dtype
|
|
371
|
-
|
|
372
|
-
from datachain.sql.types import (
|
|
373
|
-
Binary,
|
|
374
|
-
Boolean,
|
|
375
|
-
DateTime,
|
|
376
|
-
Float,
|
|
377
|
-
Float32,
|
|
378
|
-
Float64,
|
|
379
|
-
Int,
|
|
380
|
-
Int32,
|
|
381
|
-
Int64,
|
|
382
|
-
String,
|
|
383
|
-
UInt64,
|
|
384
|
-
)
|
|
385
|
-
|
|
386
|
-
if isinstance(col.dtype, ArrowDtype):
|
|
387
|
-
return arrow_type_mapper(col.dtype.pyarrow_dtype)
|
|
388
|
-
|
|
389
|
-
col_type = infer_dtype(col, skipna=True)
|
|
390
|
-
|
|
391
|
-
if col_type in ("datetime", "datetime64"):
|
|
392
|
-
return DateTime
|
|
393
|
-
if col_type == "bytes":
|
|
394
|
-
return Binary
|
|
395
|
-
if col_type == "floating":
|
|
396
|
-
if col.dtype == "float32":
|
|
397
|
-
return Float32
|
|
398
|
-
if col.dtype == "float64":
|
|
399
|
-
return Float64
|
|
400
|
-
return Float
|
|
401
|
-
if col_type == "integer":
|
|
402
|
-
if col.dtype.name.lower() in ("int8", "int16", "int32"):
|
|
403
|
-
return Int32
|
|
404
|
-
if col.dtype.name.lower() == "int64":
|
|
405
|
-
return Int64
|
|
406
|
-
if col.dtype.name.lower().startswith("uint"):
|
|
407
|
-
return UInt64
|
|
408
|
-
return Int
|
|
409
|
-
if col_type == "boolean":
|
|
410
|
-
return Boolean
|
|
411
|
-
if col_type == "date":
|
|
412
|
-
return DateTime
|
|
413
|
-
if col_type in (
|
|
414
|
-
"complex",
|
|
415
|
-
"time",
|
|
416
|
-
"timedelta",
|
|
417
|
-
"timedelta64",
|
|
418
|
-
"period",
|
|
419
|
-
"interval",
|
|
420
|
-
):
|
|
421
|
-
raise ValueError(f"{col_type!r} datatypes not supported")
|
|
422
|
-
return String
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
def arrow_type_mapper(col_type: "pa.DataType") -> type["SQLType"]: # noqa: PLR0911,C901
|
|
426
|
-
try:
|
|
427
|
-
import pyarrow as pa
|
|
428
|
-
except ImportError as exc:
|
|
429
|
-
raise ImportError(
|
|
430
|
-
"Missing required dependency pyarrow for inferring types"
|
|
431
|
-
) from exc
|
|
432
|
-
|
|
433
|
-
from datachain.sql.types import (
|
|
434
|
-
JSON,
|
|
435
|
-
Array,
|
|
436
|
-
Binary,
|
|
437
|
-
Boolean,
|
|
438
|
-
DateTime,
|
|
439
|
-
Float,
|
|
440
|
-
Float32,
|
|
441
|
-
Float64,
|
|
442
|
-
Int,
|
|
443
|
-
Int32,
|
|
444
|
-
Int64,
|
|
445
|
-
String,
|
|
446
|
-
UInt64,
|
|
447
|
-
)
|
|
448
|
-
|
|
449
|
-
if pa.types.is_timestamp(col_type):
|
|
450
|
-
return DateTime
|
|
451
|
-
if pa.types.is_binary(col_type):
|
|
452
|
-
return Binary
|
|
453
|
-
if pa.types.is_floating(col_type):
|
|
454
|
-
if pa.types.is_float32(col_type):
|
|
455
|
-
return Float32
|
|
456
|
-
if pa.types.is_float64(col_type):
|
|
457
|
-
return Float64
|
|
458
|
-
return Float
|
|
459
|
-
if pa.types.is_integer(col_type):
|
|
460
|
-
if (
|
|
461
|
-
pa.types.is_int8(col_type)
|
|
462
|
-
or pa.types.is_int16(col_type)
|
|
463
|
-
or pa.types.is_int32(col_type)
|
|
464
|
-
):
|
|
465
|
-
return Int32
|
|
466
|
-
if pa.types.is_int64(col_type):
|
|
467
|
-
return Int64
|
|
468
|
-
if pa.types.is_unsigned_integer(col_type):
|
|
469
|
-
return UInt64
|
|
470
|
-
return Int
|
|
471
|
-
if pa.types.is_boolean(col_type):
|
|
472
|
-
return Boolean
|
|
473
|
-
if pa.types.is_date(col_type):
|
|
474
|
-
return DateTime
|
|
475
|
-
if pa.types.is_string(col_type):
|
|
476
|
-
return String
|
|
477
|
-
if pa.types.is_list(col_type):
|
|
478
|
-
return Array(arrow_type_mapper(col_type.value_type)) # type: ignore[return-value]
|
|
479
|
-
if pa.types.is_struct(col_type) or pa.types.is_map(col_type):
|
|
480
|
-
return JSON
|
|
481
|
-
if isinstance(col_type, pa.lib.DictionaryType):
|
|
482
|
-
return arrow_type_mapper(col_type.value_type) # type: ignore[return-value]
|
|
483
|
-
raise ValueError(f"{col_type!r} datatypes not supported")
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -44,12 +44,19 @@ Requires-Dist: torch >=2.1.0 ; extra == 'cv'
|
|
|
44
44
|
Requires-Dist: torchvision ; extra == 'cv'
|
|
45
45
|
Requires-Dist: transformers >=4.36.0 ; extra == 'cv'
|
|
46
46
|
Provides-Extra: dev
|
|
47
|
-
Requires-Dist: datachain[tests] ; extra == 'dev'
|
|
48
|
-
Requires-Dist: mypy ==1.10.
|
|
47
|
+
Requires-Dist: datachain[docs,tests] ; extra == 'dev'
|
|
48
|
+
Requires-Dist: mypy ==1.10.1 ; extra == 'dev'
|
|
49
49
|
Requires-Dist: types-python-dateutil ; extra == 'dev'
|
|
50
50
|
Requires-Dist: types-PyYAML ; extra == 'dev'
|
|
51
51
|
Requires-Dist: types-requests ; extra == 'dev'
|
|
52
52
|
Requires-Dist: types-ujson ; extra == 'dev'
|
|
53
|
+
Provides-Extra: docs
|
|
54
|
+
Requires-Dist: mkdocs >=1.5.2 ; extra == 'docs'
|
|
55
|
+
Requires-Dist: mkdocs-gen-files >=0.5.0 ; extra == 'docs'
|
|
56
|
+
Requires-Dist: mkdocs-material >=9.3.1 ; extra == 'docs'
|
|
57
|
+
Requires-Dist: mkdocs-section-index >=0.3.6 ; extra == 'docs'
|
|
58
|
+
Requires-Dist: mkdocstrings-python >=1.6.3 ; extra == 'docs'
|
|
59
|
+
Requires-Dist: mkdocs-literate-nav >=0.6.1 ; extra == 'docs'
|
|
53
60
|
Provides-Extra: remote
|
|
54
61
|
Requires-Dist: datachain[pandas] ; extra == 'remote'
|
|
55
62
|
Requires-Dist: lz4 ; extra == 'remote'
|
|
@@ -1,82 +1,83 @@
|
|
|
1
1
|
datachain/__init__.py,sha256=9a0qX6tqyA9KC3ahLmGarqlRTZJXhM7HijAWpfUaOnQ,102
|
|
2
2
|
datachain/__main__.py,sha256=hG3Y4ARGEqe1AWwNMd259rBlqtphx1Wk39YbueQ0yV8,91
|
|
3
|
-
datachain/_version.py,sha256=
|
|
4
|
-
datachain/asyn.py,sha256=
|
|
3
|
+
datachain/_version.py,sha256=H-qsvrxCpdhaQzyddR-yajEqI71hPxLa4KxzpP3uS1g,411
|
|
4
|
+
datachain/asyn.py,sha256=CKCFQJ0CbB3r04S7mUTXxriKzPnOvdUaVPXjM8vCtJw,7644
|
|
5
5
|
datachain/cache.py,sha256=FaPWrqWznPffmskTb1pdPkt2jAMMf__9FC2zEnP0vDU,4022
|
|
6
|
-
datachain/cli.py,sha256=
|
|
6
|
+
datachain/cli.py,sha256=FLKRimIq917Dq0EmG3yLzMTqDaMA0vyCRUREOobUspY,32256
|
|
7
7
|
datachain/cli_utils.py,sha256=jrn9ejGXjybeO1ur3fjdSiAyCHZrX0qsLLbJzN9ErPM,2418
|
|
8
8
|
datachain/config.py,sha256=PfC7W5yO6HFO6-iMB4YB-0RR88LPiGmD6sS_SfVbGso,1979
|
|
9
|
-
datachain/dataset.py,sha256=
|
|
9
|
+
datachain/dataset.py,sha256=MZezyuJWNj_3PEtzr0epPMNyWAOTrhTSPI5FmemV6L4,14470
|
|
10
10
|
datachain/error.py,sha256=GY9KYTmb7GHXn2gGHV9X-PBhgwLj3i7VpK7tGHtAoGM,1279
|
|
11
|
-
datachain/listing.py,sha256
|
|
12
|
-
datachain/node.py,sha256=
|
|
11
|
+
datachain/listing.py,sha256=1arE_9gpjhHqGQCpQZj_mLoocrZWRNDHJ-bkPc08NQs,8247
|
|
12
|
+
datachain/node.py,sha256=fHe7k5ajI2g2qnzsG-_NQR_T-QdBYctVeEa8c8dsu_Y,5703
|
|
13
13
|
datachain/nodes_fetcher.py,sha256=kca19yvu11JxoVY1t4_ydp1FmchiV88GnNicNBQ9NIA,831
|
|
14
14
|
datachain/nodes_thread_pool.py,sha256=ZyzBvUImIPmi4WlKC2SW2msA0UhtembbTdcs2nx29A0,3191
|
|
15
|
-
datachain/progress.py,sha256=
|
|
15
|
+
datachain/progress.py,sha256=7_8FtJs770ITK9sMq-Lt4k4k18QmYl4yIG_kCoWID3o,4559
|
|
16
16
|
datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
17
17
|
datachain/storage.py,sha256=RiSJLYdHUjnrEWkLBKPcETHpAxld_B2WxLg711t0aZI,3733
|
|
18
|
-
datachain/utils.py,sha256=
|
|
19
|
-
datachain/catalog/__init__.py,sha256=
|
|
20
|
-
datachain/catalog/catalog.py,sha256=
|
|
18
|
+
datachain/utils.py,sha256=DV-_OON2OomEbxuQuK1lE_2qNTf28QByNcNcEhYsilE,10202
|
|
19
|
+
datachain/catalog/__init__.py,sha256=g2iAAFx_gEIrqshXlhSEbrc8qDaEH11cjU40n3CHDz4,409
|
|
20
|
+
datachain/catalog/catalog.py,sha256=5WkICtTYCN5xSMGDd5djLnEBw8kkcDf-IpFYf7kfeuQ,78654
|
|
21
21
|
datachain/catalog/datasource.py,sha256=D-VWIVDCM10A8sQavLhRXdYSCG7F4o4ifswEF80_NAQ,1412
|
|
22
|
-
datachain/catalog/loader.py,sha256=
|
|
22
|
+
datachain/catalog/loader.py,sha256=GJ8zhEYkC7TuaPzCsjJQ4LtTdECu-wwYzC12MikPOMQ,7307
|
|
23
23
|
datachain/catalog/subclass.py,sha256=B5R0qxeTYEyVAAPM1RutBPSoXZc8L5mVVZeSGXki9Sw,2096
|
|
24
24
|
datachain/client/__init__.py,sha256=T4wiYL9KIM0ZZ_UqIyzV8_ufzYlewmizlV4iymHNluE,86
|
|
25
|
-
datachain/client/azure.py,sha256=
|
|
25
|
+
datachain/client/azure.py,sha256=rxvF5erntGD32Y3DYK_TUCsyV2ALfuWWTnE8IWGwKEo,2542
|
|
26
26
|
datachain/client/fileslice.py,sha256=bT7TYco1Qe3bqoc8aUkUZcPdPofJDHlryL5BsTn9xsY,3021
|
|
27
|
-
datachain/client/fsspec.py,sha256=
|
|
27
|
+
datachain/client/fsspec.py,sha256=F1Iyyw0iTrp2wQTFeignGtaHpm5Rg_cvbKaIzBX5aSc,13390
|
|
28
28
|
datachain/client/gcs.py,sha256=ucX8e6JrqlFY-f80zkv084vxnKdtxpO32QJ-RG8Nv1s,4454
|
|
29
|
-
datachain/client/local.py,sha256=
|
|
29
|
+
datachain/client/local.py,sha256=NQVkLTJQ-a7Udavqbh_4uT-IejfZQYn10j22owz9sis,5150
|
|
30
30
|
datachain/client/s3.py,sha256=TmW4f7VUM5CMZjSmgyFQFKeMUGrXt2SLoLEbLOUleiU,6296
|
|
31
|
-
datachain/data_storage/__init__.py,sha256=
|
|
31
|
+
datachain/data_storage/__init__.py,sha256=cEOJpyu1JDZtfUupYucCDNFI6e5Wmp_Oyzq6rZv32Y8,398
|
|
32
32
|
datachain/data_storage/db_engine.py,sha256=mxOoWP4ntBMgLeTAk4dlEeIJArAz4x_tFrHytcAfLpo,3341
|
|
33
33
|
datachain/data_storage/id_generator.py,sha256=VlDALKijggegAnNMJwuMETJgnLoPYxpkrkld5DNTPQw,3839
|
|
34
34
|
datachain/data_storage/job.py,sha256=w-7spowjkOa1P5fUVtJou3OltT0L48P0RYWZ9rSJ9-s,383
|
|
35
|
-
datachain/data_storage/metastore.py,sha256=
|
|
36
|
-
datachain/data_storage/schema.py,sha256=
|
|
35
|
+
datachain/data_storage/metastore.py,sha256=y-4fYvuOPnWeYxAvqhDnw6CdlTvQiurg0Gg4TaG9LR0,54074
|
|
36
|
+
datachain/data_storage/schema.py,sha256=FrhmeZ_btT1CfVisa4ScabS11ixZ3xn3d_whvVsBtDA,8700
|
|
37
37
|
datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
|
|
38
38
|
datachain/data_storage/sqlite.py,sha256=eHTiJ0VIxU-chnhKNTN14EsaSnw5LAaxTLi9aMCZpl4,24978
|
|
39
|
-
datachain/data_storage/warehouse.py,sha256=
|
|
39
|
+
datachain/data_storage/warehouse.py,sha256=tL2mYoXVZe-coKLTRXEJ0sMdEr2BD0GwgIWip5PP5CM,33300
|
|
40
40
|
datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
41
|
+
datachain/lib/arrow.py,sha256=7lAas8hSh3vL7S7s2KOlkYn4viQpfVbM_FQ_hLCh5oc,2593
|
|
41
42
|
datachain/lib/cached_stream.py,sha256=BQI6gpJ2y7_-jqQo_0VB9ntbkOVISvj9wlDwGDQbqw8,3537
|
|
42
43
|
datachain/lib/claude.py,sha256=iAauA1zNVNONpLzUo1t0QN5PZ5Ot6cZkfib7Ka_c638,1969
|
|
43
|
-
datachain/lib/dc.py,sha256=
|
|
44
|
-
datachain/lib/feature.py,sha256=
|
|
45
|
-
datachain/lib/feature_registry.py,sha256=
|
|
46
|
-
datachain/lib/feature_utils.py,sha256=
|
|
47
|
-
datachain/lib/file.py,sha256=
|
|
44
|
+
datachain/lib/dc.py,sha256=szYQC4FOoYDMlSEDAPWZ25z4Nn-WeoaKiqKwwXbOJws,35355
|
|
45
|
+
datachain/lib/feature.py,sha256=KiPiMrU8ec-bJuUs70Xh4jytZdzKk9puQNQnx03K-po,12057
|
|
46
|
+
datachain/lib/feature_registry.py,sha256=YQsLYChNkYK6p2MpcVfAyBybtfN5EMiOJ8LIYakjmeQ,1602
|
|
47
|
+
datachain/lib/feature_utils.py,sha256=LIK233IWGWFhuav5Rm8de0xIOSnuwA1ubk6OYrxrfN0,4712
|
|
48
|
+
datachain/lib/file.py,sha256=K0jH8Q5Xle2TiVDTCzmopku_7Lh-IVufV_mgtaCNHYI,8744
|
|
48
49
|
datachain/lib/gpt4_vision.py,sha256=idyXVZVWzltstGaVIu5RYE5UNbdqcPEjIWy81O1MwkM,2922
|
|
49
50
|
datachain/lib/hf_image_to_text.py,sha256=HiPSWzJRDT-vnz9DXJbJBNCMNl9wmpxiSS3PbbVz8SE,3310
|
|
50
51
|
datachain/lib/hf_pipeline.py,sha256=f0AH_XCziOF1OKN3d1w1swTBLaeajMJ8xgdsX37i5-o,2287
|
|
51
|
-
datachain/lib/image.py,sha256=
|
|
52
|
+
datachain/lib/image.py,sha256=l2lgUR3YQzjpBmTJewzUtL5zJsLDQH32lbbaLu9WvWA,3631
|
|
52
53
|
datachain/lib/image_transform.py,sha256=NXWtnVOcofWBgl_YMxb4ABpaT7JTBMx7tLKvErH1IC4,3024
|
|
53
54
|
datachain/lib/iptc_exif_xmp.py,sha256=xrbxFeY-wRP6T5JsUgE3EXfTxKvZVymRaRD_VIfxD0A,2236
|
|
54
|
-
datachain/lib/meta_formats.py,sha256
|
|
55
|
-
datachain/lib/parquet.py,sha256=_MbRBzcgLLLegjKZNGF9Rm9IkYRSy0IqOksVjL1nntg,917
|
|
55
|
+
datachain/lib/meta_formats.py,sha256=wIVVLRLp45Zk4vjZRd_P1UtD24vpDCb-vILWtcsACwk,6630
|
|
56
56
|
datachain/lib/pytorch.py,sha256=oU16XXAyAmiiabe1IoQoID00-u3uZ5GhCN48uAl6WDs,5421
|
|
57
57
|
datachain/lib/reader.py,sha256=rPXXNoTUdm6PQwkAlaU-nOBreP_q4ett_EjFStrA_W0,1727
|
|
58
|
-
datachain/lib/settings.py,sha256=
|
|
59
|
-
datachain/lib/signal_schema.py,sha256=
|
|
58
|
+
datachain/lib/settings.py,sha256=6Nkoh8riETrftYwDp3aniK53Dsjc07MdztL8N0cW1D8,2849
|
|
59
|
+
datachain/lib/signal_schema.py,sha256=KaH194dAH8Zt8FtlNAgdVqcZlJc42y7RbcB37ldPPAY,11688
|
|
60
60
|
datachain/lib/text.py,sha256=EEZrYohADi5rAGg3aLLRwtvyAV9js_yWAGhr2C3QbwI,2424
|
|
61
|
-
datachain/lib/udf.py,sha256=
|
|
62
|
-
datachain/lib/udf_signature.py,sha256=
|
|
61
|
+
datachain/lib/udf.py,sha256=kPc_6fQ4DzbiYiXvbps7QPlJWTu9MSCS8eUfGqOhjG4,6124
|
|
62
|
+
datachain/lib/udf_signature.py,sha256=DAWMQ0dvFkKabpY5MV5K2q9YmOSTKfiV8KuUBs_6kMg,7258
|
|
63
63
|
datachain/lib/unstructured.py,sha256=9Y6rAelXdYqkNbPaqz6DhXjhS8d6qXcP0ieIsWkzvkk,1143
|
|
64
64
|
datachain/lib/utils.py,sha256=YQKzuW096SGe7QwHwdyS47k_9l2Rh73b-wBqt1-niw4,213
|
|
65
65
|
datachain/lib/vfile.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
66
66
|
datachain/lib/webdataset.py,sha256=JouI5WORgkl-am_DwQwWqO8RI1UwgbUPWsauZZj2Fmc,8221
|
|
67
|
-
datachain/lib/webdataset_laion.py,sha256=
|
|
67
|
+
datachain/lib/webdataset_laion.py,sha256=HAtSCbVvEQqzKkoRamRxDKaQALSB3QmJRU2yWRFNxwY,2147
|
|
68
68
|
datachain/query/__init__.py,sha256=tv-spkjUCYamMN9ys_90scYrZ8kJ7C7d1MTYVmxGtk4,325
|
|
69
69
|
datachain/query/batch.py,sha256=sOMxXbaNii7lVyFIEZ2noqbhy_S8qtZ-WWxrka72shc,3474
|
|
70
|
-
datachain/query/builtins.py,sha256=
|
|
71
|
-
datachain/query/dataset.py,sha256=
|
|
72
|
-
datachain/query/dispatch.py,sha256=
|
|
70
|
+
datachain/query/builtins.py,sha256=ZKNs49t8Oa_OaboCBIEqtXZt7c1Qe9OR_C_HpoDriIU,2781
|
|
71
|
+
datachain/query/dataset.py,sha256=c0ZoNEjAMmn0BdSnRm8XRWEsbaMH3xa_jd6FBJQDY1o,64576
|
|
72
|
+
datachain/query/dispatch.py,sha256=fEk1qalxAb5JJhN-iq0Mg9MyWve4XoN1Q7uvrX4mJY4,13106
|
|
73
|
+
datachain/query/metrics.py,sha256=vsECqbZfoSDBnvC3GQlziKXmISVYDLgHP1fMPEOtKyo,640
|
|
73
74
|
datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
|
|
74
|
-
datachain/query/schema.py,sha256=
|
|
75
|
+
datachain/query/schema.py,sha256=tWlUiu9eiS5y8BTQaPI2raGclt0YzcO3DoUN1OkwnrE,7946
|
|
75
76
|
datachain/query/session.py,sha256=e4_vv4RqAjU-g3KK0avgLd9MEsmJBzRVEj1w8v7fP1k,3663
|
|
76
77
|
datachain/query/udf.py,sha256=0WkBPW5ymZbOGMimSXpVWVc8whjTuYfRrnxPWNHabSk,7127
|
|
77
78
|
datachain/remote/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
78
79
|
datachain/remote/studio.py,sha256=bZb85WjtqMNFBoRuPbH-TEGpAyz0afROR7E9UgIef_Y,7438
|
|
79
|
-
datachain/sql/__init__.py,sha256=
|
|
80
|
+
datachain/sql/__init__.py,sha256=A2djrbQwSMUZZEIKGnm-mnRA-NDSbiDJNpAmmwGNyIo,303
|
|
80
81
|
datachain/sql/selectable.py,sha256=fBM-wS1TUA42kVEAAiwqGtibIevyZAEritwt8PZGyLQ,1589
|
|
81
82
|
datachain/sql/types.py,sha256=BzUm0nCcMPASvdqpQouX5bdVcK3G3DBfeeNhau7X_hA,10234
|
|
82
83
|
datachain/sql/utils.py,sha256=rzlJw08etivdrcuQPqNVvVWhuVSyUPUQEEc6DOhu258,818
|
|
@@ -92,9 +93,9 @@ datachain/sql/sqlite/__init__.py,sha256=TAdJX0Bg28XdqPO-QwUVKy8rg78cgMileHvMNot7
|
|
|
92
93
|
datachain/sql/sqlite/base.py,sha256=XVxn4pB-N4pPfiby5uVvfH7feNzRKlBNzsc5eyKPvhI,10965
|
|
93
94
|
datachain/sql/sqlite/types.py,sha256=oP93nLfTBaYnN0z_4Dsv-HZm8j9rrUf1esMM-z3JLbg,1754
|
|
94
95
|
datachain/sql/sqlite/vector.py,sha256=stBeEW6fbVbILmAtV4khjXdJIGT13HkRWJeCoqIOk50,315
|
|
95
|
-
datachain-0.
|
|
96
|
-
datachain-0.
|
|
97
|
-
datachain-0.
|
|
98
|
-
datachain-0.
|
|
99
|
-
datachain-0.
|
|
100
|
-
datachain-0.
|
|
96
|
+
datachain-0.2.0.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
|
|
97
|
+
datachain-0.2.0.dist-info/METADATA,sha256=iMX8hWEMXu-4MtXlD_SVwW3ija6bOLqSbeQvHoiMNfQ,14344
|
|
98
|
+
datachain-0.2.0.dist-info/WHEEL,sha256=y4mX-SOX4fYIkonsAGA5N0Oy-8_gI4FXw5HNI1xqvWg,91
|
|
99
|
+
datachain-0.2.0.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
|
|
100
|
+
datachain-0.2.0.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
|
|
101
|
+
datachain-0.2.0.dist-info/RECORD,,
|