datachain 0.3.17__py3-none-any.whl → 0.3.18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/cache.py +14 -55
- datachain/catalog/catalog.py +8 -18
- datachain/cli.py +7 -1
- datachain/client/fsspec.py +29 -63
- datachain/client/local.py +2 -3
- datachain/lib/arrow.py +2 -1
- datachain/lib/dc.py +4 -0
- datachain/lib/file.py +41 -23
- datachain/lib/listing.py +2 -0
- datachain/listing.py +4 -4
- datachain/node.py +6 -6
- datachain/nodes_fetcher.py +12 -5
- datachain/nodes_thread_pool.py +1 -1
- datachain/progress.py +2 -12
- datachain/query/dataset.py +6 -18
- datachain/query/dispatch.py +2 -15
- datachain/query/schema.py +25 -24
- datachain/query/udf.py +0 -106
- datachain/sql/types.py +4 -2
- datachain/telemetry.py +37 -0
- datachain/utils.py +11 -0
- {datachain-0.3.17.dist-info → datachain-0.3.18.dist-info}/METADATA +5 -3
- {datachain-0.3.17.dist-info → datachain-0.3.18.dist-info}/RECORD +27 -26
- {datachain-0.3.17.dist-info → datachain-0.3.18.dist-info}/LICENSE +0 -0
- {datachain-0.3.17.dist-info → datachain-0.3.18.dist-info}/WHEEL +0 -0
- {datachain-0.3.17.dist-info → datachain-0.3.18.dist-info}/entry_points.txt +0 -0
- {datachain-0.3.17.dist-info → datachain-0.3.18.dist-info}/top_level.txt +0 -0
datachain/nodes_thread_pool.py
CHANGED
|
@@ -20,7 +20,7 @@ class NodeChunk:
|
|
|
20
20
|
def next_downloadable(self):
|
|
21
21
|
node = next(self.nodes, None)
|
|
22
22
|
while node and (
|
|
23
|
-
not node.is_downloadable or self.cache.contains(node.
|
|
23
|
+
not node.is_downloadable or self.cache.contains(node.to_file(self.storage))
|
|
24
24
|
):
|
|
25
25
|
node = next(self.nodes, None)
|
|
26
26
|
return node
|
datachain/progress.py
CHANGED
|
@@ -1,8 +1,6 @@
|
|
|
1
1
|
"""Manages progress bars."""
|
|
2
2
|
|
|
3
3
|
import logging
|
|
4
|
-
import os
|
|
5
|
-
import re
|
|
6
4
|
import sys
|
|
7
5
|
from threading import RLock
|
|
8
6
|
from typing import Any, ClassVar
|
|
@@ -10,20 +8,12 @@ from typing import Any, ClassVar
|
|
|
10
8
|
from fsspec.callbacks import TqdmCallback
|
|
11
9
|
from tqdm import tqdm
|
|
12
10
|
|
|
11
|
+
from datachain.utils import env2bool
|
|
12
|
+
|
|
13
13
|
logger = logging.getLogger(__name__)
|
|
14
14
|
tqdm.set_lock(RLock())
|
|
15
15
|
|
|
16
16
|
|
|
17
|
-
def env2bool(var, undefined=False):
|
|
18
|
-
"""
|
|
19
|
-
undefined: return value if env var is unset
|
|
20
|
-
"""
|
|
21
|
-
var = os.getenv(var, None)
|
|
22
|
-
if var is None:
|
|
23
|
-
return undefined
|
|
24
|
-
return bool(re.search("1|y|yes|true", var, flags=re.IGNORECASE))
|
|
25
|
-
|
|
26
|
-
|
|
27
17
|
class Tqdm(tqdm):
|
|
28
18
|
"""
|
|
29
19
|
maximum-compatibility tqdm-based progressbars
|
datachain/query/dataset.py
CHANGED
|
@@ -53,7 +53,7 @@ from datachain.utils import (
|
|
|
53
53
|
|
|
54
54
|
from .schema import C, UDFParamSpec, normalize_param
|
|
55
55
|
from .session import Session
|
|
56
|
-
from .udf import UDFBase
|
|
56
|
+
from .udf import UDFBase
|
|
57
57
|
|
|
58
58
|
if TYPE_CHECKING:
|
|
59
59
|
from sqlalchemy.sql.elements import ClauseElement
|
|
@@ -364,7 +364,7 @@ def get_generated_callback(is_generator: bool = False) -> Callback:
|
|
|
364
364
|
|
|
365
365
|
@frozen
|
|
366
366
|
class UDFStep(Step, ABC):
|
|
367
|
-
udf:
|
|
367
|
+
udf: UDFBase
|
|
368
368
|
catalog: "Catalog"
|
|
369
369
|
partition_by: Optional[PartitionByType] = None
|
|
370
370
|
parallel: Optional[int] = None
|
|
@@ -470,12 +470,6 @@ class UDFStep(Step, ABC):
|
|
|
470
470
|
|
|
471
471
|
else:
|
|
472
472
|
# Otherwise process single-threaded (faster for smaller UDFs)
|
|
473
|
-
# Optionally instantiate the UDF instance if a class is provided.
|
|
474
|
-
if isinstance(self.udf, UDFFactory):
|
|
475
|
-
udf: UDFBase = self.udf()
|
|
476
|
-
else:
|
|
477
|
-
udf = self.udf
|
|
478
|
-
|
|
479
473
|
warehouse = self.catalog.warehouse
|
|
480
474
|
|
|
481
475
|
with contextlib.closing(
|
|
@@ -485,7 +479,7 @@ class UDFStep(Step, ABC):
|
|
|
485
479
|
processed_cb = get_processed_callback()
|
|
486
480
|
generated_cb = get_generated_callback(self.is_generator)
|
|
487
481
|
try:
|
|
488
|
-
udf_results = udf.run(
|
|
482
|
+
udf_results = self.udf.run(
|
|
489
483
|
udf_fields,
|
|
490
484
|
udf_inputs,
|
|
491
485
|
self.catalog,
|
|
@@ -498,7 +492,7 @@ class UDFStep(Step, ABC):
|
|
|
498
492
|
warehouse,
|
|
499
493
|
udf_table,
|
|
500
494
|
udf_results,
|
|
501
|
-
udf,
|
|
495
|
+
self.udf,
|
|
502
496
|
cb=generated_cb,
|
|
503
497
|
)
|
|
504
498
|
finally:
|
|
@@ -1471,7 +1465,7 @@ class DatasetQuery:
|
|
|
1471
1465
|
@detach
|
|
1472
1466
|
def add_signals(
|
|
1473
1467
|
self,
|
|
1474
|
-
udf:
|
|
1468
|
+
udf: UDFBase,
|
|
1475
1469
|
parallel: Optional[int] = None,
|
|
1476
1470
|
workers: Union[bool, int] = False,
|
|
1477
1471
|
min_task_size: Optional[int] = None,
|
|
@@ -1492,9 +1486,6 @@ class DatasetQuery:
|
|
|
1492
1486
|
at least that minimum number of rows to each distributed worker, mostly useful
|
|
1493
1487
|
if there are a very large number of small tasks to process.
|
|
1494
1488
|
"""
|
|
1495
|
-
if isinstance(udf, UDFClassWrapper): # type: ignore[unreachable]
|
|
1496
|
-
# This is a bare decorated class, "instantiate" it now.
|
|
1497
|
-
udf = udf() # type: ignore[unreachable]
|
|
1498
1489
|
query = self.clone()
|
|
1499
1490
|
query.steps.append(
|
|
1500
1491
|
UDFSignal(
|
|
@@ -1518,16 +1509,13 @@ class DatasetQuery:
|
|
|
1518
1509
|
@detach
|
|
1519
1510
|
def generate(
|
|
1520
1511
|
self,
|
|
1521
|
-
udf:
|
|
1512
|
+
udf: UDFBase,
|
|
1522
1513
|
parallel: Optional[int] = None,
|
|
1523
1514
|
workers: Union[bool, int] = False,
|
|
1524
1515
|
min_task_size: Optional[int] = None,
|
|
1525
1516
|
partition_by: Optional[PartitionByType] = None,
|
|
1526
1517
|
cache: bool = False,
|
|
1527
1518
|
) -> "Self":
|
|
1528
|
-
if isinstance(udf, UDFClassWrapper): # type: ignore[unreachable]
|
|
1529
|
-
# This is a bare decorated class, "instantiate" it now.
|
|
1530
|
-
udf = udf() # type: ignore[unreachable]
|
|
1531
1519
|
query = self.clone()
|
|
1532
1520
|
steps = query.steps
|
|
1533
1521
|
steps.append(
|
datachain/query/dispatch.py
CHANGED
|
@@ -27,7 +27,7 @@ from datachain.query.queue import (
|
|
|
27
27
|
put_into_queue,
|
|
28
28
|
unmarshal,
|
|
29
29
|
)
|
|
30
|
-
from datachain.query.udf import UDFBase,
|
|
30
|
+
from datachain.query.udf import UDFBase, UDFResult
|
|
31
31
|
from datachain.utils import batched_it
|
|
32
32
|
|
|
33
33
|
DEFAULT_BATCH_SIZE = 10000
|
|
@@ -156,8 +156,6 @@ class UDFDispatcher:
|
|
|
156
156
|
|
|
157
157
|
@property
|
|
158
158
|
def batch_size(self):
|
|
159
|
-
if not self.udf:
|
|
160
|
-
self.udf = self.udf_factory()
|
|
161
159
|
if self._batch_size is None:
|
|
162
160
|
if hasattr(self.udf, "properties") and hasattr(
|
|
163
161
|
self.udf.properties, "batch"
|
|
@@ -181,18 +179,7 @@ class UDFDispatcher:
|
|
|
181
179
|
self.catalog = Catalog(
|
|
182
180
|
id_generator, metastore, warehouse, **self.catalog_init_params
|
|
183
181
|
)
|
|
184
|
-
udf = loads(self.udf_data)
|
|
185
|
-
# isinstance cannot be used here, as cloudpickle packages the entire class
|
|
186
|
-
# definition, and so these two types are not considered exactly equal,
|
|
187
|
-
# even if they have the same import path.
|
|
188
|
-
if full_module_type_path(type(udf)) != full_module_type_path(UDFFactory):
|
|
189
|
-
self.udf = udf
|
|
190
|
-
else:
|
|
191
|
-
self.udf = None
|
|
192
|
-
self.udf_factory = udf
|
|
193
|
-
if not self.udf:
|
|
194
|
-
self.udf = self.udf_factory()
|
|
195
|
-
|
|
182
|
+
self.udf = loads(self.udf_data)
|
|
196
183
|
return UDFWorker(
|
|
197
184
|
self.catalog,
|
|
198
185
|
self.udf,
|
datachain/query/schema.py
CHANGED
|
@@ -9,6 +9,7 @@ import attrs
|
|
|
9
9
|
import sqlalchemy as sa
|
|
10
10
|
from fsspec.callbacks import DEFAULT_CALLBACK, Callback
|
|
11
11
|
|
|
12
|
+
from datachain.lib.file import File
|
|
12
13
|
from datachain.sql.types import JSON, Boolean, DateTime, Int64, SQLType, String
|
|
13
14
|
|
|
14
15
|
if TYPE_CHECKING:
|
|
@@ -97,11 +98,11 @@ class Object(UDFParameter):
|
|
|
97
98
|
cb: Callback = DEFAULT_CALLBACK,
|
|
98
99
|
**kwargs,
|
|
99
100
|
) -> Any:
|
|
100
|
-
|
|
101
|
-
|
|
101
|
+
file = File._from_row(file_signals(row))
|
|
102
|
+
client = catalog.get_client(file.source)
|
|
102
103
|
if cache:
|
|
103
|
-
client.download(
|
|
104
|
-
with client.open_object(
|
|
104
|
+
client.download(file, callback=cb)
|
|
105
|
+
with client.open_object(file, use_cache=cache, cb=cb) as f:
|
|
105
106
|
return self.reader(f)
|
|
106
107
|
|
|
107
108
|
async def get_value_async(
|
|
@@ -114,12 +115,12 @@ class Object(UDFParameter):
|
|
|
114
115
|
cb: Callback = DEFAULT_CALLBACK,
|
|
115
116
|
**kwargs,
|
|
116
117
|
) -> Any:
|
|
117
|
-
|
|
118
|
-
|
|
118
|
+
file = File._from_row(file_signals(row))
|
|
119
|
+
client = catalog.get_client(file.source)
|
|
119
120
|
if cache:
|
|
120
|
-
await client._download(
|
|
121
|
+
await client._download(file, callback=cb)
|
|
121
122
|
obj = await mapper.to_thread(
|
|
122
|
-
functools.partial(client.open_object,
|
|
123
|
+
functools.partial(client.open_object, file, use_cache=cache, cb=cb)
|
|
123
124
|
)
|
|
124
125
|
with obj:
|
|
125
126
|
return await mapper.to_thread(self.reader, obj)
|
|
@@ -140,11 +141,11 @@ class Stream(UDFParameter):
|
|
|
140
141
|
cb: Callback = DEFAULT_CALLBACK,
|
|
141
142
|
**kwargs,
|
|
142
143
|
) -> Any:
|
|
143
|
-
|
|
144
|
-
|
|
144
|
+
file = File._from_row(file_signals(row))
|
|
145
|
+
client = catalog.get_client(file.source)
|
|
145
146
|
if cache:
|
|
146
|
-
client.download(
|
|
147
|
-
return client.open_object(
|
|
147
|
+
client.download(file, callback=cb)
|
|
148
|
+
return client.open_object(file, use_cache=cache, cb=cb)
|
|
148
149
|
|
|
149
150
|
async def get_value_async(
|
|
150
151
|
self,
|
|
@@ -156,12 +157,12 @@ class Stream(UDFParameter):
|
|
|
156
157
|
cb: Callback = DEFAULT_CALLBACK,
|
|
157
158
|
**kwargs,
|
|
158
159
|
) -> Any:
|
|
159
|
-
|
|
160
|
-
|
|
160
|
+
file = File._from_row(file_signals(row))
|
|
161
|
+
client = catalog.get_client(file.source)
|
|
161
162
|
if cache:
|
|
162
|
-
await client._download(
|
|
163
|
+
await client._download(file, callback=cb)
|
|
163
164
|
return await mapper.to_thread(
|
|
164
|
-
functools.partial(client.open_object,
|
|
165
|
+
functools.partial(client.open_object, file, use_cache=cache, cb=cb)
|
|
165
166
|
)
|
|
166
167
|
|
|
167
168
|
|
|
@@ -189,10 +190,10 @@ class LocalFilename(UDFParameter):
|
|
|
189
190
|
# If the glob pattern is specified and the row filename
|
|
190
191
|
# does not match it, then return None
|
|
191
192
|
return None
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
client.download(
|
|
195
|
-
return client.cache.get_path(
|
|
193
|
+
file = File._from_row(file_signals(row))
|
|
194
|
+
client = catalog.get_client(file.source)
|
|
195
|
+
client.download(file, callback=cb)
|
|
196
|
+
return client.cache.get_path(file)
|
|
196
197
|
|
|
197
198
|
async def get_value_async(
|
|
198
199
|
self,
|
|
@@ -208,10 +209,10 @@ class LocalFilename(UDFParameter):
|
|
|
208
209
|
# If the glob pattern is specified and the row filename
|
|
209
210
|
# does not match it, then return None
|
|
210
211
|
return None
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
await client._download(
|
|
214
|
-
return client.cache.get_path(
|
|
212
|
+
file = File._from_row(file_signals(row))
|
|
213
|
+
client = catalog.get_client(file.source)
|
|
214
|
+
await client._download(file, callback=cb)
|
|
215
|
+
return client.cache.get_path(file)
|
|
215
216
|
|
|
216
217
|
|
|
217
218
|
UDFParamSpec = Union[str, Column, UDFParameter]
|
datachain/query/udf.py
CHANGED
|
@@ -1,13 +1,9 @@
|
|
|
1
1
|
import typing
|
|
2
2
|
from collections.abc import Iterable, Iterator, Sequence
|
|
3
3
|
from dataclasses import dataclass
|
|
4
|
-
from functools import WRAPPER_ASSIGNMENTS
|
|
5
4
|
from typing import (
|
|
6
5
|
TYPE_CHECKING,
|
|
7
6
|
Any,
|
|
8
|
-
Callable,
|
|
9
|
-
Optional,
|
|
10
|
-
Union,
|
|
11
7
|
)
|
|
12
8
|
|
|
13
9
|
from fsspec.callbacks import DEFAULT_CALLBACK, Callback
|
|
@@ -128,105 +124,3 @@ class UDFBase:
|
|
|
128
124
|
for row_id, signals in zip(row_ids, results)
|
|
129
125
|
if signals is not None # skip rows with no output
|
|
130
126
|
]
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
class UDFClassWrapper:
|
|
134
|
-
"""
|
|
135
|
-
A wrapper for class-based (stateful) UDFs.
|
|
136
|
-
"""
|
|
137
|
-
|
|
138
|
-
def __init__(
|
|
139
|
-
self,
|
|
140
|
-
udf_class: type,
|
|
141
|
-
properties: UDFProperties,
|
|
142
|
-
method: Optional[str] = None,
|
|
143
|
-
):
|
|
144
|
-
self.udf_class = udf_class
|
|
145
|
-
self.udf_method = method
|
|
146
|
-
self.properties = properties
|
|
147
|
-
self.output = properties.output
|
|
148
|
-
|
|
149
|
-
def __call__(self, *args, **kwargs) -> "UDFFactory":
|
|
150
|
-
return UDFFactory(
|
|
151
|
-
self.udf_class,
|
|
152
|
-
args,
|
|
153
|
-
kwargs,
|
|
154
|
-
self.properties,
|
|
155
|
-
self.udf_method,
|
|
156
|
-
)
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
class UDFWrapper(UDFBase):
|
|
160
|
-
"""A wrapper class for function UDFs to be used in custom signal generation."""
|
|
161
|
-
|
|
162
|
-
def __init__(
|
|
163
|
-
self,
|
|
164
|
-
func: Callable,
|
|
165
|
-
properties: UDFProperties,
|
|
166
|
-
):
|
|
167
|
-
self.func = func
|
|
168
|
-
super().__init__(properties)
|
|
169
|
-
# This emulates the behavior of functools.wraps for a class decorator
|
|
170
|
-
for attr in WRAPPER_ASSIGNMENTS:
|
|
171
|
-
if hasattr(func, attr):
|
|
172
|
-
setattr(self, attr, getattr(func, attr))
|
|
173
|
-
|
|
174
|
-
def run_once(
|
|
175
|
-
self,
|
|
176
|
-
catalog: "Catalog",
|
|
177
|
-
arg: "UDFInput",
|
|
178
|
-
is_generator: bool = False,
|
|
179
|
-
cache: bool = False,
|
|
180
|
-
cb: Callback = DEFAULT_CALLBACK,
|
|
181
|
-
) -> Iterable[UDFResult]:
|
|
182
|
-
if isinstance(arg, UDFInputBatch):
|
|
183
|
-
udf_inputs = [
|
|
184
|
-
self.bind_parameters(catalog, row, cache=cache, cb=cb)
|
|
185
|
-
for row in arg.rows
|
|
186
|
-
]
|
|
187
|
-
udf_outputs = self.func(udf_inputs)
|
|
188
|
-
return self._process_results(arg.rows, udf_outputs, is_generator)
|
|
189
|
-
if isinstance(arg, RowDict):
|
|
190
|
-
udf_inputs = self.bind_parameters(catalog, arg, cache=cache, cb=cb)
|
|
191
|
-
udf_outputs = self.func(*udf_inputs)
|
|
192
|
-
if not is_generator:
|
|
193
|
-
# udf_outputs is generator already if is_generator=True
|
|
194
|
-
udf_outputs = [udf_outputs]
|
|
195
|
-
return self._process_results([arg], udf_outputs, is_generator)
|
|
196
|
-
raise ValueError(f"Unexpected UDF argument: {arg}")
|
|
197
|
-
|
|
198
|
-
# This emulates the behavior of functools.wraps for a class decorator
|
|
199
|
-
def __repr__(self):
|
|
200
|
-
return repr(self.func)
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
class UDFFactory:
|
|
204
|
-
"""
|
|
205
|
-
A wrapper for late instantiation of UDF classes, primarily for use in parallelized
|
|
206
|
-
execution.
|
|
207
|
-
"""
|
|
208
|
-
|
|
209
|
-
def __init__(
|
|
210
|
-
self,
|
|
211
|
-
udf_class: type,
|
|
212
|
-
args,
|
|
213
|
-
kwargs,
|
|
214
|
-
properties: UDFProperties,
|
|
215
|
-
method: Optional[str] = None,
|
|
216
|
-
):
|
|
217
|
-
self.udf_class = udf_class
|
|
218
|
-
self.udf_method = method
|
|
219
|
-
self.args = args
|
|
220
|
-
self.kwargs = kwargs
|
|
221
|
-
self.properties = properties
|
|
222
|
-
self.output = properties.output
|
|
223
|
-
|
|
224
|
-
def __call__(self) -> UDFWrapper:
|
|
225
|
-
udf_func = self.udf_class(*self.args, **self.kwargs)
|
|
226
|
-
if self.udf_method:
|
|
227
|
-
udf_func = getattr(udf_func, self.udf_method)
|
|
228
|
-
|
|
229
|
-
return UDFWrapper(udf_func, self.properties)
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
UDFType = Union[UDFBase, UDFFactory]
|
datachain/sql/types.py
CHANGED
|
@@ -12,11 +12,11 @@ for sqlite we can use `sqlite.register_converter`
|
|
|
12
12
|
( https://docs.python.org/3/library/sqlite3.html#sqlite3.register_converter )
|
|
13
13
|
"""
|
|
14
14
|
|
|
15
|
-
import json
|
|
16
15
|
from datetime import datetime
|
|
17
16
|
from types import MappingProxyType
|
|
18
17
|
from typing import Any, Union
|
|
19
18
|
|
|
19
|
+
import orjson
|
|
20
20
|
import sqlalchemy as sa
|
|
21
21
|
from sqlalchemy import TypeDecorator, types
|
|
22
22
|
|
|
@@ -312,7 +312,7 @@ class Array(SQLType):
|
|
|
312
312
|
def on_read_convert(self, value, dialect):
|
|
313
313
|
r = read_converter(dialect).array(value, self.item_type, dialect)
|
|
314
314
|
if isinstance(self.item_type, JSON):
|
|
315
|
-
r = [
|
|
315
|
+
r = [orjson.loads(item) if isinstance(item, str) else item for item in r]
|
|
316
316
|
return r
|
|
317
317
|
|
|
318
318
|
|
|
@@ -420,6 +420,8 @@ class TypeReadConverter:
|
|
|
420
420
|
return [item_type.on_read_convert(x, dialect) for x in value]
|
|
421
421
|
|
|
422
422
|
def json(self, value):
|
|
423
|
+
if isinstance(value, str):
|
|
424
|
+
return orjson.loads(value)
|
|
423
425
|
return value
|
|
424
426
|
|
|
425
427
|
def datetime(self, value):
|
datachain/telemetry.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import os
|
|
3
|
+
from importlib.metadata import PackageNotFoundError, version
|
|
4
|
+
|
|
5
|
+
from iterative_telemetry import IterativeTelemetryLogger
|
|
6
|
+
|
|
7
|
+
from datachain.utils import env2bool
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def is_enabled():
|
|
13
|
+
"""
|
|
14
|
+
Determine if telemetry is enabled based on environment variables and configuration.
|
|
15
|
+
"""
|
|
16
|
+
# Disable telemetry if running in test mode
|
|
17
|
+
if env2bool("DATACHAIN_TEST"):
|
|
18
|
+
return False
|
|
19
|
+
|
|
20
|
+
# Check if telemetry is disabled by environment variable
|
|
21
|
+
disabled = bool(os.getenv("DATACHAIN_NO_ANALYTICS"))
|
|
22
|
+
if disabled:
|
|
23
|
+
logger.debug("Telemetry is disabled by environment variable.")
|
|
24
|
+
return False
|
|
25
|
+
|
|
26
|
+
logger.debug("Telemetry is enabled.")
|
|
27
|
+
return True
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
# Try to get the version of the datachain package
|
|
31
|
+
try:
|
|
32
|
+
__version__ = version("datachain")
|
|
33
|
+
except PackageNotFoundError:
|
|
34
|
+
__version__ = "unknown"
|
|
35
|
+
|
|
36
|
+
# Initialize telemetry logger
|
|
37
|
+
telemetry = IterativeTelemetryLogger("datachain", __version__, is_enabled)
|
datachain/utils.py
CHANGED
|
@@ -4,6 +4,7 @@ import json
|
|
|
4
4
|
import os
|
|
5
5
|
import os.path as osp
|
|
6
6
|
import random
|
|
7
|
+
import re
|
|
7
8
|
import stat
|
|
8
9
|
import sys
|
|
9
10
|
import time
|
|
@@ -410,3 +411,13 @@ def get_datachain_executable() -> list[str]:
|
|
|
410
411
|
def uses_glob(path: str) -> bool:
|
|
411
412
|
"""Checks if some URI path has glob syntax in it"""
|
|
412
413
|
return glob.has_magic(os.path.basename(os.path.normpath(path)))
|
|
414
|
+
|
|
415
|
+
|
|
416
|
+
def env2bool(var, undefined=False):
|
|
417
|
+
"""
|
|
418
|
+
undefined: return value if env var is unset
|
|
419
|
+
"""
|
|
420
|
+
var = os.getenv(var, None)
|
|
421
|
+
if var is None:
|
|
422
|
+
return undefined
|
|
423
|
+
return bool(re.search("1|y|yes|true", var, flags=re.IGNORECASE))
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.18
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -43,6 +43,7 @@ Requires-Dist: Pillow <11,>=10.0.0
|
|
|
43
43
|
Requires-Dist: msgpack <2,>=1.0.4
|
|
44
44
|
Requires-Dist: psutil
|
|
45
45
|
Requires-Dist: huggingface-hub
|
|
46
|
+
Requires-Dist: iterative-telemetry >=0.0.9
|
|
46
47
|
Requires-Dist: numpy <2,>=1 ; sys_platform == "win32"
|
|
47
48
|
Provides-Extra: dev
|
|
48
49
|
Requires-Dist: datachain[docs,tests] ; extra == 'dev'
|
|
@@ -63,9 +64,10 @@ Requires-Dist: datachain[tests] ; extra == 'examples'
|
|
|
63
64
|
Requires-Dist: numpy <2,>=1 ; extra == 'examples'
|
|
64
65
|
Requires-Dist: defusedxml ; extra == 'examples'
|
|
65
66
|
Requires-Dist: accelerate ; extra == 'examples'
|
|
66
|
-
Requires-Dist: unstructured[pdf] ; extra == 'examples'
|
|
67
|
+
Requires-Dist: unstructured[embed-huggingface,pdf] ; extra == 'examples'
|
|
67
68
|
Requires-Dist: pdfplumber ==0.11.4 ; extra == 'examples'
|
|
68
69
|
Requires-Dist: huggingface-hub[hf_transfer] ; extra == 'examples'
|
|
70
|
+
Requires-Dist: onnx ==1.16.1 ; extra == 'examples'
|
|
69
71
|
Provides-Extra: hf
|
|
70
72
|
Requires-Dist: numba >=0.60.0 ; extra == 'hf'
|
|
71
73
|
Requires-Dist: datasets[audio,vision] >=2.21.0 ; extra == 'hf'
|
|
@@ -78,7 +80,7 @@ Requires-Dist: pytest <9,>=8 ; extra == 'tests'
|
|
|
78
80
|
Requires-Dist: pytest-sugar >=0.9.6 ; extra == 'tests'
|
|
79
81
|
Requires-Dist: pytest-cov >=4.1.0 ; extra == 'tests'
|
|
80
82
|
Requires-Dist: pytest-mock >=3.12.0 ; extra == 'tests'
|
|
81
|
-
Requires-Dist: pytest-servers[all] >=0.5.
|
|
83
|
+
Requires-Dist: pytest-servers[all] >=0.5.7 ; extra == 'tests'
|
|
82
84
|
Requires-Dist: pytest-benchmark[histogram] ; extra == 'tests'
|
|
83
85
|
Requires-Dist: pytest-xdist >=3.3.1 ; extra == 'tests'
|
|
84
86
|
Requires-Dist: virtualenv ; extra == 'tests'
|
|
@@ -1,32 +1,33 @@
|
|
|
1
1
|
datachain/__init__.py,sha256=GeyhE-5LgfJav2OKYGaieP2lBvf2Gm-ihj7thnK9zjI,800
|
|
2
2
|
datachain/__main__.py,sha256=hG3Y4ARGEqe1AWwNMd259rBlqtphx1Wk39YbueQ0yV8,91
|
|
3
3
|
datachain/asyn.py,sha256=Lg3Ck1PQLjQziMx9KU4atzbEnJXTE0924WMYkhgWtGU,8247
|
|
4
|
-
datachain/cache.py,sha256=
|
|
5
|
-
datachain/cli.py,sha256=
|
|
4
|
+
datachain/cache.py,sha256=s0YHN7qurmQv-eC265TjeureK84TebWWAnL07cxchZQ,2997
|
|
5
|
+
datachain/cli.py,sha256=tRuUvlFey5zYE0UVkGylqGiG5t89gUBo2SJ_yPsvu1I,30129
|
|
6
6
|
datachain/cli_utils.py,sha256=jrn9ejGXjybeO1ur3fjdSiAyCHZrX0qsLLbJzN9ErPM,2418
|
|
7
7
|
datachain/config.py,sha256=PfC7W5yO6HFO6-iMB4YB-0RR88LPiGmD6sS_SfVbGso,1979
|
|
8
8
|
datachain/dataset.py,sha256=sHnsmKfMg2bK88gZH1izk8jlbmJDEhQpyOemdaPQVFo,14761
|
|
9
9
|
datachain/error.py,sha256=OnZ8OaBtDdTZPy8XQiy29SAjqdQArQeorYbP5ju7ldc,1199
|
|
10
10
|
datachain/job.py,sha256=Jt4sNutMHJReaGsj3r3scueN5aESLGfhimAa8pUP7Is,1271
|
|
11
|
-
datachain/listing.py,sha256=
|
|
12
|
-
datachain/node.py,sha256
|
|
13
|
-
datachain/nodes_fetcher.py,sha256=
|
|
14
|
-
datachain/nodes_thread_pool.py,sha256=
|
|
15
|
-
datachain/progress.py,sha256=
|
|
11
|
+
datachain/listing.py,sha256=TkMmBzCiru26x4RaZiagWJTmTGbiy6yGrAsSJMr8cFE,8213
|
|
12
|
+
datachain/node.py,sha256=ThE6Ue4BqpaBvrkFFJW_ljLxchixUX2aWz3l_nbwY54,5195
|
|
13
|
+
datachain/nodes_fetcher.py,sha256=F-73-h19HHNGtHFBGKk7p3mc0ALm4a9zGnzhtuUjnp4,1107
|
|
14
|
+
datachain/nodes_thread_pool.py,sha256=uPo-xl8zG5m9YgODjPFBpbcqqHjI-dcxH87yAbj_qco,3192
|
|
15
|
+
datachain/progress.py,sha256=5KotcvvzAUL_RF0GEj4JY0IB1lyImnmHxe89YkT1XO4,4330
|
|
16
16
|
datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
17
17
|
datachain/storage.py,sha256=RiSJLYdHUjnrEWkLBKPcETHpAxld_B2WxLg711t0aZI,3733
|
|
18
|
-
datachain/
|
|
18
|
+
datachain/telemetry.py,sha256=0A4IOPPp9VlP5pyW9eBfaTK3YhHGzHl7dQudQjUAx9A,994
|
|
19
|
+
datachain/utils.py,sha256=KeFSRHsiYthnTu4a6bH-rw04mX1m8krTX0f2NqfQGFI,12114
|
|
19
20
|
datachain/catalog/__init__.py,sha256=g2iAAFx_gEIrqshXlhSEbrc8qDaEH11cjU40n3CHDz4,409
|
|
20
|
-
datachain/catalog/catalog.py,sha256=
|
|
21
|
+
datachain/catalog/catalog.py,sha256=tICInYEeCRJow9hNSFnlA50hCOjFPN7fyGgoN5shcf8,67985
|
|
21
22
|
datachain/catalog/datasource.py,sha256=D-VWIVDCM10A8sQavLhRXdYSCG7F4o4ifswEF80_NAQ,1412
|
|
22
23
|
datachain/catalog/loader.py,sha256=-6VelNfXUdgUnwInVyA8g86Boxv2xqhTh9xNS-Zlwig,8242
|
|
23
24
|
datachain/client/__init__.py,sha256=T4wiYL9KIM0ZZ_UqIyzV8_ufzYlewmizlV4iymHNluE,86
|
|
24
25
|
datachain/client/azure.py,sha256=ffxs26zm6KLAL1aUWJm-vtzuZP3LSNha7UDGXynMBKo,2234
|
|
25
26
|
datachain/client/fileslice.py,sha256=bT7TYco1Qe3bqoc8aUkUZcPdPofJDHlryL5BsTn9xsY,3021
|
|
26
|
-
datachain/client/fsspec.py,sha256=
|
|
27
|
+
datachain/client/fsspec.py,sha256=CO5LfxlZF58UAywLfMYeZRXDLIzcJepnQyPZfZk0Ies,12236
|
|
27
28
|
datachain/client/gcs.py,sha256=cnTIr5GS6dbYOEYfqehhyQu3dr6XNjPHSg5U3FkivUk,4124
|
|
28
29
|
datachain/client/hf.py,sha256=k24bpa6FEKNQn9zhoNC9kCigDwFSqobLsCnN_Nuzwh4,922
|
|
29
|
-
datachain/client/local.py,sha256=
|
|
30
|
+
datachain/client/local.py,sha256=5OT3yf9QHi0If_dlqKYIYs-if-3oWhfAztMvsSa3YRA,4969
|
|
30
31
|
datachain/client/s3.py,sha256=CVHBUZ1Ic2Q3370nl-Bbe69phuWjFlrVv9dTJKBpRT0,6019
|
|
31
32
|
datachain/data_storage/__init__.py,sha256=cEOJpyu1JDZtfUupYucCDNFI6e5Wmp_Oyzq6rZv32Y8,398
|
|
32
33
|
datachain/data_storage/db_engine.py,sha256=81Ol1of9TTTzD97ORajCnP366Xz2mEJt6C-kTUCaru4,3406
|
|
@@ -38,15 +39,15 @@ datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2kru
|
|
|
38
39
|
datachain/data_storage/sqlite.py,sha256=EBKJncuzcyQfcKFm2mUjvHjHRTODsteM-k_zndunBrw,28834
|
|
39
40
|
datachain/data_storage/warehouse.py,sha256=Vwhu_OfcNAoTtg1BHui80VCzlPeTUjZQL0QWziu8awY,32186
|
|
40
41
|
datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
41
|
-
datachain/lib/arrow.py,sha256=
|
|
42
|
+
datachain/lib/arrow.py,sha256=uYn9RQwJy4MsMkhu18_6cgtVO3HkniBcB1NdFmkwtvo,7292
|
|
42
43
|
datachain/lib/clip.py,sha256=lm5CzVi4Cj1jVLEKvERKArb-egb9j1Ls-fwTItT6vlI,6150
|
|
43
44
|
datachain/lib/data_model.py,sha256=gHIjlow84GMRDa78yLL1Ud-N18or21fnTyPEwsatpXY,2045
|
|
44
45
|
datachain/lib/dataset_info.py,sha256=srPPhI2UHf6hFPBecyFEVw2SS5aPisIIMsvGgKqi7ss,2366
|
|
45
|
-
datachain/lib/dc.py,sha256=
|
|
46
|
-
datachain/lib/file.py,sha256=
|
|
46
|
+
datachain/lib/dc.py,sha256=oc9tPf5G9X3DmVCPaTuuwp8LlLogoYuEHpOkq_W7h6Y,68984
|
|
47
|
+
datachain/lib/file.py,sha256=flKGvmrotXWZqQQafaZQEeSQlqUVTSVWB7JIkEsr0MM,14255
|
|
47
48
|
datachain/lib/hf.py,sha256=cPnmLuprr0pYABH7KqA5FARQ1JGlywdDwD3yDzVAm4k,5920
|
|
48
49
|
datachain/lib/image.py,sha256=AMXYwQsmarZjRbPCZY3M1jDsM2WAB_b3cTY4uOIuXNU,2675
|
|
49
|
-
datachain/lib/listing.py,sha256=
|
|
50
|
+
datachain/lib/listing.py,sha256=cHPN5-Fq8yb0gP6DARImhmZWxykDDNqhhJujDxEp53A,4104
|
|
50
51
|
datachain/lib/listing_info.py,sha256=36NZ-tXY5Y118wurkajuWWbcE8UCjkRwZlacDtN9F3g,954
|
|
51
52
|
datachain/lib/meta_formats.py,sha256=3f-0vpMTesagS9iMd3y9-u9r-7g0eqYsxmK4fVfNWlw,6635
|
|
52
53
|
datachain/lib/model_store.py,sha256=DNIv8Y6Jtk1_idNLzIpsThOsdW2BMAudyUCbPUcgcxk,2515
|
|
@@ -69,19 +70,19 @@ datachain/lib/convert/unflatten.py,sha256=Ogvh_5wg2f38_At_1lN0D_e2uZOOpYEvwvB2xd
|
|
|
69
70
|
datachain/lib/convert/values_to_tuples.py,sha256=YOdbjzHq-uj6-cV2Qq43G72eN2avMNDGl4x5t6yQMl8,3931
|
|
70
71
|
datachain/query/__init__.py,sha256=0NBOZVgIDpCcj1Ci883dQ9A0iiwe03xzmotkOCFbxYc,293
|
|
71
72
|
datachain/query/batch.py,sha256=-vlpINJiertlnaoUVv1C95RatU0F6zuhpIYRufJRo1M,3660
|
|
72
|
-
datachain/query/dataset.py,sha256=
|
|
73
|
-
datachain/query/dispatch.py,sha256=
|
|
73
|
+
datachain/query/dataset.py,sha256=k2jU0uZ86i9vr3On-o7GzHrubK5bCJjZEvz9P8extmw,54347
|
|
74
|
+
datachain/query/dispatch.py,sha256=CFAc09O6UllcyUSSEY1GUlEMPzeO8RYhXinNN4HBl9M,12405
|
|
74
75
|
datachain/query/metrics.py,sha256=r5b0ygYhokbXp8Mg3kCH8iFSRw0jxzyeBe-C-J_bKFc,938
|
|
75
76
|
datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
|
|
76
77
|
datachain/query/queue.py,sha256=waqM_KzavU8C-G95-4211Nd4GXna_u2747Chgwtgz2w,3839
|
|
77
|
-
datachain/query/schema.py,sha256=
|
|
78
|
+
datachain/query/schema.py,sha256=I8zLWJuWl5N332ni9mAzDYtcxMJupVPgWkSDe8spNEk,8019
|
|
78
79
|
datachain/query/session.py,sha256=UPH5Z4fzCDsvj81ji0e8GA6Mgra3bOAEpVq4htqOtis,4317
|
|
79
|
-
datachain/query/udf.py,sha256=
|
|
80
|
+
datachain/query/udf.py,sha256=HB2hbEuiGA4ch9P2mh9iLA5Jj9mRj-4JFy9VfjTLJ8U,3622
|
|
80
81
|
datachain/remote/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
81
82
|
datachain/remote/studio.py,sha256=f5s6qSZ9uB4URGUoU_8_W1KZRRQQVSm6cgEBkBUEfuE,7226
|
|
82
83
|
datachain/sql/__init__.py,sha256=A2djrbQwSMUZZEIKGnm-mnRA-NDSbiDJNpAmmwGNyIo,303
|
|
83
84
|
datachain/sql/selectable.py,sha256=fBM-wS1TUA42kVEAAiwqGtibIevyZAEritwt8PZGyLQ,1589
|
|
84
|
-
datachain/sql/types.py,sha256=
|
|
85
|
+
datachain/sql/types.py,sha256=3aXpoxkmCYbw0Dlta5J1enwS8_FuvjfSqyrNZO-dWj4,13383
|
|
85
86
|
datachain/sql/utils.py,sha256=rzlJw08etivdrcuQPqNVvVWhuVSyUPUQEEc6DOhu258,818
|
|
86
87
|
datachain/sql/default/__init__.py,sha256=XQ2cEZpzWiABqjV-6yYHUBGI9vN_UHxbxZENESmVAWw,45
|
|
87
88
|
datachain/sql/default/base.py,sha256=QD-31C6JnyOXzogyDx90sUhm7QvgXIYpeHEASH84igU,628
|
|
@@ -96,9 +97,9 @@ datachain/sql/sqlite/base.py,sha256=WLPHBhZbXbiqPoRV1VgDrXJqku4UuvJpBhYeQ0k5rI8,
|
|
|
96
97
|
datachain/sql/sqlite/types.py,sha256=yzvp0sXSEoEYXs6zaYC_2YubarQoZH-MiUNXcpuEP4s,1573
|
|
97
98
|
datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
|
|
98
99
|
datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
|
|
99
|
-
datachain-0.3.
|
|
100
|
-
datachain-0.3.
|
|
101
|
-
datachain-0.3.
|
|
102
|
-
datachain-0.3.
|
|
103
|
-
datachain-0.3.
|
|
104
|
-
datachain-0.3.
|
|
100
|
+
datachain-0.3.18.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
|
|
101
|
+
datachain-0.3.18.dist-info/METADATA,sha256=_LpwSHtaSTA-rz4rG9nHIbO2mLlrlI4mCnlxKx8vePo,17185
|
|
102
|
+
datachain-0.3.18.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
|
|
103
|
+
datachain-0.3.18.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
|
|
104
|
+
datachain-0.3.18.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
|
|
105
|
+
datachain-0.3.18.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|