datachain 0.3.16__py3-none-any.whl → 0.3.18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/cache.py +14 -55
- datachain/catalog/catalog.py +21 -55
- datachain/cli.py +7 -26
- datachain/client/fsspec.py +29 -63
- datachain/client/local.py +2 -3
- datachain/data_storage/metastore.py +7 -66
- datachain/data_storage/sqlite.py +5 -2
- datachain/data_storage/warehouse.py +0 -22
- datachain/lib/arrow.py +2 -1
- datachain/lib/dc.py +5 -2
- datachain/lib/file.py +41 -23
- datachain/lib/listing.py +3 -0
- datachain/lib/tar.py +2 -1
- datachain/listing.py +4 -4
- datachain/node.py +23 -9
- datachain/nodes_fetcher.py +12 -5
- datachain/nodes_thread_pool.py +1 -1
- datachain/progress.py +2 -12
- datachain/query/__init__.py +0 -2
- datachain/query/dataset.py +26 -144
- datachain/query/dispatch.py +2 -15
- datachain/query/schema.py +36 -24
- datachain/query/udf.py +2 -148
- datachain/sql/types.py +4 -2
- datachain/telemetry.py +37 -0
- datachain/utils.py +11 -40
- {datachain-0.3.16.dist-info → datachain-0.3.18.dist-info}/METADATA +5 -3
- {datachain-0.3.16.dist-info → datachain-0.3.18.dist-info}/RECORD +32 -32
- datachain/query/builtins.py +0 -96
- {datachain-0.3.16.dist-info → datachain-0.3.18.dist-info}/LICENSE +0 -0
- {datachain-0.3.16.dist-info → datachain-0.3.18.dist-info}/WHEEL +0 -0
- {datachain-0.3.16.dist-info → datachain-0.3.18.dist-info}/entry_points.txt +0 -0
- {datachain-0.3.16.dist-info → datachain-0.3.18.dist-info}/top_level.txt +0 -0
datachain/telemetry.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import os
|
|
3
|
+
from importlib.metadata import PackageNotFoundError, version
|
|
4
|
+
|
|
5
|
+
from iterative_telemetry import IterativeTelemetryLogger
|
|
6
|
+
|
|
7
|
+
from datachain.utils import env2bool
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def is_enabled():
|
|
13
|
+
"""
|
|
14
|
+
Determine if telemetry is enabled based on environment variables and configuration.
|
|
15
|
+
"""
|
|
16
|
+
# Disable telemetry if running in test mode
|
|
17
|
+
if env2bool("DATACHAIN_TEST"):
|
|
18
|
+
return False
|
|
19
|
+
|
|
20
|
+
# Check if telemetry is disabled by environment variable
|
|
21
|
+
disabled = bool(os.getenv("DATACHAIN_NO_ANALYTICS"))
|
|
22
|
+
if disabled:
|
|
23
|
+
logger.debug("Telemetry is disabled by environment variable.")
|
|
24
|
+
return False
|
|
25
|
+
|
|
26
|
+
logger.debug("Telemetry is enabled.")
|
|
27
|
+
return True
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
# Try to get the version of the datachain package
|
|
31
|
+
try:
|
|
32
|
+
__version__ = version("datachain")
|
|
33
|
+
except PackageNotFoundError:
|
|
34
|
+
__version__ = "unknown"
|
|
35
|
+
|
|
36
|
+
# Initialize telemetry logger
|
|
37
|
+
telemetry = IterativeTelemetryLogger("datachain", __version__, is_enabled)
|
datachain/utils.py
CHANGED
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
import glob
|
|
2
|
-
import importlib.util
|
|
3
2
|
import io
|
|
4
3
|
import json
|
|
5
4
|
import os
|
|
6
5
|
import os.path as osp
|
|
7
6
|
import random
|
|
7
|
+
import re
|
|
8
8
|
import stat
|
|
9
9
|
import sys
|
|
10
10
|
import time
|
|
@@ -198,45 +198,6 @@ def get_envs_by_prefix(prefix: str) -> dict[str, str]:
|
|
|
198
198
|
return variables
|
|
199
199
|
|
|
200
200
|
|
|
201
|
-
def import_object(object_spec):
|
|
202
|
-
filename, identifier = object_spec.rsplit(":", 1)
|
|
203
|
-
filename = filename.strip()
|
|
204
|
-
identifier = identifier.strip()
|
|
205
|
-
|
|
206
|
-
if not identifier.isidentifier() or not filename.endswith(".py"):
|
|
207
|
-
raise ValueError(f"Invalid object spec: {object_spec}")
|
|
208
|
-
|
|
209
|
-
modname = os.path.abspath(filename)
|
|
210
|
-
if modname in sys.modules:
|
|
211
|
-
module = sys.modules[modname]
|
|
212
|
-
else:
|
|
213
|
-
# Use importlib to find and load the module from the given filename
|
|
214
|
-
spec = importlib.util.spec_from_file_location(modname, filename)
|
|
215
|
-
module = importlib.util.module_from_spec(spec)
|
|
216
|
-
sys.modules[modname] = module
|
|
217
|
-
spec.loader.exec_module(module)
|
|
218
|
-
|
|
219
|
-
return getattr(module, identifier)
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
def parse_params_string(params: str):
|
|
223
|
-
"""
|
|
224
|
-
Parse a string containing UDF class constructor parameters in the form
|
|
225
|
-
`a, b, key=val` into *args and **kwargs.
|
|
226
|
-
"""
|
|
227
|
-
args = []
|
|
228
|
-
kwargs = {}
|
|
229
|
-
for part in params.split():
|
|
230
|
-
if "=" in part:
|
|
231
|
-
key, val = part.split("=")
|
|
232
|
-
kwargs[key] = val
|
|
233
|
-
else:
|
|
234
|
-
args.append(part)
|
|
235
|
-
if any((args, kwargs)):
|
|
236
|
-
return args, kwargs
|
|
237
|
-
return None, None
|
|
238
|
-
|
|
239
|
-
|
|
240
201
|
_T_co = TypeVar("_T_co", covariant=True)
|
|
241
202
|
|
|
242
203
|
|
|
@@ -450,3 +411,13 @@ def get_datachain_executable() -> list[str]:
|
|
|
450
411
|
def uses_glob(path: str) -> bool:
|
|
451
412
|
"""Checks if some URI path has glob syntax in it"""
|
|
452
413
|
return glob.has_magic(os.path.basename(os.path.normpath(path)))
|
|
414
|
+
|
|
415
|
+
|
|
416
|
+
def env2bool(var, undefined=False):
|
|
417
|
+
"""
|
|
418
|
+
undefined: return value if env var is unset
|
|
419
|
+
"""
|
|
420
|
+
var = os.getenv(var, None)
|
|
421
|
+
if var is None:
|
|
422
|
+
return undefined
|
|
423
|
+
return bool(re.search("1|y|yes|true", var, flags=re.IGNORECASE))
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: datachain
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.18
|
|
4
4
|
Summary: Wrangle unstructured AI data at scale
|
|
5
5
|
Author-email: Dmitry Petrov <support@dvc.org>
|
|
6
6
|
License: Apache-2.0
|
|
@@ -43,6 +43,7 @@ Requires-Dist: Pillow <11,>=10.0.0
|
|
|
43
43
|
Requires-Dist: msgpack <2,>=1.0.4
|
|
44
44
|
Requires-Dist: psutil
|
|
45
45
|
Requires-Dist: huggingface-hub
|
|
46
|
+
Requires-Dist: iterative-telemetry >=0.0.9
|
|
46
47
|
Requires-Dist: numpy <2,>=1 ; sys_platform == "win32"
|
|
47
48
|
Provides-Extra: dev
|
|
48
49
|
Requires-Dist: datachain[docs,tests] ; extra == 'dev'
|
|
@@ -63,9 +64,10 @@ Requires-Dist: datachain[tests] ; extra == 'examples'
|
|
|
63
64
|
Requires-Dist: numpy <2,>=1 ; extra == 'examples'
|
|
64
65
|
Requires-Dist: defusedxml ; extra == 'examples'
|
|
65
66
|
Requires-Dist: accelerate ; extra == 'examples'
|
|
66
|
-
Requires-Dist: unstructured[pdf] ; extra == 'examples'
|
|
67
|
+
Requires-Dist: unstructured[embed-huggingface,pdf] ; extra == 'examples'
|
|
67
68
|
Requires-Dist: pdfplumber ==0.11.4 ; extra == 'examples'
|
|
68
69
|
Requires-Dist: huggingface-hub[hf_transfer] ; extra == 'examples'
|
|
70
|
+
Requires-Dist: onnx ==1.16.1 ; extra == 'examples'
|
|
69
71
|
Provides-Extra: hf
|
|
70
72
|
Requires-Dist: numba >=0.60.0 ; extra == 'hf'
|
|
71
73
|
Requires-Dist: datasets[audio,vision] >=2.21.0 ; extra == 'hf'
|
|
@@ -78,7 +80,7 @@ Requires-Dist: pytest <9,>=8 ; extra == 'tests'
|
|
|
78
80
|
Requires-Dist: pytest-sugar >=0.9.6 ; extra == 'tests'
|
|
79
81
|
Requires-Dist: pytest-cov >=4.1.0 ; extra == 'tests'
|
|
80
82
|
Requires-Dist: pytest-mock >=3.12.0 ; extra == 'tests'
|
|
81
|
-
Requires-Dist: pytest-servers[all] >=0.5.
|
|
83
|
+
Requires-Dist: pytest-servers[all] >=0.5.7 ; extra == 'tests'
|
|
82
84
|
Requires-Dist: pytest-benchmark[histogram] ; extra == 'tests'
|
|
83
85
|
Requires-Dist: pytest-xdist >=3.3.1 ; extra == 'tests'
|
|
84
86
|
Requires-Dist: virtualenv ; extra == 'tests'
|
|
@@ -1,59 +1,60 @@
|
|
|
1
1
|
datachain/__init__.py,sha256=GeyhE-5LgfJav2OKYGaieP2lBvf2Gm-ihj7thnK9zjI,800
|
|
2
2
|
datachain/__main__.py,sha256=hG3Y4ARGEqe1AWwNMd259rBlqtphx1Wk39YbueQ0yV8,91
|
|
3
3
|
datachain/asyn.py,sha256=Lg3Ck1PQLjQziMx9KU4atzbEnJXTE0924WMYkhgWtGU,8247
|
|
4
|
-
datachain/cache.py,sha256=
|
|
5
|
-
datachain/cli.py,sha256=
|
|
4
|
+
datachain/cache.py,sha256=s0YHN7qurmQv-eC265TjeureK84TebWWAnL07cxchZQ,2997
|
|
5
|
+
datachain/cli.py,sha256=tRuUvlFey5zYE0UVkGylqGiG5t89gUBo2SJ_yPsvu1I,30129
|
|
6
6
|
datachain/cli_utils.py,sha256=jrn9ejGXjybeO1ur3fjdSiAyCHZrX0qsLLbJzN9ErPM,2418
|
|
7
7
|
datachain/config.py,sha256=PfC7W5yO6HFO6-iMB4YB-0RR88LPiGmD6sS_SfVbGso,1979
|
|
8
8
|
datachain/dataset.py,sha256=sHnsmKfMg2bK88gZH1izk8jlbmJDEhQpyOemdaPQVFo,14761
|
|
9
9
|
datachain/error.py,sha256=OnZ8OaBtDdTZPy8XQiy29SAjqdQArQeorYbP5ju7ldc,1199
|
|
10
10
|
datachain/job.py,sha256=Jt4sNutMHJReaGsj3r3scueN5aESLGfhimAa8pUP7Is,1271
|
|
11
|
-
datachain/listing.py,sha256=
|
|
12
|
-
datachain/node.py,sha256=
|
|
13
|
-
datachain/nodes_fetcher.py,sha256=
|
|
14
|
-
datachain/nodes_thread_pool.py,sha256=
|
|
15
|
-
datachain/progress.py,sha256=
|
|
11
|
+
datachain/listing.py,sha256=TkMmBzCiru26x4RaZiagWJTmTGbiy6yGrAsSJMr8cFE,8213
|
|
12
|
+
datachain/node.py,sha256=ThE6Ue4BqpaBvrkFFJW_ljLxchixUX2aWz3l_nbwY54,5195
|
|
13
|
+
datachain/nodes_fetcher.py,sha256=F-73-h19HHNGtHFBGKk7p3mc0ALm4a9zGnzhtuUjnp4,1107
|
|
14
|
+
datachain/nodes_thread_pool.py,sha256=uPo-xl8zG5m9YgODjPFBpbcqqHjI-dcxH87yAbj_qco,3192
|
|
15
|
+
datachain/progress.py,sha256=5KotcvvzAUL_RF0GEj4JY0IB1lyImnmHxe89YkT1XO4,4330
|
|
16
16
|
datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
17
17
|
datachain/storage.py,sha256=RiSJLYdHUjnrEWkLBKPcETHpAxld_B2WxLg711t0aZI,3733
|
|
18
|
-
datachain/
|
|
18
|
+
datachain/telemetry.py,sha256=0A4IOPPp9VlP5pyW9eBfaTK3YhHGzHl7dQudQjUAx9A,994
|
|
19
|
+
datachain/utils.py,sha256=KeFSRHsiYthnTu4a6bH-rw04mX1m8krTX0f2NqfQGFI,12114
|
|
19
20
|
datachain/catalog/__init__.py,sha256=g2iAAFx_gEIrqshXlhSEbrc8qDaEH11cjU40n3CHDz4,409
|
|
20
|
-
datachain/catalog/catalog.py,sha256=
|
|
21
|
+
datachain/catalog/catalog.py,sha256=tICInYEeCRJow9hNSFnlA50hCOjFPN7fyGgoN5shcf8,67985
|
|
21
22
|
datachain/catalog/datasource.py,sha256=D-VWIVDCM10A8sQavLhRXdYSCG7F4o4ifswEF80_NAQ,1412
|
|
22
23
|
datachain/catalog/loader.py,sha256=-6VelNfXUdgUnwInVyA8g86Boxv2xqhTh9xNS-Zlwig,8242
|
|
23
24
|
datachain/client/__init__.py,sha256=T4wiYL9KIM0ZZ_UqIyzV8_ufzYlewmizlV4iymHNluE,86
|
|
24
25
|
datachain/client/azure.py,sha256=ffxs26zm6KLAL1aUWJm-vtzuZP3LSNha7UDGXynMBKo,2234
|
|
25
26
|
datachain/client/fileslice.py,sha256=bT7TYco1Qe3bqoc8aUkUZcPdPofJDHlryL5BsTn9xsY,3021
|
|
26
|
-
datachain/client/fsspec.py,sha256=
|
|
27
|
+
datachain/client/fsspec.py,sha256=CO5LfxlZF58UAywLfMYeZRXDLIzcJepnQyPZfZk0Ies,12236
|
|
27
28
|
datachain/client/gcs.py,sha256=cnTIr5GS6dbYOEYfqehhyQu3dr6XNjPHSg5U3FkivUk,4124
|
|
28
29
|
datachain/client/hf.py,sha256=k24bpa6FEKNQn9zhoNC9kCigDwFSqobLsCnN_Nuzwh4,922
|
|
29
|
-
datachain/client/local.py,sha256=
|
|
30
|
+
datachain/client/local.py,sha256=5OT3yf9QHi0If_dlqKYIYs-if-3oWhfAztMvsSa3YRA,4969
|
|
30
31
|
datachain/client/s3.py,sha256=CVHBUZ1Ic2Q3370nl-Bbe69phuWjFlrVv9dTJKBpRT0,6019
|
|
31
32
|
datachain/data_storage/__init__.py,sha256=cEOJpyu1JDZtfUupYucCDNFI6e5Wmp_Oyzq6rZv32Y8,398
|
|
32
33
|
datachain/data_storage/db_engine.py,sha256=81Ol1of9TTTzD97ORajCnP366Xz2mEJt6C-kTUCaru4,3406
|
|
33
34
|
datachain/data_storage/id_generator.py,sha256=lCEoU0BM37Ai2aRpSbwo5oQT0GqZnSpYwwvizathRMQ,4292
|
|
34
35
|
datachain/data_storage/job.py,sha256=w-7spowjkOa1P5fUVtJou3OltT0L48P0RYWZ9rSJ9-s,383
|
|
35
|
-
datachain/data_storage/metastore.py,sha256=
|
|
36
|
+
datachain/data_storage/metastore.py,sha256=BePe3bVxo-Zuuccok8TLRo4cMHVnAIa8hfZMadbxzqM,52649
|
|
36
37
|
datachain/data_storage/schema.py,sha256=AGbjyEir5UmRZXI3m0jChZogUh5wd8csj6-YlUWaAxQ,8383
|
|
37
38
|
datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
|
|
38
|
-
datachain/data_storage/sqlite.py,sha256=
|
|
39
|
-
datachain/data_storage/warehouse.py,sha256=
|
|
39
|
+
datachain/data_storage/sqlite.py,sha256=EBKJncuzcyQfcKFm2mUjvHjHRTODsteM-k_zndunBrw,28834
|
|
40
|
+
datachain/data_storage/warehouse.py,sha256=Vwhu_OfcNAoTtg1BHui80VCzlPeTUjZQL0QWziu8awY,32186
|
|
40
41
|
datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
41
|
-
datachain/lib/arrow.py,sha256=
|
|
42
|
+
datachain/lib/arrow.py,sha256=uYn9RQwJy4MsMkhu18_6cgtVO3HkniBcB1NdFmkwtvo,7292
|
|
42
43
|
datachain/lib/clip.py,sha256=lm5CzVi4Cj1jVLEKvERKArb-egb9j1Ls-fwTItT6vlI,6150
|
|
43
44
|
datachain/lib/data_model.py,sha256=gHIjlow84GMRDa78yLL1Ud-N18or21fnTyPEwsatpXY,2045
|
|
44
45
|
datachain/lib/dataset_info.py,sha256=srPPhI2UHf6hFPBecyFEVw2SS5aPisIIMsvGgKqi7ss,2366
|
|
45
|
-
datachain/lib/dc.py,sha256=
|
|
46
|
-
datachain/lib/file.py,sha256=
|
|
46
|
+
datachain/lib/dc.py,sha256=oc9tPf5G9X3DmVCPaTuuwp8LlLogoYuEHpOkq_W7h6Y,68984
|
|
47
|
+
datachain/lib/file.py,sha256=flKGvmrotXWZqQQafaZQEeSQlqUVTSVWB7JIkEsr0MM,14255
|
|
47
48
|
datachain/lib/hf.py,sha256=cPnmLuprr0pYABH7KqA5FARQ1JGlywdDwD3yDzVAm4k,5920
|
|
48
49
|
datachain/lib/image.py,sha256=AMXYwQsmarZjRbPCZY3M1jDsM2WAB_b3cTY4uOIuXNU,2675
|
|
49
|
-
datachain/lib/listing.py,sha256=
|
|
50
|
+
datachain/lib/listing.py,sha256=cHPN5-Fq8yb0gP6DARImhmZWxykDDNqhhJujDxEp53A,4104
|
|
50
51
|
datachain/lib/listing_info.py,sha256=36NZ-tXY5Y118wurkajuWWbcE8UCjkRwZlacDtN9F3g,954
|
|
51
52
|
datachain/lib/meta_formats.py,sha256=3f-0vpMTesagS9iMd3y9-u9r-7g0eqYsxmK4fVfNWlw,6635
|
|
52
53
|
datachain/lib/model_store.py,sha256=DNIv8Y6Jtk1_idNLzIpsThOsdW2BMAudyUCbPUcgcxk,2515
|
|
53
54
|
datachain/lib/pytorch.py,sha256=8LNyFaBrx8zws--MEsFg5g3pb8oLnaQAUlgGvtjKxX4,5960
|
|
54
55
|
datachain/lib/settings.py,sha256=39thOpYJw-zPirzeNO6pmRC2vPrQvt4eBsw1xLWDFsw,2344
|
|
55
56
|
datachain/lib/signal_schema.py,sha256=iqgubjCBRiUJB30miv05qFX4uU04dA_Pzi3DCUsHZGs,24177
|
|
56
|
-
datachain/lib/tar.py,sha256=
|
|
57
|
+
datachain/lib/tar.py,sha256=3WIzao6yD5fbLqXLTt9GhPGNonbFIs_fDRu-9vgLgsA,1038
|
|
57
58
|
datachain/lib/text.py,sha256=UNHm8fhidk7wdrWqacEWaA6I9ykfYqarQ2URby7jc7M,1261
|
|
58
59
|
datachain/lib/udf.py,sha256=nG7DDuPgZ5ZuijwvDoCq-OZMxlDM8vFNzyxMmik0Y1c,11716
|
|
59
60
|
datachain/lib/udf_signature.py,sha256=gMStcEeYJka5M6cg50Z9orC6y6HzCAJ3MkFqqn1fjZg,7137
|
|
@@ -67,22 +68,21 @@ datachain/lib/convert/python_to_sql.py,sha256=40SAOdoOgikZRhn8iomCPDRoxC3RFxjJLi
|
|
|
67
68
|
datachain/lib/convert/sql_to_python.py,sha256=lGnKzSF_tz9Y_5SSKkrIU95QEjpcDzvOxIRkEKTQag0,443
|
|
68
69
|
datachain/lib/convert/unflatten.py,sha256=Ogvh_5wg2f38_At_1lN0D_e2uZOOpYEvwvB2xdq56Tw,2012
|
|
69
70
|
datachain/lib/convert/values_to_tuples.py,sha256=YOdbjzHq-uj6-cV2Qq43G72eN2avMNDGl4x5t6yQMl8,3931
|
|
70
|
-
datachain/query/__init__.py,sha256=
|
|
71
|
+
datachain/query/__init__.py,sha256=0NBOZVgIDpCcj1Ci883dQ9A0iiwe03xzmotkOCFbxYc,293
|
|
71
72
|
datachain/query/batch.py,sha256=-vlpINJiertlnaoUVv1C95RatU0F6zuhpIYRufJRo1M,3660
|
|
72
|
-
datachain/query/
|
|
73
|
-
datachain/query/
|
|
74
|
-
datachain/query/dispatch.py,sha256=GBh3EZHDp5AaXxrjOpfrpfsuy7Umnqxu-MAXcK9X3gc,12945
|
|
73
|
+
datachain/query/dataset.py,sha256=k2jU0uZ86i9vr3On-o7GzHrubK5bCJjZEvz9P8extmw,54347
|
|
74
|
+
datachain/query/dispatch.py,sha256=CFAc09O6UllcyUSSEY1GUlEMPzeO8RYhXinNN4HBl9M,12405
|
|
75
75
|
datachain/query/metrics.py,sha256=r5b0ygYhokbXp8Mg3kCH8iFSRw0jxzyeBe-C-J_bKFc,938
|
|
76
76
|
datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
|
|
77
77
|
datachain/query/queue.py,sha256=waqM_KzavU8C-G95-4211Nd4GXna_u2747Chgwtgz2w,3839
|
|
78
|
-
datachain/query/schema.py,sha256=
|
|
78
|
+
datachain/query/schema.py,sha256=I8zLWJuWl5N332ni9mAzDYtcxMJupVPgWkSDe8spNEk,8019
|
|
79
79
|
datachain/query/session.py,sha256=UPH5Z4fzCDsvj81ji0e8GA6Mgra3bOAEpVq4htqOtis,4317
|
|
80
|
-
datachain/query/udf.py,sha256=
|
|
80
|
+
datachain/query/udf.py,sha256=HB2hbEuiGA4ch9P2mh9iLA5Jj9mRj-4JFy9VfjTLJ8U,3622
|
|
81
81
|
datachain/remote/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
82
82
|
datachain/remote/studio.py,sha256=f5s6qSZ9uB4URGUoU_8_W1KZRRQQVSm6cgEBkBUEfuE,7226
|
|
83
83
|
datachain/sql/__init__.py,sha256=A2djrbQwSMUZZEIKGnm-mnRA-NDSbiDJNpAmmwGNyIo,303
|
|
84
84
|
datachain/sql/selectable.py,sha256=fBM-wS1TUA42kVEAAiwqGtibIevyZAEritwt8PZGyLQ,1589
|
|
85
|
-
datachain/sql/types.py,sha256=
|
|
85
|
+
datachain/sql/types.py,sha256=3aXpoxkmCYbw0Dlta5J1enwS8_FuvjfSqyrNZO-dWj4,13383
|
|
86
86
|
datachain/sql/utils.py,sha256=rzlJw08etivdrcuQPqNVvVWhuVSyUPUQEEc6DOhu258,818
|
|
87
87
|
datachain/sql/default/__init__.py,sha256=XQ2cEZpzWiABqjV-6yYHUBGI9vN_UHxbxZENESmVAWw,45
|
|
88
88
|
datachain/sql/default/base.py,sha256=QD-31C6JnyOXzogyDx90sUhm7QvgXIYpeHEASH84igU,628
|
|
@@ -97,9 +97,9 @@ datachain/sql/sqlite/base.py,sha256=WLPHBhZbXbiqPoRV1VgDrXJqku4UuvJpBhYeQ0k5rI8,
|
|
|
97
97
|
datachain/sql/sqlite/types.py,sha256=yzvp0sXSEoEYXs6zaYC_2YubarQoZH-MiUNXcpuEP4s,1573
|
|
98
98
|
datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
|
|
99
99
|
datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
|
|
100
|
-
datachain-0.3.
|
|
101
|
-
datachain-0.3.
|
|
102
|
-
datachain-0.3.
|
|
103
|
-
datachain-0.3.
|
|
104
|
-
datachain-0.3.
|
|
105
|
-
datachain-0.3.
|
|
100
|
+
datachain-0.3.18.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
|
|
101
|
+
datachain-0.3.18.dist-info/METADATA,sha256=_LpwSHtaSTA-rz4rG9nHIbO2mLlrlI4mCnlxKx8vePo,17185
|
|
102
|
+
datachain-0.3.18.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
|
|
103
|
+
datachain-0.3.18.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
|
|
104
|
+
datachain-0.3.18.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
|
|
105
|
+
datachain-0.3.18.dist-info/RECORD,,
|
datachain/query/builtins.py
DELETED
|
@@ -1,96 +0,0 @@
|
|
|
1
|
-
import hashlib
|
|
2
|
-
import tarfile
|
|
3
|
-
from functools import partial
|
|
4
|
-
|
|
5
|
-
from datachain.sql.types import String
|
|
6
|
-
|
|
7
|
-
from .schema import C, DatasetRow, Object
|
|
8
|
-
from .udf import udf
|
|
9
|
-
|
|
10
|
-
md5 = partial(hashlib.md5, usedforsecurity=False)
|
|
11
|
-
|
|
12
|
-
__all__ = ["checksum", "index_tar"]
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
def load_tar(raw):
|
|
16
|
-
with tarfile.open(fileobj=raw, mode="r:") as tar:
|
|
17
|
-
return tar.getmembers()
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
@udf(
|
|
21
|
-
(
|
|
22
|
-
C.source,
|
|
23
|
-
C.path,
|
|
24
|
-
C.size,
|
|
25
|
-
C.is_latest,
|
|
26
|
-
C.last_modified,
|
|
27
|
-
C.version,
|
|
28
|
-
C.etag,
|
|
29
|
-
Object(load_tar),
|
|
30
|
-
),
|
|
31
|
-
DatasetRow.schema,
|
|
32
|
-
)
|
|
33
|
-
def index_tar(
|
|
34
|
-
source,
|
|
35
|
-
parent_path,
|
|
36
|
-
size,
|
|
37
|
-
is_latest,
|
|
38
|
-
last_modified,
|
|
39
|
-
version,
|
|
40
|
-
etag,
|
|
41
|
-
tar_entries,
|
|
42
|
-
):
|
|
43
|
-
# generate original tar files as well, along with subobjects
|
|
44
|
-
yield DatasetRow.create(
|
|
45
|
-
source=source,
|
|
46
|
-
path=parent_path,
|
|
47
|
-
size=size,
|
|
48
|
-
is_latest=bool(is_latest),
|
|
49
|
-
last_modified=last_modified,
|
|
50
|
-
version=version,
|
|
51
|
-
etag=etag,
|
|
52
|
-
)
|
|
53
|
-
|
|
54
|
-
for info in tar_entries:
|
|
55
|
-
if info.isfile():
|
|
56
|
-
full_path = f"{parent_path}/{info.name}"
|
|
57
|
-
yield DatasetRow.create(
|
|
58
|
-
source=source,
|
|
59
|
-
path=full_path,
|
|
60
|
-
size=info.size,
|
|
61
|
-
location={
|
|
62
|
-
"vtype": "tar",
|
|
63
|
-
"offset": info.offset_data,
|
|
64
|
-
"size": info.size,
|
|
65
|
-
"parent": {
|
|
66
|
-
"source": source,
|
|
67
|
-
"path": parent_path,
|
|
68
|
-
"version": version,
|
|
69
|
-
"size": size,
|
|
70
|
-
"etag": etag,
|
|
71
|
-
"location": None,
|
|
72
|
-
},
|
|
73
|
-
},
|
|
74
|
-
)
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
BUFSIZE = 2**18
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
def file_digest(fileobj):
|
|
81
|
-
"""Calculate the digest of a file-like object."""
|
|
82
|
-
buf = bytearray(BUFSIZE) # Reusable buffer to reduce allocations.
|
|
83
|
-
view = memoryview(buf)
|
|
84
|
-
digestobj = md5()
|
|
85
|
-
# From 3.11's hashlib.filedigest()
|
|
86
|
-
while True:
|
|
87
|
-
size = fileobj.readinto(buf)
|
|
88
|
-
if size == 0:
|
|
89
|
-
break # EOF
|
|
90
|
-
digestobj.update(view[:size])
|
|
91
|
-
return digestobj.hexdigest()
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
@udf(params=[Object(file_digest)], output={"checksum": String})
|
|
95
|
-
def checksum(digest):
|
|
96
|
-
return (digest,)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|