datachain 0.3.16__py3-none-any.whl → 0.3.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

datachain/telemetry.py ADDED
@@ -0,0 +1,37 @@
1
+ import logging
2
+ import os
3
+ from importlib.metadata import PackageNotFoundError, version
4
+
5
+ from iterative_telemetry import IterativeTelemetryLogger
6
+
7
+ from datachain.utils import env2bool
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ def is_enabled():
13
+ """
14
+ Determine if telemetry is enabled based on environment variables and configuration.
15
+ """
16
+ # Disable telemetry if running in test mode
17
+ if env2bool("DATACHAIN_TEST"):
18
+ return False
19
+
20
+ # Check if telemetry is disabled by environment variable
21
+ disabled = bool(os.getenv("DATACHAIN_NO_ANALYTICS"))
22
+ if disabled:
23
+ logger.debug("Telemetry is disabled by environment variable.")
24
+ return False
25
+
26
+ logger.debug("Telemetry is enabled.")
27
+ return True
28
+
29
+
30
+ # Try to get the version of the datachain package
31
+ try:
32
+ __version__ = version("datachain")
33
+ except PackageNotFoundError:
34
+ __version__ = "unknown"
35
+
36
+ # Initialize telemetry logger
37
+ telemetry = IterativeTelemetryLogger("datachain", __version__, is_enabled)
datachain/utils.py CHANGED
@@ -1,10 +1,10 @@
1
1
  import glob
2
- import importlib.util
3
2
  import io
4
3
  import json
5
4
  import os
6
5
  import os.path as osp
7
6
  import random
7
+ import re
8
8
  import stat
9
9
  import sys
10
10
  import time
@@ -198,45 +198,6 @@ def get_envs_by_prefix(prefix: str) -> dict[str, str]:
198
198
  return variables
199
199
 
200
200
 
201
- def import_object(object_spec):
202
- filename, identifier = object_spec.rsplit(":", 1)
203
- filename = filename.strip()
204
- identifier = identifier.strip()
205
-
206
- if not identifier.isidentifier() or not filename.endswith(".py"):
207
- raise ValueError(f"Invalid object spec: {object_spec}")
208
-
209
- modname = os.path.abspath(filename)
210
- if modname in sys.modules:
211
- module = sys.modules[modname]
212
- else:
213
- # Use importlib to find and load the module from the given filename
214
- spec = importlib.util.spec_from_file_location(modname, filename)
215
- module = importlib.util.module_from_spec(spec)
216
- sys.modules[modname] = module
217
- spec.loader.exec_module(module)
218
-
219
- return getattr(module, identifier)
220
-
221
-
222
- def parse_params_string(params: str):
223
- """
224
- Parse a string containing UDF class constructor parameters in the form
225
- `a, b, key=val` into *args and **kwargs.
226
- """
227
- args = []
228
- kwargs = {}
229
- for part in params.split():
230
- if "=" in part:
231
- key, val = part.split("=")
232
- kwargs[key] = val
233
- else:
234
- args.append(part)
235
- if any((args, kwargs)):
236
- return args, kwargs
237
- return None, None
238
-
239
-
240
201
  _T_co = TypeVar("_T_co", covariant=True)
241
202
 
242
203
 
@@ -450,3 +411,13 @@ def get_datachain_executable() -> list[str]:
450
411
  def uses_glob(path: str) -> bool:
451
412
  """Checks if some URI path has glob syntax in it"""
452
413
  return glob.has_magic(os.path.basename(os.path.normpath(path)))
414
+
415
+
416
+ def env2bool(var, undefined=False):
417
+ """
418
+ undefined: return value if env var is unset
419
+ """
420
+ var = os.getenv(var, None)
421
+ if var is None:
422
+ return undefined
423
+ return bool(re.search("1|y|yes|true", var, flags=re.IGNORECASE))
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.3.16
3
+ Version: 0.3.18
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -43,6 +43,7 @@ Requires-Dist: Pillow <11,>=10.0.0
43
43
  Requires-Dist: msgpack <2,>=1.0.4
44
44
  Requires-Dist: psutil
45
45
  Requires-Dist: huggingface-hub
46
+ Requires-Dist: iterative-telemetry >=0.0.9
46
47
  Requires-Dist: numpy <2,>=1 ; sys_platform == "win32"
47
48
  Provides-Extra: dev
48
49
  Requires-Dist: datachain[docs,tests] ; extra == 'dev'
@@ -63,9 +64,10 @@ Requires-Dist: datachain[tests] ; extra == 'examples'
63
64
  Requires-Dist: numpy <2,>=1 ; extra == 'examples'
64
65
  Requires-Dist: defusedxml ; extra == 'examples'
65
66
  Requires-Dist: accelerate ; extra == 'examples'
66
- Requires-Dist: unstructured[pdf] ; extra == 'examples'
67
+ Requires-Dist: unstructured[embed-huggingface,pdf] ; extra == 'examples'
67
68
  Requires-Dist: pdfplumber ==0.11.4 ; extra == 'examples'
68
69
  Requires-Dist: huggingface-hub[hf_transfer] ; extra == 'examples'
70
+ Requires-Dist: onnx ==1.16.1 ; extra == 'examples'
69
71
  Provides-Extra: hf
70
72
  Requires-Dist: numba >=0.60.0 ; extra == 'hf'
71
73
  Requires-Dist: datasets[audio,vision] >=2.21.0 ; extra == 'hf'
@@ -78,7 +80,7 @@ Requires-Dist: pytest <9,>=8 ; extra == 'tests'
78
80
  Requires-Dist: pytest-sugar >=0.9.6 ; extra == 'tests'
79
81
  Requires-Dist: pytest-cov >=4.1.0 ; extra == 'tests'
80
82
  Requires-Dist: pytest-mock >=3.12.0 ; extra == 'tests'
81
- Requires-Dist: pytest-servers[all] >=0.5.5 ; extra == 'tests'
83
+ Requires-Dist: pytest-servers[all] >=0.5.7 ; extra == 'tests'
82
84
  Requires-Dist: pytest-benchmark[histogram] ; extra == 'tests'
83
85
  Requires-Dist: pytest-xdist >=3.3.1 ; extra == 'tests'
84
86
  Requires-Dist: virtualenv ; extra == 'tests'
@@ -1,59 +1,60 @@
1
1
  datachain/__init__.py,sha256=GeyhE-5LgfJav2OKYGaieP2lBvf2Gm-ihj7thnK9zjI,800
2
2
  datachain/__main__.py,sha256=hG3Y4ARGEqe1AWwNMd259rBlqtphx1Wk39YbueQ0yV8,91
3
3
  datachain/asyn.py,sha256=Lg3Ck1PQLjQziMx9KU4atzbEnJXTE0924WMYkhgWtGU,8247
4
- datachain/cache.py,sha256=WP-ktH_bRn3w2g1JOOQ7rCPsZyR4OM6K1Kb7yZsSSns,4056
5
- datachain/cli.py,sha256=alMjnoBUBLvBSMBR51N09rA_aUEdHJwyxSRogF7VbbA,30891
4
+ datachain/cache.py,sha256=s0YHN7qurmQv-eC265TjeureK84TebWWAnL07cxchZQ,2997
5
+ datachain/cli.py,sha256=tRuUvlFey5zYE0UVkGylqGiG5t89gUBo2SJ_yPsvu1I,30129
6
6
  datachain/cli_utils.py,sha256=jrn9ejGXjybeO1ur3fjdSiAyCHZrX0qsLLbJzN9ErPM,2418
7
7
  datachain/config.py,sha256=PfC7W5yO6HFO6-iMB4YB-0RR88LPiGmD6sS_SfVbGso,1979
8
8
  datachain/dataset.py,sha256=sHnsmKfMg2bK88gZH1izk8jlbmJDEhQpyOemdaPQVFo,14761
9
9
  datachain/error.py,sha256=OnZ8OaBtDdTZPy8XQiy29SAjqdQArQeorYbP5ju7ldc,1199
10
10
  datachain/job.py,sha256=Jt4sNutMHJReaGsj3r3scueN5aESLGfhimAa8pUP7Is,1271
11
- datachain/listing.py,sha256=vfjOlcb98A7xkGGKWEYON6l7lfrOqNv6kldmdVnlJn4,8178
12
- datachain/node.py,sha256=2pF3Y9oYzElfiUBcw2LIv7LNNt--V4E-K021zjv0b0I,4748
13
- datachain/nodes_fetcher.py,sha256=kca19yvu11JxoVY1t4_ydp1FmchiV88GnNicNBQ9NIA,831
14
- datachain/nodes_thread_pool.py,sha256=ZyzBvUImIPmi4WlKC2SW2msA0UhtembbTdcs2nx29A0,3191
15
- datachain/progress.py,sha256=7_8FtJs770ITK9sMq-Lt4k4k18QmYl4yIG_kCoWID3o,4559
11
+ datachain/listing.py,sha256=TkMmBzCiru26x4RaZiagWJTmTGbiy6yGrAsSJMr8cFE,8213
12
+ datachain/node.py,sha256=ThE6Ue4BqpaBvrkFFJW_ljLxchixUX2aWz3l_nbwY54,5195
13
+ datachain/nodes_fetcher.py,sha256=F-73-h19HHNGtHFBGKk7p3mc0ALm4a9zGnzhtuUjnp4,1107
14
+ datachain/nodes_thread_pool.py,sha256=uPo-xl8zG5m9YgODjPFBpbcqqHjI-dcxH87yAbj_qco,3192
15
+ datachain/progress.py,sha256=5KotcvvzAUL_RF0GEj4JY0IB1lyImnmHxe89YkT1XO4,4330
16
16
  datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
17
  datachain/storage.py,sha256=RiSJLYdHUjnrEWkLBKPcETHpAxld_B2WxLg711t0aZI,3733
18
- datachain/utils.py,sha256=Z9-lPNvrrAh_VWpzVBJ7L5-Oy_Oo1V0ZW7G0MVDyPK4,13065
18
+ datachain/telemetry.py,sha256=0A4IOPPp9VlP5pyW9eBfaTK3YhHGzHl7dQudQjUAx9A,994
19
+ datachain/utils.py,sha256=KeFSRHsiYthnTu4a6bH-rw04mX1m8krTX0f2NqfQGFI,12114
19
20
  datachain/catalog/__init__.py,sha256=g2iAAFx_gEIrqshXlhSEbrc8qDaEH11cjU40n3CHDz4,409
20
- datachain/catalog/catalog.py,sha256=kPg5ILeCWSjXCj3ewUZY6kzj36HTEqajB3mJDkbs-Vo,69023
21
+ datachain/catalog/catalog.py,sha256=tICInYEeCRJow9hNSFnlA50hCOjFPN7fyGgoN5shcf8,67985
21
22
  datachain/catalog/datasource.py,sha256=D-VWIVDCM10A8sQavLhRXdYSCG7F4o4ifswEF80_NAQ,1412
22
23
  datachain/catalog/loader.py,sha256=-6VelNfXUdgUnwInVyA8g86Boxv2xqhTh9xNS-Zlwig,8242
23
24
  datachain/client/__init__.py,sha256=T4wiYL9KIM0ZZ_UqIyzV8_ufzYlewmizlV4iymHNluE,86
24
25
  datachain/client/azure.py,sha256=ffxs26zm6KLAL1aUWJm-vtzuZP3LSNha7UDGXynMBKo,2234
25
26
  datachain/client/fileslice.py,sha256=bT7TYco1Qe3bqoc8aUkUZcPdPofJDHlryL5BsTn9xsY,3021
26
- datachain/client/fsspec.py,sha256=0i4EJIwdx_UNZlbSsUeohWjgVg4B5xoGxTYZKwXS22U,13459
27
+ datachain/client/fsspec.py,sha256=CO5LfxlZF58UAywLfMYeZRXDLIzcJepnQyPZfZk0Ies,12236
27
28
  datachain/client/gcs.py,sha256=cnTIr5GS6dbYOEYfqehhyQu3dr6XNjPHSg5U3FkivUk,4124
28
29
  datachain/client/hf.py,sha256=k24bpa6FEKNQn9zhoNC9kCigDwFSqobLsCnN_Nuzwh4,922
29
- datachain/client/local.py,sha256=LTyISV4oNSOPUdsai5eNZYCGXNCn8rNGuAI0bdgbtnU,5006
30
+ datachain/client/local.py,sha256=5OT3yf9QHi0If_dlqKYIYs-if-3oWhfAztMvsSa3YRA,4969
30
31
  datachain/client/s3.py,sha256=CVHBUZ1Ic2Q3370nl-Bbe69phuWjFlrVv9dTJKBpRT0,6019
31
32
  datachain/data_storage/__init__.py,sha256=cEOJpyu1JDZtfUupYucCDNFI6e5Wmp_Oyzq6rZv32Y8,398
32
33
  datachain/data_storage/db_engine.py,sha256=81Ol1of9TTTzD97ORajCnP366Xz2mEJt6C-kTUCaru4,3406
33
34
  datachain/data_storage/id_generator.py,sha256=lCEoU0BM37Ai2aRpSbwo5oQT0GqZnSpYwwvizathRMQ,4292
34
35
  datachain/data_storage/job.py,sha256=w-7spowjkOa1P5fUVtJou3OltT0L48P0RYWZ9rSJ9-s,383
35
- datachain/data_storage/metastore.py,sha256=cHN0xmbUvChyayHHZm3Vqxr87jFqojPSlGBqhTPStlE,54519
36
+ datachain/data_storage/metastore.py,sha256=BePe3bVxo-Zuuccok8TLRo4cMHVnAIa8hfZMadbxzqM,52649
36
37
  datachain/data_storage/schema.py,sha256=AGbjyEir5UmRZXI3m0jChZogUh5wd8csj6-YlUWaAxQ,8383
37
38
  datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
38
- datachain/data_storage/sqlite.py,sha256=3OehNpYb4WJYt4RhPxZrQn9UL1yiHX7Fp1W53o-Y1NA,28788
39
- datachain/data_storage/warehouse.py,sha256=g_yWXpw5iC-VYi8gH0ctDlwO3Mo6AT-32j3Nw6TFgqw,32857
39
+ datachain/data_storage/sqlite.py,sha256=EBKJncuzcyQfcKFm2mUjvHjHRTODsteM-k_zndunBrw,28834
40
+ datachain/data_storage/warehouse.py,sha256=Vwhu_OfcNAoTtg1BHui80VCzlPeTUjZQL0QWziu8awY,32186
40
41
  datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
41
- datachain/lib/arrow.py,sha256=voY9KuJ2uhPxw_DS6rIjwfKjWXi84T3LFJ7kGFcDQuk,7272
42
+ datachain/lib/arrow.py,sha256=uYn9RQwJy4MsMkhu18_6cgtVO3HkniBcB1NdFmkwtvo,7292
42
43
  datachain/lib/clip.py,sha256=lm5CzVi4Cj1jVLEKvERKArb-egb9j1Ls-fwTItT6vlI,6150
43
44
  datachain/lib/data_model.py,sha256=gHIjlow84GMRDa78yLL1Ud-N18or21fnTyPEwsatpXY,2045
44
45
  datachain/lib/dataset_info.py,sha256=srPPhI2UHf6hFPBecyFEVw2SS5aPisIIMsvGgKqi7ss,2366
45
- datachain/lib/dc.py,sha256=HERJNR4TISbaAtSLARV72INgKPfQRItyd1l28P-GtzU,68871
46
- datachain/lib/file.py,sha256=elQLorLbIkusuQSVfiuC_KrGSZI8cGm-iT8fHmckJlo,13774
46
+ datachain/lib/dc.py,sha256=oc9tPf5G9X3DmVCPaTuuwp8LlLogoYuEHpOkq_W7h6Y,68984
47
+ datachain/lib/file.py,sha256=flKGvmrotXWZqQQafaZQEeSQlqUVTSVWB7JIkEsr0MM,14255
47
48
  datachain/lib/hf.py,sha256=cPnmLuprr0pYABH7KqA5FARQ1JGlywdDwD3yDzVAm4k,5920
48
49
  datachain/lib/image.py,sha256=AMXYwQsmarZjRbPCZY3M1jDsM2WAB_b3cTY4uOIuXNU,2675
49
- datachain/lib/listing.py,sha256=e4O1gs3rKJ0eGwb0hSEfD-l9U7x-f-TYqYGF7Ni-x38,3973
50
+ datachain/lib/listing.py,sha256=cHPN5-Fq8yb0gP6DARImhmZWxykDDNqhhJujDxEp53A,4104
50
51
  datachain/lib/listing_info.py,sha256=36NZ-tXY5Y118wurkajuWWbcE8UCjkRwZlacDtN9F3g,954
51
52
  datachain/lib/meta_formats.py,sha256=3f-0vpMTesagS9iMd3y9-u9r-7g0eqYsxmK4fVfNWlw,6635
52
53
  datachain/lib/model_store.py,sha256=DNIv8Y6Jtk1_idNLzIpsThOsdW2BMAudyUCbPUcgcxk,2515
53
54
  datachain/lib/pytorch.py,sha256=8LNyFaBrx8zws--MEsFg5g3pb8oLnaQAUlgGvtjKxX4,5960
54
55
  datachain/lib/settings.py,sha256=39thOpYJw-zPirzeNO6pmRC2vPrQvt4eBsw1xLWDFsw,2344
55
56
  datachain/lib/signal_schema.py,sha256=iqgubjCBRiUJB30miv05qFX4uU04dA_Pzi3DCUsHZGs,24177
56
- datachain/lib/tar.py,sha256=d7FpYyxbHCL1twRt_Oe9QoPbZa2Tn5lj7iWP0HvvRn0,999
57
+ datachain/lib/tar.py,sha256=3WIzao6yD5fbLqXLTt9GhPGNonbFIs_fDRu-9vgLgsA,1038
57
58
  datachain/lib/text.py,sha256=UNHm8fhidk7wdrWqacEWaA6I9ykfYqarQ2URby7jc7M,1261
58
59
  datachain/lib/udf.py,sha256=nG7DDuPgZ5ZuijwvDoCq-OZMxlDM8vFNzyxMmik0Y1c,11716
59
60
  datachain/lib/udf_signature.py,sha256=gMStcEeYJka5M6cg50Z9orC6y6HzCAJ3MkFqqn1fjZg,7137
@@ -67,22 +68,21 @@ datachain/lib/convert/python_to_sql.py,sha256=40SAOdoOgikZRhn8iomCPDRoxC3RFxjJLi
67
68
  datachain/lib/convert/sql_to_python.py,sha256=lGnKzSF_tz9Y_5SSKkrIU95QEjpcDzvOxIRkEKTQag0,443
68
69
  datachain/lib/convert/unflatten.py,sha256=Ogvh_5wg2f38_At_1lN0D_e2uZOOpYEvwvB2xdq56Tw,2012
69
70
  datachain/lib/convert/values_to_tuples.py,sha256=YOdbjzHq-uj6-cV2Qq43G72eN2avMNDGl4x5t6yQMl8,3931
70
- datachain/query/__init__.py,sha256=tv-spkjUCYamMN9ys_90scYrZ8kJ7C7d1MTYVmxGtk4,325
71
+ datachain/query/__init__.py,sha256=0NBOZVgIDpCcj1Ci883dQ9A0iiwe03xzmotkOCFbxYc,293
71
72
  datachain/query/batch.py,sha256=-vlpINJiertlnaoUVv1C95RatU0F6zuhpIYRufJRo1M,3660
72
- datachain/query/builtins.py,sha256=U6yHPF9bzxqK5iwyqCqbJxo8ggBVx9FtuXxRrQQ0SNM,2244
73
- datachain/query/dataset.py,sha256=tBmAlcz6orJbKWkcvGVE4wom-EWInFaXHJYMSpVZnhA,58892
74
- datachain/query/dispatch.py,sha256=GBh3EZHDp5AaXxrjOpfrpfsuy7Umnqxu-MAXcK9X3gc,12945
73
+ datachain/query/dataset.py,sha256=k2jU0uZ86i9vr3On-o7GzHrubK5bCJjZEvz9P8extmw,54347
74
+ datachain/query/dispatch.py,sha256=CFAc09O6UllcyUSSEY1GUlEMPzeO8RYhXinNN4HBl9M,12405
75
75
  datachain/query/metrics.py,sha256=r5b0ygYhokbXp8Mg3kCH8iFSRw0jxzyeBe-C-J_bKFc,938
76
76
  datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
77
77
  datachain/query/queue.py,sha256=waqM_KzavU8C-G95-4211Nd4GXna_u2747Chgwtgz2w,3839
78
- datachain/query/schema.py,sha256=ytlkA1xFAUOia25u8d6pxvxBSRl3uivLuOe2eHaw-qc,7550
78
+ datachain/query/schema.py,sha256=I8zLWJuWl5N332ni9mAzDYtcxMJupVPgWkSDe8spNEk,8019
79
79
  datachain/query/session.py,sha256=UPH5Z4fzCDsvj81ji0e8GA6Mgra3bOAEpVq4htqOtis,4317
80
- datachain/query/udf.py,sha256=j3NhmKK5rYG5TclcM2Sr0LhS1tmYLMjzMugx9G9iFLM,8100
80
+ datachain/query/udf.py,sha256=HB2hbEuiGA4ch9P2mh9iLA5Jj9mRj-4JFy9VfjTLJ8U,3622
81
81
  datachain/remote/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
82
82
  datachain/remote/studio.py,sha256=f5s6qSZ9uB4URGUoU_8_W1KZRRQQVSm6cgEBkBUEfuE,7226
83
83
  datachain/sql/__init__.py,sha256=A2djrbQwSMUZZEIKGnm-mnRA-NDSbiDJNpAmmwGNyIo,303
84
84
  datachain/sql/selectable.py,sha256=fBM-wS1TUA42kVEAAiwqGtibIevyZAEritwt8PZGyLQ,1589
85
- datachain/sql/types.py,sha256=1ofJjgzKTxFLl1WaMSI9pLvdHGZ1U24I0z5i-gChqDI,13305
85
+ datachain/sql/types.py,sha256=3aXpoxkmCYbw0Dlta5J1enwS8_FuvjfSqyrNZO-dWj4,13383
86
86
  datachain/sql/utils.py,sha256=rzlJw08etivdrcuQPqNVvVWhuVSyUPUQEEc6DOhu258,818
87
87
  datachain/sql/default/__init__.py,sha256=XQ2cEZpzWiABqjV-6yYHUBGI9vN_UHxbxZENESmVAWw,45
88
88
  datachain/sql/default/base.py,sha256=QD-31C6JnyOXzogyDx90sUhm7QvgXIYpeHEASH84igU,628
@@ -97,9 +97,9 @@ datachain/sql/sqlite/base.py,sha256=WLPHBhZbXbiqPoRV1VgDrXJqku4UuvJpBhYeQ0k5rI8,
97
97
  datachain/sql/sqlite/types.py,sha256=yzvp0sXSEoEYXs6zaYC_2YubarQoZH-MiUNXcpuEP4s,1573
98
98
  datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
99
99
  datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
100
- datachain-0.3.16.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
101
- datachain-0.3.16.dist-info/METADATA,sha256=EjMy4f4OVbwVttlWRzzXRLr-uAEAGNMPMmge96_CI2o,17073
102
- datachain-0.3.16.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
103
- datachain-0.3.16.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
104
- datachain-0.3.16.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
105
- datachain-0.3.16.dist-info/RECORD,,
100
+ datachain-0.3.18.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
101
+ datachain-0.3.18.dist-info/METADATA,sha256=_LpwSHtaSTA-rz4rG9nHIbO2mLlrlI4mCnlxKx8vePo,17185
102
+ datachain-0.3.18.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
103
+ datachain-0.3.18.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
104
+ datachain-0.3.18.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
105
+ datachain-0.3.18.dist-info/RECORD,,
@@ -1,96 +0,0 @@
1
- import hashlib
2
- import tarfile
3
- from functools import partial
4
-
5
- from datachain.sql.types import String
6
-
7
- from .schema import C, DatasetRow, Object
8
- from .udf import udf
9
-
10
- md5 = partial(hashlib.md5, usedforsecurity=False)
11
-
12
- __all__ = ["checksum", "index_tar"]
13
-
14
-
15
- def load_tar(raw):
16
- with tarfile.open(fileobj=raw, mode="r:") as tar:
17
- return tar.getmembers()
18
-
19
-
20
- @udf(
21
- (
22
- C.source,
23
- C.path,
24
- C.size,
25
- C.is_latest,
26
- C.last_modified,
27
- C.version,
28
- C.etag,
29
- Object(load_tar),
30
- ),
31
- DatasetRow.schema,
32
- )
33
- def index_tar(
34
- source,
35
- parent_path,
36
- size,
37
- is_latest,
38
- last_modified,
39
- version,
40
- etag,
41
- tar_entries,
42
- ):
43
- # generate original tar files as well, along with subobjects
44
- yield DatasetRow.create(
45
- source=source,
46
- path=parent_path,
47
- size=size,
48
- is_latest=bool(is_latest),
49
- last_modified=last_modified,
50
- version=version,
51
- etag=etag,
52
- )
53
-
54
- for info in tar_entries:
55
- if info.isfile():
56
- full_path = f"{parent_path}/{info.name}"
57
- yield DatasetRow.create(
58
- source=source,
59
- path=full_path,
60
- size=info.size,
61
- location={
62
- "vtype": "tar",
63
- "offset": info.offset_data,
64
- "size": info.size,
65
- "parent": {
66
- "source": source,
67
- "path": parent_path,
68
- "version": version,
69
- "size": size,
70
- "etag": etag,
71
- "location": None,
72
- },
73
- },
74
- )
75
-
76
-
77
- BUFSIZE = 2**18
78
-
79
-
80
- def file_digest(fileobj):
81
- """Calculate the digest of a file-like object."""
82
- buf = bytearray(BUFSIZE) # Reusable buffer to reduce allocations.
83
- view = memoryview(buf)
84
- digestobj = md5()
85
- # From 3.11's hashlib.filedigest()
86
- while True:
87
- size = fileobj.readinto(buf)
88
- if size == 0:
89
- break # EOF
90
- digestobj.update(view[:size])
91
- return digestobj.hexdigest()
92
-
93
-
94
- @udf(params=[Object(file_digest)], output={"checksum": String})
95
- def checksum(digest):
96
- return (digest,)