datachain 0.18.9__py3-none-any.whl → 0.18.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

@@ -17,6 +17,8 @@ from datachain.client import Client
17
17
  from datachain.data_storage.schema import convert_rows_custom_column_types
18
18
  from datachain.data_storage.serializer import Serializable
19
19
  from datachain.dataset import DatasetRecord, StorageURI
20
+ from datachain.lib.file import File
21
+ from datachain.lib.signal_schema import SignalSchema
20
22
  from datachain.node import DirType, DirTypeGroup, Node, NodeWithPath, get_path
21
23
  from datachain.query.batch import RowsOutput
22
24
  from datachain.query.utils import get_query_id_column
@@ -35,7 +37,6 @@ if TYPE_CHECKING:
35
37
  from datachain.data_storage import schema
36
38
  from datachain.data_storage.db_engine import DatabaseEngine
37
39
  from datachain.data_storage.schema import DataTable
38
- from datachain.lib.file import File
39
40
 
40
41
 
41
42
  logger = logging.getLogger("datachain")
@@ -370,14 +371,18 @@ class AbstractWarehouse(ABC, Serializable):
370
371
  if not (self.db.has_table(self.dataset_table_name(dataset.name, version))):
371
372
  return None, None
372
373
 
374
+ file_signals = list(
375
+ SignalSchema.deserialize(dataset.feature_schema).get_signals(File)
376
+ )
377
+
373
378
  dr = self.dataset_rows(dataset, version)
374
379
  table = dr.get_table()
375
380
  expressions: tuple[_ColumnsClauseArgument[Any], ...] = (
376
381
  sa.func.count(table.c.sys__id),
377
382
  )
378
- size_columns = [
379
- c for c in table.columns if c.name == "size" or c.name.endswith("__size")
380
- ]
383
+ size_column_names = [s.replace(".", "__") + "__size" for s in file_signals]
384
+ size_columns = [c for c in table.columns if c.name in size_column_names]
385
+
381
386
  if size_columns:
382
387
  expressions = (*expressions, sa.func.sum(sum(size_columns)))
383
388
  query = sa.select(*expressions)
datachain/lib/dc/json.py CHANGED
@@ -1,18 +1,13 @@
1
1
  import os
2
2
  import os.path
3
3
  import re
4
- from typing import (
5
- TYPE_CHECKING,
6
- Optional,
7
- Union,
8
- )
4
+ from typing import TYPE_CHECKING, Optional, Union
9
5
 
6
+ import cloudpickle
7
+
8
+ from datachain.lib import meta_formats
10
9
  from datachain.lib.data_model import DataType
11
- from datachain.lib.file import (
12
- File,
13
- FileType,
14
- )
15
- from datachain.lib.meta_formats import read_meta
10
+ from datachain.lib.file import File, FileType
16
11
 
17
12
  if TYPE_CHECKING:
18
13
  from typing_extensions import ParamSpec
@@ -76,7 +71,7 @@ def read_json(
76
71
  column = format
77
72
  chain = read_storage(uri=path, type=type, **kwargs)
78
73
  signal_dict = {
79
- column: read_meta(
74
+ column: meta_formats.read_meta(
80
75
  schema_from=schema_from,
81
76
  format=format,
82
77
  spec=spec,
@@ -88,4 +83,7 @@ def read_json(
88
83
  }
89
84
  # disable prefetch if nrows is set
90
85
  settings = {"prefetch": 0} if nrows else {}
86
+
87
+ cloudpickle.register_pickle_by_value(meta_formats)
88
+
91
89
  return chain.settings(**settings).gen(**signal_dict) # type: ignore[misc, arg-type]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.18.9
3
+ Version: 0.18.10
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License-Expression: Apache-2.0
@@ -51,7 +51,7 @@ datachain/data_storage/metastore.py,sha256=1PaRTQbL7kjcU1BVjiLjXJLrrLzQtUvpqLmm0
51
51
  datachain/data_storage/schema.py,sha256=asZYz1cg_WKfe2Q-k5W51E2z2CzHU5B4QEDZDMFr8yo,9346
52
52
  datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
53
53
  datachain/data_storage/sqlite.py,sha256=bwZAB_NUMT2WMv5tPQnnLFA0P-PiQtxzSaQ1q6xDxOU,24590
54
- datachain/data_storage/warehouse.py,sha256=RkdX1cunfmpDkRYRdOGNy0kLw7RekIokVl3Dd0i-hrA,31534
54
+ datachain/data_storage/warehouse.py,sha256=imPm4R2V7TkqgGNSO2FGnKu03axU9UVLMfdUPfpwgHE,31747
55
55
  datachain/diff/__init__.py,sha256=-OFZzgOplqO84iWgGY7kfe60NXaWR9JRIh9T-uJboAM,9668
56
56
  datachain/fs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
57
57
  datachain/fs/reference.py,sha256=A8McpXF0CqbXPqanXuvpKu50YLB3a2ZXA3YAPxtBXSM,914
@@ -102,7 +102,7 @@ datachain/lib/dc/database.py,sha256=g5M6NjYR1T0vKte-abV-3Ejnm-HqxTIMir5cRi_SziE,
102
102
  datachain/lib/dc/datachain.py,sha256=5rR_QqG4vesq-x545ZTSFJDSb6Oc5CW4-ziQYD6DpW4,80993
103
103
  datachain/lib/dc/datasets.py,sha256=G65leCuo_3bItmvjoV1wK0pzj7a2IQqe3xRsflpF3xM,10794
104
104
  datachain/lib/dc/hf.py,sha256=PJl2wiLjdRsMz0SYbLT-6H8b-D5i2WjeH7li8HHOk_0,2145
105
- datachain/lib/dc/json.py,sha256=ZUThPDAaP2gBFIL5vsQTwKBcuN_dhvC_O44wdDv0jEc,2683
105
+ datachain/lib/dc/json.py,sha256=dNijfJ-H92vU3soyR7X1IiDrWhm6yZIGG3bSnZkPdAE,2733
106
106
  datachain/lib/dc/listings.py,sha256=2na9v63xO1vPUNaoBSzA-TSN49V7zQAb-4iS1wOPLFE,1029
107
107
  datachain/lib/dc/pandas.py,sha256=ObueUXDUFKJGu380GmazdG02ARpKAHPhSaymfmOH13E,1489
108
108
  datachain/lib/dc/parquet.py,sha256=zYcSgrWwyEDW9UxGUSVdIVsCu15IGEf0xL8KfWQqK94,1782
@@ -153,9 +153,9 @@ datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR
153
153
  datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
154
154
  datachain/toolkit/split.py,sha256=ktGWzY4kyzjWyR86dhvzw-Zhl0lVk_LOX3NciTac6qo,2914
155
155
  datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
156
- datachain-0.18.9.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
157
- datachain-0.18.9.dist-info/METADATA,sha256=0BhJEeQiYf41Rg7DLgJ-WtiUu9cpwwUtVwo__lPaMAw,11319
158
- datachain-0.18.9.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
159
- datachain-0.18.9.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
160
- datachain-0.18.9.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
161
- datachain-0.18.9.dist-info/RECORD,,
156
+ datachain-0.18.10.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
157
+ datachain-0.18.10.dist-info/METADATA,sha256=Vjkb16V4J8lNJphVuqD2DZ_V_7BLIf8YPRlvJNtsLaM,11320
158
+ datachain-0.18.10.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
159
+ datachain-0.18.10.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
160
+ datachain-0.18.10.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
161
+ datachain-0.18.10.dist-info/RECORD,,