datachain 0.18.0__py3-none-any.whl → 0.18.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

@@ -674,7 +674,7 @@ class AbstractDBMetastore(AbstractMetastore):
674
674
  dv = self._datasets_versions
675
675
  self.db.execute(
676
676
  self._datasets_versions_update()
677
- .where(dv.c.dataset_id == dataset.id and dv.c.version == version)
677
+ .where(dv.c.dataset_id == dataset.id, dv.c.version == version)
678
678
  .values(values),
679
679
  conn=conn,
680
680
  ) # type: ignore [attr-defined]
datachain/func/array.py CHANGED
@@ -178,6 +178,126 @@ def contains(arr: Union[str, Sequence, Func], elem: Any) -> Func:
178
178
  return Func("contains", inner=inner, cols=cols, args=args, result_type=int)
179
179
 
180
180
 
181
+ def slice(
182
+ arr: Union[str, Sequence, Func],
183
+ offset: int,
184
+ length: Optional[int] = None,
185
+ ) -> Func:
186
+ """
187
+ Returns a slice of the array.
188
+
189
+ Args:
190
+ arr (str | Sequence | Func): Array to check for the element.
191
+ If a string is provided, it is assumed to be the name of the array column.
192
+ If a sequence is provided, it is assumed to be an array of values.
193
+ If a Func is provided, it is assumed to be a function returning an array.
194
+ offset (int): Offset to start the slice from.
195
+ length (int, optional): Length of the slice. If not provided, the slice will
196
+ continue to the end of the array.
197
+
198
+ Returns:
199
+ Func: A Func object that represents the slice function. Result of the
200
+ function will be a slice of the array starting from the offset
201
+ and with the given length.
202
+
203
+ Example:
204
+ ```py
205
+ dc.mutate(
206
+ contains1=func.array.slice("signal.values", 3),
207
+ contains2=func.array.slice([1, 2, 3, 4, 5], 1, 3),
208
+ )
209
+ ```
210
+ """
211
+
212
+ def inner(arg):
213
+ if length is not None:
214
+ return array.slice(arg, offset, length)
215
+ return array.slice(arg, offset)
216
+
217
+ def element_type(el):
218
+ if isinstance(el, list):
219
+ try:
220
+ return list[element_type(el[0])]
221
+ except IndexError:
222
+ # if the array is empty, return list[str] as default type
223
+ return list[str]
224
+ return type(el)
225
+
226
+ def type_from_args(arr, *_):
227
+ if isinstance(arr, list):
228
+ try:
229
+ return list[element_type(arr[0])]
230
+ except IndexError:
231
+ # if the array is empty, return list[str] as default type
232
+ return list[str]
233
+ return None
234
+
235
+ if isinstance(arr, (str, Func)):
236
+ cols = [arr]
237
+ args = None
238
+ else:
239
+ cols = None
240
+ args = [arr]
241
+
242
+ return Func(
243
+ "slice",
244
+ inner=inner,
245
+ cols=cols,
246
+ args=args,
247
+ from_array=True,
248
+ is_array=True,
249
+ type_from_args=type_from_args,
250
+ )
251
+
252
+
253
+ def join(
254
+ arr: Union[str, Sequence, Func],
255
+ sep: str = "",
256
+ ) -> Func:
257
+ """
258
+ Returns a string that is the concatenation of the elements of the array,
259
+
260
+ Args:
261
+ arr (str | Sequence | Func): Array to check for the element.
262
+ If a string is provided, it is assumed to be the name of the array column.
263
+ If a sequence is provided, it is assumed to be an array of values.
264
+ If a Func is provided, it is assumed to be a function returning an array.
265
+ sep (str): Separator to use for the concatenation. Default is an empty string.
266
+
267
+ Returns:
268
+ Func: A Func object that represents the join function. Result of the
269
+ function will be a string that is the concatenation of the elements
270
+ of the array, separated by the given separator.
271
+
272
+ Example:
273
+ ```py
274
+ dc.mutate(
275
+ contains1=func.array.join("signal.values", ":"),
276
+ contains2=func.array.join(["1", "2", "3", "4", "5"], "/"),
277
+ )
278
+ ```
279
+ """
280
+
281
+ def inner(arg):
282
+ return array.join(arg, sep)
283
+
284
+ if isinstance(arr, (str, Func)):
285
+ cols = [arr]
286
+ args = None
287
+ else:
288
+ cols = None
289
+ args = [arr]
290
+
291
+ return Func(
292
+ "join",
293
+ inner=inner,
294
+ cols=cols,
295
+ args=args,
296
+ from_array=True,
297
+ result_type=str,
298
+ )
299
+
300
+
181
301
  def get_element(arg: Union[str, Sequence, Func], index: int) -> Func:
182
302
  """
183
303
  Returns the element at the given index from the array.
datachain/func/func.py CHANGED
@@ -108,18 +108,20 @@ class Func(Function):
108
108
  )
109
109
 
110
110
  if self.from_array:
111
- if get_origin(col_type) is list:
112
- col_args = get_args(col_type)
113
- if len(col_args) != 1:
114
- raise DataChainColumnError(
115
- str(self),
116
- "Array column must have a single type argument",
117
- )
118
- return col_args[0]
119
- raise DataChainColumnError(
120
- str(self),
121
- "Array column must be of type list",
122
- )
111
+ if get_origin(col_type) is not list:
112
+ raise DataChainColumnError(
113
+ str(self),
114
+ "Array column must be of type list",
115
+ )
116
+ if self.is_array:
117
+ return col_type
118
+ col_args = get_args(col_type)
119
+ if len(col_args) != 1:
120
+ raise DataChainColumnError(
121
+ str(self),
122
+ "Array column must have a single type argument",
123
+ )
124
+ return col_args[0]
123
125
 
124
126
  return list[col_type] if self.is_array else col_type # type: ignore[valid-type]
125
127
 
datachain/lib/file.py CHANGED
@@ -237,7 +237,7 @@ class File(DataModel):
237
237
  @field_validator("path", mode="before")
238
238
  @classmethod
239
239
  def validate_path(cls, path):
240
- return Path(path).as_posix()
240
+ return Path(path).as_posix() if path else ""
241
241
 
242
242
  def model_dump_custom(self):
243
243
  res = self.model_dump()
@@ -195,5 +195,11 @@ class Session:
195
195
  Session.GLOBAL_SESSION_CTX.__exit__(None, None, None)
196
196
 
197
197
  for obj in gc.get_objects(): # Get all tracked objects
198
- if isinstance(obj, Session): # Cleanup temp dataset for session variables.
199
- obj.__exit__(None, None, None)
198
+ try:
199
+ if isinstance(obj, Session):
200
+ # Cleanup temp dataset for session variables.
201
+ obj.__exit__(None, None, None)
202
+ except ReferenceError:
203
+ continue # Object has been finalized already
204
+ except Exception as e: # noqa: BLE001
205
+ logger.error(f"Exception while cleaning up session: {e}") # noqa: G004
@@ -1,6 +1,6 @@
1
1
  from sqlalchemy.sql.functions import GenericFunction
2
2
 
3
- from datachain.sql.types import Boolean, Float, Int64
3
+ from datachain.sql.types import Boolean, Float, Int64, String
4
4
  from datachain.sql.utils import compiler_not_implemented
5
5
 
6
6
 
@@ -48,6 +48,27 @@ class contains(GenericFunction): # noqa: N801
48
48
  inherit_cache = True
49
49
 
50
50
 
51
+ class slice(GenericFunction): # noqa: N801
52
+ """
53
+ Returns a slice of the array.
54
+ """
55
+
56
+ package = "array"
57
+ name = "slice"
58
+ inherit_cache = True
59
+
60
+
61
+ class join(GenericFunction): # noqa: N801
62
+ """
63
+ Returns the concatenation of the array elements.
64
+ """
65
+
66
+ type = String()
67
+ package = "array"
68
+ name = "join"
69
+ inherit_cache = True
70
+
71
+
51
72
  class get_element(GenericFunction): # noqa: N801
52
73
  """
53
74
  Returns the element at the given index in the array.
@@ -88,6 +88,8 @@ def setup():
88
88
  compiles(sql_path.file_ext, "sqlite")(compile_path_file_ext)
89
89
  compiles(array.length, "sqlite")(compile_array_length)
90
90
  compiles(array.contains, "sqlite")(compile_array_contains)
91
+ compiles(array.slice, "sqlite")(compile_array_slice)
92
+ compiles(array.join, "sqlite")(compile_array_join)
91
93
  compiles(array.get_element, "sqlite")(compile_array_get_element)
92
94
  compiles(string.length, "sqlite")(compile_string_length)
93
95
  compiles(string.split, "sqlite")(compile_string_split)
@@ -275,6 +277,15 @@ def register_user_defined_sql_functions() -> None:
275
277
  conn.create_function(
276
278
  "json_array_get_element", 2, py_json_array_get_element, deterministic=True
277
279
  )
280
+ conn.create_function(
281
+ "json_array_slice", 2, py_json_array_slice, deterministic=True
282
+ )
283
+ conn.create_function(
284
+ "json_array_slice", 3, py_json_array_slice, deterministic=True
285
+ )
286
+ conn.create_function(
287
+ "json_array_join", 2, py_json_array_join, deterministic=True
288
+ )
278
289
 
279
290
  _registered_function_creators["array_functions"] = create_array_functions
280
291
 
@@ -454,6 +465,20 @@ def py_json_array_get_element(val, idx):
454
465
  return None
455
466
 
456
467
 
468
+ def py_json_array_slice(val, offset: int, length: Optional[int] = None):
469
+ arr = orjson.loads(val)
470
+ try:
471
+ return orjson.dumps(
472
+ list(arr[offset : offset + length] if length is not None else arr[offset:])
473
+ ).decode("utf-8")
474
+ except IndexError:
475
+ return None
476
+
477
+
478
+ def py_json_array_join(val, sep: str):
479
+ return sep.join(orjson.loads(val))
480
+
481
+
457
482
  def compile_array_get_element(element, compiler, **kwargs):
458
483
  return compiler.process(
459
484
  func.json_array_get_element(*element.clauses.clauses), **kwargs
@@ -470,6 +495,14 @@ def compile_array_contains(element, compiler, **kwargs):
470
495
  )
471
496
 
472
497
 
498
+ def compile_array_slice(element, compiler, **kwargs):
499
+ return compiler.process(func.json_array_slice(*element.clauses.clauses), **kwargs)
500
+
501
+
502
+ def compile_array_join(element, compiler, **kwargs):
503
+ return compiler.process(func.json_array_join(*element.clauses.clauses), **kwargs)
504
+
505
+
473
506
  def compile_string_length(element, compiler, **kwargs):
474
507
  return compiler.process(func.length(*element.clauses.clauses), **kwargs)
475
508
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.18.0
3
+ Version: 0.18.2
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License-Expression: Apache-2.0
@@ -47,7 +47,7 @@ datachain/client/s3.py,sha256=YCtDhKVO_jGsMPeyqe3xk5QsF5lqMabqkt0tPFWUHOM,7286
47
47
  datachain/data_storage/__init__.py,sha256=9Wit-oe5P46V7CJQTD0BJ5MhOa2Y9h3ddJ4VWTe-Lec,273
48
48
  datachain/data_storage/db_engine.py,sha256=n8ojCbvVMPY2e3SG8fUaaD0b9GkVfpl_Naa_6EiHfWg,3788
49
49
  datachain/data_storage/job.py,sha256=9r0OGwh22bHNIvLHqg8_-eJSP1YYB-BN5HOla5TdCxw,402
50
- datachain/data_storage/metastore.py,sha256=vo2ab-U_-BKfeFYTmvpbCoMyMZEVxrVqM9DjjTg4UPk,38309
50
+ datachain/data_storage/metastore.py,sha256=PGSDRwEckKR759CRgBGFC9aK_shBGLoFcHRuXJliqdc,38306
51
51
  datachain/data_storage/schema.py,sha256=asZYz1cg_WKfe2Q-k5W51E2z2CzHU5B4QEDZDMFr8yo,9346
52
52
  datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
53
53
  datachain/data_storage/sqlite.py,sha256=bwZAB_NUMT2WMv5tPQnnLFA0P-PiQtxzSaQ1q6xDxOU,24590
@@ -58,10 +58,10 @@ datachain/fs/reference.py,sha256=A8McpXF0CqbXPqanXuvpKu50YLB3a2ZXA3YAPxtBXSM,914
58
58
  datachain/fs/utils.py,sha256=s-FkTOCGBk-b6TT3toQH51s9608pofoFjUSTc1yy7oE,825
59
59
  datachain/func/__init__.py,sha256=CjNLHfJkepdXdRZ6HjJBjNSIjOeFMuMkwPDaPUrM75g,1270
60
60
  datachain/func/aggregate.py,sha256=UfxENlw56Qv3UEkj2sZ-JZHmr9q8Rnic9io9_63gF-E,10942
61
- datachain/func/array.py,sha256=OmmjdK5AQyBXa_7NXFUPY_m3lFlRK4Um4J9NCtYwvak,8394
61
+ datachain/func/array.py,sha256=xWiXYc2OgmndRoNtvczvkbpa9qNQalzFS0S3408jbSA,11872
62
62
  datachain/func/base.py,sha256=wA0sBQAVyN9LPxoo7Ox83peS0zUVnyuKxukwAcjGLfY,534
63
63
  datachain/func/conditional.py,sha256=HkNamQr9dLyIMDEbIeO6CZR0emQoDqeaWrZ1fECod4M,8062
64
- datachain/func/func.py,sha256=jzEvnc2iN0BAly-uzxhaoMntL_xF4j94DrFuRi2ADSw,17321
64
+ datachain/func/func.py,sha256=11z24FnhM_U6KQca0er9OSkd4xOTzO3W0tRs0yFg2mM,17375
65
65
  datachain/func/numeric.py,sha256=gMe1Ks0dqQKHkjcpvj7I5S-neECzQ_gltPQLNoaWOyo,5632
66
66
  datachain/func/path.py,sha256=mqN_mfkwv44z2II7DMTp_fGGw95hmTCNls_TOFNpr4k,3155
67
67
  datachain/func/random.py,sha256=pENOLj9rSmWfGCnOsUIaCsVC5486zQb66qfQvXaz9Z4,452
@@ -72,7 +72,7 @@ datachain/lib/arrow.py,sha256=mFO_6wRqzpEzBhXf7Xn1aeLUvaiHcC6XQ-8as9sbcgY,10253
72
72
  datachain/lib/clip.py,sha256=lm5CzVi4Cj1jVLEKvERKArb-egb9j1Ls-fwTItT6vlI,6150
73
73
  datachain/lib/data_model.py,sha256=ZwBXELtqROEdLL4DmxTipnwUZmhQvMz_UVDzyf7nQ9Y,2899
74
74
  datachain/lib/dataset_info.py,sha256=d-jz6zeDU5DEgYtyeSF5nK0MU-40FV5km_iOCh4pXzo,3179
75
- datachain/lib/file.py,sha256=HLQXS_WULm7Y-fkHMy0WpibVAcrkLPRS6CrZy6rwFe0,30450
75
+ datachain/lib/file.py,sha256=0oFm1MWU7AatXplxRj-6Xbjjb6A_AvM_awwk9mYb0hc,30466
76
76
  datachain/lib/hf.py,sha256=gjxuStZBlKtNk3-4yYSlWZDv9zBGblOdvEy_Lwap5hA,5882
77
77
  datachain/lib/image.py,sha256=butvUY_33PVEYPKX2nVCPeJjJVcBaptZwsE9REQsTS8,3247
78
78
  datachain/lib/listing.py,sha256=lFG1Ms6jwm_uqlOcsBUpkmyeSO9TZdtNd820PEpAHP4,7077
@@ -127,7 +127,7 @@ datachain/query/metrics.py,sha256=DOK5HdNVaRugYPjl8qnBONvTkwjMloLqAr7Mi3TjCO0,85
127
127
  datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
128
128
  datachain/query/queue.py,sha256=v0UeK4ilmdiRoJ5OdjB5qpnHTYDxRP4vhVp5Iw_toaI,3512
129
129
  datachain/query/schema.py,sha256=b_KnVy6B26Ol4nYG0LqNNpeQ1QYPk95YRGUjXfdaQWs,6606
130
- datachain/query/session.py,sha256=3nyOvPmLiA86IdHc3BL6Dt_himtHVvaDz_I1h3hZ_gI,6512
130
+ datachain/query/session.py,sha256=6_ydvPasurmc5tR11dzFj51DpUAo4NxXP9p4ltoTauc,6792
131
131
  datachain/query/udf.py,sha256=e753bDJzTNjGFQn1WGTvOAWSwjDbrFI1-_DDWkWN2ls,1343
132
132
  datachain/query/utils.py,sha256=HaSDNH_XGvp_NIcXjcB7j4vJRPi4_tbztDWclYelHY4,1208
133
133
  datachain/remote/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -140,22 +140,22 @@ datachain/sql/default/__init__.py,sha256=XQ2cEZpzWiABqjV-6yYHUBGI9vN_UHxbxZENESm
140
140
  datachain/sql/default/base.py,sha256=QD-31C6JnyOXzogyDx90sUhm7QvgXIYpeHEASH84igU,628
141
141
  datachain/sql/functions/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
142
142
  datachain/sql/functions/aggregate.py,sha256=3AQdA8YHPFdtCEfwZKQXTT8SlQWdG9gD5PBtGN3Odqs,944
143
- datachain/sql/functions/array.py,sha256=LkVDq1Iu3pF2vz9opjxcS0oSvEepoLNYVXcmnOzsmY0,1679
143
+ datachain/sql/functions/array.py,sha256=eRWpDRItwIG87-AU7jb8WuiR-MGuhklVxWwR7t97GvY,2050
144
144
  datachain/sql/functions/conditional.py,sha256=q7YUKfunXeEldXaxgT-p5pUTcOEVU_tcQ2BJlquTRPs,207
145
145
  datachain/sql/functions/numeric.py,sha256=BK2KCiPSgM2IveCq-9M_PG3CtPBlztaS9TTn1LGzyLs,1250
146
146
  datachain/sql/functions/path.py,sha256=zixpERotTFP6LZ7I4TiGtyRA8kXOoZmH1yzH9oRW0mg,1294
147
147
  datachain/sql/functions/random.py,sha256=vBwEEj98VH4LjWixUCygQ5Bz1mv1nohsCG0-ZTELlVg,271
148
148
  datachain/sql/functions/string.py,sha256=E-T9OIzUR-GKaLgjZsEtg5CJrY_sLf1lt1awTvY7w2w,1426
149
149
  datachain/sql/sqlite/__init__.py,sha256=TAdJX0Bg28XdqPO-QwUVKy8rg78cgMileHvMNot7d04,166
150
- datachain/sql/sqlite/base.py,sha256=SktpdtyZmxG9ip_UX_0WL3YxP9o66CYTeMfriRrZzaE,20281
150
+ datachain/sql/sqlite/base.py,sha256=ldyWoMBvrYH789GK8xE4wbBNYxz4B-g4BmQAK6_KjR0,21398
151
151
  datachain/sql/sqlite/types.py,sha256=cH6oge2E_YWFy22wY-txPJH8gxoQFSpCthtZR8PZjpo,1849
152
152
  datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
153
153
  datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
154
154
  datachain/toolkit/split.py,sha256=ktGWzY4kyzjWyR86dhvzw-Zhl0lVk_LOX3NciTac6qo,2914
155
155
  datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
156
- datachain-0.18.0.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
157
- datachain-0.18.0.dist-info/METADATA,sha256=seFHQYDt0EnbQiTRz-SixSCKTMFmF9p94Bd0E4lAyvY,11331
158
- datachain-0.18.0.dist-info/WHEEL,sha256=Nw36Djuh_5VDukK0H78QzOX-_FQEo6V37m3nkm96gtU,91
159
- datachain-0.18.0.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
160
- datachain-0.18.0.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
161
- datachain-0.18.0.dist-info/RECORD,,
156
+ datachain-0.18.2.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
157
+ datachain-0.18.2.dist-info/METADATA,sha256=85UZ7jLkbhT_UI7oaFBD5m1NZ1dL_qn4VKfXCQxTLiY,11331
158
+ datachain-0.18.2.dist-info/WHEEL,sha256=zaaOINJESkSfm_4HQVc5ssNzHCPXhJm0kEUakpsEHaU,91
159
+ datachain-0.18.2.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
160
+ datachain-0.18.2.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
161
+ datachain-0.18.2.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (80.7.1)
2
+ Generator: setuptools (80.8.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5