datachain 0.32.0__py3-none-any.whl → 0.32.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

datachain/__init__.py CHANGED
@@ -37,7 +37,7 @@ from datachain.lib.file import (
37
37
  VideoFrame,
38
38
  )
39
39
  from datachain.lib.model_store import ModelStore
40
- from datachain.lib.namespaces import delete as delete_namespace
40
+ from datachain.lib.namespaces import delete_namespace
41
41
  from datachain.lib.projects import create as create_project
42
42
  from datachain.lib.udf import Aggregator, Generator, Mapper
43
43
  from datachain.lib.utils import AbstractUDF, DataChainError
@@ -1,8 +1,14 @@
1
1
  import inspect
2
+ import sys
2
3
  from datetime import datetime
3
4
  from enum import Enum
4
5
  from typing import Annotated, Literal, Union, get_args, get_origin
5
6
 
7
+ if sys.version_info >= (3, 10):
8
+ from types import UnionType
9
+ else:
10
+ UnionType = None
11
+
6
12
  from pydantic import BaseModel
7
13
  from typing_extensions import Literal as LiteralEx
8
14
 
@@ -34,6 +40,13 @@ PYTHON_TO_SQL = {
34
40
  }
35
41
 
36
42
 
43
+ def _is_union(orig) -> bool:
44
+ if orig == Union:
45
+ return True
46
+ # some code is unreachab in python<3.10
47
+ return UnionType is not None and orig is UnionType # type: ignore[unreachable]
48
+
49
+
37
50
  def python_to_sql(typ): # noqa: PLR0911
38
51
  if inspect.isclass(typ):
39
52
  if issubclass(typ, SQLType):
@@ -69,9 +82,10 @@ def python_to_sql(typ): # noqa: PLR0911
69
82
  if inspect.isclass(orig) and issubclass(dict, orig):
70
83
  return JSON
71
84
 
72
- if orig == Union:
85
+ if _is_union(orig):
73
86
  if len(args) == 2 and (type(None) in args):
74
- return python_to_sql(args[0])
87
+ non_none_arg = args[0] if args[0] is not type(None) else args[1]
88
+ return python_to_sql(non_none_arg)
75
89
 
76
90
  if _is_union_str_literal(orig, args):
77
91
  return String
@@ -95,7 +109,7 @@ def list_of_args_to_type(args) -> SQLType:
95
109
 
96
110
 
97
111
  def _is_json_inside_union(orig, args) -> bool:
98
- if orig == Union and len(args) >= 2:
112
+ if _is_union(orig) and len(args) >= 2:
99
113
  # List in JSON: Union[dict, list[dict]]
100
114
  args_no_nones = [arg for arg in args if arg != type(None)] # noqa: E721
101
115
  if len(args_no_nones) == 2:
@@ -112,6 +126,6 @@ def _is_json_inside_union(orig, args) -> bool:
112
126
 
113
127
 
114
128
  def _is_union_str_literal(orig, args) -> bool:
115
- if orig != Union:
129
+ if not _is_union(orig):
116
130
  return False
117
131
  return all(arg is str or get_origin(arg) in (Literal, LiteralEx) for arg in args)
@@ -77,7 +77,7 @@ def ls(session: Optional[Session] = None) -> list[Namespace]:
77
77
  return Session.get(session).catalog.metastore.list_namespaces()
78
78
 
79
79
 
80
- def delete(name: str, session: Optional[Session]) -> None:
80
+ def delete_namespace(name: str, session: Optional[Session]) -> None:
81
81
  """
82
82
  Removes a namespace by name.
83
83
 
@@ -88,14 +88,13 @@ def delete(name: str, session: Optional[Session]) -> None:
88
88
  as these cannot be removed.
89
89
 
90
90
  Parameters:
91
- name : The name of the namespace.
92
- session : Session to use for getting project.
91
+ name: The name of the namespace.
92
+ session: Session to use for getting project.
93
93
 
94
94
  Example:
95
95
  ```py
96
96
  import datachain as dc
97
- from datachain.lib.namespace import delete as delete_namespace
98
- delete_namespace("dev")
97
+ dc.delete_namespace("dev")
99
98
  ```
100
99
  """
101
100
  session = Session.get(session)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.32.0
3
+ Version: 0.32.1
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License-Expression: Apache-2.0
@@ -210,45 +210,33 @@ datasets that evolve over time and may occasionally have processing errors.
210
210
  .. code:: py
211
211
 
212
212
  import datachain as dc
213
- from datachain import C, File
214
213
 
215
- def process_file(file: File):
216
- """Process a file, which may occasionally fail."""
214
+ def process_file(file: dc.File) -> tuple[str, str, str]:
215
+ """Analyze a file, may occasionally fail."""
217
216
  try:
218
217
  # Your processing logic here
219
218
  content = file.read_text()
220
- result = analyze_content(content)
221
- return {
222
- "content": content,
223
- "result": result,
224
- "error": None # No error
225
- }
219
+ result = content.upper()
220
+ return content, result, "" # No error
226
221
  except Exception as e:
227
222
  # Return an error that will trigger reprocessing next time
228
- return {
229
- "content": None,
230
- "result": None,
231
- "error": str(e) # Error field will trigger retry
232
- }
223
+ return "", "", str(e) # Error field will trigger retry
233
224
 
234
225
  # Process files efficiently with delta and retry
226
+ # Run it many times, keep adding files, to see delta and retry in action
235
227
  chain = (
236
228
  dc.read_storage(
237
229
  "data/",
238
230
  update=True,
239
231
  delta=True, # Process only new/changed files
240
232
  delta_on="file.path", # Identify files by path
241
- retry_on="error" # Field that indicates errors
233
+ delta_retry="error", # Process files with error again
242
234
  )
243
- .map(processed_result=process_file)
244
- .mutate(
245
- content=C("processed_result.content"),
246
- result=C("processed_result.result"),
247
- error=C("processed_result.error")
248
- )
249
- .save(name="processed_data")
235
+ .map(process_file, output=("content", "result", "error"))
236
+ .save("processed-data")
250
237
  )
251
238
 
239
+
252
240
  Example: LLM based text-file evaluation
253
241
  ---------------------------------------
254
242
 
@@ -1,4 +1,4 @@
1
- datachain/__init__.py,sha256=5DFB1P58c35C_WBMrhmaynsP1WwCukC-9gTJIaPy0E8,1832
1
+ datachain/__init__.py,sha256=BRqfLPoBRRycnndaxyba-i4ZrZCJl0As2pwV9RiNBr8,1822
2
2
  datachain/__main__.py,sha256=hG3Y4ARGEqe1AWwNMd259rBlqtphx1Wk39YbueQ0yV8,91
3
3
  datachain/asyn.py,sha256=RH_jFwJcTXxhEFomaI9yL6S3Onau6NZ6FSKfKFGtrJE,9689
4
4
  datachain/cache.py,sha256=ESVRaCJXEThMIfGEFVHx6wJPOZA7FYk9V6WxjyuqUBY,3626
@@ -82,7 +82,7 @@ datachain/lib/listing.py,sha256=U-2stsTEwEsq4Y80dqGfktGzkmB5-ZntnL1_rzXlH0k,7089
82
82
  datachain/lib/listing_info.py,sha256=9ua40Hw0aiQByUw3oAEeNzMavJYfW0Uhe8YdCTK-m_g,1110
83
83
  datachain/lib/meta_formats.py,sha256=zdyg6XLk3QIsSk3I7s0Ez5kaCJSlE3uq7JiGxf7UwtU,6348
84
84
  datachain/lib/model_store.py,sha256=A0pSVQ7uaZ9RvANapzirF8Cqq9N6ysosPpMSkzdRPkU,3226
85
- datachain/lib/namespaces.py,sha256=ij67QHnRJhC8uquR21aD8u1Um2jfxnBX8PecuOQZpYw,3828
85
+ datachain/lib/namespaces.py,sha256=IMfEgs680C3XpaLAiWIPhy8Wry4jT8iyRslcKV2cFOM,3768
86
86
  datachain/lib/projects.py,sha256=_YeU9PPcH_pC8-sbX-47XtWSdl1ltVKnALY8azWLJkM,4112
87
87
  datachain/lib/pytorch.py,sha256=S-st2SAczYut13KMf6eSqP_OQ8otWI5TRmzhK5fN3k0,7828
88
88
  datachain/lib/settings.py,sha256=xBQEPZfgaYKhHIFLd0u5CBTYDcJS8ZHCm47x7GJErFU,7666
@@ -97,7 +97,7 @@ datachain/lib/webdataset.py,sha256=CkW8FfGigNx6wo2EEK4KMjhEE8FamRHWGs2HZuH7jDY,7
97
97
  datachain/lib/webdataset_laion.py,sha256=xvT6m_r5y0KbOx14BUe7UC5mOgrktJq53Mh-H0EVlUE,2525
98
98
  datachain/lib/convert/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
99
99
  datachain/lib/convert/flatten.py,sha256=_5rjGFnN6t1KCX5ftL5rG7tiiNat7j0SdNqajO15KUY,1539
100
- datachain/lib/convert/python_to_sql.py,sha256=wg-O5FRKX3x3Wh8ZL1b9ntMlgf1zRO4djMP3t8CHJLo,3188
100
+ datachain/lib/convert/python_to_sql.py,sha256=ACIHtiPujlG9DRChSlxifcMJCls1PvrB273w_cgR6OQ,3584
101
101
  datachain/lib/convert/sql_to_python.py,sha256=Gxc4FylWC_Pvvuawuc2MKZIiuAWI7wje8pyeN1MxRrU,670
102
102
  datachain/lib/convert/unflatten.py,sha256=ysMkstwJzPMWUlnxn-Z-tXJR3wmhjHeSN_P-sDcLS6s,2010
103
103
  datachain/lib/convert/values_to_tuples.py,sha256=j5yZMrVUH6W7b-7yUvdCTGI7JCUAYUOzHUGPoyZXAB0,4360
@@ -161,9 +161,9 @@ datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR
161
161
  datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
162
162
  datachain/toolkit/split.py,sha256=ktGWzY4kyzjWyR86dhvzw-Zhl0lVk_LOX3NciTac6qo,2914
163
163
  datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
164
- datachain-0.32.0.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
165
- datachain-0.32.0.dist-info/METADATA,sha256=gLsH5khLc-z_s0MKTt3H6qk_UXDAaWuHy2lk1yytgBw,13898
166
- datachain-0.32.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
167
- datachain-0.32.0.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
168
- datachain-0.32.0.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
169
- datachain-0.32.0.dist-info/RECORD,,
164
+ datachain-0.32.1.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
165
+ datachain-0.32.1.dist-info/METADATA,sha256=4T7E05Y-6wELDtaFv4De-D3dVQPk3Fm_-B3WqKkLvQQ,13607
166
+ datachain-0.32.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
167
+ datachain-0.32.1.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
168
+ datachain-0.32.1.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
169
+ datachain-0.32.1.dist-info/RECORD,,