datachain 0.3.1__py3-none-any.whl → 0.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

@@ -1627,8 +1627,17 @@ class Catalog:
1627
1627
  version = self.get_dataset(dataset_name).get_version(dataset_version)
1628
1628
 
1629
1629
  file_signals_values = {}
1630
+ file_schemas = {}
1631
+ # TODO: To remove after we properly fix deserialization
1632
+ for signal, type_name in version.feature_schema.items():
1633
+ from datachain.lib.model_store import ModelStore
1630
1634
 
1631
- schema = SignalSchema.deserialize(version.feature_schema)
1635
+ type_name_parsed, v = ModelStore.parse_name_version(type_name)
1636
+ fr = ModelStore.get(type_name_parsed, v)
1637
+ if fr and issubclass(fr, File):
1638
+ file_schemas[signal] = type_name
1639
+
1640
+ schema = SignalSchema.deserialize(file_schemas)
1632
1641
  for file_signals in schema.get_signals(File):
1633
1642
  prefix = file_signals.replace(".", DEFAULT_DELIMITER) + DEFAULT_DELIMITER
1634
1643
  file_signals_values[file_signals] = {
@@ -67,7 +67,11 @@ def convert_rows_custom_column_types(
67
67
  for row in rows:
68
68
  row_list = list(row)
69
69
  for idx, t in custom_columns_types:
70
- row_list[idx] = t.on_read_convert(row_list[idx], dialect)
70
+ row_list[idx] = (
71
+ t.default_value(dialect)
72
+ if row_list[idx] is None
73
+ else t.on_read_convert(row_list[idx], dialect)
74
+ )
71
75
 
72
76
  yield tuple(row_list)
73
77
 
@@ -136,7 +140,15 @@ class DataTable:
136
140
  self.column_types: dict[str, SQLType] = column_types or {}
137
141
 
138
142
  @staticmethod
139
- def copy_column(column: sa.Column):
143
+ def copy_column(
144
+ column: sa.Column,
145
+ primary_key: Optional[bool] = None,
146
+ index: Optional[bool] = None,
147
+ nullable: Optional[bool] = None,
148
+ default: Optional[Any] = None,
149
+ server_default: Optional[Any] = None,
150
+ unique: Optional[bool] = None,
151
+ ) -> sa.Column:
140
152
  """
141
153
  Copy a sqlalchemy Column object intended for use as a signal column.
142
154
 
@@ -150,12 +162,14 @@ class DataTable:
150
162
  return sa.Column(
151
163
  column.name,
152
164
  column.type,
153
- primary_key=column.primary_key,
154
- index=column.index,
155
- nullable=column.nullable,
156
- default=column.default,
157
- server_default=column.server_default,
158
- unique=column.unique,
165
+ primary_key=primary_key if primary_key is not None else column.primary_key,
166
+ index=index if index is not None else column.index,
167
+ nullable=nullable if nullable is not None else column.nullable,
168
+ default=default if default is not None else column.default,
169
+ server_default=(
170
+ server_default if server_default is not None else column.server_default
171
+ ),
172
+ unique=unique if unique is not None else column.unique,
159
173
  )
160
174
 
161
175
  @classmethod
@@ -122,6 +122,11 @@ class SQLiteDatabaseEngine(DatabaseEngine):
122
122
  engine = sqlalchemy.create_engine(
123
123
  "sqlite+pysqlite:///", creator=lambda: db, future=True
124
124
  )
125
+ # ensure we run SA on_connect init (e.g it registers regexp function),
126
+ # also makes sure that it's consistent. Otherwise in some cases it
127
+ # seems we are getting different results if engine object is used in a
128
+ # different thread first and enine is not used in the Main thread.
129
+ engine.connect().close()
125
130
 
126
131
  db.isolation_level = None # Use autocommit mode
127
132
  db.execute("PRAGMA foreign_keys = ON")
datachain/lib/dc.py CHANGED
@@ -508,7 +508,7 @@ class DataChain(DatasetQuery):
508
508
 
509
509
  def print_json_schema( # type: ignore[override]
510
510
  self, jmespath: Optional[str] = None, model_name: Optional[str] = None
511
- ) -> "DataChain":
511
+ ) -> "Self":
512
512
  """Print JSON data model and save it. It returns the chain itself.
513
513
 
514
514
  Parameters:
@@ -533,7 +533,7 @@ class DataChain(DatasetQuery):
533
533
 
534
534
  def print_jsonl_schema( # type: ignore[override]
535
535
  self, jmespath: Optional[str] = None, model_name: Optional[str] = None
536
- ) -> "DataChain":
536
+ ) -> "Self":
537
537
  """Print JSON data model and save it. It returns the chain itself.
538
538
 
539
539
  Parameters:
@@ -549,7 +549,7 @@ class DataChain(DatasetQuery):
549
549
 
550
550
  def save( # type: ignore[override]
551
551
  self, name: Optional[str] = None, version: Optional[int] = None
552
- ) -> "DataChain":
552
+ ) -> "Self":
553
553
  """Save to a Dataset. It returns the chain itself.
554
554
 
555
555
  Parameters:
@@ -785,7 +785,7 @@ class DataChain(DatasetQuery):
785
785
  descending (bool): Whether to sort in descending order or not.
786
786
  """
787
787
  if descending:
788
- args = tuple([sqlalchemy.desc(a) for a in args])
788
+ args = tuple(sqlalchemy.desc(a) for a in args)
789
789
 
790
790
  return super().order_by(*args)
791
791
 
@@ -1206,14 +1206,14 @@ class DataChain(DatasetQuery):
1206
1206
  """
1207
1207
  headers, max_length = self._effective_signals_schema.get_headers_with_length()
1208
1208
  if flatten or max_length < 2:
1209
- df = pd.DataFrame.from_records(self.to_records())
1209
+ columns = []
1210
1210
  if headers:
1211
- df.columns = [".".join(filter(None, header)) for header in headers]
1212
- return df
1211
+ columns = [".".join(filter(None, header)) for header in headers]
1212
+ return pd.DataFrame.from_records(self.to_records(), columns=columns)
1213
1213
 
1214
- transposed_result = list(map(list, zip(*self.results())))
1215
- data = {tuple(n): val for n, val in zip(headers, transposed_result)}
1216
- return pd.DataFrame(data)
1214
+ return pd.DataFrame(
1215
+ self.results(), columns=pd.MultiIndex.from_tuples(map(tuple, headers))
1216
+ )
1217
1217
 
1218
1218
  def show(
1219
1219
  self,
@@ -1232,6 +1232,12 @@ class DataChain(DatasetQuery):
1232
1232
  """
1233
1233
  dc = self.limit(limit) if limit > 0 else self
1234
1234
  df = dc.to_pandas(flatten)
1235
+
1236
+ if df.empty:
1237
+ print("Empty result")
1238
+ print(f"Columns: {list(df.columns)}")
1239
+ return
1240
+
1235
1241
  if transpose:
1236
1242
  df = df.T
1237
1243
 
@@ -1270,7 +1276,7 @@ class DataChain(DatasetQuery):
1270
1276
  source: bool = True,
1271
1277
  nrows: Optional[int] = None,
1272
1278
  **kwargs,
1273
- ) -> "DataChain":
1279
+ ) -> "Self":
1274
1280
  """Generate chain from list of tabular files.
1275
1281
 
1276
1282
  Parameters:
@@ -1390,7 +1396,8 @@ class DataChain(DatasetQuery):
1390
1396
  dc = DataChain.from_csv("s3://mybucket/dir")
1391
1397
  ```
1392
1398
  """
1393
- from pyarrow.csv import ParseOptions, ReadOptions
1399
+ from pandas.io.parsers.readers import STR_NA_VALUES
1400
+ from pyarrow.csv import ConvertOptions, ParseOptions, ReadOptions
1394
1401
  from pyarrow.dataset import CsvFileFormat
1395
1402
 
1396
1403
  chain = DataChain.from_storage(path, **kwargs)
@@ -1414,7 +1421,14 @@ class DataChain(DatasetQuery):
1414
1421
 
1415
1422
  parse_options = ParseOptions(delimiter=delimiter)
1416
1423
  read_options = ReadOptions(column_names=column_names)
1417
- format = CsvFileFormat(parse_options=parse_options, read_options=read_options)
1424
+ convert_options = ConvertOptions(
1425
+ strings_can_be_null=True, null_values=STR_NA_VALUES
1426
+ )
1427
+ format = CsvFileFormat(
1428
+ parse_options=parse_options,
1429
+ read_options=read_options,
1430
+ convert_options=convert_options,
1431
+ )
1418
1432
  return chain.parse_tabular(
1419
1433
  output=output,
1420
1434
  object_name=object_name,
@@ -11,12 +11,16 @@ from collections.abc import Iterator
11
11
  from typing import Any, Callable
12
12
 
13
13
  import jmespath as jsp
14
- from pydantic import Field, ValidationError # noqa: F401
14
+ from pydantic import BaseModel, ConfigDict, Field, ValidationError # noqa: F401
15
15
 
16
16
  from datachain.lib.data_model import DataModel # noqa: F401
17
17
  from datachain.lib.file import File
18
18
 
19
19
 
20
+ class UserModel(BaseModel):
21
+ model_config = ConfigDict(populate_by_name=True)
22
+
23
+
20
24
  def generate_uuid():
21
25
  return uuid.uuid4() # Generates a random UUID.
22
26
 
@@ -72,6 +76,8 @@ def read_schema(source_file, data_type="csv", expr=None, model_name=None):
72
76
  data_type,
73
77
  "--class-name",
74
78
  model_name,
79
+ "--base-class",
80
+ "datachain.lib.meta_formats.UserModel",
75
81
  ]
76
82
  try:
77
83
  result = subprocess.run( # noqa: S603
@@ -87,7 +93,7 @@ def read_schema(source_file, data_type="csv", expr=None, model_name=None):
87
93
  except subprocess.CalledProcessError as e:
88
94
  model_output = f"An error occurred in datamodel-codegen: {e.stderr}"
89
95
  print(f"{model_output}")
90
- print("\n" + "from datachain.lib.data_model import DataModel" + "\n")
96
+ print("from datachain.lib.data_model import DataModel")
91
97
  print("\n" + f"DataModel.register({model_name})" + "\n")
92
98
  print("\n" + f"spec={model_name}" + "\n")
93
99
  return model_output
datachain/node.py CHANGED
@@ -47,7 +47,7 @@ class DirTypeGroup:
47
47
  @attrs.define
48
48
  class Node:
49
49
  sys__id: int = 0
50
- sys__rand: int = -1
50
+ sys__rand: int = 0
51
51
  vtype: str = ""
52
52
  dir_type: Optional[int] = None
53
53
  path: str = ""
datachain/query/schema.py CHANGED
@@ -45,6 +45,10 @@ class Column(sa.ColumnClause, metaclass=ColumnMeta):
45
45
  """Search for matches using glob pattern matching."""
46
46
  return self.op("GLOB")(glob_str)
47
47
 
48
+ def regexp(self, regexp_str):
49
+ """Search for matches using regexp pattern matching."""
50
+ return self.op("REGEXP")(regexp_str)
51
+
48
52
 
49
53
  class UDFParameter(ABC):
50
54
  @abstractmethod
@@ -1,8 +1,10 @@
1
1
  from datachain.sql.types import (
2
+ DBDefaults,
2
3
  TypeConverter,
3
4
  TypeDefaults,
4
5
  TypeReadConverter,
5
6
  register_backend_types,
7
+ register_db_defaults,
6
8
  register_type_defaults,
7
9
  register_type_read_converters,
8
10
  )
@@ -18,5 +20,6 @@ def setup() -> None:
18
20
  register_backend_types("default", TypeConverter())
19
21
  register_type_read_converters("default", TypeReadConverter())
20
22
  register_type_defaults("default", TypeDefaults())
23
+ register_db_defaults("default", DBDefaults())
21
24
 
22
25
  setup_is_complete = True
@@ -22,8 +22,10 @@ from datachain.sql.sqlite.types import (
22
22
  register_type_converters,
23
23
  )
24
24
  from datachain.sql.types import (
25
+ DBDefaults,
25
26
  TypeDefaults,
26
27
  register_backend_types,
28
+ register_db_defaults,
27
29
  register_type_defaults,
28
30
  register_type_read_converters,
29
31
  )
@@ -66,6 +68,7 @@ def setup():
66
68
  register_backend_types("sqlite", SQLiteTypeConverter())
67
69
  register_type_read_converters("sqlite", SQLiteTypeReadConverter())
68
70
  register_type_defaults("sqlite", TypeDefaults())
71
+ register_db_defaults("sqlite", DBDefaults())
69
72
 
70
73
  compiles(sql_path.parent, "sqlite")(compile_path_parent)
71
74
  compiles(sql_path.name, "sqlite")(compile_path_name)
datachain/sql/types.py CHANGED
@@ -17,6 +17,7 @@ from datetime import datetime
17
17
  from types import MappingProxyType
18
18
  from typing import Any, Union
19
19
 
20
+ import sqlalchemy as sa
20
21
  from sqlalchemy import TypeDecorator, types
21
22
 
22
23
  _registry: dict[str, "TypeConverter"] = {}
@@ -28,6 +29,9 @@ read_converter_registry = MappingProxyType(_read_converter_registry)
28
29
  _type_defaults_registry: dict[str, "TypeDefaults"] = {}
29
30
  type_defaults_registry = MappingProxyType(_type_defaults_registry)
30
31
 
32
+ _db_defaults_registry: dict[str, "DBDefaults"] = {}
33
+ db_defaults_registry = MappingProxyType(_db_defaults_registry)
34
+
31
35
  NullType = types.NullType
32
36
 
33
37
 
@@ -43,6 +47,10 @@ def register_type_defaults(dialect_name: str, td: "TypeDefaults"):
43
47
  _type_defaults_registry[dialect_name] = td
44
48
 
45
49
 
50
+ def register_db_defaults(dialect_name: str, dbd: "DBDefaults"):
51
+ _db_defaults_registry[dialect_name] = dbd
52
+
53
+
46
54
  def converter(dialect) -> "TypeConverter":
47
55
  name = dialect.name
48
56
  try:
@@ -71,6 +79,14 @@ def type_defaults(dialect) -> "TypeDefaults":
71
79
  raise ValueError(f"No type defaults registered for dialect: {name!r}") from None
72
80
 
73
81
 
82
+ def db_defaults(dialect) -> "DBDefaults":
83
+ name = dialect.name
84
+ try:
85
+ return db_defaults_registry[name]
86
+ except KeyError:
87
+ raise ValueError(f"No DB defaults registered for dialect: {name!r}") from None
88
+
89
+
74
90
  class SQLType(TypeDecorator):
75
91
  impl: type[types.TypeEngine[Any]] = types.TypeEngine
76
92
  cache_ok = True
@@ -97,6 +113,10 @@ class String(SQLType):
97
113
  def default_value(dialect):
98
114
  return type_defaults(dialect).string()
99
115
 
116
+ @staticmethod
117
+ def db_default_value(dialect):
118
+ return db_defaults(dialect).string()
119
+
100
120
  def on_read_convert(self, value, dialect):
101
121
  return read_converter(dialect).string(value)
102
122
 
@@ -115,6 +135,10 @@ class Boolean(SQLType):
115
135
  def default_value(dialect):
116
136
  return type_defaults(dialect).boolean()
117
137
 
138
+ @staticmethod
139
+ def db_default_value(dialect):
140
+ return db_defaults(dialect).boolean()
141
+
118
142
  def on_read_convert(self, value, dialect):
119
143
  return read_converter(dialect).boolean(value)
120
144
 
@@ -133,6 +157,10 @@ class Int(SQLType):
133
157
  def default_value(dialect):
134
158
  return type_defaults(dialect).int()
135
159
 
160
+ @staticmethod
161
+ def db_default_value(dialect):
162
+ return db_defaults(dialect).int()
163
+
136
164
  def on_read_convert(self, value, dialect):
137
165
  return read_converter(dialect).int(value)
138
166
 
@@ -145,6 +173,10 @@ class Int32(Int):
145
173
  def default_value(dialect):
146
174
  return type_defaults(dialect).int32()
147
175
 
176
+ @staticmethod
177
+ def db_default_value(dialect):
178
+ return db_defaults(dialect).int32()
179
+
148
180
  def on_read_convert(self, value, dialect):
149
181
  return read_converter(dialect).int32(value)
150
182
 
@@ -157,6 +189,10 @@ class Int64(Int):
157
189
  def default_value(dialect):
158
190
  return type_defaults(dialect).int64()
159
191
 
192
+ @staticmethod
193
+ def db_default_value(dialect):
194
+ return db_defaults(dialect).int64()
195
+
160
196
  def on_read_convert(self, value, dialect):
161
197
  return read_converter(dialect).int64(value)
162
198
 
@@ -169,12 +205,16 @@ class UInt64(Int):
169
205
  def default_value(dialect):
170
206
  return type_defaults(dialect).uint64()
171
207
 
208
+ @staticmethod
209
+ def db_default_value(dialect):
210
+ return db_defaults(dialect).uint64()
211
+
172
212
  def on_read_convert(self, value, dialect):
173
213
  return read_converter(dialect).uint64(value)
174
214
 
175
215
 
176
216
  class Float(SQLType):
177
- impl = types.INTEGER
217
+ impl = types.FLOAT
178
218
 
179
219
  @property
180
220
  def python_type(self):
@@ -187,6 +227,10 @@ class Float(SQLType):
187
227
  def default_value(dialect):
188
228
  return type_defaults(dialect).float()
189
229
 
230
+ @staticmethod
231
+ def db_default_value(dialect):
232
+ return db_defaults(dialect).float()
233
+
190
234
  def on_read_convert(self, value, dialect):
191
235
  return read_converter(dialect).float(value)
192
236
 
@@ -199,6 +243,10 @@ class Float32(Float):
199
243
  def default_value(dialect):
200
244
  return type_defaults(dialect).float32()
201
245
 
246
+ @staticmethod
247
+ def db_default_value(dialect):
248
+ return db_defaults(dialect).float32()
249
+
202
250
  def on_read_convert(self, value, dialect):
203
251
  return read_converter(dialect).float32(value)
204
252
 
@@ -211,6 +259,10 @@ class Float64(Float):
211
259
  def default_value(dialect):
212
260
  return type_defaults(dialect).float64()
213
261
 
262
+ @staticmethod
263
+ def db_default_value(dialect):
264
+ return db_defaults(dialect).float64()
265
+
214
266
  def on_read_convert(self, value, dialect):
215
267
  return read_converter(dialect).float64(value)
216
268
 
@@ -247,6 +299,10 @@ class Array(SQLType):
247
299
  def default_value(dialect):
248
300
  return type_defaults(dialect).array()
249
301
 
302
+ @staticmethod
303
+ def db_default_value(dialect):
304
+ return db_defaults(dialect).array()
305
+
250
306
  def on_read_convert(self, value, dialect):
251
307
  r = read_converter(dialect).array(value, self.item_type, dialect)
252
308
  if isinstance(self.item_type, JSON):
@@ -268,6 +324,10 @@ class JSON(SQLType):
268
324
  def default_value(dialect):
269
325
  return type_defaults(dialect).json()
270
326
 
327
+ @staticmethod
328
+ def db_default_value(dialect):
329
+ return db_defaults(dialect).json()
330
+
271
331
  def on_read_convert(self, value, dialect):
272
332
  return read_converter(dialect).json(value)
273
333
 
@@ -286,6 +346,10 @@ class DateTime(SQLType):
286
346
  def default_value(dialect):
287
347
  return type_defaults(dialect).datetime()
288
348
 
349
+ @staticmethod
350
+ def db_default_value(dialect):
351
+ return db_defaults(dialect).datetime()
352
+
289
353
  def on_read_convert(self, value, dialect):
290
354
  return read_converter(dialect).datetime(value)
291
355
 
@@ -304,6 +368,10 @@ class Binary(SQLType):
304
368
  def default_value(dialect):
305
369
  return type_defaults(dialect).binary()
306
370
 
371
+ @staticmethod
372
+ def db_default_value(dialect):
373
+ return db_defaults(dialect).binary()
374
+
307
375
  def on_read_convert(self, value, dialect):
308
376
  return read_converter(dialect).binary(value)
309
377
 
@@ -328,13 +396,17 @@ class TypeReadConverter:
328
396
  return value
329
397
 
330
398
  def float(self, value):
399
+ if value is None:
400
+ return float("nan")
401
+ if isinstance(value, str) and value.lower() == "nan":
402
+ return float("nan")
331
403
  return value
332
404
 
333
405
  def float32(self, value):
334
- return value
406
+ return self.float(value)
335
407
 
336
408
  def float64(self, value):
337
- return value
409
+ return self.float(value)
338
410
 
339
411
  def array(self, value, item_type, dialect):
340
412
  if value is None or item_type is None:
@@ -347,10 +419,9 @@ class TypeReadConverter:
347
419
  def datetime(self, value):
348
420
  return value
349
421
 
350
- def uuid(self, value):
351
- return value
352
-
353
422
  def binary(self, value):
423
+ if isinstance(value, str):
424
+ return value.encode()
354
425
  return value
355
426
 
356
427
 
@@ -415,13 +486,13 @@ class TypeDefaults:
415
486
  return None
416
487
 
417
488
  def float(self):
418
- return None
489
+ return float("nan")
419
490
 
420
491
  def float32(self):
421
- return None
492
+ return self.float()
422
493
 
423
494
  def float64(self):
424
- return None
495
+ return self.float()
425
496
 
426
497
  def array(self):
427
498
  return None
@@ -432,11 +503,49 @@ class TypeDefaults:
432
503
  def datetime(self):
433
504
  return None
434
505
 
435
- def uuid(self):
506
+ def binary(self):
436
507
  return None
437
508
 
509
+
510
+ class DBDefaults:
511
+ def string(self):
512
+ return sa.text("''")
513
+
514
+ def boolean(self):
515
+ return sa.text("False")
516
+
517
+ def int(self):
518
+ return sa.text("0")
519
+
520
+ def int32(self):
521
+ return self.int()
522
+
523
+ def int64(self):
524
+ return self.int()
525
+
526
+ def uint64(self):
527
+ return self.int()
528
+
529
+ def float(self):
530
+ return sa.text("NaN")
531
+
532
+ def float32(self):
533
+ return self.float()
534
+
535
+ def float64(self):
536
+ return self.float()
537
+
538
+ def array(self):
539
+ return sa.text("'[]'")
540
+
541
+ def json(self):
542
+ return sa.text("'{}'")
543
+
544
+ def datetime(self):
545
+ return sa.text("'1970-01-01 00:00:00'")
546
+
438
547
  def binary(self):
439
- return None
548
+ return sa.text("''")
440
549
 
441
550
 
442
551
  TYPES = [
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.3.1
3
+ Version: 0.3.2
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -55,6 +55,15 @@ Requires-Dist: mkdocs-material >=9.3.1 ; extra == 'docs'
55
55
  Requires-Dist: mkdocs-section-index >=0.3.6 ; extra == 'docs'
56
56
  Requires-Dist: mkdocstrings-python >=1.6.3 ; extra == 'docs'
57
57
  Requires-Dist: mkdocs-literate-nav >=0.6.1 ; extra == 'docs'
58
+ Provides-Extra: examples
59
+ Requires-Dist: datachain[tests] ; extra == 'examples'
60
+ Requires-Dist: numpy <2,>=1 ; extra == 'examples'
61
+ Requires-Dist: defusedxml ; extra == 'examples'
62
+ Requires-Dist: accelerate ; extra == 'examples'
63
+ Requires-Dist: unstructured[pdf] ; extra == 'examples'
64
+ Requires-Dist: pdfplumber ==0.11.3 ; extra == 'examples'
65
+ Requires-Dist: huggingface-hub[hf_transfer] ; extra == 'examples'
66
+ Requires-Dist: nltk ==3.8.1 ; extra == 'examples'
58
67
  Provides-Extra: remote
59
68
  Requires-Dist: lz4 ; extra == 'remote'
60
69
  Requires-Dist: msgpack <2,>=1.0.4 ; extra == 'remote'
@@ -100,102 +109,78 @@ Requires-Dist: usearch ; extra == 'vector'
100
109
  AI 🔗 DataChain
101
110
  ----------------
102
111
 
103
- DataChain is a data-frame library designed for AI-specific scenarios. It helps ML and
104
- AI engineers build a metadata layer on top of unstructured files and analyze data using
105
- this layer.
112
+ DataChain is a modern Pythonic data-frame library designed for artificial intelligence.
113
+ It is made to organize your unstructured data into datasets and wrangle it at scale on
114
+ your local machine.
106
115
 
107
- 📂 **Raw Files Processing**
108
- Process raw files (images, video, text, PDFs) directly from storage (S3, GCP, Azure,
109
- Local), version and update datasets.
116
+ Key Features
117
+ ============
110
118
 
111
- 🌟 **Metadata layer.**
112
- Build a metadata layer on top of files using structured sources like CSV, Parquet,
113
- and JSON files.
119
+ 📂 **Storage as a Source of Truth.**
120
+ - Process unstructured data without redundant copies: S3, GCP, Azure, and local
121
+ file systems.
122
+ - Multimodal data: images, video, text, PDFs, JSONs, CSVs, parquet.
123
+ - Join files and metadata together into persistent, versioned, columnar datasets.
114
124
 
115
- **Metadata enrichment.**
116
- Enhance the metadata layer with outputs from local ML model inferences and LLM calls.
125
+ 🐍 **Python-friendly data pipelines.**
126
+ - Operate on Python objects and object fields.
127
+ - Built-in parallelization and out-of-memory compute without a need in SQL or
128
+ Spark jobs.
117
129
 
118
- 🛠️ **Data Transformation.**
119
- Transform metadata using traditional methods like filtering, grouping, joining, and
120
- others.
130
+ 🧠 **Data Enrichment and Processing.**
131
+ - Generate metadata columns using local AI models and LLM APIs.
132
+ - Filter, join, and group by AI metadata. Vector similarity search.
133
+ - Pass datasets to Pytorch and Tensorflow, or export back into storage.
121
134
 
122
- 🐍 **User-friendly interface.**
123
- Operate efficiently with familiar Python objects and object fields, eliminating the
124
- need for SQL.
135
+ 🚀 **Efficiency.**
136
+ - Parallelization, out-of-memory workloads and data caching.
137
+ - Vectorized operations on Python object fields: sum, count, avg, etc.
138
+ - Vector search on embeddings.
125
139
 
126
140
 
141
+ Quick Start
142
+ -----------
143
+
127
144
  .. code:: console
128
145
 
129
146
  $ pip install datachain
130
147
 
131
148
 
132
- Data Structures
133
- ===============
134
-
135
- DataChain introduces expressive data structures tailored for AI-specific workload:
136
-
137
- - **Dataset:** Preserves the file-references and meta-information. Takes care of Python
138
- object serialization, dataset versioning and difference. Operations on dataset:
139
-
140
- - **Transformations:** traditional data-frame or SQL operations such as filtering,
141
- grouping, joining.
142
- - **Enrichments:** mapping, aggregating and generating using customer’s Python
143
- code. This is needed to work with ML inference and LLM calls.
144
-
145
- - **Chain** is a sequence of operations on datasets. Chain executes operations in lazy
146
- mode - only when needed.
147
-
148
- DataChain name comes from these major data structures: dataset and chaining.
149
-
149
+ Selecting files using JSON metadata
150
+ ======================================
150
151
 
151
- What’s new in DataChain?
152
- ========================
152
+ A storage consists of images of cats and dogs (`dog.1048.jpg`, `cat.1009.jpg`),
153
+ annotated with ground truth and model inferences in the 'json-pairs' format,
154
+ where each image has a matching JSON file like `cat.1009.json`:
153
155
 
154
- The project combines multiple ideas from different areas in order to simplify AI
155
- use-cases and at the same time to fit it into traditional data infrastructure.
156
+ .. code:: json
156
157
 
157
- - **Python-Native for AI.** Utilizes Python instead of SQL for data manipulation as the
158
- native language for AI. It’s powered by `Pydantic`_ data models.
159
- - **Separation of CPU-GPU workloads.** Distinguishes CPU-heavy transformations (filter,
160
- group_by, join) from GPU heavy enrichments (ML-inference or LLM calls). That’s mostly
161
- needed for distributed computations.
162
- - **Resuming data processing** (in development). Introduces idempotent operations,
163
- allowing data processing to resume from the last successful process file/record/batch
164
- if it fails due to issues like failed LLM calls, ML inference or file download.
158
+ {
159
+ "class": "cat", "id": "1009", "num_annotators": 8,
160
+ "inference": {"class": "dog", "confidence": 0.68}
161
+ }
165
162
 
166
- Additional relatively new ideas:
163
+ Example of downloading only high-confidence cat images using JSON metadata:
167
164
 
168
- - **Functional style data processing.** Using a functional/chaining approach to data
169
- processing rather than declarative SQL, inspired by R-dplyr and some Python libraries.
170
- - **Data Versioning.** Treats raw files in cloud storage as the source of truth for data
171
- and implements data versioning, extending ideas from DVC (developed by the same team).
172
165
 
166
+ .. code:: py
173
167
 
174
- What DataChain is NOT?
175
- ======================
176
-
177
- - **Not a database** (Postgres, MySQL). Instead, it uses databases under the hood:
178
- `SQLite`_ in open-source and ClickHouse and other data warehouses for the commercial
179
- version.
180
- - **Not a data processing tool / data warehouse** (Spark, Snowflake, Big Query) since
181
- it delegates heavy data transformations to underlying data warehouses and focuses on
182
- AI specific data enrichments and orchestrating all the pieces together.
183
-
168
+ from datachain import Column, DataChain
184
169
 
185
- Quick Start
186
- -----------
170
+ meta = DataChain.from_json("gs://datachain-demo/dogs-and-cats/*json", object_name="meta")
171
+ images = DataChain.from_storage("gs://datachain-demo/dogs-and-cats/*jpg")
187
172
 
188
- Data curation with a local model
189
- =================================
173
+ images_id = images.map(id=lambda file: file.path.split('.')[-2])
174
+ annotated = images_id.merge(meta, on="id", right_on="meta.id")
190
175
 
191
- We will evaluate chatbot dialogs stored as text files in Google Cloud Storage
192
- - 50 files total in this example.
193
- These dialogs involve users chatting with a bot while looking for better wireless plans.
194
- Our goal is to identify the successful dialogs.
176
+ likely_cats = annotated.filter((Column("meta.inference.confidence") > 0.93) \
177
+ & (Column("meta.inference.class_") == "cat"))
178
+ likely_cats.export_files("high-confidence-cats/", signal="file")
195
179
 
196
- The data used in the examples is `publicly available`_. The sample code is designed to run on a local machine.
197
180
 
198
- First, we'll show batch inference with a simple sentiment model using the `transformers` library:
181
+ Data curation with a local AI model
182
+ ===================================
183
+ Batch inference with a simple sentiment model using the `transformers` library:
199
184
 
200
185
  .. code:: shell
201
186
 
@@ -246,30 +231,30 @@ LLM judging chatbots
246
231
  =============================
247
232
 
248
233
  LLMs can work as efficient universal classifiers. In the example below,
249
- we employ a free API from Mistral to judge the chatbot performance. Please get a free
234
+ we employ a free API from Mistral to judge the `publicly available`_ chatbot dialogs. Please get a free
250
235
  Mistral API key at https://console.mistral.ai
251
236
 
237
+
252
238
  .. code:: shell
253
239
 
254
- $ pip install mistralai
240
+ $ pip install mistralai (Requires version >=1.0.0)
255
241
  $ export MISTRAL_API_KEY=_your_key_
256
242
 
257
243
  DataChain can parallelize API calls; the free Mistral tier supports up to 4 requests at the same time.
258
244
 
259
245
  .. code:: py
260
246
 
261
- from mistralai.client import MistralClient
262
- from mistralai.models.chat_completion import ChatMessage
247
+ from mistralai import Mistral
263
248
  from datachain import File, DataChain, Column
264
249
 
265
250
  PROMPT = "Was this dialog successful? Answer in a single word: Success or Failure."
266
251
 
267
252
  def eval_dialogue(file: File) -> bool:
268
- client = MistralClient()
269
- response = client.chat(
253
+ client = Mistral()
254
+ response = client.chat.complete(
270
255
  model="open-mixtral-8x22b",
271
- messages=[ChatMessage(role="system", content=PROMPT),
272
- ChatMessage(role="user", content=file.read())])
256
+ messages=[{"role": "system", "content": PROMPT},
257
+ {"role": "user", "content": file.read()}])
273
258
  result = response.choices[0].message.content
274
259
  return result.lower().startswith("success")
275
260
 
@@ -309,8 +294,8 @@ Instead of extracting this information from the Mistral response data structure
309
294
 
310
295
  .. code:: py
311
296
 
312
- from mistralai.client import MistralClient
313
- from mistralai.models.chat_completion import ChatMessage, ChatCompletionResponse
297
+ from mistralai import Mistral
298
+ from mistralai.models import ChatCompletionResponse
314
299
  from datachain import File, DataChain, Column
315
300
 
316
301
  PROMPT = "Was this dialog successful? Answer in a single word: Success or Failure."
@@ -319,8 +304,8 @@ Instead of extracting this information from the Mistral response data structure
319
304
  client = MistralClient()
320
305
  return client.chat(
321
306
  model="open-mixtral-8x22b",
322
- messages=[ChatMessage(role="system", content=PROMPT),
323
- ChatMessage(role="user", content=file.read())])
307
+ messages=[{"role": "system", "content": PROMPT},
308
+ {"role": "user", "content": file.read()}])
324
309
 
325
310
  chain = (
326
311
  DataChain.from_storage("gs://datachain-demo/chatbot-KiT/", object_name="file")
@@ -438,7 +423,10 @@ Tutorials
438
423
  ---------
439
424
 
440
425
  * `Getting Started`_
441
- * `Multimodal <examples/multimodal/clip_fine_tuning.ipynb>`_ (try in `Colab <https://colab.research.google.com/github/iterative/datachain/blob/main/examples/multimodal/clip_fine_tuning.ipynb>`__)
426
+ * `Multimodal <https://github.com/iterative/datachain-examples/blob/main/multimodal/clip_fine_tuning.ipynb>`_ (try in `Colab <https://colab.research.google.com/github/iterative/datachain-examples/blob/main/multimodal/clip_fine_tuning.ipynb>`__)
427
+ * `LLM evaluations <https://github.com/iterative/datachain-examples/blob/main/llm/llm_chatbot_evaluation.ipynb>`_ (try in `Colab <https://colab.research.google.com/github/iterative/datachain-examples/blob/main/llm/llm_chatbot_evaluation.ipynb>`__)
428
+ * `Reading JSON metadata <https://github.com/iterative/datachain-examples/blob/main/formats/json-metadata-tutorial.ipynb>`_ (try in `Colab <https://colab.research.google.com/github/iterative/datachain-examples/blob/main/formats/json-metadata-tutorial.ipynb>`__)
429
+
442
430
 
443
431
  Contributions
444
432
  -------------
@@ -9,7 +9,7 @@ datachain/dataset.py,sha256=MZezyuJWNj_3PEtzr0epPMNyWAOTrhTSPI5FmemV6L4,14470
9
9
  datachain/error.py,sha256=GY9KYTmb7GHXn2gGHV9X-PBhgwLj3i7VpK7tGHtAoGM,1279
10
10
  datachain/job.py,sha256=bk25bIqClhgRPzlXAhxpTtDeewibQe5l3S8Cf7db0gM,1229
11
11
  datachain/listing.py,sha256=keLkvPfumDA3gijeIiinH5yGWe71qCxgF5HqqP5AeH4,8299
12
- datachain/node.py,sha256=frxZWoEvqUvk9pyXmVaeiNCs3W-xjC_sENmUD11V06Q,6006
12
+ datachain/node.py,sha256=ihrP5l9HKpXLR0fR1wyb7QIdb7NR26dX6bB09qGX5B4,6005
13
13
  datachain/nodes_fetcher.py,sha256=kca19yvu11JxoVY1t4_ydp1FmchiV88GnNicNBQ9NIA,831
14
14
  datachain/nodes_thread_pool.py,sha256=ZyzBvUImIPmi4WlKC2SW2msA0UhtembbTdcs2nx29A0,3191
15
15
  datachain/progress.py,sha256=7_8FtJs770ITK9sMq-Lt4k4k18QmYl4yIG_kCoWID3o,4559
@@ -17,7 +17,7 @@ datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
17
  datachain/storage.py,sha256=RiSJLYdHUjnrEWkLBKPcETHpAxld_B2WxLg711t0aZI,3733
18
18
  datachain/utils.py,sha256=ROVCLwb37VmFRzgTlSGUDw4eJNgYGiQ4yMX581HfUX8,12988
19
19
  datachain/catalog/__init__.py,sha256=g2iAAFx_gEIrqshXlhSEbrc8qDaEH11cjU40n3CHDz4,409
20
- datachain/catalog/catalog.py,sha256=9-7SnMjh5ruH9sdKDo8P5EklX9oC2EHH6bnku6ZqLko,80275
20
+ datachain/catalog/catalog.py,sha256=9fxRJjiM8tK3ZePHFErYqY6LkJFA6bvlp-KHq-_kSYk,80703
21
21
  datachain/catalog/datasource.py,sha256=D-VWIVDCM10A8sQavLhRXdYSCG7F4o4ifswEF80_NAQ,1412
22
22
  datachain/catalog/loader.py,sha256=GJ8zhEYkC7TuaPzCsjJQ4LtTdECu-wwYzC12MikPOMQ,7307
23
23
  datachain/catalog/subclass.py,sha256=B5R0qxeTYEyVAAPM1RutBPSoXZc8L5mVVZeSGXki9Sw,2096
@@ -33,19 +33,19 @@ datachain/data_storage/db_engine.py,sha256=81Ol1of9TTTzD97ORajCnP366Xz2mEJt6C-kT
33
33
  datachain/data_storage/id_generator.py,sha256=lCEoU0BM37Ai2aRpSbwo5oQT0GqZnSpYwwvizathRMQ,4292
34
34
  datachain/data_storage/job.py,sha256=w-7spowjkOa1P5fUVtJou3OltT0L48P0RYWZ9rSJ9-s,383
35
35
  datachain/data_storage/metastore.py,sha256=nxcY6nwyEmQWMAo33sNGO-FgUFQs2amBGGnZz2ftEz0,55362
36
- datachain/data_storage/schema.py,sha256=Idi-29fckvZozzvkyz3nTR2FOIajPlSuPdIEO7SMvXM,7863
36
+ datachain/data_storage/schema.py,sha256=GwJIHkjhrnBxJAV1WvCMM8jiJN5h79LXDyzMmUDtRw0,8523
37
37
  datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
38
- datachain/data_storage/sqlite.py,sha256=0r6L_a2hdGRoR_gl06v1qWhEFOS_Q31aldHyk07Yx-M,26857
38
+ datachain/data_storage/sqlite.py,sha256=IKd4epEjVxAoQQHsE7WTY4kgOiFyUiWhvaGm-61rJfg,27218
39
39
  datachain/data_storage/warehouse.py,sha256=MXYkUG69UK2wbIFsZFvT7rKzXlnSitDMp3Vzj_IIsnA,33089
40
40
  datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
41
41
  datachain/lib/arrow.py,sha256=R8wDUDEa-5hYjI3HW9cqvOYYJpeeah5lbhFIL3gkmcE,4915
42
42
  datachain/lib/clip.py,sha256=16u4b_y2Y15nUS2UN_8ximMo6r_-_4IQpmct2ol-e-g,5730
43
43
  datachain/lib/data_model.py,sha256=qfTtQNncS5pt9SvXdMEa5kClniaT6XBGBfO7onEz2TI,1632
44
44
  datachain/lib/dataset_info.py,sha256=lONGr71ozo1DS4CQEhnpKORaU4qFb6Ketv8Xm8CVm2U,2188
45
- datachain/lib/dc.py,sha256=e24ecfIcypVkmVBqvr-p06zpwrw7GD20gy1gBJQPT-I,58012
45
+ datachain/lib/dc.py,sha256=Q9HL7Axfo9i5hodlkD2GwklN4i0BVULm9_A11ckuj2A,58352
46
46
  datachain/lib/file.py,sha256=ZHpdilDPYCob8uqtwUPtBvBNxVvQRq4AC_0IGg5m-G4,12003
47
47
  datachain/lib/image.py,sha256=TgYhRhzd4nkytfFMeykQkPyzqb5Le_-tU81unVMPn4Q,2328
48
- datachain/lib/meta_formats.py,sha256=jlSYWRUeDMjun_YCsQ2JxyaDJpEpokzHDPmKUAoCXnU,7034
48
+ datachain/lib/meta_formats.py,sha256=Hels85LJmNCz1aYVJvhymNdAt3qdJ2-qoxsIiUezrow,7198
49
49
  datachain/lib/model_store.py,sha256=c4USXsBBjrGH8VOh4seIgOiav-qHOwdoixtxfLgU63c,2409
50
50
  datachain/lib/pytorch.py,sha256=9PsypKseyKfIimTmTQOgb-pbNXgeeAHLdlWx0qRPULY,5660
51
51
  datachain/lib/settings.py,sha256=39thOpYJw-zPirzeNO6pmRC2vPrQvt4eBsw1xLWDFsw,2344
@@ -71,17 +71,17 @@ datachain/query/dispatch.py,sha256=GBh3EZHDp5AaXxrjOpfrpfsuy7Umnqxu-MAXcK9X3gc,1
71
71
  datachain/query/metrics.py,sha256=vsECqbZfoSDBnvC3GQlziKXmISVYDLgHP1fMPEOtKyo,640
72
72
  datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
73
73
  datachain/query/queue.py,sha256=waqM_KzavU8C-G95-4211Nd4GXna_u2747Chgwtgz2w,3839
74
- datachain/query/schema.py,sha256=O3mTM5DRjvRAJCI7O9mR8wOdFJbgI1jIjvtfl5YvjI4,7755
74
+ datachain/query/schema.py,sha256=BvHipN79CnSTbVFcfIEwzo1npe7HmThnk0iY-CSLEkM,7899
75
75
  datachain/query/session.py,sha256=qTzkXgwMJdJhal3rVt3hdv3x1EXT1IHuXcwkC-Ex0As,4111
76
76
  datachain/query/udf.py,sha256=j3NhmKK5rYG5TclcM2Sr0LhS1tmYLMjzMugx9G9iFLM,8100
77
77
  datachain/remote/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
78
78
  datachain/remote/studio.py,sha256=f5s6qSZ9uB4URGUoU_8_W1KZRRQQVSm6cgEBkBUEfuE,7226
79
79
  datachain/sql/__init__.py,sha256=A2djrbQwSMUZZEIKGnm-mnRA-NDSbiDJNpAmmwGNyIo,303
80
80
  datachain/sql/selectable.py,sha256=fBM-wS1TUA42kVEAAiwqGtibIevyZAEritwt8PZGyLQ,1589
81
- datachain/sql/types.py,sha256=SShudhdIpdfTKDxWDDqOajYRkTCkIgQbilA94g4i-4E,10389
81
+ datachain/sql/types.py,sha256=1MFvECB_5A6QwQKKY3VPhvitgKDlc2aB7iBjY4hv1_s,13034
82
82
  datachain/sql/utils.py,sha256=rzlJw08etivdrcuQPqNVvVWhuVSyUPUQEEc6DOhu258,818
83
83
  datachain/sql/default/__init__.py,sha256=XQ2cEZpzWiABqjV-6yYHUBGI9vN_UHxbxZENESmVAWw,45
84
- datachain/sql/default/base.py,sha256=h44005q3qtMc9cjWmRufWwcBr5CfK_dnvG4IrcSQs_8,536
84
+ datachain/sql/default/base.py,sha256=QD-31C6JnyOXzogyDx90sUhm7QvgXIYpeHEASH84igU,628
85
85
  datachain/sql/functions/__init__.py,sha256=Ioyy7nSetrTLVnHGcGcmZU99HxUFcx-5PFbrh2dPNH0,396
86
86
  datachain/sql/functions/array.py,sha256=EB7nJSncUc1PuxlHyzU2gVhF8DuXaxpGlxb5e8X2KFY,1297
87
87
  datachain/sql/functions/conditional.py,sha256=q7YUKfunXeEldXaxgT-p5pUTcOEVU_tcQ2BJlquTRPs,207
@@ -89,13 +89,13 @@ datachain/sql/functions/path.py,sha256=zixpERotTFP6LZ7I4TiGtyRA8kXOoZmH1yzH9oRW0
89
89
  datachain/sql/functions/random.py,sha256=vBwEEj98VH4LjWixUCygQ5Bz1mv1nohsCG0-ZTELlVg,271
90
90
  datachain/sql/functions/string.py,sha256=hIrF1fTvlPamDtm8UMnWDcnGfbbjCsHxZXS30U2Rzxo,651
91
91
  datachain/sql/sqlite/__init__.py,sha256=TAdJX0Bg28XdqPO-QwUVKy8rg78cgMileHvMNot7d04,166
92
- datachain/sql/sqlite/base.py,sha256=LBYmXqXsVF30fbcnR55evCZHbPDCzMdGk_ogPLps63s,12236
92
+ datachain/sql/sqlite/base.py,sha256=w6HbEkGdmNGDnDY3_75E-wDb6qNskVpq0qbHGADsERk,12327
93
93
  datachain/sql/sqlite/types.py,sha256=yzvp0sXSEoEYXs6zaYC_2YubarQoZH-MiUNXcpuEP4s,1573
94
94
  datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
95
95
  datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
96
- datachain-0.3.1.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
97
- datachain-0.3.1.dist-info/METADATA,sha256=qR3OMpGUkx0cKelnl51d9uksn5H-Wn4LvTJbUnTMDuQ,17268
98
- datachain-0.3.1.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
99
- datachain-0.3.1.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
100
- datachain-0.3.1.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
101
- datachain-0.3.1.dist-info/RECORD,,
96
+ datachain-0.3.2.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
97
+ datachain-0.3.2.dist-info/METADATA,sha256=i8evXYMe4FgBqxV7TYdWTRuh7MxRT6jfqmzL-tbk_JQ,16789
98
+ datachain-0.3.2.dist-info/WHEEL,sha256=HiCZjzuy6Dw0hdX5R3LCFPDmFS4BWl8H-8W39XfmgX4,91
99
+ datachain-0.3.2.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
100
+ datachain-0.3.2.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
101
+ datachain-0.3.2.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (72.1.0)
2
+ Generator: setuptools (72.2.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5