datachain 0.3.8__py3-none-any.whl → 0.3.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

@@ -1560,17 +1560,8 @@ class Catalog:
1560
1560
  version = self.get_dataset(dataset_name).get_version(dataset_version)
1561
1561
 
1562
1562
  file_signals_values = {}
1563
- file_schemas = {}
1564
- # TODO: To remove after we properly fix deserialization
1565
- for signal, type_name in version.feature_schema.items():
1566
- from datachain.lib.model_store import ModelStore
1567
1563
 
1568
- type_name_parsed, v = ModelStore.parse_name_version(type_name)
1569
- fr = ModelStore.get(type_name_parsed, v)
1570
- if fr and issubclass(fr, File):
1571
- file_schemas[signal] = type_name
1572
-
1573
- schema = SignalSchema.deserialize(file_schemas)
1564
+ schema = SignalSchema.deserialize(version.feature_schema)
1574
1565
  for file_signals in schema.get_signals(File):
1575
1566
  prefix = file_signals.replace(".", DEFAULT_DELIMITER) + DEFAULT_DELIMITER
1576
1567
  file_signals_values[file_signals] = {
@@ -1916,7 +1907,7 @@ class Catalog:
1916
1907
  """
1917
1908
  from datachain.query.dataset import ExecutionResult
1918
1909
 
1919
- feature_file = tempfile.NamedTemporaryFile(
1910
+ feature_file = tempfile.NamedTemporaryFile( # noqa: SIM115
1920
1911
  dir=os.getcwd(), suffix=".py", delete=False
1921
1912
  )
1922
1913
  _, feature_module = os.path.split(feature_file.name)
datachain/lib/arrow.py CHANGED
@@ -131,7 +131,7 @@ def arrow_type_mapper(col_type: pa.DataType) -> type: # noqa: PLR0911
131
131
 
132
132
 
133
133
  def _nrows_file(file: File, nrows: int) -> str:
134
- tf = NamedTemporaryFile(delete=False)
134
+ tf = NamedTemporaryFile(delete=False) # noqa: SIM115
135
135
  with file.open(mode="r") as reader:
136
136
  with open(tf.name, "a") as writer:
137
137
  for row, line in enumerate(reader):
datachain/lib/dc.py CHANGED
@@ -1153,17 +1153,35 @@ class DataChain(DatasetQuery):
1153
1153
  self,
1154
1154
  other: "DataChain",
1155
1155
  on: Optional[Union[str, Sequence[str]]] = None,
1156
+ right_on: Optional[Union[str, Sequence[str]]] = None,
1156
1157
  ) -> "Self":
1157
1158
  """Remove rows that appear in another chain.
1158
1159
 
1159
1160
  Parameters:
1160
1161
  other: chain whose rows will be removed from `self`
1161
- on: columns to consider for determining row equality. If unspecified,
1162
- defaults to all common columns between `self` and `other`.
1162
+ on: columns to consider for determining row equality in `self`.
1163
+ If unspecified, defaults to all common columns
1164
+ between `self` and `other`.
1165
+ right_on: columns to consider for determining row equality in `other`.
1166
+ If unspecified, defaults to the same values as `on`.
1163
1167
  """
1164
1168
  if isinstance(on, str):
1169
+ if not on:
1170
+ raise DataChainParamsError("'on' cannot be an empty string")
1165
1171
  on = [on]
1166
- if on is None:
1172
+ elif isinstance(on, Sequence):
1173
+ if not on or any(not col for col in on):
1174
+ raise DataChainParamsError("'on' cannot contain empty strings")
1175
+
1176
+ if isinstance(right_on, str):
1177
+ if not right_on:
1178
+ raise DataChainParamsError("'right_on' cannot be an empty string")
1179
+ right_on = [right_on]
1180
+ elif isinstance(right_on, Sequence):
1181
+ if not right_on or any(not col for col in right_on):
1182
+ raise DataChainParamsError("'right_on' cannot contain empty strings")
1183
+
1184
+ if on is None and right_on is None:
1167
1185
  other_columns = set(other._effective_signals_schema.db_signals())
1168
1186
  signals = [
1169
1187
  c
@@ -1172,16 +1190,29 @@ class DataChain(DatasetQuery):
1172
1190
  ]
1173
1191
  if not signals:
1174
1192
  raise DataChainParamsError("subtract(): no common columns")
1175
- elif not isinstance(on, Sequence):
1176
- raise TypeError(
1177
- f"'on' must be 'str' or 'Sequence' object but got type '{type(on)}'",
1178
- )
1179
- elif not on:
1193
+ elif on is not None and right_on is None:
1194
+ right_on = on
1195
+ signals = list(self.signals_schema.resolve(*on).db_signals())
1196
+ elif on is None and right_on is not None:
1180
1197
  raise DataChainParamsError(
1181
- "'on' cannot be empty",
1198
+ "'on' must be specified when 'right_on' is provided"
1182
1199
  )
1183
1200
  else:
1184
- signals = self.signals_schema.resolve(*on).db_signals() # type: ignore[assignment]
1201
+ if not isinstance(on, Sequence) or not isinstance(right_on, Sequence):
1202
+ raise TypeError(
1203
+ "'on' and 'right_on' must be 'str' or 'Sequence' object"
1204
+ )
1205
+ if len(on) != len(right_on):
1206
+ raise DataChainParamsError(
1207
+ "'on' and 'right_on' must have the same length"
1208
+ )
1209
+ signals = list(
1210
+ zip(
1211
+ self.signals_schema.resolve(*on).db_signals(),
1212
+ other.signals_schema.resolve(*right_on).db_signals(),
1213
+ ) # type: ignore[arg-type]
1214
+ )
1215
+
1185
1216
  return super()._subtract(other, signals) # type: ignore[arg-type]
1186
1217
 
1187
1218
  @classmethod
@@ -222,7 +222,7 @@ class TarStream(File):
222
222
  self._tar = None
223
223
 
224
224
  def open(self):
225
- self._tar = tarfile.open(fileobj=super().open())
225
+ self._tar = tarfile.open(fileobj=super().open()) # noqa: SIM115
226
226
  return self
227
227
 
228
228
  def getmembers(self) -> list[tarfile.TarInfo]:
@@ -296,15 +296,23 @@ class DatasetDiffOperation(Step):
296
296
 
297
297
  @frozen
298
298
  class Subtract(DatasetDiffOperation):
299
- on: Sequence[str]
299
+ on: Sequence[tuple[str, str]]
300
300
 
301
301
  def query(self, source_query: Select, target_query: Select) -> sa.Selectable:
302
302
  sq = source_query.alias("source_query")
303
303
  tq = target_query.alias("target_query")
304
304
  where_clause = sa.and_(
305
- getattr(sq.c, col_name).is_not_distinct_from(getattr(tq.c, col_name))
306
- for col_name in self.on
307
- ) # type: ignore[arg-type]
305
+ *[
306
+ getattr(
307
+ sq.c, col_name[0] if isinstance(col_name, tuple) else col_name
308
+ ).is_not_distinct_from(
309
+ getattr(
310
+ tq.c, col_name[1] if isinstance(col_name, tuple) else col_name
311
+ )
312
+ )
313
+ for col_name in self.on
314
+ ]
315
+ )
308
316
  return sq.select().except_(sq.select().where(where_clause))
309
317
 
310
318
 
@@ -1571,10 +1579,10 @@ class DatasetQuery:
1571
1579
 
1572
1580
  @detach
1573
1581
  def subtract(self, dq: "DatasetQuery") -> "Self":
1574
- return self._subtract(dq, on=["source", "path"])
1582
+ return self._subtract(dq, on=[("source", "source"), ("path", "path")])
1575
1583
 
1576
1584
  @detach
1577
- def _subtract(self, dq: "DatasetQuery", on: Sequence[str]) -> "Self":
1585
+ def _subtract(self, dq: "DatasetQuery", on: Sequence[tuple[str, str]]) -> "Self":
1578
1586
  query = self.clone()
1579
1587
  query.steps.append(Subtract(dq, self.catalog, on=on))
1580
1588
  return query
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.3.8
3
+ Version: 0.3.9
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -115,31 +115,30 @@ AI 🔗 DataChain
115
115
 
116
116
  DataChain is a modern Pythonic data-frame library designed for artificial intelligence.
117
117
  It is made to organize your unstructured data into datasets and wrangle it at scale on
118
- your local machine.
118
+ your local machine. Datachain does not abstract or hide the AI models and API calls, but helps to integrate them into the postmodern data stack.
119
119
 
120
120
  Key Features
121
121
  ============
122
122
 
123
123
  📂 **Storage as a Source of Truth.**
124
- - Process unstructured data without redundant copies: S3, GCP, Azure, and local
124
+ - Process unstructured data without redundant copies from S3, GCP, Azure, and local
125
125
  file systems.
126
- - Multimodal data: images, video, text, PDFs, JSONs, CSVs, parquet.
127
- - Join files and metadata together into persistent, versioned, columnar datasets.
126
+ - Multimodal data support: images, video, text, PDFs, JSONs, CSVs, parquet.
127
+ - Unite files and metadata together into persistent, versioned, columnar datasets.
128
128
 
129
129
  🐍 **Python-friendly data pipelines.**
130
130
  - Operate on Python objects and object fields.
131
- - Built-in parallelization and out-of-memory compute without a need in SQL or
132
- Spark jobs.
131
+ - Built-in parallelization and out-of-memory compute without SQL or Spark.
133
132
 
134
133
  🧠 **Data Enrichment and Processing.**
135
- - Generate metadata columns using local AI models and LLM APIs.
136
- - Filter, join, and group by AI metadata. Vector similarity search.
137
- - Pass datasets to Pytorch and Tensorflow, or export back into storage.
134
+ - Generate metadata using local AI models and LLM APIs.
135
+ - Filter, join, and group by metadata. Search by vector embeddings.
136
+ - Pass datasets to Pytorch and Tensorflow, or export them back into storage.
138
137
 
139
138
  🚀 **Efficiency.**
140
139
  - Parallelization, out-of-memory workloads and data caching.
141
140
  - Vectorized operations on Python object fields: sum, count, avg, etc.
142
- - Vector search on embeddings.
141
+ - Optimized vector search.
143
142
 
144
143
 
145
144
  Quick Start
@@ -164,7 +163,7 @@ where each image has a matching JSON file like `cat.1009.json`:
164
163
  "inference": {"class": "dog", "confidence": 0.68}
165
164
  }
166
165
 
167
- Example of downloading only high-confidence cat images using JSON metadata:
166
+ Example of downloading only "high-confidence cat" inferred images using JSON metadata:
168
167
 
169
168
 
170
169
  .. code:: py
@@ -234,7 +233,7 @@ detected are then copied to the local directory.
234
233
  LLM judging chatbots
235
234
  =============================
236
235
 
237
- LLMs can work as efficient universal classifiers. In the example below,
236
+ LLMs can work as universal classifiers. In the example below,
238
237
  we employ a free API from Mistral to judge the `publicly available`_ chatbot dialogs. Please get a free
239
238
  Mistral API key at https://console.mistral.ai
240
239
 
@@ -17,7 +17,7 @@ datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
17
  datachain/storage.py,sha256=RiSJLYdHUjnrEWkLBKPcETHpAxld_B2WxLg711t0aZI,3733
18
18
  datachain/utils.py,sha256=ROVCLwb37VmFRzgTlSGUDw4eJNgYGiQ4yMX581HfUX8,12988
19
19
  datachain/catalog/__init__.py,sha256=g2iAAFx_gEIrqshXlhSEbrc8qDaEH11cjU40n3CHDz4,409
20
- datachain/catalog/catalog.py,sha256=6S4AnDos4sGYGhy4wNSyV2pKPQNXvo819cd3Dl8Htgg,78271
20
+ datachain/catalog/catalog.py,sha256=kGpp9IEyr1YS7QFWjLYprRT1gp7freyt-WLaLNzqUZg,77859
21
21
  datachain/catalog/datasource.py,sha256=D-VWIVDCM10A8sQavLhRXdYSCG7F4o4ifswEF80_NAQ,1412
22
22
  datachain/catalog/loader.py,sha256=-6VelNfXUdgUnwInVyA8g86Boxv2xqhTh9xNS-Zlwig,8242
23
23
  datachain/catalog/subclass.py,sha256=B5R0qxeTYEyVAAPM1RutBPSoXZc8L5mVVZeSGXki9Sw,2096
@@ -38,11 +38,11 @@ datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2kru
38
38
  datachain/data_storage/sqlite.py,sha256=jLgkvikYkENQUO_ykoNFfsBc2ofZXwFHLMa1nyWP3aw,28316
39
39
  datachain/data_storage/warehouse.py,sha256=cvlfa-nyIxqrrpSRtCdeVjlTwhn7rcIoWjOq91HhItU,33668
40
40
  datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
41
- datachain/lib/arrow.py,sha256=W8bIxMIe_b3dqMFYKGWmfbC_7Xe0gV3UiJjQ2i4EYLA,4925
41
+ datachain/lib/arrow.py,sha256=17-jHLdYhsSdO5kfKWpBS5OAWbMjNi5r8ao0zGXUBoA,4941
42
42
  datachain/lib/clip.py,sha256=33RL11OIqfbwyhvBgiMGM8rDAnZx1IRmxk9dY89ls3Q,6130
43
43
  datachain/lib/data_model.py,sha256=gHIjlow84GMRDa78yLL1Ud-N18or21fnTyPEwsatpXY,2045
44
44
  datachain/lib/dataset_info.py,sha256=lONGr71ozo1DS4CQEhnpKORaU4qFb6Ketv8Xm8CVm2U,2188
45
- datachain/lib/dc.py,sha256=wdMzFLglOhwWKHwh4qcLA0ezMrjuRJq2il2WnkHjyag,62490
45
+ datachain/lib/dc.py,sha256=tY_ccOsv9njsXF23cwoZ7tSTCDKCfakyRvsIBLKE0SE,63976
46
46
  datachain/lib/file.py,sha256=ZHpdilDPYCob8uqtwUPtBvBNxVvQRq4AC_0IGg5m-G4,12003
47
47
  datachain/lib/hf.py,sha256=mYaHFPS4CW2-stRZHBMWW-NKN4dhrnhjZobBgRocnvo,5317
48
48
  datachain/lib/image.py,sha256=WbcwSaFzuyqjg4x4hH5CUogeUQjkZFjQHqw_oDEV1nA,2655
@@ -57,7 +57,7 @@ datachain/lib/udf.py,sha256=nG7DDuPgZ5ZuijwvDoCq-OZMxlDM8vFNzyxMmik0Y1c,11716
57
57
  datachain/lib/udf_signature.py,sha256=gMStcEeYJka5M6cg50Z9orC6y6HzCAJ3MkFqqn1fjZg,7137
58
58
  datachain/lib/utils.py,sha256=5-kJlAZE0D9nXXweAjo7-SP_AWGo28feaDByONYaooQ,463
59
59
  datachain/lib/vfile.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
60
- datachain/lib/webdataset.py,sha256=SsjCKLSKEkHRRfeTHQhjoGqNPqIWw_SCWQcUwgUWWP0,8282
60
+ datachain/lib/webdataset.py,sha256=Q3UlCk66341sq-nvFbBCX4Cv3cYXBK9n12ejG4axPXE,8298
61
61
  datachain/lib/webdataset_laion.py,sha256=PQP6tQmUP7Xu9fPuAGK1JDBYA6T5UufYMUTGaxgspJA,2118
62
62
  datachain/lib/convert/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
63
63
  datachain/lib/convert/flatten.py,sha256=Uebc5CeqCsacp-nr6IG9i6OGuUavXqdqnoGctZBk3RQ,1384
@@ -68,7 +68,7 @@ datachain/lib/convert/values_to_tuples.py,sha256=YOdbjzHq-uj6-cV2Qq43G72eN2avMND
68
68
  datachain/query/__init__.py,sha256=tv-spkjUCYamMN9ys_90scYrZ8kJ7C7d1MTYVmxGtk4,325
69
69
  datachain/query/batch.py,sha256=-vlpINJiertlnaoUVv1C95RatU0F6zuhpIYRufJRo1M,3660
70
70
  datachain/query/builtins.py,sha256=EmKPYsoQ46zwdyOn54MuCzvYFmfsBn5F8zyF7UBUfrc,2550
71
- datachain/query/dataset.py,sha256=G6xA3ItIGUJTXhizdAb6S3L1zFwTf8I0w0jHa1A6F4A,61103
71
+ datachain/query/dataset.py,sha256=mHqSyovJlCQ7pKVMQKKKCiTJs3bP1GDXLKpOSpzVxx8,61378
72
72
  datachain/query/dispatch.py,sha256=GBh3EZHDp5AaXxrjOpfrpfsuy7Umnqxu-MAXcK9X3gc,12945
73
73
  datachain/query/metrics.py,sha256=vsECqbZfoSDBnvC3GQlziKXmISVYDLgHP1fMPEOtKyo,640
74
74
  datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
@@ -95,9 +95,9 @@ datachain/sql/sqlite/base.py,sha256=WLPHBhZbXbiqPoRV1VgDrXJqku4UuvJpBhYeQ0k5rI8,
95
95
  datachain/sql/sqlite/types.py,sha256=yzvp0sXSEoEYXs6zaYC_2YubarQoZH-MiUNXcpuEP4s,1573
96
96
  datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
97
97
  datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
98
- datachain-0.3.8.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
99
- datachain-0.3.8.dist-info/METADATA,sha256=ivteXQrJgp8dKgIO2pdwUj6Qdg96rbI3Gq0kx5fyxtk,16903
100
- datachain-0.3.8.dist-info/WHEEL,sha256=UvcQYKBHoFqaQd6LKyqHw9fxEolWLQnlzP0h_LgJAfI,91
101
- datachain-0.3.8.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
102
- datachain-0.3.8.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
103
- datachain-0.3.8.dist-info/RECORD,,
98
+ datachain-0.3.9.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
99
+ datachain-0.3.9.dist-info/METADATA,sha256=r5uNlVdal7YrsX7nYE56c_Ak8YZIgXqCiSwNJF5KjlY,17015
100
+ datachain-0.3.9.dist-info/WHEEL,sha256=UvcQYKBHoFqaQd6LKyqHw9fxEolWLQnlzP0h_LgJAfI,91
101
+ datachain-0.3.9.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
102
+ datachain-0.3.9.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
103
+ datachain-0.3.9.dist-info/RECORD,,