datachain 0.2.15__py3-none-any.whl → 0.2.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

@@ -1217,16 +1217,14 @@ class Catalog:
1217
1217
  def get_temp_table_names(self) -> list[str]:
1218
1218
  return self.warehouse.get_temp_table_names()
1219
1219
 
1220
- def cleanup_temp_tables(self, names: Iterable[str]) -> None:
1220
+ def cleanup_tables(self, names: Iterable[str]) -> None:
1221
1221
  """
1222
- Drop tables created temporarily when processing datasets.
1222
+ Drop tables passed.
1223
1223
 
1224
- This should be implemented even if temporary tables are used to
1225
- ensure that they are cleaned up as soon as they are no longer
1226
- needed. When running the same `DatasetQuery` multiple times we
1227
- may use the same temporary table names.
1224
+ This should be implemented to ensure that the provided tables
1225
+ are cleaned up as soon as they are no longer needed.
1228
1226
  """
1229
- self.warehouse.cleanup_temp_tables(names)
1227
+ self.warehouse.cleanup_tables(names)
1230
1228
  self.id_generator.delete_uris(names)
1231
1229
 
1232
1230
  def create_dataset_from_sources(
datachain/cli.py CHANGED
@@ -910,7 +910,7 @@ def garbage_collect(catalog: "Catalog"):
910
910
  print("Nothing to clean up.")
911
911
  else:
912
912
  print(f"Garbage collecting {len(temp_tables)} tables.")
913
- catalog.cleanup_temp_tables(temp_tables)
913
+ catalog.cleanup_tables(temp_tables)
914
914
 
915
915
 
916
916
  def completion(shell: str) -> str:
@@ -97,7 +97,7 @@ class AbstractMetastore(ABC, Serializable):
97
97
  def close(self) -> None:
98
98
  """Closes any active database or HTTP connections."""
99
99
 
100
- def cleanup_temp_tables(self, temp_table_names: list[str]) -> None:
100
+ def cleanup_tables(self, temp_table_names: list[str]) -> None:
101
101
  """Cleanup temp tables."""
102
102
 
103
103
  def cleanup_for_tests(self) -> None:
@@ -457,7 +457,7 @@ class AbstractDBMetastore(AbstractMetastore):
457
457
  """Closes any active database connections."""
458
458
  self.db.close()
459
459
 
460
- def cleanup_temp_tables(self, temp_table_names: list[str]) -> None:
460
+ def cleanup_tables(self, temp_table_names: list[str]) -> None:
461
461
  """Cleanup temp tables."""
462
462
  self.id_generator.delete_uris(temp_table_names)
463
463
 
@@ -915,14 +915,12 @@ class AbstractWarehouse(ABC, Serializable):
915
915
  if self.is_temp_table_name(t)
916
916
  ]
917
917
 
918
- def cleanup_temp_tables(self, names: Iterable[str]) -> None:
918
+ def cleanup_tables(self, names: Iterable[str]) -> None:
919
919
  """
920
- Drop tables created temporarily when processing datasets.
920
+ Drop tables passed.
921
921
 
922
- This should be implemented even if temporary tables are used to
923
- ensure that they are cleaned up as soon as they are no longer
924
- needed. When running the same `DatasetQuery` multiple times we
925
- may use the same temporary table names.
922
+ This should be implemented to ensure that the provided tables
923
+ are cleaned up as soon as they are no longer needed.
926
924
  """
927
925
  for name in names:
928
926
  self.db.drop_table(Table(name, self.db.metadata), if_exists=True)
datachain/lib/dc.py CHANGED
@@ -193,8 +193,6 @@ class DataChain(DatasetQuery):
193
193
  ```
194
194
  """
195
195
 
196
- max_row_count: Optional[int] = None
197
-
198
196
  DEFAULT_FILE_RECORD: ClassVar[dict] = {
199
197
  "source": "",
200
198
  "name": "",
@@ -1124,7 +1122,7 @@ class DataChain(DatasetQuery):
1124
1122
  def _func_fr() -> Iterator[tuple_type]: # type: ignore[valid-type]
1125
1123
  yield from tuples
1126
1124
 
1127
- chain = DataChain.create_empty(DataChain.DEFAULT_FILE_RECORD, session=session)
1125
+ chain = DataChain.from_records(DataChain.DEFAULT_FILE_RECORD, session=session)
1128
1126
  if object_name:
1129
1127
  output = {object_name: DataChain._dict_to_data_model(object_name, output)} # type: ignore[arg-type]
1130
1128
  return chain.gen(_func_fr, output=output)
@@ -1441,13 +1439,14 @@ class DataChain(DatasetQuery):
1441
1439
  )
1442
1440
 
1443
1441
  @classmethod
1444
- def create_empty(
1442
+ def from_records(
1445
1443
  cls,
1446
1444
  to_insert: Optional[Union[dict, list[dict]]],
1447
1445
  session: Optional[Session] = None,
1448
1446
  ) -> "DataChain":
1449
- """Create empty chain. Returns a chain. This method is used for programmatically
1450
- generating a chains in contrast of reading data from storages or other sources.
1447
+ """Create a DataChain from the provided records. This method can be used for
1448
+ programmatically generating a chain in contrast of reading data from storages
1449
+ or other sources.
1451
1450
 
1452
1451
  Parameters:
1453
1452
  to_insert : records (or a single record) to insert. Each record is
@@ -1455,8 +1454,8 @@ class DataChain(DatasetQuery):
1455
1454
 
1456
1455
  Example:
1457
1456
  ```py
1458
- empty = DataChain.create_empty()
1459
- single_record = DataChain.create_empty(DataChain.DEFAULT_FILE_RECORD)
1457
+ empty = DataChain.from_records()
1458
+ single_record = DataChain.from_records(DataChain.DEFAULT_FILE_RECORD)
1460
1459
  ```
1461
1460
  """
1462
1461
  session = Session.get(session)
@@ -1602,18 +1601,7 @@ class DataChain(DatasetQuery):
1602
1601
  @detach
1603
1602
  def limit(self, n: int) -> "Self":
1604
1603
  """Return the first n rows of the chain."""
1605
- n = max(n, 0)
1606
-
1607
- if self.max_row_count is None:
1608
- self.max_row_count = n
1609
- return super().limit(n)
1610
-
1611
- limit = min(n, self.max_row_count)
1612
- if limit == self.max_row_count:
1613
- return self
1614
-
1615
- self.max_row_count = limit
1616
- return super().limit(self.max_row_count)
1604
+ return super().limit(n)
1617
1605
 
1618
1606
  @detach
1619
1607
  def offset(self, offset: int) -> "Self":
@@ -243,8 +243,11 @@ class SignalSchema:
243
243
  curr_type = None
244
244
  i = 0
245
245
  while curr_tree is not None and i < len(path):
246
- if val := curr_tree.get(path[i], None):
246
+ if val := curr_tree.get(path[i]):
247
247
  curr_type, curr_tree = val
248
+ elif i == 0 and len(path) > 1 and (val := curr_tree.get(".".join(path))):
249
+ curr_type, curr_tree = val
250
+ break
248
251
  else:
249
252
  curr_type = None
250
253
  i += 1
@@ -1201,10 +1201,10 @@ class DatasetQuery:
1201
1201
  # implementations, as errors may close or render unusable the existing
1202
1202
  # connections.
1203
1203
  metastore = self.catalog.metastore.clone(use_new_connection=True)
1204
- metastore.cleanup_temp_tables(self.temp_table_names)
1204
+ metastore.cleanup_tables(self.temp_table_names)
1205
1205
  metastore.close()
1206
1206
  warehouse = self.catalog.warehouse.clone(use_new_connection=True)
1207
- warehouse.cleanup_temp_tables(self.temp_table_names)
1207
+ warehouse.cleanup_tables(self.temp_table_names)
1208
1208
  warehouse.close()
1209
1209
  self.temp_table_names = []
1210
1210
 
@@ -1383,6 +1383,9 @@ class DatasetQuery:
1383
1383
  @detach
1384
1384
  def limit(self, n: int) -> "Self":
1385
1385
  query = self.clone(new_table=False)
1386
+ for step in query.steps:
1387
+ if isinstance(step, SQLLimit) and step.n < n:
1388
+ return query
1386
1389
  query.steps.append(SQLLimit(n))
1387
1390
  return query
1388
1391
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.2.15
3
+ Version: 0.2.16
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -2,7 +2,7 @@ datachain/__init__.py,sha256=GeyhE-5LgfJav2OKYGaieP2lBvf2Gm-ihj7thnK9zjI,800
2
2
  datachain/__main__.py,sha256=hG3Y4ARGEqe1AWwNMd259rBlqtphx1Wk39YbueQ0yV8,91
3
3
  datachain/asyn.py,sha256=CKCFQJ0CbB3r04S7mUTXxriKzPnOvdUaVPXjM8vCtJw,7644
4
4
  datachain/cache.py,sha256=N6PCEFJlWRpq7f_zeBNoaURFCJFAV7ibsLJqyiMHbBg,4207
5
- datachain/cli.py,sha256=Twb6BXjNxfAAGj42dUOJ7Ah5etkrTDVfMzAmINWUSOI,33104
5
+ datachain/cli.py,sha256=DbmI1sXs7-KCQz6RdLE_JAp3XO3yrTSRJ71LdUzx-XE,33099
6
6
  datachain/cli_utils.py,sha256=jrn9ejGXjybeO1ur3fjdSiAyCHZrX0qsLLbJzN9ErPM,2418
7
7
  datachain/config.py,sha256=PfC7W5yO6HFO6-iMB4YB-0RR88LPiGmD6sS_SfVbGso,1979
8
8
  datachain/dataset.py,sha256=MZezyuJWNj_3PEtzr0epPMNyWAOTrhTSPI5FmemV6L4,14470
@@ -17,7 +17,7 @@ datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
17
  datachain/storage.py,sha256=RiSJLYdHUjnrEWkLBKPcETHpAxld_B2WxLg711t0aZI,3733
18
18
  datachain/utils.py,sha256=kgH5NPj47eC_KrFTd6ZS206lKVhnJVFt5XsqkK6ppTc,12483
19
19
  datachain/catalog/__init__.py,sha256=g2iAAFx_gEIrqshXlhSEbrc8qDaEH11cjU40n3CHDz4,409
20
- datachain/catalog/catalog.py,sha256=ab-PLPa9CMeHCo9asHjkqw4mZ6tHM4x8bsswfMtr65w,80575
20
+ datachain/catalog/catalog.py,sha256=LZo9pIgi_HOUWpxX1c7RMt5OnrlDHXx2YpL5oP8X0kk,80397
21
21
  datachain/catalog/datasource.py,sha256=D-VWIVDCM10A8sQavLhRXdYSCG7F4o4ifswEF80_NAQ,1412
22
22
  datachain/catalog/loader.py,sha256=GJ8zhEYkC7TuaPzCsjJQ4LtTdECu-wwYzC12MikPOMQ,7307
23
23
  datachain/catalog/subclass.py,sha256=B5R0qxeTYEyVAAPM1RutBPSoXZc8L5mVVZeSGXki9Sw,2096
@@ -32,24 +32,24 @@ datachain/data_storage/__init__.py,sha256=cEOJpyu1JDZtfUupYucCDNFI6e5Wmp_Oyzq6rZ
32
32
  datachain/data_storage/db_engine.py,sha256=rgBuqJ-M1j5QyqiUQuJRewctuvRRj8LBDL54-aPEFxE,3287
33
33
  datachain/data_storage/id_generator.py,sha256=VlDALKijggegAnNMJwuMETJgnLoPYxpkrkld5DNTPQw,3839
34
34
  datachain/data_storage/job.py,sha256=w-7spowjkOa1P5fUVtJou3OltT0L48P0RYWZ9rSJ9-s,383
35
- datachain/data_storage/metastore.py,sha256=wVcT8MiSH_paWEXN6eZ8Z3msrHY6vWtVFTH5kwHteRE,54852
35
+ datachain/data_storage/metastore.py,sha256=ody-hWyrisGuNlzy24bc7QBqPXWIg64NcucIhZYronk,54842
36
36
  datachain/data_storage/schema.py,sha256=FQvt5MUMSnI5ZAE7Nthae4aaJpt8JC4nH8KiWDuhJkk,8135
37
37
  datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
38
38
  datachain/data_storage/sqlite.py,sha256=w0d_cZ2u9LpQYFFXll22mnxHaxPOoJdHlsKAZmONQpA,25605
39
- datachain/data_storage/warehouse.py,sha256=WGHWBuBmNmK-qHwhvMfAwtXZ-fQKwk8w1dadN_4dugA,33293
39
+ datachain/data_storage/warehouse.py,sha256=3iD946WXgGxohZ5lagmwydFZr7j7RceZW423QXU_7_U,33120
40
40
  datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
41
41
  datachain/lib/arrow.py,sha256=9C5AVH6tLo9hwzav-1tLLnmWP-3_SReYCOfcOC54pu0,4437
42
42
  datachain/lib/clip.py,sha256=16u4b_y2Y15nUS2UN_8ximMo6r_-_4IQpmct2ol-e-g,5730
43
43
  datachain/lib/data_model.py,sha256=qfTtQNncS5pt9SvXdMEa5kClniaT6XBGBfO7onEz2TI,1632
44
44
  datachain/lib/dataset_info.py,sha256=lONGr71ozo1DS4CQEhnpKORaU4qFb6Ketv8Xm8CVm2U,2188
45
- datachain/lib/dc.py,sha256=alJwK7z5JoUmGc1Kj74dGtlH2MJ0jeSyS2dnInemnnA,56386
45
+ datachain/lib/dc.py,sha256=6RtwA7MC3hosxi9RBgpOXjkv46SdN99g9N_u4mCDUUo,56071
46
46
  datachain/lib/file.py,sha256=n9GBmZ1CjzDjHkbUBsUrs8JOJrAoh3MV2Cc8hBkex20,11957
47
47
  datachain/lib/image.py,sha256=TgYhRhzd4nkytfFMeykQkPyzqb5Le_-tU81unVMPn4Q,2328
48
48
  datachain/lib/meta_formats.py,sha256=jlSYWRUeDMjun_YCsQ2JxyaDJpEpokzHDPmKUAoCXnU,7034
49
49
  datachain/lib/model_store.py,sha256=c4USXsBBjrGH8VOh4seIgOiav-qHOwdoixtxfLgU63c,2409
50
50
  datachain/lib/pytorch.py,sha256=9PsypKseyKfIimTmTQOgb-pbNXgeeAHLdlWx0qRPULY,5660
51
51
  datachain/lib/settings.py,sha256=39thOpYJw-zPirzeNO6pmRC2vPrQvt4eBsw1xLWDFsw,2344
52
- datachain/lib/signal_schema.py,sha256=lKGlpRRUHOUFLcpk-pLQd9kGAJ8FPy0Q2bk--UlVemU,14559
52
+ datachain/lib/signal_schema.py,sha256=XQTINSN_FJK76Jn8qd03g6J0cum58knP8U7Iuw-zKyU,14704
53
53
  datachain/lib/text.py,sha256=dVe2Ilc_gW2EV0kun0UwegiCkapWcd20cef7CgINWHU,1083
54
54
  datachain/lib/udf.py,sha256=IjuDt2B8E3xEHhcJnaK_ZhmivdrOYPXz5uf7ylpktws,11815
55
55
  datachain/lib/udf_signature.py,sha256=gMStcEeYJka5M6cg50Z9orC6y6HzCAJ3MkFqqn1fjZg,7137
@@ -66,7 +66,7 @@ datachain/lib/convert/values_to_tuples.py,sha256=aVoHWMOUGLAiS6_BBwKJqVIne91VffO
66
66
  datachain/query/__init__.py,sha256=tv-spkjUCYamMN9ys_90scYrZ8kJ7C7d1MTYVmxGtk4,325
67
67
  datachain/query/batch.py,sha256=j-_ZcuQra2Ro3Wj4crtqQCg-7xuv-p84hr4QHdvT7as,3479
68
68
  datachain/query/builtins.py,sha256=ZKNs49t8Oa_OaboCBIEqtXZt7c1Qe9OR_C_HpoDriIU,2781
69
- datachain/query/dataset.py,sha256=PJFVasYhCU0XvF7OrbxlAHLdm_PnhIQBp3TUDVHNHVY,60054
69
+ datachain/query/dataset.py,sha256=iTz3c5nJ-WmoQ5zcvKGT9ly6xVKJtD_fk76LA7zecWk,60164
70
70
  datachain/query/dispatch.py,sha256=oGX9ZuoKWPB_EyqAZD_eULcO3OejY44_keSmFS6SHT0,13315
71
71
  datachain/query/metrics.py,sha256=vsECqbZfoSDBnvC3GQlziKXmISVYDLgHP1fMPEOtKyo,640
72
72
  datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
@@ -92,9 +92,9 @@ datachain/sql/sqlite/base.py,sha256=Jb1csbIARjEvwbylnvgNA7ChozSyoL3CQzOGBUf8QAw,
92
92
  datachain/sql/sqlite/types.py,sha256=yzvp0sXSEoEYXs6zaYC_2YubarQoZH-MiUNXcpuEP4s,1573
93
93
  datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
94
94
  datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
95
- datachain-0.2.15.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
96
- datachain-0.2.15.dist-info/METADATA,sha256=kKdEsDFle6KQ55q9RlWsAd6DUTgAg40A8L5YWE9fbMg,14577
97
- datachain-0.2.15.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
98
- datachain-0.2.15.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
99
- datachain-0.2.15.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
100
- datachain-0.2.15.dist-info/RECORD,,
95
+ datachain-0.2.16.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
96
+ datachain-0.2.16.dist-info/METADATA,sha256=1f326fK-ZnS0nPvETuUj9PaI4R5SatpGVDIsQiJ0OvM,14577
97
+ datachain-0.2.16.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
98
+ datachain-0.2.16.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
99
+ datachain-0.2.16.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
100
+ datachain-0.2.16.dist-info/RECORD,,