datachain 0.3.19__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

@@ -79,6 +79,7 @@ if TYPE_CHECKING:
79
79
  )
80
80
  from datachain.dataset import DatasetVersion
81
81
  from datachain.job import Job
82
+ from datachain.lib.file import File
82
83
 
83
84
  logger = logging.getLogger("datachain")
84
85
 
@@ -978,7 +979,6 @@ class Catalog:
978
979
  script_output="",
979
980
  create_rows_table=True,
980
981
  job_id: Optional[str] = None,
981
- is_job_result: bool = False,
982
982
  ) -> DatasetRecord:
983
983
  """
984
984
  Creates dataset version if it doesn't exist.
@@ -1000,7 +1000,6 @@ class Catalog:
1000
1000
  script_output=script_output,
1001
1001
  schema=schema,
1002
1002
  job_id=job_id,
1003
- is_job_result=is_job_result,
1004
1003
  ignore_if_exists=True,
1005
1004
  )
1006
1005
 
@@ -1210,7 +1209,6 @@ class Catalog:
1210
1209
  size=dataset_version.size,
1211
1210
  preview=dataset_version.preview,
1212
1211
  job_id=dataset_version.job_id,
1213
- is_job_result=dataset_version.is_job_result,
1214
1212
  )
1215
1213
  # to avoid re-creating rows table, we are just renaming it for a new version
1216
1214
  # of target dataset
@@ -1399,65 +1397,34 @@ class Catalog:
1399
1397
  dataset = self.get_dataset(name)
1400
1398
  return self.update_dataset(dataset, **update_data)
1401
1399
 
1402
- def get_file_signals(
1403
- self, dataset_name: str, dataset_version: int, row: RowDict
1404
- ) -> Optional[RowDict]:
1400
+ def get_file_from_row(
1401
+ self, dataset_name: str, dataset_version: int, row: RowDict, signal_name: str
1402
+ ) -> "File":
1405
1403
  """
1406
- Function that returns file signals from dataset row.
1407
- Note that signal names are without prefix, so if there was 'laion__file__source'
1408
- in original row, result will have just 'source'
1409
- Example output:
1410
- {
1411
- "source": "s3://ldb-public",
1412
- "path": "animals/dogs/dog.jpg",
1413
- ...
1414
- }
1404
+ Function that returns specific file signal from dataset row by name.
1415
1405
  """
1416
1406
  from datachain.lib.file import File
1417
1407
  from datachain.lib.signal_schema import DEFAULT_DELIMITER, SignalSchema
1418
1408
 
1419
1409
  version = self.get_dataset(dataset_name).get_version(dataset_version)
1420
-
1421
- file_signals_values = RowDict()
1422
-
1423
1410
  schema = SignalSchema.deserialize(version.feature_schema)
1424
- for file_signals in schema.get_signals(File):
1425
- prefix = file_signals.replace(".", DEFAULT_DELIMITER) + DEFAULT_DELIMITER
1426
- file_signals_values[file_signals] = {
1427
- c_name.removeprefix(prefix): c_value
1428
- for c_name, c_value in row.items()
1429
- if c_name.startswith(prefix)
1430
- and DEFAULT_DELIMITER not in c_name.removeprefix(prefix)
1431
- }
1432
1411
 
1433
- if not file_signals_values:
1434
- return None
1435
-
1436
- # there can be multiple file signals in a schema, but taking the first
1437
- # one for now. In future we might add ability to choose from which one
1438
- # to open object
1439
- return next(iter(file_signals_values.values()))
1440
-
1441
- def open_object(
1442
- self,
1443
- dataset_name: str,
1444
- dataset_version: int,
1445
- row: RowDict,
1446
- use_cache: bool = True,
1447
- **config: Any,
1448
- ):
1449
- from datachain.lib.file import File
1412
+ if signal_name not in schema.get_signals(File):
1413
+ raise RuntimeError(
1414
+ f"File signal with path {signal_name} not found in ",
1415
+ f"dataset {dataset_name}@v{dataset_version} signals schema",
1416
+ )
1450
1417
 
1451
- file_signals = self.get_file_signals(dataset_name, dataset_version, row)
1452
- if not file_signals:
1453
- raise RuntimeError("Cannot open object without file signals")
1418
+ prefix = signal_name.replace(".", DEFAULT_DELIMITER) + DEFAULT_DELIMITER
1419
+ file_signals = {
1420
+ c_name.removeprefix(prefix): c_value
1421
+ for c_name, c_value in row.items()
1422
+ if c_name.startswith(prefix)
1423
+ and DEFAULT_DELIMITER not in c_name.removeprefix(prefix)
1424
+ and c_name.removeprefix(prefix) in File.model_fields
1425
+ }
1454
1426
 
1455
- config = config or self.client_config
1456
- client = self.get_client(file_signals["source"], **config)
1457
- return client.open_object(
1458
- File._from_row(file_signals),
1459
- use_cache=use_cache,
1460
- )
1427
+ return File(**file_signals)
1461
1428
 
1462
1429
  def ls(
1463
1430
  self,
@@ -243,7 +243,6 @@ class AbstractMetastore(ABC, Serializable):
243
243
  size: Optional[int] = None,
244
244
  preview: Optional[list[dict]] = None,
245
245
  job_id: Optional[str] = None,
246
- is_job_result: bool = False,
247
246
  ) -> DatasetRecord:
248
247
  """Creates new dataset version."""
249
248
 
@@ -497,7 +496,6 @@ class AbstractDBMetastore(AbstractMetastore):
497
496
  Column("query_script", Text, nullable=False, default=""),
498
497
  Column("schema", JSON, nullable=True),
499
498
  Column("job_id", Text, nullable=True),
500
- Column("is_job_result", Boolean, nullable=False, default=False),
501
499
  UniqueConstraint("dataset_id", "version"),
502
500
  ]
503
501
 
@@ -1009,7 +1007,6 @@ class AbstractDBMetastore(AbstractMetastore):
1009
1007
  size: Optional[int] = None,
1010
1008
  preview: Optional[list[dict]] = None,
1011
1009
  job_id: Optional[str] = None,
1012
- is_job_result: bool = False,
1013
1010
  conn=None,
1014
1011
  ) -> DatasetRecord:
1015
1012
  """Creates new dataset version."""
@@ -1035,7 +1032,6 @@ class AbstractDBMetastore(AbstractMetastore):
1035
1032
  size=size,
1036
1033
  preview=json.dumps(preview or []),
1037
1034
  job_id=job_id or os.getenv("DATACHAIN_JOB_ID"),
1038
- is_job_result=is_job_result,
1039
1035
  )
1040
1036
  if ignore_if_exists and hasattr(query, "on_conflict_do_nothing"):
1041
1037
  # SQLite and PostgreSQL both support 'on_conflict_do_nothing',
datachain/dataset.py CHANGED
@@ -179,7 +179,6 @@ class DatasetVersion:
179
179
  sources: str = ""
180
180
  query_script: str = ""
181
181
  job_id: Optional[str] = None
182
- is_job_result: bool = False
183
182
 
184
183
  @classmethod
185
184
  def parse( # noqa: PLR0913
@@ -201,7 +200,6 @@ class DatasetVersion:
201
200
  sources: str = "",
202
201
  query_script: str = "",
203
202
  job_id: Optional[str] = None,
204
- is_job_result: bool = False,
205
203
  ):
206
204
  return cls(
207
205
  id,
@@ -221,7 +219,6 @@ class DatasetVersion:
221
219
  sources,
222
220
  query_script,
223
221
  job_id,
224
- is_job_result,
225
222
  )
226
223
 
227
224
  def __eq__(self, other):
@@ -327,7 +324,6 @@ class DatasetRecord:
327
324
  version_query_script: Optional[str],
328
325
  version_schema: str,
329
326
  version_job_id: Optional[str] = None,
330
- version_is_job_result: bool = False,
331
327
  ) -> "DatasetRecord":
332
328
  labels_lst: list[str] = json.loads(labels) if labels else []
333
329
  schema_dct: dict[str, Any] = json.loads(schema) if schema else {}
@@ -353,7 +349,6 @@ class DatasetRecord:
353
349
  version_sources, # type: ignore[arg-type]
354
350
  version_query_script, # type: ignore[arg-type]
355
351
  version_job_id,
356
- version_is_job_result,
357
352
  )
358
353
 
359
354
  return cls(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.3.19
3
+ Version: 0.4.0
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -5,7 +5,7 @@ datachain/cache.py,sha256=s0YHN7qurmQv-eC265TjeureK84TebWWAnL07cxchZQ,2997
5
5
  datachain/cli.py,sha256=TQ1OKMulAcsJndKLCyxJpfNqbMWQgOa4Aeihnu36cR8,30095
6
6
  datachain/cli_utils.py,sha256=jrn9ejGXjybeO1ur3fjdSiAyCHZrX0qsLLbJzN9ErPM,2418
7
7
  datachain/config.py,sha256=PfC7W5yO6HFO6-iMB4YB-0RR88LPiGmD6sS_SfVbGso,1979
8
- datachain/dataset.py,sha256=2NCQU9ZSgNGhA01SP5ON18VhMohXif-btOB4Lz-Uvds,14911
8
+ datachain/dataset.py,sha256=HWcFckJpmTU5AGsg8ILW8JInpNQqaWmJoasls18q5kI,14735
9
9
  datachain/error.py,sha256=vbIbamnFMIojh1UpmxWoA6Omup7WFAFNJnf8xAkGWwI,1146
10
10
  datachain/job.py,sha256=Jt4sNutMHJReaGsj3r3scueN5aESLGfhimAa8pUP7Is,1271
11
11
  datachain/listing.py,sha256=TkMmBzCiru26x4RaZiagWJTmTGbiy6yGrAsSJMr8cFE,8213
@@ -18,7 +18,7 @@ datachain/storage.py,sha256=RiSJLYdHUjnrEWkLBKPcETHpAxld_B2WxLg711t0aZI,3733
18
18
  datachain/telemetry.py,sha256=0A4IOPPp9VlP5pyW9eBfaTK3YhHGzHl7dQudQjUAx9A,994
19
19
  datachain/utils.py,sha256=KeFSRHsiYthnTu4a6bH-rw04mX1m8krTX0f2NqfQGFI,12114
20
20
  datachain/catalog/__init__.py,sha256=g2iAAFx_gEIrqshXlhSEbrc8qDaEH11cjU40n3CHDz4,409
21
- datachain/catalog/catalog.py,sha256=poTu_B5va35MTCV60ntsn4jvAFXepqa2peCjYCXWeU0,64982
21
+ datachain/catalog/catalog.py,sha256=FuKuIiCwPgN5Ea25hnFe_ZFZH9YEUZ2ma9k_Lczk-JU,63867
22
22
  datachain/catalog/datasource.py,sha256=D-VWIVDCM10A8sQavLhRXdYSCG7F4o4ifswEF80_NAQ,1412
23
23
  datachain/catalog/loader.py,sha256=-6VelNfXUdgUnwInVyA8g86Boxv2xqhTh9xNS-Zlwig,8242
24
24
  datachain/client/__init__.py,sha256=T4wiYL9KIM0ZZ_UqIyzV8_ufzYlewmizlV4iymHNluE,86
@@ -33,7 +33,7 @@ datachain/data_storage/__init__.py,sha256=cEOJpyu1JDZtfUupYucCDNFI6e5Wmp_Oyzq6rZ
33
33
  datachain/data_storage/db_engine.py,sha256=81Ol1of9TTTzD97ORajCnP366Xz2mEJt6C-kTUCaru4,3406
34
34
  datachain/data_storage/id_generator.py,sha256=lCEoU0BM37Ai2aRpSbwo5oQT0GqZnSpYwwvizathRMQ,4292
35
35
  datachain/data_storage/job.py,sha256=w-7spowjkOa1P5fUVtJou3OltT0L48P0RYWZ9rSJ9-s,383
36
- datachain/data_storage/metastore.py,sha256=BePe3bVxo-Zuuccok8TLRo4cMHVnAIa8hfZMadbxzqM,52649
36
+ datachain/data_storage/metastore.py,sha256=Ztw86JbN4-1gobZea1oqAAT2kotvi46pxNRjqncZ7B8,52457
37
37
  datachain/data_storage/schema.py,sha256=AGbjyEir5UmRZXI3m0jChZogUh5wd8csj6-YlUWaAxQ,8383
38
38
  datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
39
39
  datachain/data_storage/sqlite.py,sha256=EBKJncuzcyQfcKFm2mUjvHjHRTODsteM-k_zndunBrw,28834
@@ -97,9 +97,9 @@ datachain/sql/sqlite/base.py,sha256=WLPHBhZbXbiqPoRV1VgDrXJqku4UuvJpBhYeQ0k5rI8,
97
97
  datachain/sql/sqlite/types.py,sha256=yzvp0sXSEoEYXs6zaYC_2YubarQoZH-MiUNXcpuEP4s,1573
98
98
  datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
99
99
  datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
100
- datachain-0.3.19.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
101
- datachain-0.3.19.dist-info/METADATA,sha256=yMBpXwOmeoWOmpS0m_hp8GFiMs3Zu_ixMzkG6GF_Z2U,17157
102
- datachain-0.3.19.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
103
- datachain-0.3.19.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
104
- datachain-0.3.19.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
105
- datachain-0.3.19.dist-info/RECORD,,
100
+ datachain-0.4.0.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
101
+ datachain-0.4.0.dist-info/METADATA,sha256=UmW4n6_qqsTZe_bXdjwCe6n6zWSVq35Kn_-h_u_b0RA,17156
102
+ datachain-0.4.0.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
103
+ datachain-0.4.0.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
104
+ datachain-0.4.0.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
105
+ datachain-0.4.0.dist-info/RECORD,,