datachain 0.3.19__py3-none-any.whl → 0.3.20__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

@@ -79,6 +79,7 @@ if TYPE_CHECKING:
79
79
  )
80
80
  from datachain.dataset import DatasetVersion
81
81
  from datachain.job import Job
82
+ from datachain.lib.file import File
82
83
 
83
84
  logger = logging.getLogger("datachain")
84
85
 
@@ -1399,65 +1400,34 @@ class Catalog:
1399
1400
  dataset = self.get_dataset(name)
1400
1401
  return self.update_dataset(dataset, **update_data)
1401
1402
 
1402
- def get_file_signals(
1403
- self, dataset_name: str, dataset_version: int, row: RowDict
1404
- ) -> Optional[RowDict]:
1403
+ def get_file_from_row(
1404
+ self, dataset_name: str, dataset_version: int, row: RowDict, signal_name: str
1405
+ ) -> "File":
1405
1406
  """
1406
- Function that returns file signals from dataset row.
1407
- Note that signal names are without prefix, so if there was 'laion__file__source'
1408
- in original row, result will have just 'source'
1409
- Example output:
1410
- {
1411
- "source": "s3://ldb-public",
1412
- "path": "animals/dogs/dog.jpg",
1413
- ...
1414
- }
1407
+ Function that returns specific file signal from dataset row by name.
1415
1408
  """
1416
1409
  from datachain.lib.file import File
1417
1410
  from datachain.lib.signal_schema import DEFAULT_DELIMITER, SignalSchema
1418
1411
 
1419
1412
  version = self.get_dataset(dataset_name).get_version(dataset_version)
1420
-
1421
- file_signals_values = RowDict()
1422
-
1423
1413
  schema = SignalSchema.deserialize(version.feature_schema)
1424
- for file_signals in schema.get_signals(File):
1425
- prefix = file_signals.replace(".", DEFAULT_DELIMITER) + DEFAULT_DELIMITER
1426
- file_signals_values[file_signals] = {
1427
- c_name.removeprefix(prefix): c_value
1428
- for c_name, c_value in row.items()
1429
- if c_name.startswith(prefix)
1430
- and DEFAULT_DELIMITER not in c_name.removeprefix(prefix)
1431
- }
1432
1414
 
1433
- if not file_signals_values:
1434
- return None
1435
-
1436
- # there can be multiple file signals in a schema, but taking the first
1437
- # one for now. In future we might add ability to choose from which one
1438
- # to open object
1439
- return next(iter(file_signals_values.values()))
1440
-
1441
- def open_object(
1442
- self,
1443
- dataset_name: str,
1444
- dataset_version: int,
1445
- row: RowDict,
1446
- use_cache: bool = True,
1447
- **config: Any,
1448
- ):
1449
- from datachain.lib.file import File
1415
+ if signal_name not in schema.get_signals(File):
1416
+ raise RuntimeError(
1417
+ f"File signal with path {signal_name} not found in ",
1418
+ f"dataset {dataset_name}@v{dataset_version} signals schema",
1419
+ )
1450
1420
 
1451
- file_signals = self.get_file_signals(dataset_name, dataset_version, row)
1452
- if not file_signals:
1453
- raise RuntimeError("Cannot open object without file signals")
1421
+ prefix = signal_name.replace(".", DEFAULT_DELIMITER) + DEFAULT_DELIMITER
1422
+ file_signals = {
1423
+ c_name.removeprefix(prefix): c_value
1424
+ for c_name, c_value in row.items()
1425
+ if c_name.startswith(prefix)
1426
+ and DEFAULT_DELIMITER not in c_name.removeprefix(prefix)
1427
+ and c_name.removeprefix(prefix) in File.model_fields
1428
+ }
1454
1429
 
1455
- config = config or self.client_config
1456
- client = self.get_client(file_signals["source"], **config)
1457
- return client.open_object(
1458
- File._from_row(file_signals),
1459
- use_cache=use_cache,
1460
- )
1430
+ return File(**file_signals)
1461
1431
 
1462
1432
  def ls(
1463
1433
  self,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.3.19
3
+ Version: 0.3.20
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -18,7 +18,7 @@ datachain/storage.py,sha256=RiSJLYdHUjnrEWkLBKPcETHpAxld_B2WxLg711t0aZI,3733
18
18
  datachain/telemetry.py,sha256=0A4IOPPp9VlP5pyW9eBfaTK3YhHGzHl7dQudQjUAx9A,994
19
19
  datachain/utils.py,sha256=KeFSRHsiYthnTu4a6bH-rw04mX1m8krTX0f2NqfQGFI,12114
20
20
  datachain/catalog/__init__.py,sha256=g2iAAFx_gEIrqshXlhSEbrc8qDaEH11cjU40n3CHDz4,409
21
- datachain/catalog/catalog.py,sha256=poTu_B5va35MTCV60ntsn4jvAFXepqa2peCjYCXWeU0,64982
21
+ datachain/catalog/catalog.py,sha256=MC8qxu5r0eWtVSWBxPmnYsc-0sUnkzGUZZxgwFQDhH0,64002
22
22
  datachain/catalog/datasource.py,sha256=D-VWIVDCM10A8sQavLhRXdYSCG7F4o4ifswEF80_NAQ,1412
23
23
  datachain/catalog/loader.py,sha256=-6VelNfXUdgUnwInVyA8g86Boxv2xqhTh9xNS-Zlwig,8242
24
24
  datachain/client/__init__.py,sha256=T4wiYL9KIM0ZZ_UqIyzV8_ufzYlewmizlV4iymHNluE,86
@@ -97,9 +97,9 @@ datachain/sql/sqlite/base.py,sha256=WLPHBhZbXbiqPoRV1VgDrXJqku4UuvJpBhYeQ0k5rI8,
97
97
  datachain/sql/sqlite/types.py,sha256=yzvp0sXSEoEYXs6zaYC_2YubarQoZH-MiUNXcpuEP4s,1573
98
98
  datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
99
99
  datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
100
- datachain-0.3.19.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
101
- datachain-0.3.19.dist-info/METADATA,sha256=yMBpXwOmeoWOmpS0m_hp8GFiMs3Zu_ixMzkG6GF_Z2U,17157
102
- datachain-0.3.19.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
103
- datachain-0.3.19.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
104
- datachain-0.3.19.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
105
- datachain-0.3.19.dist-info/RECORD,,
100
+ datachain-0.3.20.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
101
+ datachain-0.3.20.dist-info/METADATA,sha256=zFk_QWL3Ag3kxLdQPqYAFEXnTD2WkxrvJmLLGOxXpsE,17157
102
+ datachain-0.3.20.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
103
+ datachain-0.3.20.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
104
+ datachain-0.3.20.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
105
+ datachain-0.3.20.dist-info/RECORD,,