datachain 0.3.19__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/catalog/catalog.py +19 -52
- datachain/data_storage/metastore.py +0 -4
- datachain/dataset.py +0 -5
- {datachain-0.3.19.dist-info → datachain-0.4.0.dist-info}/METADATA +1 -1
- {datachain-0.3.19.dist-info → datachain-0.4.0.dist-info}/RECORD +9 -9
- {datachain-0.3.19.dist-info → datachain-0.4.0.dist-info}/LICENSE +0 -0
- {datachain-0.3.19.dist-info → datachain-0.4.0.dist-info}/WHEEL +0 -0
- {datachain-0.3.19.dist-info → datachain-0.4.0.dist-info}/entry_points.txt +0 -0
- {datachain-0.3.19.dist-info → datachain-0.4.0.dist-info}/top_level.txt +0 -0
datachain/catalog/catalog.py
CHANGED
|
@@ -79,6 +79,7 @@ if TYPE_CHECKING:
|
|
|
79
79
|
)
|
|
80
80
|
from datachain.dataset import DatasetVersion
|
|
81
81
|
from datachain.job import Job
|
|
82
|
+
from datachain.lib.file import File
|
|
82
83
|
|
|
83
84
|
logger = logging.getLogger("datachain")
|
|
84
85
|
|
|
@@ -978,7 +979,6 @@ class Catalog:
|
|
|
978
979
|
script_output="",
|
|
979
980
|
create_rows_table=True,
|
|
980
981
|
job_id: Optional[str] = None,
|
|
981
|
-
is_job_result: bool = False,
|
|
982
982
|
) -> DatasetRecord:
|
|
983
983
|
"""
|
|
984
984
|
Creates dataset version if it doesn't exist.
|
|
@@ -1000,7 +1000,6 @@ class Catalog:
|
|
|
1000
1000
|
script_output=script_output,
|
|
1001
1001
|
schema=schema,
|
|
1002
1002
|
job_id=job_id,
|
|
1003
|
-
is_job_result=is_job_result,
|
|
1004
1003
|
ignore_if_exists=True,
|
|
1005
1004
|
)
|
|
1006
1005
|
|
|
@@ -1210,7 +1209,6 @@ class Catalog:
|
|
|
1210
1209
|
size=dataset_version.size,
|
|
1211
1210
|
preview=dataset_version.preview,
|
|
1212
1211
|
job_id=dataset_version.job_id,
|
|
1213
|
-
is_job_result=dataset_version.is_job_result,
|
|
1214
1212
|
)
|
|
1215
1213
|
# to avoid re-creating rows table, we are just renaming it for a new version
|
|
1216
1214
|
# of target dataset
|
|
@@ -1399,65 +1397,34 @@ class Catalog:
|
|
|
1399
1397
|
dataset = self.get_dataset(name)
|
|
1400
1398
|
return self.update_dataset(dataset, **update_data)
|
|
1401
1399
|
|
|
1402
|
-
def
|
|
1403
|
-
self, dataset_name: str, dataset_version: int, row: RowDict
|
|
1404
|
-
) ->
|
|
1400
|
+
def get_file_from_row(
|
|
1401
|
+
self, dataset_name: str, dataset_version: int, row: RowDict, signal_name: str
|
|
1402
|
+
) -> "File":
|
|
1405
1403
|
"""
|
|
1406
|
-
Function that returns file
|
|
1407
|
-
Note that signal names are without prefix, so if there was 'laion__file__source'
|
|
1408
|
-
in original row, result will have just 'source'
|
|
1409
|
-
Example output:
|
|
1410
|
-
{
|
|
1411
|
-
"source": "s3://ldb-public",
|
|
1412
|
-
"path": "animals/dogs/dog.jpg",
|
|
1413
|
-
...
|
|
1414
|
-
}
|
|
1404
|
+
Function that returns specific file signal from dataset row by name.
|
|
1415
1405
|
"""
|
|
1416
1406
|
from datachain.lib.file import File
|
|
1417
1407
|
from datachain.lib.signal_schema import DEFAULT_DELIMITER, SignalSchema
|
|
1418
1408
|
|
|
1419
1409
|
version = self.get_dataset(dataset_name).get_version(dataset_version)
|
|
1420
|
-
|
|
1421
|
-
file_signals_values = RowDict()
|
|
1422
|
-
|
|
1423
1410
|
schema = SignalSchema.deserialize(version.feature_schema)
|
|
1424
|
-
for file_signals in schema.get_signals(File):
|
|
1425
|
-
prefix = file_signals.replace(".", DEFAULT_DELIMITER) + DEFAULT_DELIMITER
|
|
1426
|
-
file_signals_values[file_signals] = {
|
|
1427
|
-
c_name.removeprefix(prefix): c_value
|
|
1428
|
-
for c_name, c_value in row.items()
|
|
1429
|
-
if c_name.startswith(prefix)
|
|
1430
|
-
and DEFAULT_DELIMITER not in c_name.removeprefix(prefix)
|
|
1431
|
-
}
|
|
1432
1411
|
|
|
1433
|
-
if not
|
|
1434
|
-
|
|
1435
|
-
|
|
1436
|
-
|
|
1437
|
-
|
|
1438
|
-
# to open object
|
|
1439
|
-
return next(iter(file_signals_values.values()))
|
|
1440
|
-
|
|
1441
|
-
def open_object(
|
|
1442
|
-
self,
|
|
1443
|
-
dataset_name: str,
|
|
1444
|
-
dataset_version: int,
|
|
1445
|
-
row: RowDict,
|
|
1446
|
-
use_cache: bool = True,
|
|
1447
|
-
**config: Any,
|
|
1448
|
-
):
|
|
1449
|
-
from datachain.lib.file import File
|
|
1412
|
+
if signal_name not in schema.get_signals(File):
|
|
1413
|
+
raise RuntimeError(
|
|
1414
|
+
f"File signal with path {signal_name} not found in ",
|
|
1415
|
+
f"dataset {dataset_name}@v{dataset_version} signals schema",
|
|
1416
|
+
)
|
|
1450
1417
|
|
|
1451
|
-
|
|
1452
|
-
|
|
1453
|
-
|
|
1418
|
+
prefix = signal_name.replace(".", DEFAULT_DELIMITER) + DEFAULT_DELIMITER
|
|
1419
|
+
file_signals = {
|
|
1420
|
+
c_name.removeprefix(prefix): c_value
|
|
1421
|
+
for c_name, c_value in row.items()
|
|
1422
|
+
if c_name.startswith(prefix)
|
|
1423
|
+
and DEFAULT_DELIMITER not in c_name.removeprefix(prefix)
|
|
1424
|
+
and c_name.removeprefix(prefix) in File.model_fields
|
|
1425
|
+
}
|
|
1454
1426
|
|
|
1455
|
-
|
|
1456
|
-
client = self.get_client(file_signals["source"], **config)
|
|
1457
|
-
return client.open_object(
|
|
1458
|
-
File._from_row(file_signals),
|
|
1459
|
-
use_cache=use_cache,
|
|
1460
|
-
)
|
|
1427
|
+
return File(**file_signals)
|
|
1461
1428
|
|
|
1462
1429
|
def ls(
|
|
1463
1430
|
self,
|
|
@@ -243,7 +243,6 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
243
243
|
size: Optional[int] = None,
|
|
244
244
|
preview: Optional[list[dict]] = None,
|
|
245
245
|
job_id: Optional[str] = None,
|
|
246
|
-
is_job_result: bool = False,
|
|
247
246
|
) -> DatasetRecord:
|
|
248
247
|
"""Creates new dataset version."""
|
|
249
248
|
|
|
@@ -497,7 +496,6 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
497
496
|
Column("query_script", Text, nullable=False, default=""),
|
|
498
497
|
Column("schema", JSON, nullable=True),
|
|
499
498
|
Column("job_id", Text, nullable=True),
|
|
500
|
-
Column("is_job_result", Boolean, nullable=False, default=False),
|
|
501
499
|
UniqueConstraint("dataset_id", "version"),
|
|
502
500
|
]
|
|
503
501
|
|
|
@@ -1009,7 +1007,6 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
1009
1007
|
size: Optional[int] = None,
|
|
1010
1008
|
preview: Optional[list[dict]] = None,
|
|
1011
1009
|
job_id: Optional[str] = None,
|
|
1012
|
-
is_job_result: bool = False,
|
|
1013
1010
|
conn=None,
|
|
1014
1011
|
) -> DatasetRecord:
|
|
1015
1012
|
"""Creates new dataset version."""
|
|
@@ -1035,7 +1032,6 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
1035
1032
|
size=size,
|
|
1036
1033
|
preview=json.dumps(preview or []),
|
|
1037
1034
|
job_id=job_id or os.getenv("DATACHAIN_JOB_ID"),
|
|
1038
|
-
is_job_result=is_job_result,
|
|
1039
1035
|
)
|
|
1040
1036
|
if ignore_if_exists and hasattr(query, "on_conflict_do_nothing"):
|
|
1041
1037
|
# SQLite and PostgreSQL both support 'on_conflict_do_nothing',
|
datachain/dataset.py
CHANGED
|
@@ -179,7 +179,6 @@ class DatasetVersion:
|
|
|
179
179
|
sources: str = ""
|
|
180
180
|
query_script: str = ""
|
|
181
181
|
job_id: Optional[str] = None
|
|
182
|
-
is_job_result: bool = False
|
|
183
182
|
|
|
184
183
|
@classmethod
|
|
185
184
|
def parse( # noqa: PLR0913
|
|
@@ -201,7 +200,6 @@ class DatasetVersion:
|
|
|
201
200
|
sources: str = "",
|
|
202
201
|
query_script: str = "",
|
|
203
202
|
job_id: Optional[str] = None,
|
|
204
|
-
is_job_result: bool = False,
|
|
205
203
|
):
|
|
206
204
|
return cls(
|
|
207
205
|
id,
|
|
@@ -221,7 +219,6 @@ class DatasetVersion:
|
|
|
221
219
|
sources,
|
|
222
220
|
query_script,
|
|
223
221
|
job_id,
|
|
224
|
-
is_job_result,
|
|
225
222
|
)
|
|
226
223
|
|
|
227
224
|
def __eq__(self, other):
|
|
@@ -327,7 +324,6 @@ class DatasetRecord:
|
|
|
327
324
|
version_query_script: Optional[str],
|
|
328
325
|
version_schema: str,
|
|
329
326
|
version_job_id: Optional[str] = None,
|
|
330
|
-
version_is_job_result: bool = False,
|
|
331
327
|
) -> "DatasetRecord":
|
|
332
328
|
labels_lst: list[str] = json.loads(labels) if labels else []
|
|
333
329
|
schema_dct: dict[str, Any] = json.loads(schema) if schema else {}
|
|
@@ -353,7 +349,6 @@ class DatasetRecord:
|
|
|
353
349
|
version_sources, # type: ignore[arg-type]
|
|
354
350
|
version_query_script, # type: ignore[arg-type]
|
|
355
351
|
version_job_id,
|
|
356
|
-
version_is_job_result,
|
|
357
352
|
)
|
|
358
353
|
|
|
359
354
|
return cls(
|
|
@@ -5,7 +5,7 @@ datachain/cache.py,sha256=s0YHN7qurmQv-eC265TjeureK84TebWWAnL07cxchZQ,2997
|
|
|
5
5
|
datachain/cli.py,sha256=TQ1OKMulAcsJndKLCyxJpfNqbMWQgOa4Aeihnu36cR8,30095
|
|
6
6
|
datachain/cli_utils.py,sha256=jrn9ejGXjybeO1ur3fjdSiAyCHZrX0qsLLbJzN9ErPM,2418
|
|
7
7
|
datachain/config.py,sha256=PfC7W5yO6HFO6-iMB4YB-0RR88LPiGmD6sS_SfVbGso,1979
|
|
8
|
-
datachain/dataset.py,sha256=
|
|
8
|
+
datachain/dataset.py,sha256=HWcFckJpmTU5AGsg8ILW8JInpNQqaWmJoasls18q5kI,14735
|
|
9
9
|
datachain/error.py,sha256=vbIbamnFMIojh1UpmxWoA6Omup7WFAFNJnf8xAkGWwI,1146
|
|
10
10
|
datachain/job.py,sha256=Jt4sNutMHJReaGsj3r3scueN5aESLGfhimAa8pUP7Is,1271
|
|
11
11
|
datachain/listing.py,sha256=TkMmBzCiru26x4RaZiagWJTmTGbiy6yGrAsSJMr8cFE,8213
|
|
@@ -18,7 +18,7 @@ datachain/storage.py,sha256=RiSJLYdHUjnrEWkLBKPcETHpAxld_B2WxLg711t0aZI,3733
|
|
|
18
18
|
datachain/telemetry.py,sha256=0A4IOPPp9VlP5pyW9eBfaTK3YhHGzHl7dQudQjUAx9A,994
|
|
19
19
|
datachain/utils.py,sha256=KeFSRHsiYthnTu4a6bH-rw04mX1m8krTX0f2NqfQGFI,12114
|
|
20
20
|
datachain/catalog/__init__.py,sha256=g2iAAFx_gEIrqshXlhSEbrc8qDaEH11cjU40n3CHDz4,409
|
|
21
|
-
datachain/catalog/catalog.py,sha256=
|
|
21
|
+
datachain/catalog/catalog.py,sha256=FuKuIiCwPgN5Ea25hnFe_ZFZH9YEUZ2ma9k_Lczk-JU,63867
|
|
22
22
|
datachain/catalog/datasource.py,sha256=D-VWIVDCM10A8sQavLhRXdYSCG7F4o4ifswEF80_NAQ,1412
|
|
23
23
|
datachain/catalog/loader.py,sha256=-6VelNfXUdgUnwInVyA8g86Boxv2xqhTh9xNS-Zlwig,8242
|
|
24
24
|
datachain/client/__init__.py,sha256=T4wiYL9KIM0ZZ_UqIyzV8_ufzYlewmizlV4iymHNluE,86
|
|
@@ -33,7 +33,7 @@ datachain/data_storage/__init__.py,sha256=cEOJpyu1JDZtfUupYucCDNFI6e5Wmp_Oyzq6rZ
|
|
|
33
33
|
datachain/data_storage/db_engine.py,sha256=81Ol1of9TTTzD97ORajCnP366Xz2mEJt6C-kTUCaru4,3406
|
|
34
34
|
datachain/data_storage/id_generator.py,sha256=lCEoU0BM37Ai2aRpSbwo5oQT0GqZnSpYwwvizathRMQ,4292
|
|
35
35
|
datachain/data_storage/job.py,sha256=w-7spowjkOa1P5fUVtJou3OltT0L48P0RYWZ9rSJ9-s,383
|
|
36
|
-
datachain/data_storage/metastore.py,sha256=
|
|
36
|
+
datachain/data_storage/metastore.py,sha256=Ztw86JbN4-1gobZea1oqAAT2kotvi46pxNRjqncZ7B8,52457
|
|
37
37
|
datachain/data_storage/schema.py,sha256=AGbjyEir5UmRZXI3m0jChZogUh5wd8csj6-YlUWaAxQ,8383
|
|
38
38
|
datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
|
|
39
39
|
datachain/data_storage/sqlite.py,sha256=EBKJncuzcyQfcKFm2mUjvHjHRTODsteM-k_zndunBrw,28834
|
|
@@ -97,9 +97,9 @@ datachain/sql/sqlite/base.py,sha256=WLPHBhZbXbiqPoRV1VgDrXJqku4UuvJpBhYeQ0k5rI8,
|
|
|
97
97
|
datachain/sql/sqlite/types.py,sha256=yzvp0sXSEoEYXs6zaYC_2YubarQoZH-MiUNXcpuEP4s,1573
|
|
98
98
|
datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
|
|
99
99
|
datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
|
|
100
|
-
datachain-0.
|
|
101
|
-
datachain-0.
|
|
102
|
-
datachain-0.
|
|
103
|
-
datachain-0.
|
|
104
|
-
datachain-0.
|
|
105
|
-
datachain-0.
|
|
100
|
+
datachain-0.4.0.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
|
|
101
|
+
datachain-0.4.0.dist-info/METADATA,sha256=UmW4n6_qqsTZe_bXdjwCe6n6zWSVq35Kn_-h_u_b0RA,17156
|
|
102
|
+
datachain-0.4.0.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
|
|
103
|
+
datachain-0.4.0.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
|
|
104
|
+
datachain-0.4.0.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
|
|
105
|
+
datachain-0.4.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|