datachain 0.33.1__py3-none-any.whl → 0.34.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/catalog/catalog.py +22 -58
- datachain/data_storage/metastore.py +22 -1
- datachain/error.py +4 -0
- datachain/job.py +3 -0
- datachain/lib/dc/datachain.py +169 -71
- {datachain-0.33.1.dist-info → datachain-0.34.1.dist-info}/METADATA +1 -1
- {datachain-0.33.1.dist-info → datachain-0.34.1.dist-info}/RECORD +11 -11
- {datachain-0.33.1.dist-info → datachain-0.34.1.dist-info}/WHEEL +0 -0
- {datachain-0.33.1.dist-info → datachain-0.34.1.dist-info}/entry_points.txt +0 -0
- {datachain-0.33.1.dist-info → datachain-0.34.1.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.33.1.dist-info → datachain-0.34.1.dist-info}/top_level.txt +0 -0
datachain/catalog/catalog.py
CHANGED
|
@@ -144,26 +144,19 @@ def shutdown_process(
|
|
|
144
144
|
return proc.wait()
|
|
145
145
|
|
|
146
146
|
|
|
147
|
-
def
|
|
147
|
+
def _process_stream(stream: "IO[bytes]", callback: Callable[[str], None]) -> None:
|
|
148
148
|
buffer = b""
|
|
149
|
+
while byt := stream.read(1): # Read one byte at a time
|
|
150
|
+
buffer += byt
|
|
149
151
|
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
buffer += byt
|
|
153
|
-
|
|
154
|
-
if byt in (b"\n", b"\r"): # Check for newline or carriage return
|
|
155
|
-
line = buffer.decode("utf-8", errors="replace")
|
|
156
|
-
callback(line)
|
|
157
|
-
buffer = b"" # Clear buffer for the next line
|
|
158
|
-
|
|
159
|
-
if buffer: # Handle any remaining data in the buffer
|
|
160
|
-
line = buffer.decode("utf-8", errors="replace")
|
|
152
|
+
if byt in (b"\n", b"\r"): # Check for newline or carriage return
|
|
153
|
+
line = buffer.decode("utf-8")
|
|
161
154
|
callback(line)
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
155
|
+
buffer = b"" # Clear buffer for next line
|
|
156
|
+
|
|
157
|
+
if buffer: # Handle any remaining data in the buffer
|
|
158
|
+
line = buffer.decode("utf-8")
|
|
159
|
+
callback(line)
|
|
167
160
|
|
|
168
161
|
|
|
169
162
|
class DatasetRowsFetcher(NodesThreadPool):
|
|
@@ -1767,13 +1760,13 @@ class Catalog:
|
|
|
1767
1760
|
recursive=recursive,
|
|
1768
1761
|
)
|
|
1769
1762
|
|
|
1770
|
-
@staticmethod
|
|
1771
1763
|
def query(
|
|
1764
|
+
self,
|
|
1772
1765
|
query_script: str,
|
|
1773
1766
|
env: Optional[Mapping[str, str]] = None,
|
|
1774
1767
|
python_executable: str = sys.executable,
|
|
1775
|
-
|
|
1776
|
-
|
|
1768
|
+
capture_output: bool = False,
|
|
1769
|
+
output_hook: Callable[[str], None] = noop,
|
|
1777
1770
|
params: Optional[dict[str, str]] = None,
|
|
1778
1771
|
job_id: Optional[str] = None,
|
|
1779
1772
|
interrupt_timeout: Optional[int] = None,
|
|
@@ -1788,18 +1781,13 @@ class Catalog:
|
|
|
1788
1781
|
},
|
|
1789
1782
|
)
|
|
1790
1783
|
popen_kwargs: dict[str, Any] = {}
|
|
1791
|
-
|
|
1792
|
-
|
|
1793
|
-
popen_kwargs = {"stdout": subprocess.PIPE}
|
|
1794
|
-
if stderr_callback is not None:
|
|
1795
|
-
popen_kwargs["stderr"] = subprocess.PIPE
|
|
1784
|
+
if capture_output:
|
|
1785
|
+
popen_kwargs = {"stdout": subprocess.PIPE, "stderr": subprocess.STDOUT}
|
|
1796
1786
|
|
|
1797
1787
|
def raise_termination_signal(sig: int, _: Any) -> NoReturn:
|
|
1798
1788
|
raise TerminationSignal(sig)
|
|
1799
1789
|
|
|
1800
|
-
|
|
1801
|
-
stderr_thread: Optional[Thread] = None
|
|
1802
|
-
|
|
1790
|
+
thread: Optional[Thread] = None
|
|
1803
1791
|
with subprocess.Popen(cmd, env=env, **popen_kwargs) as proc: # noqa: S603
|
|
1804
1792
|
logger.info("Starting process %s", proc.pid)
|
|
1805
1793
|
|
|
@@ -1813,20 +1801,10 @@ class Catalog:
|
|
|
1813
1801
|
orig_sigterm_handler = signal.getsignal(signal.SIGTERM)
|
|
1814
1802
|
signal.signal(signal.SIGTERM, raise_termination_signal)
|
|
1815
1803
|
try:
|
|
1816
|
-
if
|
|
1817
|
-
|
|
1818
|
-
|
|
1819
|
-
|
|
1820
|
-
daemon=True,
|
|
1821
|
-
)
|
|
1822
|
-
stdout_thread.start()
|
|
1823
|
-
if stderr_callback is not None:
|
|
1824
|
-
stderr_thread = Thread(
|
|
1825
|
-
target=process_output,
|
|
1826
|
-
args=(proc.stderr, stderr_callback),
|
|
1827
|
-
daemon=True,
|
|
1828
|
-
)
|
|
1829
|
-
stderr_thread.start()
|
|
1804
|
+
if capture_output:
|
|
1805
|
+
args = (proc.stdout, output_hook)
|
|
1806
|
+
thread = Thread(target=_process_stream, args=args, daemon=True)
|
|
1807
|
+
thread.start()
|
|
1830
1808
|
|
|
1831
1809
|
proc.wait()
|
|
1832
1810
|
except TerminationSignal as exc:
|
|
@@ -1844,22 +1822,8 @@ class Catalog:
|
|
|
1844
1822
|
finally:
|
|
1845
1823
|
signal.signal(signal.SIGTERM, orig_sigterm_handler)
|
|
1846
1824
|
signal.signal(signal.SIGINT, orig_sigint_handler)
|
|
1847
|
-
|
|
1848
|
-
|
|
1849
|
-
if stdout_thread is not None:
|
|
1850
|
-
stdout_thread.join(timeout=thread_join_timeout_seconds)
|
|
1851
|
-
if stdout_thread.is_alive():
|
|
1852
|
-
logger.warning(
|
|
1853
|
-
"stdout thread is still alive after %s seconds",
|
|
1854
|
-
thread_join_timeout_seconds,
|
|
1855
|
-
)
|
|
1856
|
-
if stderr_thread is not None:
|
|
1857
|
-
stderr_thread.join(timeout=thread_join_timeout_seconds)
|
|
1858
|
-
if stderr_thread.is_alive():
|
|
1859
|
-
logger.warning(
|
|
1860
|
-
"stderr thread is still alive after %s seconds",
|
|
1861
|
-
thread_join_timeout_seconds,
|
|
1862
|
-
)
|
|
1825
|
+
if thread:
|
|
1826
|
+
thread.join() # wait for the reader thread
|
|
1863
1827
|
|
|
1864
1828
|
logger.info("Process %s exited with return code %s", proc.pid, proc.returncode)
|
|
1865
1829
|
if proc.returncode in (
|
|
@@ -21,6 +21,7 @@ from sqlalchemy import (
|
|
|
21
21
|
Table,
|
|
22
22
|
Text,
|
|
23
23
|
UniqueConstraint,
|
|
24
|
+
desc,
|
|
24
25
|
select,
|
|
25
26
|
)
|
|
26
27
|
from sqlalchemy.sql import func as f
|
|
@@ -399,6 +400,7 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
399
400
|
workers: int = 1,
|
|
400
401
|
python_version: Optional[str] = None,
|
|
401
402
|
params: Optional[dict[str, str]] = None,
|
|
403
|
+
parent_job_id: Optional[str] = None,
|
|
402
404
|
) -> str:
|
|
403
405
|
"""
|
|
404
406
|
Creates a new job.
|
|
@@ -443,6 +445,10 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
443
445
|
def list_checkpoints(self, job_id: str, conn=None) -> Iterator["Checkpoint"]:
|
|
444
446
|
"""Returns all checkpoints related to some job"""
|
|
445
447
|
|
|
448
|
+
@abstractmethod
|
|
449
|
+
def get_last_checkpoint(self, job_id: str, conn=None) -> Optional[Checkpoint]:
|
|
450
|
+
"""Get last created checkpoint for some job."""
|
|
451
|
+
|
|
446
452
|
@abstractmethod
|
|
447
453
|
def get_checkpoint_by_id(self, checkpoint_id: str, conn=None) -> Checkpoint:
|
|
448
454
|
"""Gets single checkpoint by id"""
|
|
@@ -1548,6 +1554,7 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
1548
1554
|
Column("error_stack", Text, nullable=False, default=""),
|
|
1549
1555
|
Column("params", JSON, nullable=False),
|
|
1550
1556
|
Column("metrics", JSON, nullable=False),
|
|
1557
|
+
Column("parent_job_id", Text, nullable=True),
|
|
1551
1558
|
]
|
|
1552
1559
|
|
|
1553
1560
|
@cached_property
|
|
@@ -1595,6 +1602,7 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
1595
1602
|
workers: int = 1,
|
|
1596
1603
|
python_version: Optional[str] = None,
|
|
1597
1604
|
params: Optional[dict[str, str]] = None,
|
|
1605
|
+
parent_job_id: Optional[str] = None,
|
|
1598
1606
|
conn: Optional[Any] = None,
|
|
1599
1607
|
) -> str:
|
|
1600
1608
|
"""
|
|
@@ -1616,6 +1624,7 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
1616
1624
|
error_stack="",
|
|
1617
1625
|
params=json.dumps(params or {}),
|
|
1618
1626
|
metrics=json.dumps({}),
|
|
1627
|
+
parent_job_id=parent_job_id,
|
|
1619
1628
|
),
|
|
1620
1629
|
conn=conn,
|
|
1621
1630
|
)
|
|
@@ -1770,7 +1779,7 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
1770
1779
|
)
|
|
1771
1780
|
return self.get_checkpoint_by_id(checkpoint_id)
|
|
1772
1781
|
|
|
1773
|
-
def list_checkpoints(self, job_id: str, conn=None) -> Iterator[
|
|
1782
|
+
def list_checkpoints(self, job_id: str, conn=None) -> Iterator[Checkpoint]:
|
|
1774
1783
|
"""List checkpoints by job id."""
|
|
1775
1784
|
query = self._checkpoints_query().where(self._checkpoints.c.job_id == job_id)
|
|
1776
1785
|
rows = list(self.db.execute(query, conn=conn))
|
|
@@ -1800,3 +1809,15 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
1800
1809
|
if not rows:
|
|
1801
1810
|
return None
|
|
1802
1811
|
return self.checkpoint_class.parse(*rows[0])
|
|
1812
|
+
|
|
1813
|
+
def get_last_checkpoint(self, job_id: str, conn=None) -> Optional[Checkpoint]:
|
|
1814
|
+
query = (
|
|
1815
|
+
self._checkpoints_query()
|
|
1816
|
+
.where(self._checkpoints.c.job_id == job_id)
|
|
1817
|
+
.order_by(desc(self._checkpoints.c.created_at))
|
|
1818
|
+
.limit(1)
|
|
1819
|
+
)
|
|
1820
|
+
rows = list(self.db.execute(query, conn=conn))
|
|
1821
|
+
if not rows:
|
|
1822
|
+
return None
|
|
1823
|
+
return self.checkpoint_class.parse(*rows[0])
|
datachain/error.py
CHANGED
datachain/job.py
CHANGED
|
@@ -22,6 +22,7 @@ class Job:
|
|
|
22
22
|
python_version: Optional[str] = None
|
|
23
23
|
error_message: str = ""
|
|
24
24
|
error_stack: str = ""
|
|
25
|
+
parent_job_id: Optional[str] = None
|
|
25
26
|
|
|
26
27
|
@classmethod
|
|
27
28
|
def parse(
|
|
@@ -39,6 +40,7 @@ class Job:
|
|
|
39
40
|
error_stack: str,
|
|
40
41
|
params: str,
|
|
41
42
|
metrics: str,
|
|
43
|
+
parent_job_id: Optional[str],
|
|
42
44
|
) -> "Job":
|
|
43
45
|
return cls(
|
|
44
46
|
str(id),
|
|
@@ -54,4 +56,5 @@ class Job:
|
|
|
54
56
|
python_version,
|
|
55
57
|
error_message,
|
|
56
58
|
error_stack,
|
|
59
|
+
parent_job_id,
|
|
57
60
|
)
|
datachain/lib/dc/datachain.py
CHANGED
|
@@ -19,7 +19,6 @@ from typing import (
|
|
|
19
19
|
cast,
|
|
20
20
|
overload,
|
|
21
21
|
)
|
|
22
|
-
from uuid import uuid4
|
|
23
22
|
|
|
24
23
|
import sqlalchemy
|
|
25
24
|
import ujson as json
|
|
@@ -30,10 +29,15 @@ from tqdm import tqdm
|
|
|
30
29
|
from datachain import semver
|
|
31
30
|
from datachain.dataset import DatasetRecord
|
|
32
31
|
from datachain.delta import delta_disabled
|
|
33
|
-
from datachain.error import
|
|
32
|
+
from datachain.error import (
|
|
33
|
+
JobNotFoundError,
|
|
34
|
+
ProjectCreateNotAllowedError,
|
|
35
|
+
ProjectNotFoundError,
|
|
36
|
+
)
|
|
34
37
|
from datachain.func import literal
|
|
35
38
|
from datachain.func.base import Function
|
|
36
39
|
from datachain.func.func import Func
|
|
40
|
+
from datachain.job import Job
|
|
37
41
|
from datachain.lib.convert.python_to_sql import python_to_sql
|
|
38
42
|
from datachain.lib.data_model import (
|
|
39
43
|
DataModel,
|
|
@@ -50,11 +54,12 @@ from datachain.lib.signal_schema import SignalResolvingError, SignalSchema
|
|
|
50
54
|
from datachain.lib.udf import Aggregator, BatchMapper, Generator, Mapper, UDFBase
|
|
51
55
|
from datachain.lib.udf_signature import UdfSignature
|
|
52
56
|
from datachain.lib.utils import DataChainColumnError, DataChainParamsError
|
|
57
|
+
from datachain.project import Project
|
|
53
58
|
from datachain.query import Session
|
|
54
59
|
from datachain.query.dataset import DatasetQuery, PartitionByType
|
|
55
60
|
from datachain.query.schema import DEFAULT_DELIMITER, Column
|
|
56
61
|
from datachain.sql.functions import path as pathfunc
|
|
57
|
-
from datachain.utils import batched_it, inside_notebook, row_to_nested_dict
|
|
62
|
+
from datachain.utils import batched_it, env2bool, inside_notebook, row_to_nested_dict
|
|
58
63
|
|
|
59
64
|
from .database import DEFAULT_DATABASE_BATCH_SIZE
|
|
60
65
|
from .utils import (
|
|
@@ -578,6 +583,19 @@ class DataChain:
|
|
|
578
583
|
query=self._query.save(project=project, feature_schema=schema)
|
|
579
584
|
)
|
|
580
585
|
|
|
586
|
+
def _calculate_job_hash(self, job_id: str) -> str:
|
|
587
|
+
"""
|
|
588
|
+
Calculates hash of the job at the place of this chain's save method.
|
|
589
|
+
Hash is calculated using previous job checkpoint hash (if exists) and
|
|
590
|
+
adding hash of this chain to produce new hash.
|
|
591
|
+
"""
|
|
592
|
+
last_checkpoint = self.session.catalog.metastore.get_last_checkpoint(job_id)
|
|
593
|
+
|
|
594
|
+
return hashlib.sha256(
|
|
595
|
+
(bytes.fromhex(last_checkpoint.hash) if last_checkpoint else b"")
|
|
596
|
+
+ bytes.fromhex(self.hash())
|
|
597
|
+
).hexdigest()
|
|
598
|
+
|
|
581
599
|
def save( # type: ignore[override]
|
|
582
600
|
self,
|
|
583
601
|
name: str,
|
|
@@ -602,101 +620,171 @@ class DataChain:
|
|
|
602
620
|
update_version: which part of the dataset version to automatically increase.
|
|
603
621
|
Available values: `major`, `minor` or `patch`. Default is `patch`.
|
|
604
622
|
"""
|
|
623
|
+
|
|
605
624
|
catalog = self.session.catalog
|
|
606
|
-
if version is not None:
|
|
607
|
-
semver.validate(version)
|
|
608
625
|
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
raise ValueError(
|
|
615
|
-
"update_version can have one of the following values: major, minor or"
|
|
616
|
-
" patch"
|
|
617
|
-
)
|
|
626
|
+
result = None # result chain that will be returned at the end
|
|
627
|
+
|
|
628
|
+
# Version validation
|
|
629
|
+
self._validate_version(version)
|
|
630
|
+
self._validate_update_version(update_version)
|
|
618
631
|
|
|
619
632
|
namespace_name, project_name, name = catalog.get_full_dataset_name(
|
|
620
633
|
name,
|
|
621
634
|
namespace_name=self._settings.namespace,
|
|
622
635
|
project_name=self._settings.project,
|
|
623
636
|
)
|
|
637
|
+
project = self._get_or_create_project(namespace_name, project_name)
|
|
638
|
+
|
|
639
|
+
# Checkpoint handling
|
|
640
|
+
job, _hash, result = self._resolve_checkpoint(name, project, kwargs)
|
|
641
|
+
|
|
642
|
+
# Schema preparation
|
|
643
|
+
schema = self.signals_schema.clone_without_sys_signals().serialize()
|
|
624
644
|
|
|
645
|
+
# Handle retry and delta functionality
|
|
646
|
+
if not result:
|
|
647
|
+
result = self._handle_delta(name, version, project, schema, kwargs)
|
|
648
|
+
|
|
649
|
+
if not result:
|
|
650
|
+
# calculate chain if we already don't have result from checkpoint or delta
|
|
651
|
+
result = self._evolve(
|
|
652
|
+
query=self._query.save(
|
|
653
|
+
name=name,
|
|
654
|
+
version=version,
|
|
655
|
+
project=project,
|
|
656
|
+
description=description,
|
|
657
|
+
attrs=attrs,
|
|
658
|
+
feature_schema=schema,
|
|
659
|
+
update_version=update_version,
|
|
660
|
+
**kwargs,
|
|
661
|
+
)
|
|
662
|
+
)
|
|
663
|
+
|
|
664
|
+
if job:
|
|
665
|
+
catalog.metastore.create_checkpoint(job.id, _hash) # type: ignore[arg-type]
|
|
666
|
+
|
|
667
|
+
return result
|
|
668
|
+
|
|
669
|
+
def _validate_version(self, version: Optional[str]) -> None:
|
|
670
|
+
"""Validate dataset version if provided."""
|
|
671
|
+
if version is not None:
|
|
672
|
+
semver.validate(version)
|
|
673
|
+
|
|
674
|
+
def _validate_update_version(self, update_version: Optional[str]) -> None:
|
|
675
|
+
"""Ensure update_version is one of: major, minor, patch."""
|
|
676
|
+
allowed = ["major", "minor", "patch"]
|
|
677
|
+
if update_version not in allowed:
|
|
678
|
+
raise ValueError(f"update_version must be one of {allowed}")
|
|
679
|
+
|
|
680
|
+
def _get_or_create_project(self, namespace: str, project_name: str) -> Project:
|
|
681
|
+
"""Get project or raise if creation not allowed."""
|
|
625
682
|
try:
|
|
626
|
-
|
|
683
|
+
return self.session.catalog.metastore.get_project(
|
|
627
684
|
project_name,
|
|
628
|
-
|
|
685
|
+
namespace,
|
|
629
686
|
create=is_studio(),
|
|
630
687
|
)
|
|
631
688
|
except ProjectNotFoundError as e:
|
|
632
|
-
# not being able to create it as creation is not allowed
|
|
633
689
|
raise ProjectCreateNotAllowedError("Creating project is not allowed") from e
|
|
634
690
|
|
|
635
|
-
|
|
691
|
+
def _resolve_checkpoint(
|
|
692
|
+
self,
|
|
693
|
+
name: str,
|
|
694
|
+
project: Project,
|
|
695
|
+
kwargs: dict,
|
|
696
|
+
) -> tuple[Optional[Job], Optional[str], Optional["DataChain"]]:
|
|
697
|
+
"""Check if checkpoint exists and return cached dataset if possible."""
|
|
698
|
+
from .datasets import read_dataset
|
|
636
699
|
|
|
637
|
-
|
|
638
|
-
if self.delta and name:
|
|
639
|
-
from datachain.delta import delta_retry_update
|
|
700
|
+
metastore = self.session.catalog.metastore
|
|
640
701
|
|
|
641
|
-
|
|
642
|
-
|
|
702
|
+
job_id = os.getenv("DATACHAIN_JOB_ID")
|
|
703
|
+
checkpoints_reset = env2bool("DATACHAIN_CHECKPOINTS_RESET", undefined=True)
|
|
643
704
|
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
705
|
+
if not job_id:
|
|
706
|
+
return None, None, None
|
|
707
|
+
|
|
708
|
+
job = metastore.get_job(job_id)
|
|
709
|
+
if not job:
|
|
710
|
+
raise JobNotFoundError(f"Job with id {job_id} not found")
|
|
711
|
+
|
|
712
|
+
_hash = self._calculate_job_hash(job.id)
|
|
713
|
+
|
|
714
|
+
if (
|
|
715
|
+
job.parent_job_id
|
|
716
|
+
and not checkpoints_reset
|
|
717
|
+
and metastore.find_checkpoint(job.parent_job_id, _hash)
|
|
718
|
+
):
|
|
719
|
+
# checkpoint found → reuse dataset
|
|
720
|
+
chain = read_dataset(
|
|
721
|
+
name, namespace=project.namespace.name, project=project.name, **kwargs
|
|
653
722
|
)
|
|
723
|
+
return job, _hash, chain
|
|
654
724
|
|
|
655
|
-
|
|
656
|
-
return self._evolve(
|
|
657
|
-
query=result_ds._query.save(
|
|
658
|
-
name=name,
|
|
659
|
-
version=version,
|
|
660
|
-
project=project,
|
|
661
|
-
feature_schema=schema,
|
|
662
|
-
dependencies=dependencies,
|
|
663
|
-
**kwargs,
|
|
664
|
-
)
|
|
665
|
-
)
|
|
725
|
+
return job, _hash, None
|
|
666
726
|
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
727
|
+
def _handle_delta(
|
|
728
|
+
self,
|
|
729
|
+
name: str,
|
|
730
|
+
version: Optional[str],
|
|
731
|
+
project: Project,
|
|
732
|
+
schema: dict,
|
|
733
|
+
kwargs: dict,
|
|
734
|
+
) -> Optional["DataChain"]:
|
|
735
|
+
"""Try to save as a delta dataset.
|
|
736
|
+
Returns:
|
|
737
|
+
A DataChain if delta logic could handle it, otherwise None to fall back
|
|
738
|
+
to the regular save path (e.g., on first dataset creation).
|
|
739
|
+
"""
|
|
740
|
+
from datachain.delta import delta_retry_update
|
|
673
741
|
|
|
674
|
-
|
|
675
|
-
name, namespace=namespace_name, project=project_name, **kwargs
|
|
676
|
-
)
|
|
742
|
+
from .datasets import read_dataset
|
|
677
743
|
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
|
|
744
|
+
if not self.delta or not name:
|
|
745
|
+
return None
|
|
746
|
+
|
|
747
|
+
assert self._delta_on is not None, "Delta chain must have delta_on defined"
|
|
748
|
+
|
|
749
|
+
result_ds, dependencies, has_changes = delta_retry_update(
|
|
750
|
+
self,
|
|
751
|
+
project.namespace.name,
|
|
752
|
+
project.name,
|
|
753
|
+
name,
|
|
754
|
+
on=self._delta_on,
|
|
755
|
+
right_on=self._delta_result_on,
|
|
756
|
+
compare=self._delta_compare,
|
|
757
|
+
delta_retry=self._delta_retry,
|
|
689
758
|
)
|
|
690
759
|
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
760
|
+
# Case 1: delta produced a new dataset
|
|
761
|
+
if result_ds:
|
|
762
|
+
return self._evolve(
|
|
763
|
+
query=result_ds._query.save(
|
|
764
|
+
name=name,
|
|
765
|
+
version=version,
|
|
766
|
+
project=project,
|
|
767
|
+
feature_schema=schema,
|
|
768
|
+
dependencies=dependencies,
|
|
769
|
+
**kwargs,
|
|
770
|
+
)
|
|
697
771
|
)
|
|
698
772
|
|
|
699
|
-
|
|
773
|
+
# Case 2: no changes → reuse last version
|
|
774
|
+
if not has_changes:
|
|
775
|
+
# sources have not been changed so new version of resulting dataset
|
|
776
|
+
# would be the same as previous one. To avoid duplicating exact
|
|
777
|
+
# datasets, we won't create new version of it and we will return
|
|
778
|
+
# current latest version instead.
|
|
779
|
+
return read_dataset(
|
|
780
|
+
name,
|
|
781
|
+
namespace=project.namespace.name,
|
|
782
|
+
project=project.name,
|
|
783
|
+
**kwargs,
|
|
784
|
+
)
|
|
785
|
+
|
|
786
|
+
# Case 3: first creation of dataset
|
|
787
|
+
return None
|
|
700
788
|
|
|
701
789
|
def apply(self, func, *args, **kwargs):
|
|
702
790
|
"""Apply any function to the chain.
|
|
@@ -1875,12 +1963,15 @@ class DataChain:
|
|
|
1875
1963
|
self,
|
|
1876
1964
|
flatten: bool = False,
|
|
1877
1965
|
include_hidden: bool = True,
|
|
1966
|
+
as_object: bool = False,
|
|
1878
1967
|
) -> "pd.DataFrame":
|
|
1879
1968
|
"""Return a pandas DataFrame from the chain.
|
|
1880
1969
|
|
|
1881
1970
|
Parameters:
|
|
1882
1971
|
flatten: Whether to use a multiindex or flatten column names.
|
|
1883
1972
|
include_hidden: Whether to include hidden columns.
|
|
1973
|
+
as_object: Whether to emit a dataframe backed by Python objects
|
|
1974
|
+
rather than pandas-inferred dtypes.
|
|
1884
1975
|
|
|
1885
1976
|
Returns:
|
|
1886
1977
|
pd.DataFrame: A pandas DataFrame representation of the chain.
|
|
@@ -1896,6 +1987,9 @@ class DataChain:
|
|
|
1896
1987
|
columns = pd.MultiIndex.from_tuples(map(tuple, headers))
|
|
1897
1988
|
|
|
1898
1989
|
results = self.results(include_hidden=include_hidden)
|
|
1990
|
+
if as_object:
|
|
1991
|
+
df = pd.DataFrame(results, columns=columns, dtype=object)
|
|
1992
|
+
return df.where(pd.notna(df), None)
|
|
1899
1993
|
return pd.DataFrame.from_records(results, columns=columns)
|
|
1900
1994
|
|
|
1901
1995
|
def show(
|
|
@@ -1918,7 +2012,11 @@ class DataChain:
|
|
|
1918
2012
|
import pandas as pd
|
|
1919
2013
|
|
|
1920
2014
|
dc = self.limit(limit) if limit > 0 else self # type: ignore[misc]
|
|
1921
|
-
df = dc.to_pandas(
|
|
2015
|
+
df = dc.to_pandas(
|
|
2016
|
+
flatten,
|
|
2017
|
+
include_hidden=include_hidden,
|
|
2018
|
+
as_object=True,
|
|
2019
|
+
)
|
|
1922
2020
|
|
|
1923
2021
|
if df.empty:
|
|
1924
2022
|
print("Empty result")
|
|
@@ -6,9 +6,9 @@ datachain/checkpoint.py,sha256=Ar6SnnDMN3fr5ZZm3Xpdbj2f9buhqeApad-B1Lyrr4Y,1152
|
|
|
6
6
|
datachain/config.py,sha256=g8qbNV0vW2VEKpX-dGZ9pAn0DAz6G2ZFcr7SAV3PoSM,4272
|
|
7
7
|
datachain/dataset.py,sha256=eX7xGa3EUpAccBZWpkgDmYV6_FjGuhjkMLFHpjl6lVI,25256
|
|
8
8
|
datachain/delta.py,sha256=X5Lw6GQ8MAYNl2YIExNvl0tPIkylQEWwnCw0We7NtHM,10693
|
|
9
|
-
datachain/error.py,sha256=
|
|
9
|
+
datachain/error.py,sha256=P_5KXlfVIsW4E42JJCoFhGsgvY8la-6jXBEWbHbgqKo,1846
|
|
10
10
|
datachain/hash_utils.py,sha256=tgyXlz1m0gsS3UkIxdb0fxtNfVsbO2-YrELtyGV5XYE,4515
|
|
11
|
-
datachain/job.py,sha256=
|
|
11
|
+
datachain/job.py,sha256=WDkZrr4Je50nngRDaRapNpGpx_50L6wYWmAqcMT_yCw,1367
|
|
12
12
|
datachain/listing.py,sha256=aqayl5St3D9PwdwM6nR1STkpLSw-S3U8pudO9PWi3N8,7241
|
|
13
13
|
datachain/namespace.py,sha256=sgIF90KEaC_VlMFivDIJiFz8RUsTftMxW4kOUTyxo3A,2356
|
|
14
14
|
datachain/node.py,sha256=KWDT0ClYXB7FYI-QOvzAa-UDkLJErUI2eWm5FBteYuU,5577
|
|
@@ -23,7 +23,7 @@ datachain/studio.py,sha256=IS8o4BZnhUo73Bd8m4CJxFc5utdmh2miIs25WswkFBA,15283
|
|
|
23
23
|
datachain/telemetry.py,sha256=0A4IOPPp9VlP5pyW9eBfaTK3YhHGzHl7dQudQjUAx9A,994
|
|
24
24
|
datachain/utils.py,sha256=yW-Df5R6npqcqlNZMlBRBwyhUFmXpl9sQipPmy9HfQU,15797
|
|
25
25
|
datachain/catalog/__init__.py,sha256=9NBaywvAOaXdkyqiHjbBEiXs7JImR1OJsY9r8D5Q16g,403
|
|
26
|
-
datachain/catalog/catalog.py,sha256=
|
|
26
|
+
datachain/catalog/catalog.py,sha256=a1AN6eDHWWzII1wi46T_1JvTsW1AeMudwR_6sVQ4f7I,67588
|
|
27
27
|
datachain/catalog/datasource.py,sha256=IkGMh0Ttg6Q-9DWfU_H05WUnZepbGa28HYleECi6K7I,1353
|
|
28
28
|
datachain/catalog/loader.py,sha256=53VnuSRkt_CO9RdlHWkzQsPF55qMxcXvEm3ecsZREw8,6150
|
|
29
29
|
datachain/cli/__init__.py,sha256=so3WxEQF03KdGvjav15Sw7a6-lriiE24uDSGbBDBp8o,8298
|
|
@@ -52,7 +52,7 @@ datachain/client/s3.py,sha256=6DNVGLg-woPS1DVlYVX2rIlunNblsuxyOnI1rSzhW3k,7515
|
|
|
52
52
|
datachain/data_storage/__init__.py,sha256=9Wit-oe5P46V7CJQTD0BJ5MhOa2Y9h3ddJ4VWTe-Lec,273
|
|
53
53
|
datachain/data_storage/db_engine.py,sha256=n8ojCbvVMPY2e3SG8fUaaD0b9GkVfpl_Naa_6EiHfWg,3788
|
|
54
54
|
datachain/data_storage/job.py,sha256=NGFhXg0C0zRFTaF6ccjXZJT4xI4_gUr1WcxTLK6WYDE,448
|
|
55
|
-
datachain/data_storage/metastore.py,sha256=
|
|
55
|
+
datachain/data_storage/metastore.py,sha256=9Wd0MfdVrdpgvFXOddUvyz61MnoRDipv0-A38aRsqzw,61021
|
|
56
56
|
datachain/data_storage/schema.py,sha256=DmxxXjNIsXib9gj5jcrb1CVjGzHf7HZLOehs1RmuiMA,9891
|
|
57
57
|
datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
|
|
58
58
|
datachain/data_storage/sqlite.py,sha256=Z6KlFk7hWoXBbjzxfk2NuIBecqP86AJzp5iEE2W4yw0,30603
|
|
@@ -107,7 +107,7 @@ datachain/lib/convert/values_to_tuples.py,sha256=j5yZMrVUH6W7b-7yUvdCTGI7JCUAYUO
|
|
|
107
107
|
datachain/lib/dc/__init__.py,sha256=UrUzmDH6YyVl8fxM5iXTSFtl5DZTUzEYm1MaazK4vdQ,900
|
|
108
108
|
datachain/lib/dc/csv.py,sha256=wUsDPpLD4lts92yn0gejZHqTv8qQBbv8JYRwiIepj0o,4471
|
|
109
109
|
datachain/lib/dc/database.py,sha256=sTpos1rE4BS5BTzzixykhWIO2JxVYKH1GTRncdpu4dU,14716
|
|
110
|
-
datachain/lib/dc/datachain.py,sha256=
|
|
110
|
+
datachain/lib/dc/datachain.py,sha256=Xh7Hwpvow_3QHPhsPSpP99HDKlwcJOpZEZJUNa_Ex9c,104396
|
|
111
111
|
datachain/lib/dc/datasets.py,sha256=pVRcrVEPVPHMf8sLqqhjXbilB3QuUqKE-byvZ-XlJNE,15347
|
|
112
112
|
datachain/lib/dc/hf.py,sha256=B7pubDQTDmth9uILXyhpQNtOAT3UOLjR-peU__tpypk,2884
|
|
113
113
|
datachain/lib/dc/json.py,sha256=-vJ-pUpp2JxK4_vOfznE09FIoEOrvCwoIZSLxM6pjmY,2742
|
|
@@ -164,9 +164,9 @@ datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR
|
|
|
164
164
|
datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
|
|
165
165
|
datachain/toolkit/split.py,sha256=ktGWzY4kyzjWyR86dhvzw-Zhl0lVk_LOX3NciTac6qo,2914
|
|
166
166
|
datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
|
|
167
|
-
datachain-0.
|
|
168
|
-
datachain-0.
|
|
169
|
-
datachain-0.
|
|
170
|
-
datachain-0.
|
|
171
|
-
datachain-0.
|
|
172
|
-
datachain-0.
|
|
167
|
+
datachain-0.34.1.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
|
|
168
|
+
datachain-0.34.1.dist-info/METADATA,sha256=x6vwqoDfsyj5T08GdAT7Qs13lv9uIonatPaxr_nPQ5Y,13655
|
|
169
|
+
datachain-0.34.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
170
|
+
datachain-0.34.1.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
|
|
171
|
+
datachain-0.34.1.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
|
|
172
|
+
datachain-0.34.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|