datachain 0.7.11__py3-none-any.whl → 0.8.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/catalog/catalog.py +56 -45
- datachain/cli.py +25 -3
- datachain/client/gcs.py +9 -0
- datachain/data_storage/sqlite.py +20 -6
- datachain/data_storage/warehouse.py +0 -1
- datachain/lib/arrow.py +82 -58
- datachain/lib/dc.py +167 -166
- datachain/lib/diff.py +197 -0
- datachain/lib/file.py +3 -1
- datachain/lib/listing.py +44 -0
- datachain/lib/meta_formats.py +38 -42
- datachain/lib/udf.py +0 -1
- datachain/query/batch.py +32 -6
- datachain/query/dataset.py +18 -17
- datachain/query/dispatch.py +125 -125
- datachain/query/session.py +8 -5
- datachain/query/udf.py +20 -0
- datachain/query/utils.py +42 -0
- datachain/remote/studio.py +53 -1
- datachain/studio.py +47 -2
- datachain/utils.py +1 -1
- {datachain-0.7.11.dist-info → datachain-0.8.1.dist-info}/METADATA +4 -3
- {datachain-0.7.11.dist-info → datachain-0.8.1.dist-info}/RECORD +27 -24
- {datachain-0.7.11.dist-info → datachain-0.8.1.dist-info}/LICENSE +0 -0
- {datachain-0.7.11.dist-info → datachain-0.8.1.dist-info}/WHEEL +0 -0
- {datachain-0.7.11.dist-info → datachain-0.8.1.dist-info}/entry_points.txt +0 -0
- {datachain-0.7.11.dist-info → datachain-0.8.1.dist-info}/top_level.txt +0 -0
datachain/lib/dc.py
CHANGED
|
@@ -11,7 +11,6 @@ from typing import (
|
|
|
11
11
|
BinaryIO,
|
|
12
12
|
Callable,
|
|
13
13
|
ClassVar,
|
|
14
|
-
Literal,
|
|
15
14
|
Optional,
|
|
16
15
|
TypeVar,
|
|
17
16
|
Union,
|
|
@@ -24,8 +23,6 @@ from pydantic import BaseModel
|
|
|
24
23
|
from sqlalchemy.sql.functions import GenericFunction
|
|
25
24
|
from sqlalchemy.sql.sqltypes import NullType
|
|
26
25
|
|
|
27
|
-
from datachain.client import Client
|
|
28
|
-
from datachain.client.local import FileClient
|
|
29
26
|
from datachain.dataset import DatasetRecord
|
|
30
27
|
from datachain.func.base import Function
|
|
31
28
|
from datachain.func.func import Func
|
|
@@ -33,15 +30,11 @@ from datachain.lib.convert.python_to_sql import python_to_sql
|
|
|
33
30
|
from datachain.lib.convert.values_to_tuples import values_to_tuples
|
|
34
31
|
from datachain.lib.data_model import DataModel, DataType, DataValue, dict_to_data_model
|
|
35
32
|
from datachain.lib.dataset_info import DatasetInfo
|
|
36
|
-
from datachain.lib.file import ArrowRow, File, get_file_type
|
|
33
|
+
from datachain.lib.file import ArrowRow, File, FileType, get_file_type
|
|
37
34
|
from datachain.lib.file import ExportPlacement as FileExportPlacement
|
|
38
|
-
from datachain.lib.listing import
|
|
39
|
-
list_bucket,
|
|
40
|
-
ls,
|
|
41
|
-
parse_listing_uri,
|
|
42
|
-
)
|
|
35
|
+
from datachain.lib.listing import get_listing, list_bucket, ls
|
|
43
36
|
from datachain.lib.listing_info import ListingInfo
|
|
44
|
-
from datachain.lib.meta_formats import read_meta
|
|
37
|
+
from datachain.lib.meta_formats import read_meta
|
|
45
38
|
from datachain.lib.model_store import ModelStore
|
|
46
39
|
from datachain.lib.settings import Settings
|
|
47
40
|
from datachain.lib.signal_schema import SignalSchema
|
|
@@ -403,53 +396,12 @@ class DataChain:
|
|
|
403
396
|
self.signals_schema |= signals_schema
|
|
404
397
|
return self
|
|
405
398
|
|
|
406
|
-
@classmethod
|
|
407
|
-
def parse_uri(
|
|
408
|
-
cls, uri: str, session: Session, update: bool = False
|
|
409
|
-
) -> tuple[str, str, str, bool]:
|
|
410
|
-
"""Returns correct listing dataset name that must be used for saving listing
|
|
411
|
-
operation. It takes into account existing listings and reusability of those.
|
|
412
|
-
It also returns boolean saying if returned dataset name is reused / already
|
|
413
|
-
exists or not, and it returns correct listing path that should be used to find
|
|
414
|
-
rows based on uri.
|
|
415
|
-
"""
|
|
416
|
-
catalog = session.catalog
|
|
417
|
-
cache = catalog.cache
|
|
418
|
-
client_config = catalog.client_config
|
|
419
|
-
|
|
420
|
-
client = Client.get_client(uri, cache, **client_config)
|
|
421
|
-
ds_name, list_uri, list_path = parse_listing_uri(uri, cache, client_config)
|
|
422
|
-
listing = None
|
|
423
|
-
|
|
424
|
-
listings = [
|
|
425
|
-
ls
|
|
426
|
-
for ls in catalog.listings()
|
|
427
|
-
if not ls.is_expired and ls.contains(ds_name)
|
|
428
|
-
]
|
|
429
|
-
|
|
430
|
-
if listings:
|
|
431
|
-
if update:
|
|
432
|
-
# choosing the smallest possible one to minimize update time
|
|
433
|
-
listing = sorted(listings, key=lambda ls: len(ls.name))[0]
|
|
434
|
-
else:
|
|
435
|
-
# no need to update, choosing the most recent one
|
|
436
|
-
listing = sorted(listings, key=lambda ls: ls.created_at)[-1]
|
|
437
|
-
|
|
438
|
-
if isinstance(client, FileClient) and listing and listing.name != ds_name:
|
|
439
|
-
# For local file system we need to fix listing path / prefix
|
|
440
|
-
# if we are reusing existing listing
|
|
441
|
-
list_path = f'{ds_name.strip("/").removeprefix(listing.name)}/{list_path}'
|
|
442
|
-
|
|
443
|
-
ds_name = listing.name if listing else ds_name
|
|
444
|
-
|
|
445
|
-
return ds_name, list_uri, list_path, bool(listing)
|
|
446
|
-
|
|
447
399
|
@classmethod
|
|
448
400
|
def from_storage(
|
|
449
401
|
cls,
|
|
450
402
|
uri,
|
|
451
403
|
*,
|
|
452
|
-
type:
|
|
404
|
+
type: FileType = "binary",
|
|
453
405
|
session: Optional[Session] = None,
|
|
454
406
|
settings: Optional[dict] = None,
|
|
455
407
|
in_memory: bool = False,
|
|
@@ -482,7 +434,7 @@ class DataChain:
|
|
|
482
434
|
cache = session.catalog.cache
|
|
483
435
|
client_config = session.catalog.client_config
|
|
484
436
|
|
|
485
|
-
list_ds_name, list_uri, list_path, list_ds_exists =
|
|
437
|
+
list_ds_name, list_uri, list_path, list_ds_exists = get_listing(
|
|
486
438
|
uri, session, update=update
|
|
487
439
|
)
|
|
488
440
|
|
|
@@ -548,14 +500,13 @@ class DataChain:
|
|
|
548
500
|
def from_json(
|
|
549
501
|
cls,
|
|
550
502
|
path,
|
|
551
|
-
type:
|
|
503
|
+
type: FileType = "text",
|
|
552
504
|
spec: Optional[DataType] = None,
|
|
553
505
|
schema_from: Optional[str] = "auto",
|
|
554
506
|
jmespath: Optional[str] = None,
|
|
555
507
|
object_name: Optional[str] = "",
|
|
556
508
|
model_name: Optional[str] = None,
|
|
557
|
-
|
|
558
|
-
meta_type: Optional[str] = "json",
|
|
509
|
+
format: Optional[str] = "json",
|
|
559
510
|
nrows=None,
|
|
560
511
|
**kwargs,
|
|
561
512
|
) -> "DataChain":
|
|
@@ -564,12 +515,12 @@ class DataChain:
|
|
|
564
515
|
Parameters:
|
|
565
516
|
path : storage URI with directory. URI must start with storage prefix such
|
|
566
517
|
as `s3://`, `gs://`, `az://` or "file:///"
|
|
567
|
-
type : read file as "binary", "text", or "image" data. Default is "
|
|
518
|
+
type : read file as "binary", "text", or "image" data. Default is "text".
|
|
568
519
|
spec : optional Data Model
|
|
569
520
|
schema_from : path to sample to infer spec (if schema not provided)
|
|
570
521
|
object_name : generated object column name
|
|
571
522
|
model_name : optional generated model name
|
|
572
|
-
|
|
523
|
+
format: "json", "jsonl"
|
|
573
524
|
jmespath : optional JMESPATH expression to reduce JSON
|
|
574
525
|
nrows : optional row limit for jsonl and JSON arrays
|
|
575
526
|
|
|
@@ -594,80 +545,21 @@ class DataChain:
|
|
|
594
545
|
if (not object_name) and jmespath:
|
|
595
546
|
object_name = jmespath_to_name(jmespath)
|
|
596
547
|
if not object_name:
|
|
597
|
-
object_name =
|
|
598
|
-
chain = DataChain.from_storage(uri=path, type=type, **kwargs)
|
|
599
|
-
signal_dict = {
|
|
600
|
-
object_name: read_meta(
|
|
601
|
-
schema_from=schema_from,
|
|
602
|
-
meta_type=meta_type,
|
|
603
|
-
spec=spec,
|
|
604
|
-
model_name=model_name,
|
|
605
|
-
print_schema=print_schema,
|
|
606
|
-
jmespath=jmespath,
|
|
607
|
-
nrows=nrows,
|
|
608
|
-
)
|
|
609
|
-
}
|
|
610
|
-
return chain.gen(**signal_dict) # type: ignore[misc, arg-type]
|
|
611
|
-
|
|
612
|
-
@classmethod
|
|
613
|
-
def from_jsonl(
|
|
614
|
-
cls,
|
|
615
|
-
path,
|
|
616
|
-
type: Literal["binary", "text", "image"] = "text",
|
|
617
|
-
spec: Optional[DataType] = None,
|
|
618
|
-
schema_from: Optional[str] = "auto",
|
|
619
|
-
jmespath: Optional[str] = None,
|
|
620
|
-
object_name: Optional[str] = "",
|
|
621
|
-
model_name: Optional[str] = None,
|
|
622
|
-
print_schema: Optional[bool] = False,
|
|
623
|
-
meta_type: Optional[str] = "jsonl",
|
|
624
|
-
nrows=None,
|
|
625
|
-
**kwargs,
|
|
626
|
-
) -> "DataChain":
|
|
627
|
-
"""Get data from JSON lines. It returns the chain itself.
|
|
628
|
-
|
|
629
|
-
Parameters:
|
|
630
|
-
path : storage URI with directory. URI must start with storage prefix such
|
|
631
|
-
as `s3://`, `gs://`, `az://` or "file:///"
|
|
632
|
-
type : read file as "binary", "text", or "image" data. Default is "binary".
|
|
633
|
-
spec : optional Data Model
|
|
634
|
-
schema_from : path to sample to infer spec (if schema not provided)
|
|
635
|
-
object_name : generated object column name
|
|
636
|
-
model_name : optional generated model name
|
|
637
|
-
print_schema : print auto-generated schema
|
|
638
|
-
jmespath : optional JMESPATH expression to reduce JSON
|
|
639
|
-
nrows : optional row limit for jsonl and JSON arrays
|
|
640
|
-
|
|
641
|
-
Example:
|
|
642
|
-
infer JSONl schema from data, limit parsing to 1 row
|
|
643
|
-
```py
|
|
644
|
-
chain = DataChain.from_jsonl("gs://myjsonl", nrows=1)
|
|
645
|
-
```
|
|
646
|
-
"""
|
|
647
|
-
if schema_from == "auto":
|
|
648
|
-
schema_from = path
|
|
649
|
-
|
|
650
|
-
def jmespath_to_name(s: str):
|
|
651
|
-
name_end = re.search(r"\W", s).start() if re.search(r"\W", s) else len(s) # type: ignore[union-attr]
|
|
652
|
-
return s[:name_end]
|
|
653
|
-
|
|
654
|
-
if (not object_name) and jmespath:
|
|
655
|
-
object_name = jmespath_to_name(jmespath)
|
|
656
|
-
if not object_name:
|
|
657
|
-
object_name = meta_type
|
|
548
|
+
object_name = format
|
|
658
549
|
chain = DataChain.from_storage(uri=path, type=type, **kwargs)
|
|
659
550
|
signal_dict = {
|
|
660
551
|
object_name: read_meta(
|
|
661
552
|
schema_from=schema_from,
|
|
662
|
-
|
|
553
|
+
format=format,
|
|
663
554
|
spec=spec,
|
|
664
555
|
model_name=model_name,
|
|
665
|
-
print_schema=print_schema,
|
|
666
556
|
jmespath=jmespath,
|
|
667
557
|
nrows=nrows,
|
|
668
558
|
)
|
|
669
559
|
}
|
|
670
|
-
|
|
560
|
+
# disable prefetch if nrows is set
|
|
561
|
+
settings = {"prefetch": 0} if nrows else {}
|
|
562
|
+
return chain.settings(**settings).gen(**signal_dict) # type: ignore[misc, arg-type]
|
|
671
563
|
|
|
672
564
|
def explode(
|
|
673
565
|
self,
|
|
@@ -793,47 +685,6 @@ class DataChain:
|
|
|
793
685
|
**{object_name: catalog.listings()}, # type: ignore[arg-type]
|
|
794
686
|
)
|
|
795
687
|
|
|
796
|
-
def print_json_schema( # type: ignore[override]
|
|
797
|
-
self, jmespath: Optional[str] = None, model_name: Optional[str] = None
|
|
798
|
-
) -> "Self":
|
|
799
|
-
"""Print JSON data model and save it. It returns the chain itself.
|
|
800
|
-
|
|
801
|
-
Parameters:
|
|
802
|
-
jmespath : JMESPATH expression to reduce JSON
|
|
803
|
-
model_name : generated model name
|
|
804
|
-
|
|
805
|
-
Example:
|
|
806
|
-
print JSON schema and save to column "meta_from":
|
|
807
|
-
```py
|
|
808
|
-
uri = "gs://datachain-demo/coco2017/annotations_captions/"
|
|
809
|
-
chain = DataChain.from_storage(uri)
|
|
810
|
-
chain = chain.print_json_schema()
|
|
811
|
-
chain.save()
|
|
812
|
-
```
|
|
813
|
-
"""
|
|
814
|
-
return self.map(
|
|
815
|
-
meta_schema=lambda file: read_schema(
|
|
816
|
-
file, data_type="json", expr=jmespath, model_name=model_name
|
|
817
|
-
),
|
|
818
|
-
output=str,
|
|
819
|
-
)
|
|
820
|
-
|
|
821
|
-
def print_jsonl_schema( # type: ignore[override]
|
|
822
|
-
self, jmespath: Optional[str] = None, model_name: Optional[str] = None
|
|
823
|
-
) -> "Self":
|
|
824
|
-
"""Print JSON data model and save it. It returns the chain itself.
|
|
825
|
-
|
|
826
|
-
Parameters:
|
|
827
|
-
jmespath : JMESPATH expression to reduce JSON
|
|
828
|
-
model_name : generated model name
|
|
829
|
-
"""
|
|
830
|
-
return self.map(
|
|
831
|
-
meta_schema=lambda file: read_schema(
|
|
832
|
-
file, data_type="jsonl", expr=jmespath, model_name=model_name
|
|
833
|
-
),
|
|
834
|
-
output=str,
|
|
835
|
-
)
|
|
836
|
-
|
|
837
688
|
def save( # type: ignore[override]
|
|
838
689
|
self, name: Optional[str] = None, version: Optional[int] = None, **kwargs
|
|
839
690
|
) -> "Self":
|
|
@@ -1624,6 +1475,155 @@ class DataChain:
|
|
|
1624
1475
|
)
|
|
1625
1476
|
return self._evolve(query=self._query.subtract(other._query, signals)) # type: ignore[arg-type]
|
|
1626
1477
|
|
|
1478
|
+
def compare(
|
|
1479
|
+
self,
|
|
1480
|
+
other: "DataChain",
|
|
1481
|
+
on: Union[str, Sequence[str]],
|
|
1482
|
+
right_on: Optional[Union[str, Sequence[str]]] = None,
|
|
1483
|
+
compare: Optional[Union[str, Sequence[str]]] = None,
|
|
1484
|
+
right_compare: Optional[Union[str, Sequence[str]]] = None,
|
|
1485
|
+
added: bool = True,
|
|
1486
|
+
deleted: bool = True,
|
|
1487
|
+
modified: bool = True,
|
|
1488
|
+
same: bool = False,
|
|
1489
|
+
status_col: Optional[str] = None,
|
|
1490
|
+
) -> "DataChain":
|
|
1491
|
+
"""Comparing two chains by identifying rows that are added, deleted, modified
|
|
1492
|
+
or same. Result is the new chain that has additional column with possible
|
|
1493
|
+
values: `A`, `D`, `M`, `U` representing added, deleted, modified and same
|
|
1494
|
+
rows respectively. Note that if only one "status" is asked, by setting proper
|
|
1495
|
+
flags, this additional column is not created as it would have only one value
|
|
1496
|
+
for all rows. Beside additional diff column, new chain has schema of the chain
|
|
1497
|
+
on which method was called.
|
|
1498
|
+
|
|
1499
|
+
Parameters:
|
|
1500
|
+
other: Chain to calculate diff from.
|
|
1501
|
+
on: Column or list of columns to match on. If both chains have the
|
|
1502
|
+
same columns then this column is enough for the match. Otherwise,
|
|
1503
|
+
`right_on` parameter has to specify the columns for the other chain.
|
|
1504
|
+
This value is used to find corresponding row in other dataset. If not
|
|
1505
|
+
found there, row is considered as added (or removed if vice versa), and
|
|
1506
|
+
if found then row can be either modified or same.
|
|
1507
|
+
right_on: Optional column or list of columns
|
|
1508
|
+
for the `other` to match.
|
|
1509
|
+
compare: Column or list of columns to compare on. If both chains have
|
|
1510
|
+
the same columns then this column is enough for the compare. Otherwise,
|
|
1511
|
+
`right_compare` parameter has to specify the columns for the other
|
|
1512
|
+
chain. This value is used to see if row is modified or same. If
|
|
1513
|
+
not set, all columns will be used for comparison
|
|
1514
|
+
right_compare: Optional column or list of columns
|
|
1515
|
+
for the `other` to compare to.
|
|
1516
|
+
added (bool): Whether to return added rows in resulting chain.
|
|
1517
|
+
deleted (bool): Whether to return deleted rows in resulting chain.
|
|
1518
|
+
modified (bool): Whether to return modified rows in resulting chain.
|
|
1519
|
+
same (bool): Whether to return unchanged rows in resulting chain.
|
|
1520
|
+
status_col (str): Name of the new column that is created in resulting chain
|
|
1521
|
+
representing diff status.
|
|
1522
|
+
|
|
1523
|
+
Example:
|
|
1524
|
+
```py
|
|
1525
|
+
diff = persons.diff(
|
|
1526
|
+
new_persons,
|
|
1527
|
+
on=["id"],
|
|
1528
|
+
right_on=["other_id"],
|
|
1529
|
+
compare=["name"],
|
|
1530
|
+
added=True,
|
|
1531
|
+
deleted=True,
|
|
1532
|
+
modified=True,
|
|
1533
|
+
same=True,
|
|
1534
|
+
status_col="diff"
|
|
1535
|
+
)
|
|
1536
|
+
```
|
|
1537
|
+
"""
|
|
1538
|
+
from datachain.lib.diff import compare as chain_compare
|
|
1539
|
+
|
|
1540
|
+
return chain_compare(
|
|
1541
|
+
self,
|
|
1542
|
+
other,
|
|
1543
|
+
on,
|
|
1544
|
+
right_on=right_on,
|
|
1545
|
+
compare=compare,
|
|
1546
|
+
right_compare=right_compare,
|
|
1547
|
+
added=added,
|
|
1548
|
+
deleted=deleted,
|
|
1549
|
+
modified=modified,
|
|
1550
|
+
same=same,
|
|
1551
|
+
status_col=status_col,
|
|
1552
|
+
)
|
|
1553
|
+
|
|
1554
|
+
def diff(
|
|
1555
|
+
self,
|
|
1556
|
+
other: "DataChain",
|
|
1557
|
+
on: str = "file",
|
|
1558
|
+
right_on: Optional[str] = None,
|
|
1559
|
+
added: bool = True,
|
|
1560
|
+
modified: bool = True,
|
|
1561
|
+
deleted: bool = False,
|
|
1562
|
+
same: bool = False,
|
|
1563
|
+
status_col: Optional[str] = None,
|
|
1564
|
+
) -> "DataChain":
|
|
1565
|
+
"""Similar to `.compare()`, which is more generic method to calculate difference
|
|
1566
|
+
between two chains. Unlike `.compare()`, this method works only on those chains
|
|
1567
|
+
that have `File` object, or it's derivatives, in it. File `source` and `path`
|
|
1568
|
+
are used for matching, and file `version` and `etag` for comparing, while in
|
|
1569
|
+
`.compare()` user needs to provide arbitrary columns for matching and comparing.
|
|
1570
|
+
|
|
1571
|
+
Parameters:
|
|
1572
|
+
other: Chain to calculate diff from.
|
|
1573
|
+
on: File signal to match on. If both chains have the
|
|
1574
|
+
same file signal then this column is enough for the match. Otherwise,
|
|
1575
|
+
`right_on` parameter has to specify the file signal for the other chain.
|
|
1576
|
+
This value is used to find corresponding row in other dataset. If not
|
|
1577
|
+
found there, row is considered as added (or removed if vice versa), and
|
|
1578
|
+
if found then row can be either modified or same.
|
|
1579
|
+
right_on: Optional file signal for the `other` to match.
|
|
1580
|
+
added (bool): Whether to return added rows in resulting chain.
|
|
1581
|
+
deleted (bool): Whether to return deleted rows in resulting chain.
|
|
1582
|
+
modified (bool): Whether to return modified rows in resulting chain.
|
|
1583
|
+
same (bool): Whether to return unchanged rows in resulting chain.
|
|
1584
|
+
status_col (str): Optional name of the new column that is created in
|
|
1585
|
+
resulting chain representing diff status.
|
|
1586
|
+
|
|
1587
|
+
Example:
|
|
1588
|
+
```py
|
|
1589
|
+
diff = images.diff(
|
|
1590
|
+
new_images,
|
|
1591
|
+
on="file",
|
|
1592
|
+
right_on="other_file",
|
|
1593
|
+
added=True,
|
|
1594
|
+
deleted=True,
|
|
1595
|
+
modified=True,
|
|
1596
|
+
same=True,
|
|
1597
|
+
status_col="diff"
|
|
1598
|
+
)
|
|
1599
|
+
```
|
|
1600
|
+
"""
|
|
1601
|
+
on_file_signals = ["source", "path"]
|
|
1602
|
+
compare_file_signals = ["version", "etag"]
|
|
1603
|
+
|
|
1604
|
+
def get_file_signals(file: str, signals):
|
|
1605
|
+
return [f"{file}.{c}" for c in signals]
|
|
1606
|
+
|
|
1607
|
+
right_on = right_on or on
|
|
1608
|
+
|
|
1609
|
+
on_cols = get_file_signals(on, on_file_signals)
|
|
1610
|
+
right_on_cols = get_file_signals(right_on, on_file_signals)
|
|
1611
|
+
compare_cols = get_file_signals(on, compare_file_signals)
|
|
1612
|
+
right_compare_cols = get_file_signals(right_on, compare_file_signals)
|
|
1613
|
+
|
|
1614
|
+
return self.compare(
|
|
1615
|
+
other,
|
|
1616
|
+
on_cols,
|
|
1617
|
+
right_on=right_on_cols,
|
|
1618
|
+
compare=compare_cols,
|
|
1619
|
+
right_compare=right_compare_cols,
|
|
1620
|
+
added=added,
|
|
1621
|
+
deleted=deleted,
|
|
1622
|
+
modified=modified,
|
|
1623
|
+
same=same,
|
|
1624
|
+
status_col=status_col,
|
|
1625
|
+
)
|
|
1626
|
+
|
|
1627
1627
|
@classmethod
|
|
1628
1628
|
def from_values(
|
|
1629
1629
|
cls,
|
|
@@ -1896,7 +1896,10 @@ class DataChain:
|
|
|
1896
1896
|
|
|
1897
1897
|
if source:
|
|
1898
1898
|
output = {"source": ArrowRow} | output # type: ignore[assignment,operator]
|
|
1899
|
-
|
|
1899
|
+
|
|
1900
|
+
# disable prefetch if nrows is set
|
|
1901
|
+
settings = {"prefetch": 0} if nrows else {}
|
|
1902
|
+
return self.settings(**settings).gen( # type: ignore[arg-type]
|
|
1900
1903
|
ArrowGenerator(schema, model, source, nrows, **kwargs), output=output
|
|
1901
1904
|
)
|
|
1902
1905
|
|
|
@@ -1978,8 +1981,6 @@ class DataChain:
|
|
|
1978
1981
|
else:
|
|
1979
1982
|
msg = f"error parsing csv - incompatible output type {type(output)}"
|
|
1980
1983
|
raise DatasetPrepareError(chain.name, msg)
|
|
1981
|
-
elif nrows:
|
|
1982
|
-
nrows += 1
|
|
1983
1984
|
|
|
1984
1985
|
parse_options = ParseOptions(delimiter=delimiter)
|
|
1985
1986
|
read_options = ReadOptions(column_names=column_names)
|
datachain/lib/diff.py
ADDED
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
import random
|
|
2
|
+
import string
|
|
3
|
+
from collections.abc import Sequence
|
|
4
|
+
from typing import TYPE_CHECKING, Optional, Union
|
|
5
|
+
|
|
6
|
+
import sqlalchemy as sa
|
|
7
|
+
|
|
8
|
+
from datachain.lib.signal_schema import SignalSchema
|
|
9
|
+
from datachain.query.schema import Column
|
|
10
|
+
from datachain.sql.types import String
|
|
11
|
+
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from datachain.lib.dc import DataChain
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
C = Column
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def compare( # noqa: PLR0912, PLR0915, C901
|
|
20
|
+
left: "DataChain",
|
|
21
|
+
right: "DataChain",
|
|
22
|
+
on: Union[str, Sequence[str]],
|
|
23
|
+
right_on: Optional[Union[str, Sequence[str]]] = None,
|
|
24
|
+
compare: Optional[Union[str, Sequence[str]]] = None,
|
|
25
|
+
right_compare: Optional[Union[str, Sequence[str]]] = None,
|
|
26
|
+
added: bool = True,
|
|
27
|
+
deleted: bool = True,
|
|
28
|
+
modified: bool = True,
|
|
29
|
+
same: bool = True,
|
|
30
|
+
status_col: Optional[str] = None,
|
|
31
|
+
) -> "DataChain":
|
|
32
|
+
"""Comparing two chains by identifying rows that are added, deleted, modified
|
|
33
|
+
or same"""
|
|
34
|
+
dialect = left._query.dialect
|
|
35
|
+
|
|
36
|
+
rname = "right_"
|
|
37
|
+
|
|
38
|
+
def _rprefix(c: str, rc: str) -> str:
|
|
39
|
+
"""Returns prefix of right of two companion left - right columns
|
|
40
|
+
from merge. If companion columns have the same name then prefix will
|
|
41
|
+
be present in right column name, otherwise it won't.
|
|
42
|
+
"""
|
|
43
|
+
return rname if c == rc else ""
|
|
44
|
+
|
|
45
|
+
def _to_list(obj: Union[str, Sequence[str]]) -> list[str]:
|
|
46
|
+
return [obj] if isinstance(obj, str) else list(obj)
|
|
47
|
+
|
|
48
|
+
if on is None:
|
|
49
|
+
raise ValueError("'on' must be specified")
|
|
50
|
+
|
|
51
|
+
on = _to_list(on)
|
|
52
|
+
if right_on:
|
|
53
|
+
right_on = _to_list(right_on)
|
|
54
|
+
if len(on) != len(right_on):
|
|
55
|
+
raise ValueError("'on' and 'right_on' must be have the same length")
|
|
56
|
+
|
|
57
|
+
if compare:
|
|
58
|
+
compare = _to_list(compare)
|
|
59
|
+
|
|
60
|
+
if right_compare:
|
|
61
|
+
if not compare:
|
|
62
|
+
raise ValueError("'compare' must be defined if 'right_compare' is defined")
|
|
63
|
+
|
|
64
|
+
right_compare = _to_list(right_compare)
|
|
65
|
+
if len(compare) != len(right_compare):
|
|
66
|
+
raise ValueError(
|
|
67
|
+
"'compare' and 'right_compare' must be have the same length"
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
if not any([added, deleted, modified, same]):
|
|
71
|
+
raise ValueError(
|
|
72
|
+
"At least one of added, deleted, modified, same flags must be set"
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
# we still need status column for internal implementation even if not
|
|
76
|
+
# needed in output
|
|
77
|
+
need_status_col = bool(status_col)
|
|
78
|
+
status_col = status_col or "diff_" + "".join(
|
|
79
|
+
random.choice(string.ascii_letters) # noqa: S311
|
|
80
|
+
for _ in range(10)
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
# calculate on and compare column names
|
|
84
|
+
right_on = right_on or on
|
|
85
|
+
cols = left.signals_schema.clone_without_sys_signals().db_signals()
|
|
86
|
+
right_cols = right.signals_schema.clone_without_sys_signals().db_signals()
|
|
87
|
+
|
|
88
|
+
on = left.signals_schema.resolve(*on).db_signals() # type: ignore[assignment]
|
|
89
|
+
right_on = right.signals_schema.resolve(*right_on).db_signals() # type: ignore[assignment]
|
|
90
|
+
if compare:
|
|
91
|
+
right_compare = right_compare or compare
|
|
92
|
+
compare = left.signals_schema.resolve(*compare).db_signals() # type: ignore[assignment]
|
|
93
|
+
right_compare = right.signals_schema.resolve(*right_compare).db_signals() # type: ignore[assignment]
|
|
94
|
+
elif not compare and len(cols) != len(right_cols):
|
|
95
|
+
# here we will mark all rows that are not added or deleted as modified since
|
|
96
|
+
# there was no explicit list of compare columns provided (meaning we need
|
|
97
|
+
# to check all columns to determine if row is modified or same), but
|
|
98
|
+
# the number of columns on left and right is not the same (one of the chains
|
|
99
|
+
# have additional column)
|
|
100
|
+
compare = None
|
|
101
|
+
right_compare = None
|
|
102
|
+
else:
|
|
103
|
+
compare = [c for c in cols if c in right_cols] # type: ignore[misc, assignment]
|
|
104
|
+
right_compare = compare
|
|
105
|
+
|
|
106
|
+
diff_cond = []
|
|
107
|
+
|
|
108
|
+
if added:
|
|
109
|
+
added_cond = sa.and_(
|
|
110
|
+
*[
|
|
111
|
+
C(c) == None # noqa: E711
|
|
112
|
+
for c in [f"{_rprefix(c, rc)}{rc}" for c, rc in zip(on, right_on)]
|
|
113
|
+
]
|
|
114
|
+
)
|
|
115
|
+
diff_cond.append((added_cond, "A"))
|
|
116
|
+
if modified and compare:
|
|
117
|
+
modified_cond = sa.or_(
|
|
118
|
+
*[
|
|
119
|
+
C(c) != C(f"{_rprefix(c, rc)}{rc}")
|
|
120
|
+
for c, rc in zip(compare, right_compare) # type: ignore[arg-type]
|
|
121
|
+
]
|
|
122
|
+
)
|
|
123
|
+
diff_cond.append((modified_cond, "M"))
|
|
124
|
+
if same and compare:
|
|
125
|
+
same_cond = sa.and_(
|
|
126
|
+
*[
|
|
127
|
+
C(c) == C(f"{_rprefix(c, rc)}{rc}")
|
|
128
|
+
for c, rc in zip(compare, right_compare) # type: ignore[arg-type]
|
|
129
|
+
]
|
|
130
|
+
)
|
|
131
|
+
diff_cond.append((same_cond, "S"))
|
|
132
|
+
|
|
133
|
+
diff = sa.case(*diff_cond, else_=None if compare else "M").label(status_col)
|
|
134
|
+
diff.type = String()
|
|
135
|
+
|
|
136
|
+
left_right_merge = left.merge(
|
|
137
|
+
right, on=on, right_on=right_on, inner=False, rname=rname
|
|
138
|
+
)
|
|
139
|
+
left_right_merge_select = left_right_merge._query.select(
|
|
140
|
+
*(
|
|
141
|
+
[C(c) for c in left_right_merge.signals_schema.db_signals("sys")]
|
|
142
|
+
+ [C(c) for c in on]
|
|
143
|
+
+ [C(c) for c in cols if c not in on]
|
|
144
|
+
+ [diff]
|
|
145
|
+
)
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
diff_col = sa.literal("D").label(status_col)
|
|
149
|
+
diff_col.type = String()
|
|
150
|
+
|
|
151
|
+
right_left_merge = right.merge(
|
|
152
|
+
left, on=right_on, right_on=on, inner=False, rname=rname
|
|
153
|
+
).filter(
|
|
154
|
+
sa.and_(
|
|
155
|
+
*[C(f"{_rprefix(c, rc)}{c}") == None for c, rc in zip(on, right_on)] # noqa: E711
|
|
156
|
+
)
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
def _default_val(chain: "DataChain", col: str):
|
|
160
|
+
col_type = chain._query.column_types[col] # type: ignore[index]
|
|
161
|
+
val = sa.literal(col_type.default_value(dialect)).label(col)
|
|
162
|
+
val.type = col_type()
|
|
163
|
+
return val
|
|
164
|
+
|
|
165
|
+
right_left_merge_select = right_left_merge._query.select(
|
|
166
|
+
*(
|
|
167
|
+
[C(c) for c in right_left_merge.signals_schema.db_signals("sys")]
|
|
168
|
+
+ [
|
|
169
|
+
C(c) if c == rc else _default_val(left, c)
|
|
170
|
+
for c, rc in zip(on, right_on)
|
|
171
|
+
]
|
|
172
|
+
+ [
|
|
173
|
+
C(c) if c in right_cols else _default_val(left, c) # type: ignore[arg-type]
|
|
174
|
+
for c in cols
|
|
175
|
+
if c not in on
|
|
176
|
+
]
|
|
177
|
+
+ [diff_col]
|
|
178
|
+
)
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
if not deleted:
|
|
182
|
+
res = left_right_merge_select
|
|
183
|
+
elif deleted and not any([added, modified, same]):
|
|
184
|
+
res = right_left_merge_select
|
|
185
|
+
else:
|
|
186
|
+
res = left_right_merge_select.union(right_left_merge_select)
|
|
187
|
+
|
|
188
|
+
res = res.filter(C(status_col) != None) # noqa: E711
|
|
189
|
+
|
|
190
|
+
schema = left.signals_schema
|
|
191
|
+
if need_status_col:
|
|
192
|
+
res = res.select()
|
|
193
|
+
schema = SignalSchema({status_col: str}) | schema
|
|
194
|
+
else:
|
|
195
|
+
res = res.select_except(C(status_col))
|
|
196
|
+
|
|
197
|
+
return left._evolve(query=res, signal_schema=schema)
|
datachain/lib/file.py
CHANGED
|
@@ -39,6 +39,8 @@ logger = logging.getLogger("datachain")
|
|
|
39
39
|
# how to create file path when exporting
|
|
40
40
|
ExportPlacement = Literal["filename", "etag", "fullpath", "checksum"]
|
|
41
41
|
|
|
42
|
+
FileType = Literal["binary", "text", "image"]
|
|
43
|
+
|
|
42
44
|
|
|
43
45
|
class VFileError(DataChainError):
|
|
44
46
|
def __init__(self, file: "File", message: str, vtype: str = ""):
|
|
@@ -470,7 +472,7 @@ class ArrowRow(DataModel):
|
|
|
470
472
|
return record_batch.to_pylist()[0]
|
|
471
473
|
|
|
472
474
|
|
|
473
|
-
def get_file_type(type_:
|
|
475
|
+
def get_file_type(type_: FileType = "binary") -> type[File]:
|
|
474
476
|
file: type[File] = File
|
|
475
477
|
if type_ == "text":
|
|
476
478
|
file = TextFile
|