datachain 0.7.11__py3-none-any.whl → 0.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

datachain/lib/dc.py CHANGED
@@ -11,7 +11,6 @@ from typing import (
11
11
  BinaryIO,
12
12
  Callable,
13
13
  ClassVar,
14
- Literal,
15
14
  Optional,
16
15
  TypeVar,
17
16
  Union,
@@ -24,8 +23,6 @@ from pydantic import BaseModel
24
23
  from sqlalchemy.sql.functions import GenericFunction
25
24
  from sqlalchemy.sql.sqltypes import NullType
26
25
 
27
- from datachain.client import Client
28
- from datachain.client.local import FileClient
29
26
  from datachain.dataset import DatasetRecord
30
27
  from datachain.func.base import Function
31
28
  from datachain.func.func import Func
@@ -33,15 +30,11 @@ from datachain.lib.convert.python_to_sql import python_to_sql
33
30
  from datachain.lib.convert.values_to_tuples import values_to_tuples
34
31
  from datachain.lib.data_model import DataModel, DataType, DataValue, dict_to_data_model
35
32
  from datachain.lib.dataset_info import DatasetInfo
36
- from datachain.lib.file import ArrowRow, File, get_file_type
33
+ from datachain.lib.file import ArrowRow, File, FileType, get_file_type
37
34
  from datachain.lib.file import ExportPlacement as FileExportPlacement
38
- from datachain.lib.listing import (
39
- list_bucket,
40
- ls,
41
- parse_listing_uri,
42
- )
35
+ from datachain.lib.listing import get_listing, list_bucket, ls
43
36
  from datachain.lib.listing_info import ListingInfo
44
- from datachain.lib.meta_formats import read_meta, read_schema
37
+ from datachain.lib.meta_formats import read_meta
45
38
  from datachain.lib.model_store import ModelStore
46
39
  from datachain.lib.settings import Settings
47
40
  from datachain.lib.signal_schema import SignalSchema
@@ -403,53 +396,12 @@ class DataChain:
403
396
  self.signals_schema |= signals_schema
404
397
  return self
405
398
 
406
- @classmethod
407
- def parse_uri(
408
- cls, uri: str, session: Session, update: bool = False
409
- ) -> tuple[str, str, str, bool]:
410
- """Returns correct listing dataset name that must be used for saving listing
411
- operation. It takes into account existing listings and reusability of those.
412
- It also returns boolean saying if returned dataset name is reused / already
413
- exists or not, and it returns correct listing path that should be used to find
414
- rows based on uri.
415
- """
416
- catalog = session.catalog
417
- cache = catalog.cache
418
- client_config = catalog.client_config
419
-
420
- client = Client.get_client(uri, cache, **client_config)
421
- ds_name, list_uri, list_path = parse_listing_uri(uri, cache, client_config)
422
- listing = None
423
-
424
- listings = [
425
- ls
426
- for ls in catalog.listings()
427
- if not ls.is_expired and ls.contains(ds_name)
428
- ]
429
-
430
- if listings:
431
- if update:
432
- # choosing the smallest possible one to minimize update time
433
- listing = sorted(listings, key=lambda ls: len(ls.name))[0]
434
- else:
435
- # no need to update, choosing the most recent one
436
- listing = sorted(listings, key=lambda ls: ls.created_at)[-1]
437
-
438
- if isinstance(client, FileClient) and listing and listing.name != ds_name:
439
- # For local file system we need to fix listing path / prefix
440
- # if we are reusing existing listing
441
- list_path = f'{ds_name.strip("/").removeprefix(listing.name)}/{list_path}'
442
-
443
- ds_name = listing.name if listing else ds_name
444
-
445
- return ds_name, list_uri, list_path, bool(listing)
446
-
447
399
  @classmethod
448
400
  def from_storage(
449
401
  cls,
450
402
  uri,
451
403
  *,
452
- type: Literal["binary", "text", "image"] = "binary",
404
+ type: FileType = "binary",
453
405
  session: Optional[Session] = None,
454
406
  settings: Optional[dict] = None,
455
407
  in_memory: bool = False,
@@ -482,7 +434,7 @@ class DataChain:
482
434
  cache = session.catalog.cache
483
435
  client_config = session.catalog.client_config
484
436
 
485
- list_ds_name, list_uri, list_path, list_ds_exists = cls.parse_uri(
437
+ list_ds_name, list_uri, list_path, list_ds_exists = get_listing(
486
438
  uri, session, update=update
487
439
  )
488
440
 
@@ -548,14 +500,13 @@ class DataChain:
548
500
  def from_json(
549
501
  cls,
550
502
  path,
551
- type: Literal["binary", "text", "image"] = "text",
503
+ type: FileType = "text",
552
504
  spec: Optional[DataType] = None,
553
505
  schema_from: Optional[str] = "auto",
554
506
  jmespath: Optional[str] = None,
555
507
  object_name: Optional[str] = "",
556
508
  model_name: Optional[str] = None,
557
- print_schema: Optional[bool] = False,
558
- meta_type: Optional[str] = "json",
509
+ format: Optional[str] = "json",
559
510
  nrows=None,
560
511
  **kwargs,
561
512
  ) -> "DataChain":
@@ -564,12 +515,12 @@ class DataChain:
564
515
  Parameters:
565
516
  path : storage URI with directory. URI must start with storage prefix such
566
517
  as `s3://`, `gs://`, `az://` or "file:///"
567
- type : read file as "binary", "text", or "image" data. Default is "binary".
518
+ type : read file as "binary", "text", or "image" data. Default is "text".
568
519
  spec : optional Data Model
569
520
  schema_from : path to sample to infer spec (if schema not provided)
570
521
  object_name : generated object column name
571
522
  model_name : optional generated model name
572
- print_schema : print auto-generated schema
523
+ format: "json", "jsonl"
573
524
  jmespath : optional JMESPATH expression to reduce JSON
574
525
  nrows : optional row limit for jsonl and JSON arrays
575
526
 
@@ -594,80 +545,21 @@ class DataChain:
594
545
  if (not object_name) and jmespath:
595
546
  object_name = jmespath_to_name(jmespath)
596
547
  if not object_name:
597
- object_name = meta_type
598
- chain = DataChain.from_storage(uri=path, type=type, **kwargs)
599
- signal_dict = {
600
- object_name: read_meta(
601
- schema_from=schema_from,
602
- meta_type=meta_type,
603
- spec=spec,
604
- model_name=model_name,
605
- print_schema=print_schema,
606
- jmespath=jmespath,
607
- nrows=nrows,
608
- )
609
- }
610
- return chain.gen(**signal_dict) # type: ignore[misc, arg-type]
611
-
612
- @classmethod
613
- def from_jsonl(
614
- cls,
615
- path,
616
- type: Literal["binary", "text", "image"] = "text",
617
- spec: Optional[DataType] = None,
618
- schema_from: Optional[str] = "auto",
619
- jmespath: Optional[str] = None,
620
- object_name: Optional[str] = "",
621
- model_name: Optional[str] = None,
622
- print_schema: Optional[bool] = False,
623
- meta_type: Optional[str] = "jsonl",
624
- nrows=None,
625
- **kwargs,
626
- ) -> "DataChain":
627
- """Get data from JSON lines. It returns the chain itself.
628
-
629
- Parameters:
630
- path : storage URI with directory. URI must start with storage prefix such
631
- as `s3://`, `gs://`, `az://` or "file:///"
632
- type : read file as "binary", "text", or "image" data. Default is "binary".
633
- spec : optional Data Model
634
- schema_from : path to sample to infer spec (if schema not provided)
635
- object_name : generated object column name
636
- model_name : optional generated model name
637
- print_schema : print auto-generated schema
638
- jmespath : optional JMESPATH expression to reduce JSON
639
- nrows : optional row limit for jsonl and JSON arrays
640
-
641
- Example:
642
- infer JSONl schema from data, limit parsing to 1 row
643
- ```py
644
- chain = DataChain.from_jsonl("gs://myjsonl", nrows=1)
645
- ```
646
- """
647
- if schema_from == "auto":
648
- schema_from = path
649
-
650
- def jmespath_to_name(s: str):
651
- name_end = re.search(r"\W", s).start() if re.search(r"\W", s) else len(s) # type: ignore[union-attr]
652
- return s[:name_end]
653
-
654
- if (not object_name) and jmespath:
655
- object_name = jmespath_to_name(jmespath)
656
- if not object_name:
657
- object_name = meta_type
548
+ object_name = format
658
549
  chain = DataChain.from_storage(uri=path, type=type, **kwargs)
659
550
  signal_dict = {
660
551
  object_name: read_meta(
661
552
  schema_from=schema_from,
662
- meta_type=meta_type,
553
+ format=format,
663
554
  spec=spec,
664
555
  model_name=model_name,
665
- print_schema=print_schema,
666
556
  jmespath=jmespath,
667
557
  nrows=nrows,
668
558
  )
669
559
  }
670
- return chain.gen(**signal_dict) # type: ignore[misc, arg-type]
560
+ # disable prefetch if nrows is set
561
+ settings = {"prefetch": 0} if nrows else {}
562
+ return chain.settings(**settings).gen(**signal_dict) # type: ignore[misc, arg-type]
671
563
 
672
564
  def explode(
673
565
  self,
@@ -793,47 +685,6 @@ class DataChain:
793
685
  **{object_name: catalog.listings()}, # type: ignore[arg-type]
794
686
  )
795
687
 
796
- def print_json_schema( # type: ignore[override]
797
- self, jmespath: Optional[str] = None, model_name: Optional[str] = None
798
- ) -> "Self":
799
- """Print JSON data model and save it. It returns the chain itself.
800
-
801
- Parameters:
802
- jmespath : JMESPATH expression to reduce JSON
803
- model_name : generated model name
804
-
805
- Example:
806
- print JSON schema and save to column "meta_from":
807
- ```py
808
- uri = "gs://datachain-demo/coco2017/annotations_captions/"
809
- chain = DataChain.from_storage(uri)
810
- chain = chain.print_json_schema()
811
- chain.save()
812
- ```
813
- """
814
- return self.map(
815
- meta_schema=lambda file: read_schema(
816
- file, data_type="json", expr=jmespath, model_name=model_name
817
- ),
818
- output=str,
819
- )
820
-
821
- def print_jsonl_schema( # type: ignore[override]
822
- self, jmespath: Optional[str] = None, model_name: Optional[str] = None
823
- ) -> "Self":
824
- """Print JSON data model and save it. It returns the chain itself.
825
-
826
- Parameters:
827
- jmespath : JMESPATH expression to reduce JSON
828
- model_name : generated model name
829
- """
830
- return self.map(
831
- meta_schema=lambda file: read_schema(
832
- file, data_type="jsonl", expr=jmespath, model_name=model_name
833
- ),
834
- output=str,
835
- )
836
-
837
688
  def save( # type: ignore[override]
838
689
  self, name: Optional[str] = None, version: Optional[int] = None, **kwargs
839
690
  ) -> "Self":
@@ -1624,6 +1475,155 @@ class DataChain:
1624
1475
  )
1625
1476
  return self._evolve(query=self._query.subtract(other._query, signals)) # type: ignore[arg-type]
1626
1477
 
1478
+ def compare(
1479
+ self,
1480
+ other: "DataChain",
1481
+ on: Union[str, Sequence[str]],
1482
+ right_on: Optional[Union[str, Sequence[str]]] = None,
1483
+ compare: Optional[Union[str, Sequence[str]]] = None,
1484
+ right_compare: Optional[Union[str, Sequence[str]]] = None,
1485
+ added: bool = True,
1486
+ deleted: bool = True,
1487
+ modified: bool = True,
1488
+ same: bool = False,
1489
+ status_col: Optional[str] = None,
1490
+ ) -> "DataChain":
1491
+ """Comparing two chains by identifying rows that are added, deleted, modified
1492
+ or same. Result is the new chain that has additional column with possible
1493
+ values: `A`, `D`, `M`, `U` representing added, deleted, modified and same
1494
+ rows respectively. Note that if only one "status" is asked, by setting proper
1495
+ flags, this additional column is not created as it would have only one value
1496
+ for all rows. Beside additional diff column, new chain has schema of the chain
1497
+ on which method was called.
1498
+
1499
+ Parameters:
1500
+ other: Chain to calculate diff from.
1501
+ on: Column or list of columns to match on. If both chains have the
1502
+ same columns then this column is enough for the match. Otherwise,
1503
+ `right_on` parameter has to specify the columns for the other chain.
1504
+ This value is used to find corresponding row in other dataset. If not
1505
+ found there, row is considered as added (or removed if vice versa), and
1506
+ if found then row can be either modified or same.
1507
+ right_on: Optional column or list of columns
1508
+ for the `other` to match.
1509
+ compare: Column or list of columns to compare on. If both chains have
1510
+ the same columns then this column is enough for the compare. Otherwise,
1511
+ `right_compare` parameter has to specify the columns for the other
1512
+ chain. This value is used to see if row is modified or same. If
1513
+ not set, all columns will be used for comparison
1514
+ right_compare: Optional column or list of columns
1515
+ for the `other` to compare to.
1516
+ added (bool): Whether to return added rows in resulting chain.
1517
+ deleted (bool): Whether to return deleted rows in resulting chain.
1518
+ modified (bool): Whether to return modified rows in resulting chain.
1519
+ same (bool): Whether to return unchanged rows in resulting chain.
1520
+ status_col (str): Name of the new column that is created in resulting chain
1521
+ representing diff status.
1522
+
1523
+ Example:
1524
+ ```py
1525
+ diff = persons.diff(
1526
+ new_persons,
1527
+ on=["id"],
1528
+ right_on=["other_id"],
1529
+ compare=["name"],
1530
+ added=True,
1531
+ deleted=True,
1532
+ modified=True,
1533
+ same=True,
1534
+ status_col="diff"
1535
+ )
1536
+ ```
1537
+ """
1538
+ from datachain.lib.diff import compare as chain_compare
1539
+
1540
+ return chain_compare(
1541
+ self,
1542
+ other,
1543
+ on,
1544
+ right_on=right_on,
1545
+ compare=compare,
1546
+ right_compare=right_compare,
1547
+ added=added,
1548
+ deleted=deleted,
1549
+ modified=modified,
1550
+ same=same,
1551
+ status_col=status_col,
1552
+ )
1553
+
1554
+ def diff(
1555
+ self,
1556
+ other: "DataChain",
1557
+ on: str = "file",
1558
+ right_on: Optional[str] = None,
1559
+ added: bool = True,
1560
+ modified: bool = True,
1561
+ deleted: bool = False,
1562
+ same: bool = False,
1563
+ status_col: Optional[str] = None,
1564
+ ) -> "DataChain":
1565
+ """Similar to `.compare()`, which is more generic method to calculate difference
1566
+ between two chains. Unlike `.compare()`, this method works only on those chains
1567
+ that have `File` object, or it's derivatives, in it. File `source` and `path`
1568
+ are used for matching, and file `version` and `etag` for comparing, while in
1569
+ `.compare()` user needs to provide arbitrary columns for matching and comparing.
1570
+
1571
+ Parameters:
1572
+ other: Chain to calculate diff from.
1573
+ on: File signal to match on. If both chains have the
1574
+ same file signal then this column is enough for the match. Otherwise,
1575
+ `right_on` parameter has to specify the file signal for the other chain.
1576
+ This value is used to find corresponding row in other dataset. If not
1577
+ found there, row is considered as added (or removed if vice versa), and
1578
+ if found then row can be either modified or same.
1579
+ right_on: Optional file signal for the `other` to match.
1580
+ added (bool): Whether to return added rows in resulting chain.
1581
+ deleted (bool): Whether to return deleted rows in resulting chain.
1582
+ modified (bool): Whether to return modified rows in resulting chain.
1583
+ same (bool): Whether to return unchanged rows in resulting chain.
1584
+ status_col (str): Optional name of the new column that is created in
1585
+ resulting chain representing diff status.
1586
+
1587
+ Example:
1588
+ ```py
1589
+ diff = images.diff(
1590
+ new_images,
1591
+ on="file",
1592
+ right_on="other_file",
1593
+ added=True,
1594
+ deleted=True,
1595
+ modified=True,
1596
+ same=True,
1597
+ status_col="diff"
1598
+ )
1599
+ ```
1600
+ """
1601
+ on_file_signals = ["source", "path"]
1602
+ compare_file_signals = ["version", "etag"]
1603
+
1604
+ def get_file_signals(file: str, signals):
1605
+ return [f"{file}.{c}" for c in signals]
1606
+
1607
+ right_on = right_on or on
1608
+
1609
+ on_cols = get_file_signals(on, on_file_signals)
1610
+ right_on_cols = get_file_signals(right_on, on_file_signals)
1611
+ compare_cols = get_file_signals(on, compare_file_signals)
1612
+ right_compare_cols = get_file_signals(right_on, compare_file_signals)
1613
+
1614
+ return self.compare(
1615
+ other,
1616
+ on_cols,
1617
+ right_on=right_on_cols,
1618
+ compare=compare_cols,
1619
+ right_compare=right_compare_cols,
1620
+ added=added,
1621
+ deleted=deleted,
1622
+ modified=modified,
1623
+ same=same,
1624
+ status_col=status_col,
1625
+ )
1626
+
1627
1627
  @classmethod
1628
1628
  def from_values(
1629
1629
  cls,
@@ -1896,7 +1896,10 @@ class DataChain:
1896
1896
 
1897
1897
  if source:
1898
1898
  output = {"source": ArrowRow} | output # type: ignore[assignment,operator]
1899
- return self.gen(
1899
+
1900
+ # disable prefetch if nrows is set
1901
+ settings = {"prefetch": 0} if nrows else {}
1902
+ return self.settings(**settings).gen( # type: ignore[arg-type]
1900
1903
  ArrowGenerator(schema, model, source, nrows, **kwargs), output=output
1901
1904
  )
1902
1905
 
@@ -1978,8 +1981,6 @@ class DataChain:
1978
1981
  else:
1979
1982
  msg = f"error parsing csv - incompatible output type {type(output)}"
1980
1983
  raise DatasetPrepareError(chain.name, msg)
1981
- elif nrows:
1982
- nrows += 1
1983
1984
 
1984
1985
  parse_options = ParseOptions(delimiter=delimiter)
1985
1986
  read_options = ReadOptions(column_names=column_names)
datachain/lib/diff.py ADDED
@@ -0,0 +1,197 @@
1
+ import random
2
+ import string
3
+ from collections.abc import Sequence
4
+ from typing import TYPE_CHECKING, Optional, Union
5
+
6
+ import sqlalchemy as sa
7
+
8
+ from datachain.lib.signal_schema import SignalSchema
9
+ from datachain.query.schema import Column
10
+ from datachain.sql.types import String
11
+
12
+ if TYPE_CHECKING:
13
+ from datachain.lib.dc import DataChain
14
+
15
+
16
+ C = Column
17
+
18
+
19
+ def compare( # noqa: PLR0912, PLR0915, C901
20
+ left: "DataChain",
21
+ right: "DataChain",
22
+ on: Union[str, Sequence[str]],
23
+ right_on: Optional[Union[str, Sequence[str]]] = None,
24
+ compare: Optional[Union[str, Sequence[str]]] = None,
25
+ right_compare: Optional[Union[str, Sequence[str]]] = None,
26
+ added: bool = True,
27
+ deleted: bool = True,
28
+ modified: bool = True,
29
+ same: bool = True,
30
+ status_col: Optional[str] = None,
31
+ ) -> "DataChain":
32
+ """Comparing two chains by identifying rows that are added, deleted, modified
33
+ or same"""
34
+ dialect = left._query.dialect
35
+
36
+ rname = "right_"
37
+
38
+ def _rprefix(c: str, rc: str) -> str:
39
+ """Returns prefix of right of two companion left - right columns
40
+ from merge. If companion columns have the same name then prefix will
41
+ be present in right column name, otherwise it won't.
42
+ """
43
+ return rname if c == rc else ""
44
+
45
+ def _to_list(obj: Union[str, Sequence[str]]) -> list[str]:
46
+ return [obj] if isinstance(obj, str) else list(obj)
47
+
48
+ if on is None:
49
+ raise ValueError("'on' must be specified")
50
+
51
+ on = _to_list(on)
52
+ if right_on:
53
+ right_on = _to_list(right_on)
54
+ if len(on) != len(right_on):
55
+ raise ValueError("'on' and 'right_on' must be have the same length")
56
+
57
+ if compare:
58
+ compare = _to_list(compare)
59
+
60
+ if right_compare:
61
+ if not compare:
62
+ raise ValueError("'compare' must be defined if 'right_compare' is defined")
63
+
64
+ right_compare = _to_list(right_compare)
65
+ if len(compare) != len(right_compare):
66
+ raise ValueError(
67
+ "'compare' and 'right_compare' must be have the same length"
68
+ )
69
+
70
+ if not any([added, deleted, modified, same]):
71
+ raise ValueError(
72
+ "At least one of added, deleted, modified, same flags must be set"
73
+ )
74
+
75
+ # we still need status column for internal implementation even if not
76
+ # needed in output
77
+ need_status_col = bool(status_col)
78
+ status_col = status_col or "diff_" + "".join(
79
+ random.choice(string.ascii_letters) # noqa: S311
80
+ for _ in range(10)
81
+ )
82
+
83
+ # calculate on and compare column names
84
+ right_on = right_on or on
85
+ cols = left.signals_schema.clone_without_sys_signals().db_signals()
86
+ right_cols = right.signals_schema.clone_without_sys_signals().db_signals()
87
+
88
+ on = left.signals_schema.resolve(*on).db_signals() # type: ignore[assignment]
89
+ right_on = right.signals_schema.resolve(*right_on).db_signals() # type: ignore[assignment]
90
+ if compare:
91
+ right_compare = right_compare or compare
92
+ compare = left.signals_schema.resolve(*compare).db_signals() # type: ignore[assignment]
93
+ right_compare = right.signals_schema.resolve(*right_compare).db_signals() # type: ignore[assignment]
94
+ elif not compare and len(cols) != len(right_cols):
95
+ # here we will mark all rows that are not added or deleted as modified since
96
+ # there was no explicit list of compare columns provided (meaning we need
97
+ # to check all columns to determine if row is modified or same), but
98
+ # the number of columns on left and right is not the same (one of the chains
99
+ # have additional column)
100
+ compare = None
101
+ right_compare = None
102
+ else:
103
+ compare = [c for c in cols if c in right_cols] # type: ignore[misc, assignment]
104
+ right_compare = compare
105
+
106
+ diff_cond = []
107
+
108
+ if added:
109
+ added_cond = sa.and_(
110
+ *[
111
+ C(c) == None # noqa: E711
112
+ for c in [f"{_rprefix(c, rc)}{rc}" for c, rc in zip(on, right_on)]
113
+ ]
114
+ )
115
+ diff_cond.append((added_cond, "A"))
116
+ if modified and compare:
117
+ modified_cond = sa.or_(
118
+ *[
119
+ C(c) != C(f"{_rprefix(c, rc)}{rc}")
120
+ for c, rc in zip(compare, right_compare) # type: ignore[arg-type]
121
+ ]
122
+ )
123
+ diff_cond.append((modified_cond, "M"))
124
+ if same and compare:
125
+ same_cond = sa.and_(
126
+ *[
127
+ C(c) == C(f"{_rprefix(c, rc)}{rc}")
128
+ for c, rc in zip(compare, right_compare) # type: ignore[arg-type]
129
+ ]
130
+ )
131
+ diff_cond.append((same_cond, "S"))
132
+
133
+ diff = sa.case(*diff_cond, else_=None if compare else "M").label(status_col)
134
+ diff.type = String()
135
+
136
+ left_right_merge = left.merge(
137
+ right, on=on, right_on=right_on, inner=False, rname=rname
138
+ )
139
+ left_right_merge_select = left_right_merge._query.select(
140
+ *(
141
+ [C(c) for c in left_right_merge.signals_schema.db_signals("sys")]
142
+ + [C(c) for c in on]
143
+ + [C(c) for c in cols if c not in on]
144
+ + [diff]
145
+ )
146
+ )
147
+
148
+ diff_col = sa.literal("D").label(status_col)
149
+ diff_col.type = String()
150
+
151
+ right_left_merge = right.merge(
152
+ left, on=right_on, right_on=on, inner=False, rname=rname
153
+ ).filter(
154
+ sa.and_(
155
+ *[C(f"{_rprefix(c, rc)}{c}") == None for c, rc in zip(on, right_on)] # noqa: E711
156
+ )
157
+ )
158
+
159
+ def _default_val(chain: "DataChain", col: str):
160
+ col_type = chain._query.column_types[col] # type: ignore[index]
161
+ val = sa.literal(col_type.default_value(dialect)).label(col)
162
+ val.type = col_type()
163
+ return val
164
+
165
+ right_left_merge_select = right_left_merge._query.select(
166
+ *(
167
+ [C(c) for c in right_left_merge.signals_schema.db_signals("sys")]
168
+ + [
169
+ C(c) if c == rc else _default_val(left, c)
170
+ for c, rc in zip(on, right_on)
171
+ ]
172
+ + [
173
+ C(c) if c in right_cols else _default_val(left, c) # type: ignore[arg-type]
174
+ for c in cols
175
+ if c not in on
176
+ ]
177
+ + [diff_col]
178
+ )
179
+ )
180
+
181
+ if not deleted:
182
+ res = left_right_merge_select
183
+ elif deleted and not any([added, modified, same]):
184
+ res = right_left_merge_select
185
+ else:
186
+ res = left_right_merge_select.union(right_left_merge_select)
187
+
188
+ res = res.filter(C(status_col) != None) # noqa: E711
189
+
190
+ schema = left.signals_schema
191
+ if need_status_col:
192
+ res = res.select()
193
+ schema = SignalSchema({status_col: str}) | schema
194
+ else:
195
+ res = res.select_except(C(status_col))
196
+
197
+ return left._evolve(query=res, signal_schema=schema)
datachain/lib/file.py CHANGED
@@ -39,6 +39,8 @@ logger = logging.getLogger("datachain")
39
39
  # how to create file path when exporting
40
40
  ExportPlacement = Literal["filename", "etag", "fullpath", "checksum"]
41
41
 
42
+ FileType = Literal["binary", "text", "image"]
43
+
42
44
 
43
45
  class VFileError(DataChainError):
44
46
  def __init__(self, file: "File", message: str, vtype: str = ""):
@@ -470,7 +472,7 @@ class ArrowRow(DataModel):
470
472
  return record_batch.to_pylist()[0]
471
473
 
472
474
 
473
- def get_file_type(type_: Literal["binary", "text", "image"] = "binary") -> type[File]:
475
+ def get_file_type(type_: FileType = "binary") -> type[File]:
474
476
  file: type[File] = File
475
477
  if type_ == "text":
476
478
  file = TextFile