datachain 0.2.10__py3-none-any.whl → 0.2.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (49) hide show
  1. datachain/__init__.py +3 -4
  2. datachain/cache.py +10 -4
  3. datachain/catalog/catalog.py +35 -15
  4. datachain/cli.py +37 -32
  5. datachain/data_storage/metastore.py +24 -0
  6. datachain/data_storage/warehouse.py +3 -1
  7. datachain/job.py +56 -0
  8. datachain/lib/arrow.py +19 -7
  9. datachain/lib/clip.py +89 -66
  10. datachain/lib/convert/{type_converter.py → python_to_sql.py} +6 -6
  11. datachain/lib/convert/sql_to_python.py +23 -0
  12. datachain/lib/convert/values_to_tuples.py +51 -33
  13. datachain/lib/data_model.py +6 -27
  14. datachain/lib/dataset_info.py +70 -0
  15. datachain/lib/dc.py +646 -152
  16. datachain/lib/file.py +117 -15
  17. datachain/lib/image.py +1 -1
  18. datachain/lib/meta_formats.py +14 -2
  19. datachain/lib/model_store.py +3 -2
  20. datachain/lib/pytorch.py +10 -7
  21. datachain/lib/signal_schema.py +39 -14
  22. datachain/lib/text.py +2 -1
  23. datachain/lib/udf.py +56 -5
  24. datachain/lib/udf_signature.py +1 -1
  25. datachain/lib/webdataset.py +4 -3
  26. datachain/node.py +11 -8
  27. datachain/query/dataset.py +66 -147
  28. datachain/query/dispatch.py +15 -13
  29. datachain/query/schema.py +2 -0
  30. datachain/query/session.py +4 -4
  31. datachain/sql/functions/array.py +12 -0
  32. datachain/sql/functions/string.py +8 -0
  33. datachain/torch/__init__.py +1 -1
  34. datachain/utils.py +45 -0
  35. datachain-0.2.12.dist-info/METADATA +412 -0
  36. {datachain-0.2.10.dist-info → datachain-0.2.12.dist-info}/RECORD +40 -45
  37. {datachain-0.2.10.dist-info → datachain-0.2.12.dist-info}/WHEEL +1 -1
  38. datachain/lib/feature_registry.py +0 -77
  39. datachain/lib/gpt4_vision.py +0 -97
  40. datachain/lib/hf_image_to_text.py +0 -97
  41. datachain/lib/hf_pipeline.py +0 -90
  42. datachain/lib/image_transform.py +0 -103
  43. datachain/lib/iptc_exif_xmp.py +0 -76
  44. datachain/lib/unstructured.py +0 -41
  45. datachain/text/__init__.py +0 -3
  46. datachain-0.2.10.dist-info/METADATA +0 -430
  47. {datachain-0.2.10.dist-info → datachain-0.2.12.dist-info}/LICENSE +0 -0
  48. {datachain-0.2.10.dist-info → datachain-0.2.12.dist-info}/entry_points.txt +0 -0
  49. {datachain-0.2.10.dist-info → datachain-0.2.12.dist-info}/top_level.txt +0 -0
datachain/lib/clip.py CHANGED
@@ -31,7 +31,7 @@ def _get_encoder(model: Any, type: Literal["image", "text"]) -> Callable:
31
31
  )
32
32
 
33
33
 
34
- def similarity_scores(
34
+ def clip_similarity_scores(
35
35
  images: Union[None, "Image.Image", list["Image.Image"]],
36
36
  text: Union[None, str, list[str]],
37
37
  model: Any,
@@ -43,71 +43,91 @@ def similarity_scores(
43
43
  """
44
44
  Calculate CLIP similarity scores between one or more images and/or text.
45
45
 
46
- Args:
47
- images: Images to use as inputs.
48
- text: Text to use as inputs.
49
- model: Model from clip or open_clip packages.
50
- preprocess: Image preprocessor to apply.
51
- tokenizer: Text tokenizer.
52
- prob: Compute softmax probabilities.
53
- image_to_text: Whether to compute for image-to-text or text-to-image. Ignored if
54
- only one of images or text provided.
55
-
56
-
57
- Examples
58
- --------
59
-
60
- using https://github.com/openai/CLIP
61
- >>> import clip
62
- >>> model, preprocess = clip.load("ViT-B/32")
63
- >>> similarity_scores(img, "cat", model, preprocess, clip.tokenize)
64
- [[21.813]]
65
-
66
- using https://github.com/mlfoundations/open_clip
67
- >>> import open_clip
68
- >>> model, _, preprocess = open_clip.create_model_and_transforms(
69
- ... "ViT-B-32", pretrained="laion2b_s34b_b79k"
70
- ... )
71
- >>> tokenizer = open_clip.get_tokenizer("ViT-B-32")
72
- >>> similarity_scores(img, "cat", model, preprocess, tokenizer)
73
- [[21.813]]
74
-
75
- using https://huggingface.co/docs/transformers/en/model_doc/clip
76
- >>> from transformers import CLIPProcessor, CLIPModel
77
- >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
78
- >>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
79
- >>> scores = similarity_scores(
80
- ... img, "cat", model, processor.image_processor, processor.tokenizer
81
- ... )
82
- [[21.813]]
83
-
84
- image -> list of text
85
- >>> similarity_scores(img, ["cat", "dog"], model, preprocess, tokenizer)
86
- [[21.813, 35.313]]
87
-
88
- list of images -> text
89
- >>> similarity_scores([img1, img2], "cat", model, preprocess, tokenizer)
90
- [[21.813], [83.123]]
91
-
92
- list of images -> list of text
93
- >>> similarity_scores([img1, img2], ["cat", "dog"], model, preprocess, tokenizer)
94
- [[21.813, 35.313], [83.123, 34.843]]
95
-
96
- list of images -> list of images
97
- >>> similarity_scores([img1, img2], None, model, preprocess, tokenizer)
98
- [[94.189, 37.092]]
99
-
100
- list of text -> list of text
101
- >>> similarity_scores(None, ["cat", "dog"], model, preprocess, tokenizer)
102
- [[67.334, 23.588]]
103
-
104
- text -> list of images
105
- >>> similarity_scores([img1, img2], "cat", ..., image_to_text=False)
106
- [[19.708, 19.842]]
107
-
108
- show scores as softmax probabilities
109
- >>> similarity_scores(img, ["cat", "dog"], ..., prob=True)
110
- [[0.423, 0.577]]
46
+ Parameters:
47
+ images : Images to use as inputs.
48
+ text : Text to use as inputs.
49
+ model : Model from clip or open_clip packages.
50
+ preprocess : Image preprocessor to apply.
51
+ tokenizer : Text tokenizer.
52
+ prob : Compute softmax probabilities.
53
+ image_to_text : Whether to compute for image-to-text or text-to-image. Ignored
54
+ if only one of images or text provided.
55
+
56
+
57
+ Example:
58
+ Using https://github.com/openai/CLIP
59
+ ```py
60
+ >>> import clip
61
+ >>> model, preprocess = clip.load("ViT-B/32")
62
+ >>> similarity_scores(img, "cat", model, preprocess, clip.tokenize)
63
+ [[21.813]]
64
+ ```
65
+
66
+ Using https://github.com/mlfoundations/open_clip
67
+ ```py
68
+ >>> import open_clip
69
+ >>> model, _, preprocess = open_clip.create_model_and_transforms(
70
+ ... "ViT-B-32", pretrained="laion2b_s34b_b79k"
71
+ ... )
72
+ >>> tokenizer = open_clip.get_tokenizer("ViT-B-32")
73
+ >>> similarity_scores(img, "cat", model, preprocess, tokenizer)
74
+ [[21.813]]
75
+ ```
76
+
77
+ Using https://huggingface.co/docs/transformers/en/model_doc/clip
78
+ ```py
79
+ >>> from transformers import CLIPProcessor, CLIPModel
80
+ >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
81
+ >>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
82
+ >>> scores = similarity_scores(
83
+ ... img, "cat", model, processor.image_processor, processor.tokenizer
84
+ ... )
85
+ [[21.813]]
86
+ ```
87
+
88
+ Image -> list of text
89
+ ```py
90
+ >>> similarity_scores(img, ["cat", "dog"], model, preprocess, tokenizer)
91
+ [[21.813, 35.313]]
92
+ ```
93
+
94
+ List of images -> text
95
+ ```py
96
+ >>> similarity_scores([img1, img2], "cat", model, preprocess, tokenizer)
97
+ [[21.813], [83.123]]
98
+ ```
99
+
100
+ List of images -> list of text
101
+ ```py
102
+ >>> similarity_scores(
103
+ ... [img1, img2], ["cat", "dog"], model, preprocess, tokenizer)
104
+ ... )
105
+ [[21.813, 35.313], [83.123, 34.843]]
106
+ ```
107
+
108
+ List of images -> list of images
109
+ ```py
110
+ >>> similarity_scores([img1, img2], None, model, preprocess, tokenizer)
111
+ [[94.189, 37.092]]
112
+ ```
113
+
114
+ List of text -> list of text
115
+ ```py
116
+ >>> similarity_scores(None, ["cat", "dog"], model, preprocess, tokenizer)
117
+ [[67.334, 23.588]]
118
+ ```
119
+
120
+ Text -> list of images
121
+ ```py
122
+ >>> similarity_scores([img1, img2], "cat", ..., image_to_text=False)
123
+ [[19.708, 19.842]]
124
+ ```
125
+
126
+ Show scores as softmax probabilities
127
+ ```py
128
+ >>> similarity_scores(img, ["cat", "dog"], ..., prob=True)
129
+ [[0.423, 0.577]]
130
+ ```
111
131
  """
112
132
 
113
133
  with torch.no_grad():
@@ -144,3 +164,6 @@ def similarity_scores(
144
164
  scores = logits
145
165
 
146
166
  return scores.tolist()
167
+
168
+
169
+ similarity_scores = clip_similarity_scores
@@ -19,7 +19,7 @@ from datachain.sql.types import (
19
19
  String,
20
20
  )
21
21
 
22
- TYPE_TO_DATACHAIN = {
22
+ PYTHON_TO_SQL = {
23
23
  int: Int64,
24
24
  str: String,
25
25
  Literal: String,
@@ -34,14 +34,14 @@ TYPE_TO_DATACHAIN = {
34
34
  }
35
35
 
36
36
 
37
- def convert_to_db_type(typ): # noqa: PLR0911
37
+ def python_to_sql(typ): # noqa: PLR0911
38
38
  if inspect.isclass(typ):
39
39
  if issubclass(typ, SQLType):
40
40
  return typ
41
41
  if issubclass(typ, Enum):
42
42
  return str
43
43
 
44
- res = TYPE_TO_DATACHAIN.get(typ)
44
+ res = PYTHON_TO_SQL.get(typ)
45
45
  if res:
46
46
  return res
47
47
 
@@ -59,19 +59,19 @@ def convert_to_db_type(typ): # noqa: PLR0911
59
59
  if ModelStore.is_pydantic(args0):
60
60
  return Array(JSON())
61
61
 
62
- next_type = convert_to_db_type(args0)
62
+ next_type = python_to_sql(args0)
63
63
  return Array(next_type)
64
64
 
65
65
  if orig is Annotated:
66
66
  # Ignoring annotations
67
- return convert_to_db_type(args[0])
67
+ return python_to_sql(args[0])
68
68
 
69
69
  if inspect.isclass(orig) and issubclass(dict, orig):
70
70
  return JSON
71
71
 
72
72
  if orig == Union:
73
73
  if len(args) == 2 and (type(None) in args):
74
- return convert_to_db_type(args[0])
74
+ return python_to_sql(args[0])
75
75
 
76
76
  if _is_json_inside_union(orig, args):
77
77
  return JSON
@@ -0,0 +1,23 @@
1
+ from datetime import datetime
2
+ from typing import Any
3
+
4
+ from sqlalchemy import ARRAY, JSON, Boolean, DateTime, Float, Integer, String
5
+
6
+ from datachain.data_storage.sqlite import Column
7
+
8
+ SQL_TO_PYTHON = {
9
+ String: str,
10
+ Integer: int,
11
+ Float: float,
12
+ Boolean: bool,
13
+ DateTime: datetime,
14
+ ARRAY: list,
15
+ JSON: dict,
16
+ }
17
+
18
+
19
+ def sql_to_python(args_map: dict[str, Column]) -> dict[str, Any]:
20
+ return {
21
+ k: SQL_TO_PYTHON.get(type(v.type), str) # type: ignore[union-attr]
22
+ for k, v in args_map.items()
23
+ }
@@ -9,41 +9,16 @@ class ValuesToTupleError(DataChainParamsError):
9
9
  def __init__(self, ds_name, msg):
10
10
  if ds_name:
11
11
  ds_name = f"' {ds_name}'"
12
- super().__init__(f"Cannot convert features for dataset{ds_name}: {msg}")
12
+ super().__init__(f"Cannot convert signals for dataset{ds_name}: {msg}")
13
13
 
14
14
 
15
- def values_to_tuples(
15
+ def values_to_tuples( # noqa: C901, PLR0912
16
16
  ds_name: str = "",
17
17
  output: Union[None, DataType, Sequence[str], dict[str, DataType]] = None,
18
18
  **fr_map,
19
19
  ) -> tuple[Any, Any, Any]:
20
- types_map = {}
21
- length = -1
22
- for k, v in fr_map.items():
23
- if not isinstance(v, Sequence) or isinstance(v, str):
24
- raise ValuesToTupleError(ds_name, f"features '{k}' is not a sequence")
25
- len_ = len(v)
26
-
27
- if len_ == 0:
28
- raise ValuesToTupleError(ds_name, f"feature '{k}' is empty list")
29
-
30
- if length < 0:
31
- length = len_
32
- elif length != len_:
33
- raise ValuesToTupleError(
34
- ds_name,
35
- f"feature '{k}' should have length {length} while {len_} is given",
36
- )
37
- typ = type(v[0])
38
- if not is_chain_type(typ):
39
- raise ValuesToTupleError(
40
- ds_name,
41
- f"feature '{k}' has unsupported type '{typ.__name__}'."
42
- f" Please use Feature types: {DataTypeNames}",
43
- )
44
- types_map[k] = typ
45
20
  if output:
46
- if not isinstance(output, Sequence) and not isinstance(output, str):
21
+ if not isinstance(output, (Sequence, str, dict)):
47
22
  if len(fr_map) != 1:
48
23
  raise ValuesToTupleError(
49
24
  ds_name,
@@ -58,21 +33,64 @@ def values_to_tuples(
58
33
  key: str = next(iter(fr_map.keys()))
59
34
  output = {key: output} # type: ignore[dict-item]
60
35
 
36
+ if not isinstance(output, dict):
37
+ raise ValuesToTupleError(
38
+ ds_name,
39
+ "output type must be dict[str, DataType] while "
40
+ f"'{type(output).__name__}' is given",
41
+ )
42
+
61
43
  if len(output) != len(fr_map):
62
44
  raise ValuesToTupleError(
63
45
  ds_name,
64
46
  f"number of outputs '{len(output)}' should match"
65
- f" number of features '{len(fr_map)}'",
47
+ f" number of signals '{len(fr_map)}'",
66
48
  )
67
- if isinstance(output, dict):
49
+
50
+ types_map = {}
51
+ length = -1
52
+ for k, v in fr_map.items():
53
+ if not isinstance(v, Sequence) or isinstance(v, str):
54
+ raise ValuesToTupleError(ds_name, f"signals '{k}' is not a sequence")
55
+ len_ = len(v)
56
+
57
+ if output:
58
+ if k not in output: # type: ignore[operator]
59
+ raise ValuesToTupleError(
60
+ ds_name,
61
+ f"signal '{k}' is not present in the output",
62
+ )
63
+ else:
64
+ if len_ == 0:
65
+ raise ValuesToTupleError(ds_name, f"signal '{k}' is empty list")
66
+
67
+ typ = type(v[0])
68
+ if not is_chain_type(typ):
69
+ raise ValuesToTupleError(
70
+ ds_name,
71
+ f"signal '{k}' has unsupported type '{typ.__name__}'."
72
+ f" Please use DataModel types: {DataTypeNames}",
73
+ )
74
+ types_map[k] = typ
75
+
76
+ if length < 0:
77
+ length = len_
78
+ elif length != len_:
68
79
  raise ValuesToTupleError(
69
80
  ds_name,
70
- "output type must be dict[str, FeatureType] while "
71
- f"'{type(output).__name__}' is given",
81
+ f"signal '{k}' should have length {length} while {len_} is given",
72
82
  )
73
- else:
83
+
84
+ if not output:
74
85
  output = types_map # type: ignore[assignment]
75
86
 
87
+ if not output:
88
+ raise ValuesToTupleError(
89
+ ds_name,
90
+ "output type must be dict[str, DataType] while empty is given"
91
+ " and no signals are provided",
92
+ )
93
+
76
94
  output_types: list[type] = list(output.values()) # type: ignore[union-attr,call-arg,arg-type]
77
95
  if len(output) > 1: # type: ignore[arg-type]
78
96
  tuple_type = tuple(output_types)
@@ -1,14 +1,11 @@
1
1
  from collections.abc import Sequence
2
2
  from datetime import datetime
3
- from typing import TYPE_CHECKING, ClassVar, Union, get_args, get_origin
3
+ from typing import ClassVar, Union, get_args, get_origin
4
4
 
5
5
  from pydantic import BaseModel
6
6
 
7
7
  from datachain.lib.model_store import ModelStore
8
8
 
9
- if TYPE_CHECKING:
10
- from datachain.catalog import Catalog
11
-
12
9
  StandardType = Union[
13
10
  type[int],
14
11
  type[str],
@@ -24,18 +21,14 @@ DataTypeNames = "BaseModel, int, str, float, bool, list, dict, bytes, datetime"
24
21
 
25
22
 
26
23
  class DataModel(BaseModel):
27
- _version: ClassVar[int] = 1
24
+ """Pydantic model wrapper that registers model with `DataChain`."""
28
25
 
29
- def get_value(self):
30
- """Getting value from data. It's used in conjunction with method that operate
31
- with raw data such as to_pytorch(). In contrast to method that operated with
32
- data structures such as pydantic"""
33
- return
26
+ _version: ClassVar[int] = 1
34
27
 
35
28
  @classmethod
36
29
  def __pydantic_init_subclass__(cls):
37
30
  """It automatically registers every declared DataModel child class."""
38
- ModelStore.add(cls)
31
+ ModelStore.register(cls)
39
32
 
40
33
  @staticmethod
41
34
  def register(models: Union[DataType, Sequence[DataType]]):
@@ -44,25 +37,11 @@ class DataModel(BaseModel):
44
37
  if not isinstance(models, Sequence):
45
38
  models = [models]
46
39
  for val in models:
47
- ModelStore.add(val)
48
-
49
-
50
- class FileBasic(DataModel):
51
- def _set_stream(self, catalog: "Catalog", caching_enabled: bool = False) -> None:
52
- pass
53
-
54
- def open(self):
55
- raise NotImplementedError
56
-
57
- def read(self):
58
- with self.open() as stream:
59
- return stream.read()
60
-
61
- def get_value(self):
62
- return self.read()
40
+ ModelStore.register(val)
63
41
 
64
42
 
65
43
  def is_chain_type(t: type) -> bool:
44
+ """Return true if type is supported by `DataChain`."""
66
45
  if ModelStore.is_pydantic(t):
67
46
  return True
68
47
  if any(t is ft or t is get_args(ft)[0] for ft in get_args(StandardType)):
@@ -0,0 +1,70 @@
1
+ import json
2
+ from datetime import datetime
3
+ from typing import TYPE_CHECKING, Any, Optional, Union
4
+
5
+ from pydantic import Field, field_validator
6
+
7
+ from datachain.dataset import DatasetRecord, DatasetStatus, DatasetVersion
8
+ from datachain.job import Job
9
+ from datachain.lib.data_model import DataModel
10
+ from datachain.utils import TIME_ZERO
11
+
12
+ if TYPE_CHECKING:
13
+ from typing_extensions import Self
14
+
15
+
16
+ class DatasetInfo(DataModel):
17
+ name: str
18
+ version: int = Field(default=1)
19
+ status: int = Field(default=DatasetStatus.CREATED)
20
+ created_at: datetime = Field(default=TIME_ZERO)
21
+ finished_at: Optional[datetime] = Field(default=None)
22
+ num_objects: Optional[int] = Field(default=None)
23
+ size: Optional[int] = Field(default=None)
24
+ params: dict[str, str] = Field(default=dict)
25
+ metrics: dict[str, Any] = Field(default=dict)
26
+
27
+ @staticmethod
28
+ def _validate_dict(
29
+ v: Optional[Union[str, dict]],
30
+ ) -> dict:
31
+ if v is None or v == "":
32
+ return {}
33
+ if isinstance(v, str):
34
+ try:
35
+ return json.loads(v)
36
+ except Exception as e: # noqa: BLE001
37
+ raise ValueError(
38
+ f"Unable to convert string '{v}' to dict for Dataset feature: {e}"
39
+ ) from None
40
+ return v
41
+
42
+ # Workaround for empty JSONs converted to empty strings in some DBs.
43
+ @field_validator("params", mode="before")
44
+ @classmethod
45
+ def validate_location(cls, v):
46
+ return cls._validate_dict(v)
47
+
48
+ @field_validator("metrics", mode="before")
49
+ @classmethod
50
+ def validate_metrics(cls, v):
51
+ return cls._validate_dict(v)
52
+
53
+ @classmethod
54
+ def from_models(
55
+ cls,
56
+ dataset: DatasetRecord,
57
+ version: DatasetVersion,
58
+ job: Optional[Job],
59
+ ) -> "Self":
60
+ return cls(
61
+ name=dataset.name,
62
+ version=version.version,
63
+ status=version.status,
64
+ created_at=version.created_at,
65
+ finished_at=version.finished_at,
66
+ num_objects=version.num_objects,
67
+ size=version.size,
68
+ params=job.params if job else {},
69
+ metrics=job.metrics if job else {},
70
+ )