datachain 0.2.11__py3-none-any.whl → 0.2.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

Files changed (46) hide show
  1. datachain/__init__.py +3 -4
  2. datachain/cache.py +10 -4
  3. datachain/catalog/catalog.py +35 -15
  4. datachain/cli.py +37 -32
  5. datachain/data_storage/metastore.py +24 -0
  6. datachain/data_storage/warehouse.py +3 -1
  7. datachain/job.py +56 -0
  8. datachain/lib/arrow.py +19 -7
  9. datachain/lib/clip.py +89 -66
  10. datachain/lib/convert/{type_converter.py → python_to_sql.py} +6 -6
  11. datachain/lib/convert/sql_to_python.py +23 -0
  12. datachain/lib/convert/values_to_tuples.py +51 -33
  13. datachain/lib/data_model.py +6 -27
  14. datachain/lib/dataset_info.py +70 -0
  15. datachain/lib/dc.py +618 -156
  16. datachain/lib/file.py +117 -15
  17. datachain/lib/image.py +1 -1
  18. datachain/lib/meta_formats.py +14 -2
  19. datachain/lib/model_store.py +3 -2
  20. datachain/lib/pytorch.py +10 -7
  21. datachain/lib/signal_schema.py +19 -11
  22. datachain/lib/text.py +2 -1
  23. datachain/lib/udf.py +56 -5
  24. datachain/lib/udf_signature.py +1 -1
  25. datachain/node.py +11 -8
  26. datachain/query/dataset.py +52 -26
  27. datachain/query/schema.py +2 -0
  28. datachain/query/session.py +4 -4
  29. datachain/sql/functions/array.py +12 -0
  30. datachain/sql/functions/string.py +8 -0
  31. datachain/torch/__init__.py +1 -1
  32. datachain/utils.py +6 -0
  33. datachain-0.2.12.dist-info/METADATA +412 -0
  34. {datachain-0.2.11.dist-info → datachain-0.2.12.dist-info}/RECORD +38 -42
  35. {datachain-0.2.11.dist-info → datachain-0.2.12.dist-info}/WHEEL +1 -1
  36. datachain/lib/gpt4_vision.py +0 -97
  37. datachain/lib/hf_image_to_text.py +0 -97
  38. datachain/lib/hf_pipeline.py +0 -90
  39. datachain/lib/image_transform.py +0 -103
  40. datachain/lib/iptc_exif_xmp.py +0 -76
  41. datachain/lib/unstructured.py +0 -41
  42. datachain/text/__init__.py +0 -3
  43. datachain-0.2.11.dist-info/METADATA +0 -431
  44. {datachain-0.2.11.dist-info → datachain-0.2.12.dist-info}/LICENSE +0 -0
  45. {datachain-0.2.11.dist-info → datachain-0.2.12.dist-info}/entry_points.txt +0 -0
  46. {datachain-0.2.11.dist-info → datachain-0.2.12.dist-info}/top_level.txt +0 -0
@@ -1,97 +0,0 @@
1
- import numpy as np
2
- import torch
3
- from PIL import Image, ImageOps, UnidentifiedImageError
4
- from transformers import (
5
- AutoProcessor,
6
- Blip2ForConditionalGeneration,
7
- Blip2Processor,
8
- LlavaForConditionalGeneration,
9
- )
10
-
11
- from datachain.query import Object, udf
12
- from datachain.sql.types import String
13
-
14
- DEFAULT_FIT_BOX = (500, 500)
15
-
16
-
17
- def encode_image(raw):
18
- try:
19
- img = Image.open(raw)
20
- except UnidentifiedImageError:
21
- return None
22
- img.load()
23
- img = img.convert("RGB")
24
- return ImageOps.fit(img, DEFAULT_FIT_BOX)
25
-
26
-
27
- def infer_dtype(device):
28
- if device == "cpu":
29
- return torch.float32
30
- return torch.float16
31
-
32
-
33
- @udf(
34
- params=(Object(encode_image),), # Columns consumed by the UDF.
35
- output={
36
- "description": String,
37
- "error": String,
38
- }, # Signals being returned by the UDF.
39
- batch=64,
40
- method="describe",
41
- )
42
- class BLIP2describe:
43
- def __init__(self, device="cpu", model="Salesforce/blip2-opt-2.7b", max_tokens=300):
44
- self.torch_dtype = infer_dtype(device)
45
- self.processor = Blip2Processor.from_pretrained(model)
46
- self.model = Blip2ForConditionalGeneration.from_pretrained(
47
- model, torch_dtype=self.torch_dtype
48
- )
49
- self.device = device
50
- self.model.to(device)
51
- self.max_tokens = max_tokens
52
-
53
- def describe(self, imgs):
54
- images = np.squeeze(np.asarray(imgs))
55
- inputs = self.processor(images=images, return_tensors="pt").to(
56
- self.device, self.torch_dtype
57
- )
58
-
59
- generated_ids = self.model.generate(**inputs, max_new_tokens=self.max_tokens)
60
- generated_text = self.processor.batch_decode(
61
- generated_ids, skip_special_tokens=True
62
- )
63
- return [(desc.strip(), "") for desc in generated_text]
64
-
65
-
66
- @udf(
67
- params=(Object(encode_image),), # Columns consumed by the UDF.
68
- output={
69
- "description": String,
70
- "error": String,
71
- }, # Signals being returned by the UDF.
72
- batch=16,
73
- method="describe",
74
- )
75
- class LLaVAdescribe:
76
- def __init__(self, device="cpu", model="llava-hf/llava-1.5-7b-hf", max_tokens=300):
77
- self.device = device
78
- self.torch_dtype = infer_dtype(device)
79
- self.processor = AutoProcessor.from_pretrained(model)
80
- self.model = LlavaForConditionalGeneration.from_pretrained(
81
- model, torch_dtype=self.torch_dtype, low_cpu_mem_usage=True
82
- )
83
- self.model.to(device)
84
- self.max_tokens = max_tokens
85
- self.prompt = "USER: <image>\nDescribe this picture\nASSISTANT:"
86
-
87
- def describe(self, imgs):
88
- images = np.squeeze(np.asarray(imgs))
89
- inputs = self.processor(
90
- text=[self.prompt] * len(imgs), images=images, return_tensors="pt"
91
- ).to(self.device, self.torch_dtype)
92
-
93
- generated_ids = self.model.generate(**inputs, max_new_tokens=self.max_tokens)
94
- generated_text = self.processor.batch_decode(
95
- generated_ids, skip_special_tokens=True
96
- )
97
- return [(desc.split("ASSISTANT:")[-1].strip(), "") for desc in generated_text]
@@ -1,90 +0,0 @@
1
- import json
2
-
3
- from PIL import (
4
- Image,
5
- UnidentifiedImageError,
6
- )
7
- from transformers import pipeline
8
-
9
- from datachain.query import Object, udf
10
- from datachain.sql.types import JSON, String
11
-
12
-
13
- def read_image(raw):
14
- try:
15
- img = Image.open(raw)
16
- except UnidentifiedImageError:
17
- return None
18
- img.load()
19
- return img.convert("RGB")
20
-
21
-
22
- def read_object(raw):
23
- return raw.read()
24
-
25
-
26
- def read_text(raw):
27
- return read_object(raw).decode("utf-8")
28
-
29
-
30
- @udf(
31
- params=(Object(read_image),), # Columns consumed by the UDF.
32
- output={
33
- "model_output": JSON,
34
- "error": String,
35
- }, # Signals being returned by the UDF.
36
- method="image_processor",
37
- )
38
- class ImageHelper:
39
- def __init__(self, model, device, **kwargs):
40
- self.helper = pipeline(model=model, device=device)
41
- self.kwargs = kwargs
42
-
43
- def image_processor(self, imgs):
44
- result = self.helper(
45
- imgs,
46
- **self.kwargs,
47
- )
48
- return (json.dumps(result), "")
49
-
50
-
51
- @udf(
52
- params=(Object(read_text),), # Columns consumed by the UDF.
53
- output={
54
- "model_output": JSON,
55
- "error": String,
56
- }, # Signals being returned by the UDF.
57
- method="text_processor",
58
- )
59
- class TextHelper:
60
- def __init__(self, model, device, **kwargs):
61
- self.helper = pipeline(model=model, device=device)
62
- self.kwargs = kwargs
63
-
64
- def text_processor(self, text):
65
- result = self.helper(
66
- text,
67
- **self.kwargs,
68
- )
69
- return (json.dumps(result), "")
70
-
71
-
72
- @udf(
73
- params=(Object(read_object),), # Columns consumed by the UDF.
74
- output={
75
- "model_output": JSON,
76
- "error": String,
77
- }, # Signals being returned by the UDF.
78
- method="raw_processor",
79
- )
80
- class RawHelper:
81
- def __init__(self, model, device, **kwargs):
82
- self.helper = pipeline(model=model, device=device)
83
- self.kwargs = kwargs
84
-
85
- def raw_processor(self, obj):
86
- result = self.helper(
87
- obj,
88
- **self.kwargs,
89
- )
90
- return (json.dumps(result), "")
@@ -1,103 +0,0 @@
1
- import os
2
-
3
- import fsspec
4
- from PIL import Image
5
-
6
- from datachain.catalog import get_catalog
7
- from datachain.query import DatasetRow, Object, udf
8
-
9
-
10
- def load_image(raw):
11
- img = Image.open(raw)
12
- img.load()
13
- return img
14
-
15
-
16
- @udf(
17
- output=DatasetRow.schema,
18
- params=(Object(load_image), *tuple(DatasetRow.schema.keys())),
19
- )
20
- class ImageTransform:
21
- def __init__(
22
- self,
23
- *,
24
- image_filter,
25
- bucket_name,
26
- prefix,
27
- output_folder,
28
- file_prefix="",
29
- vtype="",
30
- ):
31
- # Once we fix the UDF decorator situation, it would make more sense to put this
32
- # into a child class and make apply_filter an abstractmethod.
33
- self.image_filter = image_filter
34
- self.folder_name = output_folder
35
- self.file_prefix = file_prefix
36
- self.prefix = prefix
37
- self.vtype = vtype
38
-
39
- catalog = get_catalog()
40
- self.client, _ = catalog.parse_url(os.path.join(self.prefix, bucket_name))
41
-
42
- def apply_filter(self, image):
43
- return image.filter(self.image_filter)
44
-
45
- def save(self, image, source, name, format):
46
- # Make name for new image
47
- new_name = f"{self.file_prefix}{name}"
48
-
49
- # Do writeback
50
- blob_name = os.path.join(self.folder_name, new_name)
51
- urlpath = os.path.join(source, blob_name)
52
- cloud_file = fsspec.open(urlpath=urlpath, mode="wb")
53
- with cloud_file as fp:
54
- image.save(fp, format=format)
55
-
56
- # Get the blob info
57
- info_ = self.client.fs.info(urlpath)
58
- info = self.client.convert_info(info_, self.folder_name)
59
- info.name = new_name
60
- return info
61
-
62
- def __call__(
63
- self,
64
- image,
65
- *args,
66
- ):
67
- # Build a dict from row contents
68
- record = dict(zip(DatasetRow.schema.keys(), args))
69
- record["is_latest"] = record["is_latest"] > 0 # needs to be a bool
70
-
71
- # yield same row back
72
- yield DatasetRow.create(**record)
73
-
74
- # Don't apply the filter twice
75
- if record["parent"] == self.folder_name:
76
- return
77
-
78
- # Apply the filter
79
- image_b = self.apply_filter(image)
80
-
81
- # Save the image and get the cloud object info
82
- entry = self.save(
83
- image_b, record["source"], name=record["name"], format=image.format
84
- )
85
-
86
- # Build the new row
87
- yield DatasetRow.create(
88
- name=entry.name,
89
- source=record["source"],
90
- parent=self.folder_name,
91
- size=entry.size,
92
- location=record["name"]
93
- if not record["parent"]
94
- else f"{record['parent']}/{record['name']}",
95
- vtype=self.vtype,
96
- dir_type=record["dir_type"],
97
- owner_name=entry.owner_name,
98
- owner_id=entry.owner_id,
99
- is_latest=record["is_latest"],
100
- last_modified=entry.last_modified,
101
- version=entry.version,
102
- etag=entry.etag,
103
- )
@@ -1,76 +0,0 @@
1
- import json
2
-
3
- from PIL import (
4
- ExifTags,
5
- Image,
6
- IptcImagePlugin,
7
- TiffImagePlugin,
8
- UnidentifiedImageError,
9
- )
10
-
11
- from datachain.query import Object, udf
12
- from datachain.sql.types import JSON, String
13
-
14
-
15
- def encode_image(raw):
16
- try:
17
- img = Image.open(raw)
18
- except UnidentifiedImageError:
19
- return None
20
- return img
21
-
22
-
23
- @udf(
24
- params=(Object(encode_image),), # Columns consumed by the UDF.
25
- output={
26
- "xmp": JSON,
27
- "exif": JSON,
28
- "iptc": JSON,
29
- "error": String,
30
- }, # Signals being returned by the UDF.
31
- method="image_description",
32
- )
33
- class GetMetadata:
34
- def cast(self, v): # to JSON serializable types
35
- if isinstance(v, TiffImagePlugin.IFDRational):
36
- return float(v)
37
- if isinstance(v, tuple):
38
- return tuple(self.cast(t) for t in v)
39
- if isinstance(v, bytes):
40
- return v.decode(encoding="utf-8", errors="ignore")
41
- if isinstance(v, dict):
42
- for kk, vv in v.items():
43
- v[kk] = self.cast(vv)
44
- return v
45
- if isinstance(v, list):
46
- return [self.cast(kk) for kk in v]
47
- return v
48
-
49
- def image_description(self, img):
50
- (xmp, exif, iptc) = ({}, {}, {})
51
- if img is None:
52
- error = "Image format not understood"
53
- return ({}, {}, {}, error)
54
- error = ""
55
- xmp = img.getxmp()
56
- img_exif = img.getexif()
57
- img_iptc = IptcImagePlugin.getiptcinfo(img)
58
-
59
- if img_iptc:
60
- for k, v in img_iptc.items():
61
- iptc[str(k)] = self.cast(v)
62
-
63
- if img_exif:
64
- for k, v in img_exif.items():
65
- v = self.cast(v)
66
- if k in ExifTags.TAGS:
67
- exif[ExifTags.TAGS[k]] = v
68
- if k in ExifTags.GPSTAGS:
69
- exif[ExifTags.GPSTAGS[k]] = v
70
-
71
- return (
72
- json.dumps(xmp),
73
- json.dumps(exif),
74
- json.dumps(iptc),
75
- error,
76
- )
@@ -1,41 +0,0 @@
1
- import shutil
2
- import tempfile
3
-
4
- from unstructured.partition.auto import partition
5
- from unstructured.staging.base import convert_to_dataframe
6
-
7
- from datachain.lib.udf import Mapper
8
- from datachain.query import Stream
9
- from datachain.sql.types import JSON, String
10
-
11
-
12
- class PartitionObject(Mapper):
13
- def __init__(self):
14
- super().__init__(
15
- [
16
- Stream(),
17
- ],
18
- {
19
- "elements": JSON,
20
- "title": String,
21
- "text": String,
22
- "error": String,
23
- },
24
- )
25
-
26
- def encode_object(self, raw):
27
- fname = str(raw).replace(">", "").replace("<", "")
28
- output = tempfile.TemporaryFile()
29
- shutil.copyfileobj(raw, output)
30
- elements = partition(file=output, metadata_filename=fname)
31
- output.close()
32
- return elements
33
-
34
- def __call__(self, stream):
35
- with stream:
36
- elements = self.encode_object(stream)
37
-
38
- title = str(elements[0])
39
- text = "\n\n".join([str(el) for el in elements])
40
- df = convert_to_dataframe(elements)
41
- return (df.to_json(), title, text, "")
@@ -1,3 +0,0 @@
1
- from datachain.lib.text import convert_text
2
-
3
- __all__ = ["convert_text"]