datachain 0.2.11__py3-none-any.whl → 0.2.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/__init__.py +3 -4
- datachain/cache.py +10 -4
- datachain/catalog/catalog.py +35 -15
- datachain/cli.py +37 -32
- datachain/data_storage/metastore.py +24 -0
- datachain/data_storage/warehouse.py +3 -1
- datachain/job.py +56 -0
- datachain/lib/arrow.py +19 -7
- datachain/lib/clip.py +89 -66
- datachain/lib/convert/{type_converter.py → python_to_sql.py} +6 -6
- datachain/lib/convert/sql_to_python.py +23 -0
- datachain/lib/convert/values_to_tuples.py +51 -33
- datachain/lib/data_model.py +6 -27
- datachain/lib/dataset_info.py +70 -0
- datachain/lib/dc.py +618 -156
- datachain/lib/file.py +117 -15
- datachain/lib/image.py +1 -1
- datachain/lib/meta_formats.py +14 -2
- datachain/lib/model_store.py +3 -2
- datachain/lib/pytorch.py +10 -7
- datachain/lib/signal_schema.py +19 -11
- datachain/lib/text.py +2 -1
- datachain/lib/udf.py +56 -5
- datachain/lib/udf_signature.py +1 -1
- datachain/node.py +11 -8
- datachain/query/dataset.py +52 -26
- datachain/query/schema.py +2 -0
- datachain/query/session.py +4 -4
- datachain/sql/functions/array.py +12 -0
- datachain/sql/functions/string.py +8 -0
- datachain/torch/__init__.py +1 -1
- datachain/utils.py +6 -0
- datachain-0.2.12.dist-info/METADATA +412 -0
- {datachain-0.2.11.dist-info → datachain-0.2.12.dist-info}/RECORD +38 -42
- {datachain-0.2.11.dist-info → datachain-0.2.12.dist-info}/WHEEL +1 -1
- datachain/lib/gpt4_vision.py +0 -97
- datachain/lib/hf_image_to_text.py +0 -97
- datachain/lib/hf_pipeline.py +0 -90
- datachain/lib/image_transform.py +0 -103
- datachain/lib/iptc_exif_xmp.py +0 -76
- datachain/lib/unstructured.py +0 -41
- datachain/text/__init__.py +0 -3
- datachain-0.2.11.dist-info/METADATA +0 -431
- {datachain-0.2.11.dist-info → datachain-0.2.12.dist-info}/LICENSE +0 -0
- {datachain-0.2.11.dist-info → datachain-0.2.12.dist-info}/entry_points.txt +0 -0
- {datachain-0.2.11.dist-info → datachain-0.2.12.dist-info}/top_level.txt +0 -0
|
@@ -1,97 +0,0 @@
|
|
|
1
|
-
import numpy as np
|
|
2
|
-
import torch
|
|
3
|
-
from PIL import Image, ImageOps, UnidentifiedImageError
|
|
4
|
-
from transformers import (
|
|
5
|
-
AutoProcessor,
|
|
6
|
-
Blip2ForConditionalGeneration,
|
|
7
|
-
Blip2Processor,
|
|
8
|
-
LlavaForConditionalGeneration,
|
|
9
|
-
)
|
|
10
|
-
|
|
11
|
-
from datachain.query import Object, udf
|
|
12
|
-
from datachain.sql.types import String
|
|
13
|
-
|
|
14
|
-
DEFAULT_FIT_BOX = (500, 500)
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
def encode_image(raw):
|
|
18
|
-
try:
|
|
19
|
-
img = Image.open(raw)
|
|
20
|
-
except UnidentifiedImageError:
|
|
21
|
-
return None
|
|
22
|
-
img.load()
|
|
23
|
-
img = img.convert("RGB")
|
|
24
|
-
return ImageOps.fit(img, DEFAULT_FIT_BOX)
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
def infer_dtype(device):
|
|
28
|
-
if device == "cpu":
|
|
29
|
-
return torch.float32
|
|
30
|
-
return torch.float16
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
@udf(
|
|
34
|
-
params=(Object(encode_image),), # Columns consumed by the UDF.
|
|
35
|
-
output={
|
|
36
|
-
"description": String,
|
|
37
|
-
"error": String,
|
|
38
|
-
}, # Signals being returned by the UDF.
|
|
39
|
-
batch=64,
|
|
40
|
-
method="describe",
|
|
41
|
-
)
|
|
42
|
-
class BLIP2describe:
|
|
43
|
-
def __init__(self, device="cpu", model="Salesforce/blip2-opt-2.7b", max_tokens=300):
|
|
44
|
-
self.torch_dtype = infer_dtype(device)
|
|
45
|
-
self.processor = Blip2Processor.from_pretrained(model)
|
|
46
|
-
self.model = Blip2ForConditionalGeneration.from_pretrained(
|
|
47
|
-
model, torch_dtype=self.torch_dtype
|
|
48
|
-
)
|
|
49
|
-
self.device = device
|
|
50
|
-
self.model.to(device)
|
|
51
|
-
self.max_tokens = max_tokens
|
|
52
|
-
|
|
53
|
-
def describe(self, imgs):
|
|
54
|
-
images = np.squeeze(np.asarray(imgs))
|
|
55
|
-
inputs = self.processor(images=images, return_tensors="pt").to(
|
|
56
|
-
self.device, self.torch_dtype
|
|
57
|
-
)
|
|
58
|
-
|
|
59
|
-
generated_ids = self.model.generate(**inputs, max_new_tokens=self.max_tokens)
|
|
60
|
-
generated_text = self.processor.batch_decode(
|
|
61
|
-
generated_ids, skip_special_tokens=True
|
|
62
|
-
)
|
|
63
|
-
return [(desc.strip(), "") for desc in generated_text]
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
@udf(
|
|
67
|
-
params=(Object(encode_image),), # Columns consumed by the UDF.
|
|
68
|
-
output={
|
|
69
|
-
"description": String,
|
|
70
|
-
"error": String,
|
|
71
|
-
}, # Signals being returned by the UDF.
|
|
72
|
-
batch=16,
|
|
73
|
-
method="describe",
|
|
74
|
-
)
|
|
75
|
-
class LLaVAdescribe:
|
|
76
|
-
def __init__(self, device="cpu", model="llava-hf/llava-1.5-7b-hf", max_tokens=300):
|
|
77
|
-
self.device = device
|
|
78
|
-
self.torch_dtype = infer_dtype(device)
|
|
79
|
-
self.processor = AutoProcessor.from_pretrained(model)
|
|
80
|
-
self.model = LlavaForConditionalGeneration.from_pretrained(
|
|
81
|
-
model, torch_dtype=self.torch_dtype, low_cpu_mem_usage=True
|
|
82
|
-
)
|
|
83
|
-
self.model.to(device)
|
|
84
|
-
self.max_tokens = max_tokens
|
|
85
|
-
self.prompt = "USER: <image>\nDescribe this picture\nASSISTANT:"
|
|
86
|
-
|
|
87
|
-
def describe(self, imgs):
|
|
88
|
-
images = np.squeeze(np.asarray(imgs))
|
|
89
|
-
inputs = self.processor(
|
|
90
|
-
text=[self.prompt] * len(imgs), images=images, return_tensors="pt"
|
|
91
|
-
).to(self.device, self.torch_dtype)
|
|
92
|
-
|
|
93
|
-
generated_ids = self.model.generate(**inputs, max_new_tokens=self.max_tokens)
|
|
94
|
-
generated_text = self.processor.batch_decode(
|
|
95
|
-
generated_ids, skip_special_tokens=True
|
|
96
|
-
)
|
|
97
|
-
return [(desc.split("ASSISTANT:")[-1].strip(), "") for desc in generated_text]
|
datachain/lib/hf_pipeline.py
DELETED
|
@@ -1,90 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
|
|
3
|
-
from PIL import (
|
|
4
|
-
Image,
|
|
5
|
-
UnidentifiedImageError,
|
|
6
|
-
)
|
|
7
|
-
from transformers import pipeline
|
|
8
|
-
|
|
9
|
-
from datachain.query import Object, udf
|
|
10
|
-
from datachain.sql.types import JSON, String
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
def read_image(raw):
|
|
14
|
-
try:
|
|
15
|
-
img = Image.open(raw)
|
|
16
|
-
except UnidentifiedImageError:
|
|
17
|
-
return None
|
|
18
|
-
img.load()
|
|
19
|
-
return img.convert("RGB")
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
def read_object(raw):
|
|
23
|
-
return raw.read()
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
def read_text(raw):
|
|
27
|
-
return read_object(raw).decode("utf-8")
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
@udf(
|
|
31
|
-
params=(Object(read_image),), # Columns consumed by the UDF.
|
|
32
|
-
output={
|
|
33
|
-
"model_output": JSON,
|
|
34
|
-
"error": String,
|
|
35
|
-
}, # Signals being returned by the UDF.
|
|
36
|
-
method="image_processor",
|
|
37
|
-
)
|
|
38
|
-
class ImageHelper:
|
|
39
|
-
def __init__(self, model, device, **kwargs):
|
|
40
|
-
self.helper = pipeline(model=model, device=device)
|
|
41
|
-
self.kwargs = kwargs
|
|
42
|
-
|
|
43
|
-
def image_processor(self, imgs):
|
|
44
|
-
result = self.helper(
|
|
45
|
-
imgs,
|
|
46
|
-
**self.kwargs,
|
|
47
|
-
)
|
|
48
|
-
return (json.dumps(result), "")
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
@udf(
|
|
52
|
-
params=(Object(read_text),), # Columns consumed by the UDF.
|
|
53
|
-
output={
|
|
54
|
-
"model_output": JSON,
|
|
55
|
-
"error": String,
|
|
56
|
-
}, # Signals being returned by the UDF.
|
|
57
|
-
method="text_processor",
|
|
58
|
-
)
|
|
59
|
-
class TextHelper:
|
|
60
|
-
def __init__(self, model, device, **kwargs):
|
|
61
|
-
self.helper = pipeline(model=model, device=device)
|
|
62
|
-
self.kwargs = kwargs
|
|
63
|
-
|
|
64
|
-
def text_processor(self, text):
|
|
65
|
-
result = self.helper(
|
|
66
|
-
text,
|
|
67
|
-
**self.kwargs,
|
|
68
|
-
)
|
|
69
|
-
return (json.dumps(result), "")
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
@udf(
|
|
73
|
-
params=(Object(read_object),), # Columns consumed by the UDF.
|
|
74
|
-
output={
|
|
75
|
-
"model_output": JSON,
|
|
76
|
-
"error": String,
|
|
77
|
-
}, # Signals being returned by the UDF.
|
|
78
|
-
method="raw_processor",
|
|
79
|
-
)
|
|
80
|
-
class RawHelper:
|
|
81
|
-
def __init__(self, model, device, **kwargs):
|
|
82
|
-
self.helper = pipeline(model=model, device=device)
|
|
83
|
-
self.kwargs = kwargs
|
|
84
|
-
|
|
85
|
-
def raw_processor(self, obj):
|
|
86
|
-
result = self.helper(
|
|
87
|
-
obj,
|
|
88
|
-
**self.kwargs,
|
|
89
|
-
)
|
|
90
|
-
return (json.dumps(result), "")
|
datachain/lib/image_transform.py
DELETED
|
@@ -1,103 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
|
|
3
|
-
import fsspec
|
|
4
|
-
from PIL import Image
|
|
5
|
-
|
|
6
|
-
from datachain.catalog import get_catalog
|
|
7
|
-
from datachain.query import DatasetRow, Object, udf
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
def load_image(raw):
|
|
11
|
-
img = Image.open(raw)
|
|
12
|
-
img.load()
|
|
13
|
-
return img
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
@udf(
|
|
17
|
-
output=DatasetRow.schema,
|
|
18
|
-
params=(Object(load_image), *tuple(DatasetRow.schema.keys())),
|
|
19
|
-
)
|
|
20
|
-
class ImageTransform:
|
|
21
|
-
def __init__(
|
|
22
|
-
self,
|
|
23
|
-
*,
|
|
24
|
-
image_filter,
|
|
25
|
-
bucket_name,
|
|
26
|
-
prefix,
|
|
27
|
-
output_folder,
|
|
28
|
-
file_prefix="",
|
|
29
|
-
vtype="",
|
|
30
|
-
):
|
|
31
|
-
# Once we fix the UDF decorator situation, it would make more sense to put this
|
|
32
|
-
# into a child class and make apply_filter an abstractmethod.
|
|
33
|
-
self.image_filter = image_filter
|
|
34
|
-
self.folder_name = output_folder
|
|
35
|
-
self.file_prefix = file_prefix
|
|
36
|
-
self.prefix = prefix
|
|
37
|
-
self.vtype = vtype
|
|
38
|
-
|
|
39
|
-
catalog = get_catalog()
|
|
40
|
-
self.client, _ = catalog.parse_url(os.path.join(self.prefix, bucket_name))
|
|
41
|
-
|
|
42
|
-
def apply_filter(self, image):
|
|
43
|
-
return image.filter(self.image_filter)
|
|
44
|
-
|
|
45
|
-
def save(self, image, source, name, format):
|
|
46
|
-
# Make name for new image
|
|
47
|
-
new_name = f"{self.file_prefix}{name}"
|
|
48
|
-
|
|
49
|
-
# Do writeback
|
|
50
|
-
blob_name = os.path.join(self.folder_name, new_name)
|
|
51
|
-
urlpath = os.path.join(source, blob_name)
|
|
52
|
-
cloud_file = fsspec.open(urlpath=urlpath, mode="wb")
|
|
53
|
-
with cloud_file as fp:
|
|
54
|
-
image.save(fp, format=format)
|
|
55
|
-
|
|
56
|
-
# Get the blob info
|
|
57
|
-
info_ = self.client.fs.info(urlpath)
|
|
58
|
-
info = self.client.convert_info(info_, self.folder_name)
|
|
59
|
-
info.name = new_name
|
|
60
|
-
return info
|
|
61
|
-
|
|
62
|
-
def __call__(
|
|
63
|
-
self,
|
|
64
|
-
image,
|
|
65
|
-
*args,
|
|
66
|
-
):
|
|
67
|
-
# Build a dict from row contents
|
|
68
|
-
record = dict(zip(DatasetRow.schema.keys(), args))
|
|
69
|
-
record["is_latest"] = record["is_latest"] > 0 # needs to be a bool
|
|
70
|
-
|
|
71
|
-
# yield same row back
|
|
72
|
-
yield DatasetRow.create(**record)
|
|
73
|
-
|
|
74
|
-
# Don't apply the filter twice
|
|
75
|
-
if record["parent"] == self.folder_name:
|
|
76
|
-
return
|
|
77
|
-
|
|
78
|
-
# Apply the filter
|
|
79
|
-
image_b = self.apply_filter(image)
|
|
80
|
-
|
|
81
|
-
# Save the image and get the cloud object info
|
|
82
|
-
entry = self.save(
|
|
83
|
-
image_b, record["source"], name=record["name"], format=image.format
|
|
84
|
-
)
|
|
85
|
-
|
|
86
|
-
# Build the new row
|
|
87
|
-
yield DatasetRow.create(
|
|
88
|
-
name=entry.name,
|
|
89
|
-
source=record["source"],
|
|
90
|
-
parent=self.folder_name,
|
|
91
|
-
size=entry.size,
|
|
92
|
-
location=record["name"]
|
|
93
|
-
if not record["parent"]
|
|
94
|
-
else f"{record['parent']}/{record['name']}",
|
|
95
|
-
vtype=self.vtype,
|
|
96
|
-
dir_type=record["dir_type"],
|
|
97
|
-
owner_name=entry.owner_name,
|
|
98
|
-
owner_id=entry.owner_id,
|
|
99
|
-
is_latest=record["is_latest"],
|
|
100
|
-
last_modified=entry.last_modified,
|
|
101
|
-
version=entry.version,
|
|
102
|
-
etag=entry.etag,
|
|
103
|
-
)
|
datachain/lib/iptc_exif_xmp.py
DELETED
|
@@ -1,76 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
|
|
3
|
-
from PIL import (
|
|
4
|
-
ExifTags,
|
|
5
|
-
Image,
|
|
6
|
-
IptcImagePlugin,
|
|
7
|
-
TiffImagePlugin,
|
|
8
|
-
UnidentifiedImageError,
|
|
9
|
-
)
|
|
10
|
-
|
|
11
|
-
from datachain.query import Object, udf
|
|
12
|
-
from datachain.sql.types import JSON, String
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
def encode_image(raw):
|
|
16
|
-
try:
|
|
17
|
-
img = Image.open(raw)
|
|
18
|
-
except UnidentifiedImageError:
|
|
19
|
-
return None
|
|
20
|
-
return img
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
@udf(
|
|
24
|
-
params=(Object(encode_image),), # Columns consumed by the UDF.
|
|
25
|
-
output={
|
|
26
|
-
"xmp": JSON,
|
|
27
|
-
"exif": JSON,
|
|
28
|
-
"iptc": JSON,
|
|
29
|
-
"error": String,
|
|
30
|
-
}, # Signals being returned by the UDF.
|
|
31
|
-
method="image_description",
|
|
32
|
-
)
|
|
33
|
-
class GetMetadata:
|
|
34
|
-
def cast(self, v): # to JSON serializable types
|
|
35
|
-
if isinstance(v, TiffImagePlugin.IFDRational):
|
|
36
|
-
return float(v)
|
|
37
|
-
if isinstance(v, tuple):
|
|
38
|
-
return tuple(self.cast(t) for t in v)
|
|
39
|
-
if isinstance(v, bytes):
|
|
40
|
-
return v.decode(encoding="utf-8", errors="ignore")
|
|
41
|
-
if isinstance(v, dict):
|
|
42
|
-
for kk, vv in v.items():
|
|
43
|
-
v[kk] = self.cast(vv)
|
|
44
|
-
return v
|
|
45
|
-
if isinstance(v, list):
|
|
46
|
-
return [self.cast(kk) for kk in v]
|
|
47
|
-
return v
|
|
48
|
-
|
|
49
|
-
def image_description(self, img):
|
|
50
|
-
(xmp, exif, iptc) = ({}, {}, {})
|
|
51
|
-
if img is None:
|
|
52
|
-
error = "Image format not understood"
|
|
53
|
-
return ({}, {}, {}, error)
|
|
54
|
-
error = ""
|
|
55
|
-
xmp = img.getxmp()
|
|
56
|
-
img_exif = img.getexif()
|
|
57
|
-
img_iptc = IptcImagePlugin.getiptcinfo(img)
|
|
58
|
-
|
|
59
|
-
if img_iptc:
|
|
60
|
-
for k, v in img_iptc.items():
|
|
61
|
-
iptc[str(k)] = self.cast(v)
|
|
62
|
-
|
|
63
|
-
if img_exif:
|
|
64
|
-
for k, v in img_exif.items():
|
|
65
|
-
v = self.cast(v)
|
|
66
|
-
if k in ExifTags.TAGS:
|
|
67
|
-
exif[ExifTags.TAGS[k]] = v
|
|
68
|
-
if k in ExifTags.GPSTAGS:
|
|
69
|
-
exif[ExifTags.GPSTAGS[k]] = v
|
|
70
|
-
|
|
71
|
-
return (
|
|
72
|
-
json.dumps(xmp),
|
|
73
|
-
json.dumps(exif),
|
|
74
|
-
json.dumps(iptc),
|
|
75
|
-
error,
|
|
76
|
-
)
|
datachain/lib/unstructured.py
DELETED
|
@@ -1,41 +0,0 @@
|
|
|
1
|
-
import shutil
|
|
2
|
-
import tempfile
|
|
3
|
-
|
|
4
|
-
from unstructured.partition.auto import partition
|
|
5
|
-
from unstructured.staging.base import convert_to_dataframe
|
|
6
|
-
|
|
7
|
-
from datachain.lib.udf import Mapper
|
|
8
|
-
from datachain.query import Stream
|
|
9
|
-
from datachain.sql.types import JSON, String
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
class PartitionObject(Mapper):
|
|
13
|
-
def __init__(self):
|
|
14
|
-
super().__init__(
|
|
15
|
-
[
|
|
16
|
-
Stream(),
|
|
17
|
-
],
|
|
18
|
-
{
|
|
19
|
-
"elements": JSON,
|
|
20
|
-
"title": String,
|
|
21
|
-
"text": String,
|
|
22
|
-
"error": String,
|
|
23
|
-
},
|
|
24
|
-
)
|
|
25
|
-
|
|
26
|
-
def encode_object(self, raw):
|
|
27
|
-
fname = str(raw).replace(">", "").replace("<", "")
|
|
28
|
-
output = tempfile.TemporaryFile()
|
|
29
|
-
shutil.copyfileobj(raw, output)
|
|
30
|
-
elements = partition(file=output, metadata_filename=fname)
|
|
31
|
-
output.close()
|
|
32
|
-
return elements
|
|
33
|
-
|
|
34
|
-
def __call__(self, stream):
|
|
35
|
-
with stream:
|
|
36
|
-
elements = self.encode_object(stream)
|
|
37
|
-
|
|
38
|
-
title = str(elements[0])
|
|
39
|
-
text = "\n\n".join([str(el) for el in elements])
|
|
40
|
-
df = convert_to_dataframe(elements)
|
|
41
|
-
return (df.to_json(), title, text, "")
|
datachain/text/__init__.py
DELETED