clarifai 9.0.0__py3-none-any.whl → 9.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- clarifai/data_upload/datasets/__init__.py +0 -0
- clarifai/data_upload/datasets/base.py +67 -0
- clarifai/data_upload/datasets/features.py +45 -0
- clarifai/data_upload/datasets/image.py +236 -0
- clarifai/data_upload/datasets/text.py +62 -0
- clarifai/data_upload/datasets/zoo/__init__.py +0 -0
- clarifai/data_upload/datasets/zoo/coco_captions.py +99 -0
- clarifai/data_upload/datasets/zoo/coco_detection.py +129 -0
- clarifai/data_upload/datasets/zoo/coco_segmentation.py +158 -0
- clarifai/data_upload/examples.py +19 -0
- clarifai/data_upload/upload.py +269 -168
- clarifai/listing/installed_module_versions.py +3 -14
- clarifai/listing/lister.py +40 -0
- clarifai/listing/module_versions.py +42 -0
- clarifai/listing/modules.py +36 -0
- clarifai/modules/style.css +7 -4
- {clarifai-9.0.0.dist-info → clarifai-9.3.1.dist-info}/METADATA +3 -3
- {clarifai-9.0.0.dist-info → clarifai-9.3.1.dist-info}/RECORD +37 -13
- clarifai_utils/data_upload/datasets/__init__.py +0 -0
- clarifai_utils/data_upload/datasets/base.py +67 -0
- clarifai_utils/data_upload/datasets/features.py +45 -0
- clarifai_utils/data_upload/datasets/image.py +236 -0
- clarifai_utils/data_upload/datasets/text.py +62 -0
- clarifai_utils/data_upload/datasets/zoo/__init__.py +0 -0
- clarifai_utils/data_upload/datasets/zoo/coco_captions.py +99 -0
- clarifai_utils/data_upload/datasets/zoo/coco_detection.py +129 -0
- clarifai_utils/data_upload/datasets/zoo/coco_segmentation.py +158 -0
- clarifai_utils/data_upload/examples.py +19 -0
- clarifai_utils/data_upload/upload.py +269 -168
- clarifai_utils/listing/installed_module_versions.py +3 -14
- clarifai_utils/listing/lister.py +40 -0
- clarifai_utils/listing/module_versions.py +42 -0
- clarifai_utils/listing/modules.py +36 -0
- clarifai_utils/modules/style.css +7 -4
- {clarifai-9.0.0.dist-info → clarifai-9.3.1.dist-info}/LICENSE +0 -0
- {clarifai-9.0.0.dist-info → clarifai-9.3.1.dist-info}/WHEEL +0 -0
- {clarifai-9.0.0.dist-info → clarifai-9.3.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
#! COCO 2017 Image Segmentation dataset
|
|
2
|
+
|
|
3
|
+
import gc
|
|
4
|
+
import os
|
|
5
|
+
import zipfile
|
|
6
|
+
from functools import reduce
|
|
7
|
+
from glob import glob
|
|
8
|
+
|
|
9
|
+
import cv2
|
|
10
|
+
import numpy as np
|
|
11
|
+
import requests
|
|
12
|
+
from pycocotools import mask as maskUtils
|
|
13
|
+
from pycocotools.coco import COCO
|
|
14
|
+
from tqdm import tqdm
|
|
15
|
+
|
|
16
|
+
from ..features import VisualSegmentationFeatures
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class COCOSegmentationDataset:
|
|
20
|
+
"""COCO 2017 Image Segmentation Dataset."""
|
|
21
|
+
|
|
22
|
+
def __init__(self, split: str = "train"):
|
|
23
|
+
"""
|
|
24
|
+
Initialize coco dataset.
|
|
25
|
+
Args:
|
|
26
|
+
filenames: the coco zip filenames: List[str] to be downloaded if download=True,
|
|
27
|
+
data_dir: the local coco dataset directory
|
|
28
|
+
split: "train" or "val"
|
|
29
|
+
"""
|
|
30
|
+
self.filenames = {
|
|
31
|
+
"train": "train2017.zip",
|
|
32
|
+
"val": "val2017.zip",
|
|
33
|
+
"annotations": "annotations_trainval2017.zip"
|
|
34
|
+
}
|
|
35
|
+
self.split = split
|
|
36
|
+
self.url = "http://images.cocodataset.org/zips/" # coco base image-zip url
|
|
37
|
+
self.data_dir = os.path.join(os.curdir, ".data") # data storage dir
|
|
38
|
+
self.extracted_coco_dirs = {"train": None, "val": None, "annotations": None}
|
|
39
|
+
|
|
40
|
+
def coco_download(self, save_dir):
|
|
41
|
+
"""Download coco dataset."""
|
|
42
|
+
if not os.path.exists(save_dir):
|
|
43
|
+
os.mkdir(save_dir)
|
|
44
|
+
|
|
45
|
+
#check if train, val and annotation dirs exist
|
|
46
|
+
#so that the coco2017 data isn't downloaded
|
|
47
|
+
for key, filename in self.filenames.items():
|
|
48
|
+
if os.path.exists(glob(f"{save_dir}/{key}*")[0]):
|
|
49
|
+
print("dataset already downloded and extracted")
|
|
50
|
+
continue
|
|
51
|
+
|
|
52
|
+
print("-" * 80)
|
|
53
|
+
print(f"Downloading {filename}")
|
|
54
|
+
print("-" * 80)
|
|
55
|
+
|
|
56
|
+
if "annotations" in filename:
|
|
57
|
+
self.url = "http://images.cocodataset.org/annotations/"
|
|
58
|
+
|
|
59
|
+
response = requests.get(self.url + filename, stream=True)
|
|
60
|
+
response.raise_for_status()
|
|
61
|
+
with open(os.path.join(save_dir, filename), "wb") as _file:
|
|
62
|
+
for chunk in tqdm(response.iter_content(chunk_size=5124000)):
|
|
63
|
+
if chunk:
|
|
64
|
+
_file.write(chunk)
|
|
65
|
+
print("Coco data download complete...")
|
|
66
|
+
|
|
67
|
+
#extract files
|
|
68
|
+
zf = zipfile.ZipFile(os.path.join(save_dir, filename))
|
|
69
|
+
print(f" Extracting {filename} file")
|
|
70
|
+
zf.extractall(path=save_dir)
|
|
71
|
+
# Delete coco zip
|
|
72
|
+
print(f" Deleting {filename}")
|
|
73
|
+
os.remove(path=os.path.join(save_dir, filename))
|
|
74
|
+
|
|
75
|
+
def dataloader(self):
|
|
76
|
+
"""
|
|
77
|
+
Transform coco data into clarifai proto compatible format for upload.
|
|
78
|
+
Returns:
|
|
79
|
+
VisualSegmentationFeatures type generator.
|
|
80
|
+
"""
|
|
81
|
+
if isinstance(self.filenames, list) and len(self.filenames) == 3:
|
|
82
|
+
self.coco_download(self.data_dir)
|
|
83
|
+
self.extracted_coco_dirs["train"] = [os.path.join(self.data_dir, i) \
|
|
84
|
+
for i in os.listdir(self.data_dir) if "train" in i][0]
|
|
85
|
+
self.extracted_coco_dirs["val"] = [os.path.join(self.data_dir, i) \
|
|
86
|
+
for i in os.listdir(self.data_dir) if "val" in i][0]
|
|
87
|
+
|
|
88
|
+
self.extracted_coco_dirs["annotations"] = [os.path.join(self.data_dir, i) \
|
|
89
|
+
for i in os.listdir(self.data_dir) if "annotations" in i][0]
|
|
90
|
+
else:
|
|
91
|
+
raise Exception(f"`filenames` must be a list of atleast 3 coco zip file names; \
|
|
92
|
+
train, val and annotations. Found {len(self.filenames)} items instead.")
|
|
93
|
+
|
|
94
|
+
annot_file = glob(self.extracted_coco_dirs["annotations"] + "/" +\
|
|
95
|
+
f"instances_{self.split}*")[0]
|
|
96
|
+
coco = COCO(annot_file)
|
|
97
|
+
categories = coco.loadCats(coco.getCatIds())
|
|
98
|
+
cat_id_map = {category["id"]: category["name"] for category in categories}
|
|
99
|
+
cat_img_ids = {}
|
|
100
|
+
for cat_id in list(cat_id_map.keys()):
|
|
101
|
+
cat_img_ids[cat_id] = coco.getImgIds(catIds=[cat_id])
|
|
102
|
+
|
|
103
|
+
img_ids = []
|
|
104
|
+
for i in list(cat_img_ids.values()):
|
|
105
|
+
img_ids.extend(i)
|
|
106
|
+
|
|
107
|
+
#get annotations for each image id
|
|
108
|
+
for _id in img_ids:
|
|
109
|
+
annots = [] # polygons
|
|
110
|
+
class_names = []
|
|
111
|
+
labels = [i for i in list(filter(lambda x: _id in cat_img_ids[x], cat_img_ids))]
|
|
112
|
+
image_path = glob(self.extracted_coco_dirs[self.split]+"/"+\
|
|
113
|
+
f"{str(_id).zfill(12)}*")[0]
|
|
114
|
+
|
|
115
|
+
image_height, image_width = cv2.imread(image_path).shape[:2]
|
|
116
|
+
for cat_id in labels:
|
|
117
|
+
annot_ids = coco.getAnnIds(imgIds=_id, catIds=[cat_id])
|
|
118
|
+
if len(annot_ids) > 0:
|
|
119
|
+
img_annotations = coco.loadAnns(annot_ids)
|
|
120
|
+
for ann in img_annotations:
|
|
121
|
+
class_names.append(cat_id_map[cat_id])
|
|
122
|
+
# get polygons
|
|
123
|
+
if type(ann['segmentation']) == list:
|
|
124
|
+
for seg in ann['segmentation']:
|
|
125
|
+
poly = np.array(seg).reshape((int(len(seg) / 2), 2))
|
|
126
|
+
poly[:, 0], poly[:, 1] = poly[:, 0] / image_width, poly[:, 1] / image_height
|
|
127
|
+
annots.append(poly.tolist()) #[[x=col, y=row],...]
|
|
128
|
+
else: # seg: {"counts":[...]}
|
|
129
|
+
if type(ann['segmentation']['counts']) == list:
|
|
130
|
+
rle = maskUtils.frPyObjects([ann['segmentation']], image_height, image_width)
|
|
131
|
+
else:
|
|
132
|
+
rle = ann['segmentation']
|
|
133
|
+
mask = maskUtils.decode(rle) #binary mask
|
|
134
|
+
#convert mask to polygons and add to annots
|
|
135
|
+
contours, _ = cv2.findContours(mask, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
|
|
136
|
+
polygons = []
|
|
137
|
+
for cont in contours:
|
|
138
|
+
if cont.size >= 6:
|
|
139
|
+
polygons.append(cont.astype(float).flatten().tolist())
|
|
140
|
+
# store polygons in (x,y) pairs
|
|
141
|
+
polygons_flattened = reduce(lambda x, y: x + y, polygons)
|
|
142
|
+
del polygons
|
|
143
|
+
del contours
|
|
144
|
+
del mask
|
|
145
|
+
gc.collect()
|
|
146
|
+
|
|
147
|
+
polygons = np.array(polygons_flattened).reshape((int(len(polygons_flattened) / 2),
|
|
148
|
+
2))
|
|
149
|
+
polygons[:, 0] = polygons[:, 0] / image_width
|
|
150
|
+
polygons[:, 1] = polygons[:, 1] / image_height
|
|
151
|
+
|
|
152
|
+
annots.append(polygons.tolist()) #[[x=col, y=row],...,[x=col, y=row]]
|
|
153
|
+
else: # if no annotations for given image_id-cat_id pair
|
|
154
|
+
continue
|
|
155
|
+
assert len(class_names) == len(annots), f"Num classes must match num annotations\
|
|
156
|
+
for a single image. Found {len(class_names)} classes and {len(annots)} polygons."
|
|
157
|
+
|
|
158
|
+
yield VisualSegmentationFeatures(image_path, class_names, annots, _id)
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
#! Execute dataset upload using the `from_module` upload feature
|
|
2
|
+
|
|
3
|
+
from clarifai.data_upload.upload import UploadConfig
|
|
4
|
+
|
|
5
|
+
text_upload_obj = UploadConfig(
|
|
6
|
+
user_id="",
|
|
7
|
+
app_id="",
|
|
8
|
+
pat="",
|
|
9
|
+
dataset_id="",
|
|
10
|
+
task="visual_clf",
|
|
11
|
+
from_module="./examples/image_classification/cifar10",
|
|
12
|
+
split="train",
|
|
13
|
+
portal="clarifai" #clarifai(prod), dev or staging
|
|
14
|
+
)
|
|
15
|
+
## change the task and from_module arguments in UploadConfig() to upload
|
|
16
|
+
## example food-101 dataset
|
|
17
|
+
|
|
18
|
+
if __name__ == "__main__":
|
|
19
|
+
text_upload_obj.upload_to_clarifai()
|
clarifai/data_upload/upload.py
CHANGED
|
@@ -1,201 +1,302 @@
|
|
|
1
1
|
#! Clarifai data upload
|
|
2
2
|
|
|
3
|
+
import importlib
|
|
4
|
+
import inspect
|
|
5
|
+
import os
|
|
6
|
+
import sys
|
|
3
7
|
import time
|
|
4
8
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
5
9
|
from multiprocessing import cpu_count
|
|
10
|
+
from typing import Iterator, Optional, Tuple, Union
|
|
6
11
|
|
|
7
|
-
from
|
|
8
|
-
from clarifai_grpc.grpc.api import resources_pb2, service_pb2
|
|
12
|
+
from clarifai_grpc.grpc.api import resources_pb2, service_pb2, service_pb2_grpc
|
|
9
13
|
from clarifai_grpc.grpc.api.status import status_code_pb2
|
|
10
|
-
from datasets import (ImageClassificationDataset, TextClassificationDataset,
|
|
11
|
-
VisualDetectionDataset, VisualSegmentationDataset)
|
|
12
|
-
from omegaconf import OmegaConf
|
|
13
14
|
from tqdm import tqdm
|
|
14
15
|
|
|
15
16
|
from clarifai.client import create_stub
|
|
17
|
+
from clarifai.data_upload.datasets.base import Chunker
|
|
18
|
+
from clarifai.data_upload.datasets.image import (VisualClassificationDataset,
|
|
19
|
+
VisualDetectionDataset, VisualSegmentationDataset)
|
|
20
|
+
from clarifai.data_upload.datasets.text import TextClassificationDataset
|
|
16
21
|
|
|
17
22
|
|
|
18
|
-
def
|
|
23
|
+
def load_dataset(module_dir: Union[str, os.PathLike], split: str) -> Iterator:
|
|
19
24
|
"""
|
|
20
|
-
|
|
25
|
+
Validate and import dataset module data generator.
|
|
21
26
|
Args:
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
27
|
+
`module_dir`: relative path to the module directory
|
|
28
|
+
The directory must contain a `dataset.py` script and the data itself.
|
|
29
|
+
`split`: "train" or "val"/"test" dataset split
|
|
30
|
+
Module Directory Structure:
|
|
31
|
+
---------------------------
|
|
32
|
+
<folder_name>/
|
|
33
|
+
├──__init__.py
|
|
34
|
+
├──<Your local dir dataset>/
|
|
35
|
+
└──dataset.py
|
|
36
|
+
dataset.py must implement a class named following the convention,
|
|
37
|
+
<dataset_name>Dataset and this class must have a dataloader()
|
|
38
|
+
generator method
|
|
25
39
|
"""
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
if
|
|
38
|
-
|
|
39
|
-
print(f"Post inputs failed, status:\n{response.inputs[0].status.details}\n")
|
|
40
|
-
except:
|
|
41
|
-
print(f"Post inputs failed, status:\n{response.status.details}\n")
|
|
42
|
-
retry_upload.append(inp_proto)
|
|
40
|
+
sys.path.append(str(module_dir))
|
|
41
|
+
|
|
42
|
+
if not os.path.exists(os.path.join(module_dir, "__init__.py")):
|
|
43
|
+
with open(os.path.join(module_dir, "__init__.py"), "w"):
|
|
44
|
+
pass
|
|
45
|
+
|
|
46
|
+
import dataset # dataset module
|
|
47
|
+
|
|
48
|
+
# get main module class
|
|
49
|
+
main_module_cls = None
|
|
50
|
+
for name, obj in dataset.__dict__.items():
|
|
51
|
+
if inspect.isclass(obj) and "Dataset" in name:
|
|
52
|
+
main_module_cls = obj
|
|
43
53
|
else:
|
|
44
|
-
|
|
54
|
+
continue
|
|
45
55
|
|
|
46
|
-
return
|
|
56
|
+
return main_module_cls(split).dataloader()
|
|
47
57
|
|
|
48
58
|
|
|
49
|
-
def
|
|
59
|
+
def load_zoo_dataset(name: str, split: str) -> Iterator:
|
|
50
60
|
"""
|
|
51
|
-
|
|
61
|
+
Get dataset generator object from dataset zoo.
|
|
62
|
+
Args:
|
|
63
|
+
`name`: dataset module name in datasets/zoo/.
|
|
64
|
+
`split`: "train" or "val"/"test" dataset split
|
|
65
|
+
Returns:
|
|
66
|
+
Data generator object
|
|
52
67
|
"""
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
for annot_proto in inputs:
|
|
61
|
-
response = STUB.PostAnnotations(
|
|
62
|
-
service_pb2.PostAnnotationsRequest(user_app_id=USER_APP_ID, annotations=[annot_proto]),)
|
|
63
|
-
|
|
64
|
-
if response.status.code != status_code_pb2.SUCCESS:
|
|
65
|
-
try:
|
|
66
|
-
print(f"Post annotations failed, status:\n{response.annotations[0].status.details}\n")
|
|
67
|
-
except:
|
|
68
|
-
print(f"Post annotations failed, status:\n{response.status.details}\n")
|
|
69
|
-
retry_upload.append(annot_proto)
|
|
68
|
+
zoo_dataset = importlib.import_module(f"datasets.zoo.{name}")
|
|
69
|
+
# get main module class
|
|
70
|
+
main_module_cls = None
|
|
71
|
+
for name, obj in zoo_dataset.__dict__.items():
|
|
72
|
+
if inspect.isclass(obj) and "Dataset" in name:
|
|
73
|
+
main_module_cls = obj
|
|
70
74
|
else:
|
|
71
|
-
|
|
75
|
+
continue
|
|
76
|
+
|
|
77
|
+
return main_module_cls(split).dataloader()
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
class UploadConfig:
|
|
81
|
+
|
|
82
|
+
def __init__(
|
|
83
|
+
self,
|
|
84
|
+
user_id: str,
|
|
85
|
+
app_id: str,
|
|
86
|
+
pat: str,
|
|
87
|
+
dataset_id: str,
|
|
88
|
+
task: str,
|
|
89
|
+
from_module: Optional[Union[str, os.PathLike]] = None,
|
|
90
|
+
from_zoo: Optional[str] = None, # load dataset from zoo
|
|
91
|
+
split: str = "train", # train or test/val
|
|
92
|
+
chunk_size: int = 16,
|
|
93
|
+
portal: str = "clarifai"):
|
|
94
|
+
"""
|
|
95
|
+
Initialize upload configs.
|
|
96
|
+
Args:
|
|
97
|
+
`user_id`: Clarifai user id.
|
|
98
|
+
`app_id`: Clarifai app id.
|
|
99
|
+
`pat`: Clarifai PAT(Personal Access Token).
|
|
100
|
+
`dataset_id`: Clarifai dataset id (where data is to be uploaded).
|
|
101
|
+
`task`: either of `visual_clf`, `visual_detection`, `visual_segmentation` or `text_clf`.
|
|
102
|
+
`from_module`: Path to dataset module directory.
|
|
103
|
+
Should be left as None if `from_zoo` is to be used.
|
|
104
|
+
`from_zoo`: Name of dataset to upload from the zoo.
|
|
105
|
+
The name must match the dataset module name excluding the file extension.
|
|
106
|
+
Should be left as None if `from_module` is to be used.
|
|
107
|
+
`split`: Dataset split to upload. Either of train or test/val
|
|
108
|
+
`chunk_size`: size of chunks for parallel data upload.
|
|
109
|
+
"""
|
|
110
|
+
self.USER_ID = user_id
|
|
111
|
+
self.APP_ID = app_id
|
|
112
|
+
self.PAT = pat
|
|
113
|
+
self.dataset_id = dataset_id
|
|
114
|
+
self.task = task
|
|
115
|
+
self.module_dir = from_module
|
|
116
|
+
self.zoo_dataset = from_zoo
|
|
117
|
+
self.split = split
|
|
118
|
+
self.chunk_size = chunk_size
|
|
119
|
+
self.num_workers: int = cpu_count()
|
|
120
|
+
self.__base: str = ""
|
|
121
|
+
if portal == "dev":
|
|
122
|
+
self.__base = "https://api-dev.clarifai.com"
|
|
123
|
+
elif portal == "staging":
|
|
124
|
+
self.__base = "https://api-staging.clarifai.com"
|
|
125
|
+
else: #prod
|
|
126
|
+
self.__base = "https://api.clarifai.com"
|
|
127
|
+
|
|
128
|
+
# Set auth vars as env variables
|
|
129
|
+
os.environ["CLARIFAI_USER_ID"] = self.USER_ID
|
|
130
|
+
os.environ["CLARIFAI_APP_ID"] = self.APP_ID
|
|
131
|
+
os.environ["CLARIFAI_API_BASE"] = self.__base
|
|
132
|
+
os.environ["CLARIFAI_PAT"] = self.PAT
|
|
133
|
+
|
|
134
|
+
self.STUB: service_pb2_grpc.V2Stub = create_stub()
|
|
135
|
+
self.metadata: Tuple = (('authorization', 'Key ' + self.PAT),)
|
|
136
|
+
self.user_app_id = resources_pb2.UserAppIDSet(user_id=self.USER_ID, app_id=self.APP_ID)
|
|
137
|
+
|
|
138
|
+
def _upload_inputs(self, inputs):
|
|
139
|
+
"""
|
|
140
|
+
Upload inputs to clarifai platform dataset.
|
|
141
|
+
Args:
|
|
142
|
+
inputs: input protos
|
|
143
|
+
"""
|
|
144
|
+
upload_count = 0
|
|
145
|
+
retry_upload = [] # those that fail to upload are stored for retries
|
|
146
|
+
|
|
147
|
+
for inp_proto in inputs:
|
|
148
|
+
response = self.STUB.PostInputs(
|
|
149
|
+
service_pb2.PostInputsRequest(user_app_id=self.user_app_id, inputs=[inp_proto]),)
|
|
150
|
+
|
|
151
|
+
MESSAGE_DUPLICATE_ID = "Input has a duplicate ID."
|
|
152
|
+
if response.status.code != status_code_pb2.SUCCESS:
|
|
153
|
+
try:
|
|
154
|
+
if response.inputs[0].status.details != MESSAGE_DUPLICATE_ID:
|
|
155
|
+
retry_upload.append(inp_proto)
|
|
156
|
+
print(f"Post inputs failed, status: {response.inputs[0].status.details}\n")
|
|
157
|
+
continue
|
|
158
|
+
except:
|
|
159
|
+
print(f"Post inputs failed, status: {response.status.details}\n")
|
|
160
|
+
else:
|
|
161
|
+
upload_count += 1
|
|
162
|
+
|
|
163
|
+
return retry_upload
|
|
164
|
+
|
|
165
|
+
def upload_annotations(self, inputs):
|
|
166
|
+
"""
|
|
167
|
+
Upload image annotations to clarifai detection dataset
|
|
168
|
+
"""
|
|
169
|
+
upload_count = 0
|
|
170
|
+
retry_upload = [] # those that fail to upload are stored for retries
|
|
171
|
+
|
|
172
|
+
for annot_proto in inputs:
|
|
173
|
+
response = self.STUB.PostAnnotations(
|
|
174
|
+
service_pb2.PostAnnotationsRequest(
|
|
175
|
+
user_app_id=self.user_app_id, annotations=[annot_proto]),)
|
|
176
|
+
|
|
177
|
+
if response.status.code != status_code_pb2.SUCCESS:
|
|
178
|
+
try:
|
|
179
|
+
print(f"Post annotations failed, status:\n{response.annotations[0].status.details}\n")
|
|
180
|
+
continue
|
|
181
|
+
except:
|
|
182
|
+
print(f"Post annotations failed, status:\n{response.status.details}\n")
|
|
183
|
+
retry_upload.append(annot_proto)
|
|
184
|
+
else:
|
|
185
|
+
upload_count += 1
|
|
186
|
+
|
|
187
|
+
return retry_upload
|
|
188
|
+
|
|
189
|
+
def concurrent_inp_upload(self, inputs, chunks):
|
|
190
|
+
"""
|
|
191
|
+
Upload images concurrently.
|
|
192
|
+
"""
|
|
193
|
+
inp_threads = []
|
|
194
|
+
retry_upload = []
|
|
195
|
+
|
|
196
|
+
with ThreadPoolExecutor(max_workers=self.num_workers) as executor:
|
|
197
|
+
for inp_batch in tqdm(inputs, total=chunks + 1, desc="uploading inputs..."):
|
|
198
|
+
inp_threads.append(executor.submit(self._upload_inputs, inp_batch))
|
|
199
|
+
time.sleep(0.1)
|
|
200
|
+
|
|
201
|
+
for job in tqdm(
|
|
202
|
+
as_completed(inp_threads), total=chunks + 1, desc="retry uploading failed protos..."):
|
|
203
|
+
if job.result():
|
|
204
|
+
retry_upload.extend(job.result())
|
|
205
|
+
|
|
206
|
+
if len(
|
|
207
|
+
list(retry_upload)) > 0: ## TODO: use api_with_retries functionality via upload_inputs()
|
|
208
|
+
_ = self._upload_inputs(retry_upload)
|
|
209
|
+
|
|
210
|
+
def concurrent_annot_upload(self, inputs, chunks):
|
|
211
|
+
"""
|
|
212
|
+
Uploads annotations concurrently.
|
|
213
|
+
"""
|
|
214
|
+
annot_threads = []
|
|
215
|
+
retry_annot_upload = []
|
|
216
|
+
|
|
217
|
+
with ThreadPoolExecutor(max_workers=self.num_workers) as executor:
|
|
218
|
+
for annot_batch in tqdm(inputs, total=chunks + 1, desc="uploading..."):
|
|
219
|
+
annot_threads.append(executor.submit(self.upload_annotations, annot_batch))
|
|
220
|
+
time.sleep(0.2)
|
|
221
|
+
|
|
222
|
+
for job in tqdm(
|
|
223
|
+
as_completed(annot_threads), total=chunks + 1, desc="retry uploading failed protos..."):
|
|
224
|
+
if job.result():
|
|
225
|
+
retry_annot_upload.extend(job.result())
|
|
226
|
+
if len(retry_annot_upload) > 0:
|
|
227
|
+
## TODO: use api_with_retries functionality via upload_annotations()
|
|
228
|
+
_ = self.upload_annotations(retry_annot_upload)
|
|
229
|
+
|
|
230
|
+
def upload_to_clarifai(self):
|
|
231
|
+
"""
|
|
232
|
+
Execute data upload.
|
|
233
|
+
"""
|
|
234
|
+
datagen_object = None
|
|
235
|
+
if self.module_dir is None and self.zoo_dataset is None:
|
|
236
|
+
raise Exception("One of `from_module` and `from_zoo` must be \
|
|
237
|
+
specified. Both can't be None or defined at the same time.")
|
|
238
|
+
elif self.module_dir is not None and self.zoo_dataset is not None:
|
|
239
|
+
raise Exception("Use either of `from_module` or `from_zoo` \
|
|
240
|
+
but NOT both.")
|
|
241
|
+
elif self.module_dir is not None:
|
|
242
|
+
datagen_object = load_dataset(self.module_dir, self.split)
|
|
243
|
+
else:
|
|
244
|
+
datagen_object = load_zoo_dataset(self.zoo_dataset, self.split)
|
|
72
245
|
|
|
73
|
-
|
|
246
|
+
if self.task == "text_clf":
|
|
247
|
+
dataset_obj = TextClassificationDataset(datagen_object, self.dataset_id, self.split)
|
|
248
|
+
text_protos = dataset_obj._get_input_protos()
|
|
249
|
+
text_protos = dataset_obj._to_list(text_protos)
|
|
74
250
|
|
|
251
|
+
# Upload text
|
|
252
|
+
chunks = len(text_protos) // self.num_workers
|
|
253
|
+
chunked_text_protos = Chunker(text_protos, self.chunk_size).chunk()
|
|
75
254
|
|
|
76
|
-
|
|
77
|
-
"""
|
|
78
|
-
Upload images concurrently for efficiency.
|
|
79
|
-
"""
|
|
80
|
-
inp_threads = []
|
|
81
|
-
retry_upload = []
|
|
255
|
+
self.concurrent_inp_upload(chunked_text_protos, chunks)
|
|
82
256
|
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
257
|
+
elif self.task == "visual_detection":
|
|
258
|
+
dataset_obj = VisualDetectionDataset(datagen_object, self.dataset_id, self.split)
|
|
259
|
+
img_protos, annotation_protos = dataset_obj._get_input_protos()
|
|
260
|
+
img_protos = dataset_obj._to_list(img_protos)
|
|
87
261
|
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
retry_upload.extend(job.result())
|
|
92
|
-
if len(list(retry_upload)) > 0: ## TODO: use api_with_retries functionality via upload_data()
|
|
93
|
-
_ = upload_data(config, retry_upload, stub)
|
|
262
|
+
# Upload images
|
|
263
|
+
chunks = len(img_protos) // self.num_workers
|
|
264
|
+
chunked_img_protos = Chunker(img_protos, self.chunk_size).chunk()
|
|
94
265
|
|
|
266
|
+
self.concurrent_inp_upload(chunked_img_protos, chunks)
|
|
95
267
|
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
retry_annot_upload = []
|
|
268
|
+
# Upload annotations:
|
|
269
|
+
print("Uploading annotations.......")
|
|
270
|
+
annotation_protos = dataset_obj._to_list(annotation_protos)
|
|
271
|
+
chunks_ = len(annotation_protos) // self.num_workers
|
|
272
|
+
chunked_annot_protos = Chunker(annotation_protos, self.chunk_size).chunk()
|
|
102
273
|
|
|
103
|
-
|
|
104
|
-
for annot_batch in tqdm(inputs, total=chunks + 1, desc="uploading..."):
|
|
105
|
-
annot_threads.append(executor.submit(upload_annotations, config, annot_batch, stub))
|
|
106
|
-
time.sleep(0.2)
|
|
274
|
+
self.concurrent_annot_upload(chunked_annot_protos, chunks_)
|
|
107
275
|
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
## TODO: use api_with_retries functionality via upload_annotations()
|
|
114
|
-
_ = upload_annotations(config, retry_annot_upload, stub)
|
|
276
|
+
elif self.task == "visual_segmentation":
|
|
277
|
+
dataset_obj = VisualSegmentationDataset(datagen_object, self.dataset_id, self.split)
|
|
278
|
+
img_protos, mask_protos = dataset_obj._get_input_protos()
|
|
279
|
+
img_protos = dataset_obj._to_list(img_protos)
|
|
280
|
+
mask_protos = dataset_obj._to_list(mask_protos)
|
|
115
281
|
|
|
282
|
+
# Upload images
|
|
283
|
+
chunks = len(img_protos) // self.num_workers
|
|
284
|
+
chunked_img_protos = Chunker(img_protos, self.chunk_size).chunk()
|
|
116
285
|
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
# Upload text
|
|
135
|
-
chunks = len(text_protos) // workers
|
|
136
|
-
chunked_text_protos = Chunker(text_protos, config["chunk_size"]).chunk()
|
|
137
|
-
|
|
138
|
-
concurrent_inp_upload(config, chunked_text_protos, workers, chunks, STUB)
|
|
139
|
-
|
|
140
|
-
elif task == "visual_det":
|
|
141
|
-
dataset_obj = VisualDetectionDataset(
|
|
142
|
-
config.data["visual_det_image_dir"],
|
|
143
|
-
config.data["visual_det_labels_dir"],
|
|
144
|
-
config.data["dataset_id"],
|
|
145
|
-
config["split"],
|
|
146
|
-
labels_from_text_file=False)
|
|
147
|
-
img_protos, annotation_protos = dataset_obj._get_input_protos()
|
|
148
|
-
img_protos = dataset_obj.to_list(img_protos)
|
|
149
|
-
|
|
150
|
-
# Upload images
|
|
151
|
-
chunks = len(img_protos) // workers
|
|
152
|
-
chunked_img_protos = Chunker(img_protos, config["chunk_size"]).chunk()
|
|
153
|
-
|
|
154
|
-
concurrent_inp_upload(config, chunked_img_protos, workers, chunks, STUB)
|
|
155
|
-
|
|
156
|
-
# Upload annotations:
|
|
157
|
-
print("Uploading annotations.......")
|
|
158
|
-
annotation_protos = dataset_obj.to_list(annotation_protos)
|
|
159
|
-
chunks_ = len(annotation_protos) // workers
|
|
160
|
-
chunked_annot_protos = Chunker(annotation_protos, config["chunk_size"]).chunk()
|
|
161
|
-
|
|
162
|
-
concurrent_annot_upload(config, chunked_annot_protos, workers, chunks_, STUB)
|
|
163
|
-
|
|
164
|
-
elif task == "visual_seg":
|
|
165
|
-
dataset_obj = VisualSegmentationDataset(config.data["visual_seg_image_dir"],
|
|
166
|
-
config.data["visual_seg_masks_dir"],
|
|
167
|
-
config.data["dataset_id"], config["split"])
|
|
168
|
-
img_protos, mask_protos = dataset_obj._get_input_protos()
|
|
169
|
-
img_protos = dataset_obj.to_list(img_protos)
|
|
170
|
-
mask_protos = dataset_obj.to_list(mask_protos)
|
|
171
|
-
|
|
172
|
-
# Upload images
|
|
173
|
-
chunks = len(img_protos) // workers
|
|
174
|
-
chunked_img_protos = Chunker(img_protos, config["chunk_size"]).chunk()
|
|
175
|
-
|
|
176
|
-
concurrent_inp_upload(config, chunked_img_protos, workers, chunks, STUB)
|
|
177
|
-
|
|
178
|
-
# Upload masks:
|
|
179
|
-
print("Uploading masks.......")
|
|
180
|
-
chunks_ = len(mask_protos) // workers
|
|
181
|
-
chunked_mask_protos = Chunker(mask_protos, config["chunk_size"]).chunk()
|
|
182
|
-
|
|
183
|
-
concurrent_annot_upload(config, chunked_mask_protos, workers, chunks_, STUB)
|
|
184
|
-
|
|
185
|
-
else:
|
|
186
|
-
dataset_obj = ImageClassificationDataset(config.data["clf_image_dir"],
|
|
187
|
-
config.data["dataset_id"], config["split"])
|
|
188
|
-
img_protos = dataset_obj._get_input_protos()
|
|
189
|
-
img_protos = dataset_obj.to_list(img_protos)
|
|
190
|
-
|
|
191
|
-
# Upload images
|
|
192
|
-
chunks = len(img_protos) // workers
|
|
193
|
-
chunked_img_protos = Chunker(img_protos, config["chunk_size"]).chunk()
|
|
194
|
-
|
|
195
|
-
concurrent_inp_upload(config, chunked_img_protos, workers, chunks, STUB)
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
if __name__ == "__main__":
|
|
199
|
-
yaml_path = "./config.yaml"
|
|
200
|
-
config = OmegaConf.load(yaml_path)
|
|
201
|
-
upload_to_clarifai(config, task=config["task"])
|
|
286
|
+
#self.concurrent_inp_upload(chunked_img_protos, chunks)
|
|
287
|
+
# Upload masks:
|
|
288
|
+
print("Uploading masks.......")
|
|
289
|
+
chunks_ = len(mask_protos) // self.num_workers
|
|
290
|
+
chunked_mask_protos = Chunker(mask_protos, self.chunk_size).chunk()
|
|
291
|
+
|
|
292
|
+
self.concurrent_annot_upload(chunked_mask_protos, chunks_)
|
|
293
|
+
else: # visual-classification & visual-captioning
|
|
294
|
+
dataset_obj = VisualClassificationDataset(datagen_object, self.dataset_id, self.split)
|
|
295
|
+
img_protos = dataset_obj._get_input_protos()
|
|
296
|
+
img_protos = dataset_obj._to_list(img_protos)
|
|
297
|
+
|
|
298
|
+
# Upload images
|
|
299
|
+
chunks = len(img_protos) // self.num_workers
|
|
300
|
+
chunked_img_protos = Chunker(img_protos, self.chunk_size).chunk()
|
|
301
|
+
|
|
302
|
+
self.concurrent_inp_upload(chunked_img_protos, chunks)
|
|
@@ -24,12 +24,6 @@ def installed_module_versions_generator(stub: V2Stub,
|
|
|
24
24
|
|
|
25
25
|
imv_success_status = {status_code_pb2.SUCCESS}
|
|
26
26
|
|
|
27
|
-
# HACK(zeiler): this is the number of default installed module versions every app has.
|
|
28
|
-
# so with pagination
|
|
29
|
-
seen = {
|
|
30
|
-
"module_manager_install": False,
|
|
31
|
-
}
|
|
32
|
-
|
|
33
27
|
page = 1
|
|
34
28
|
while True:
|
|
35
29
|
response = stub.ListInstalledModuleVersions(
|
|
@@ -39,13 +33,8 @@ def installed_module_versions_generator(stub: V2Stub,
|
|
|
39
33
|
if response.status.code not in imv_success_status:
|
|
40
34
|
raise Exception("ListInstalledModuleVersions failed with response %r" % response)
|
|
41
35
|
for item in response.installed_module_versions:
|
|
42
|
-
|
|
43
|
-
if not seen[item.id]: # yield it once.
|
|
44
|
-
seen[item.id] = True
|
|
45
|
-
yield item
|
|
46
|
-
else:
|
|
47
|
-
yield item
|
|
36
|
+
yield item
|
|
48
37
|
page += 1
|
|
49
|
-
# if we don't get a full page back
|
|
50
|
-
if len(response.installed_module_versions) < page_size
|
|
38
|
+
# if we don't get a full page back we know we're done.
|
|
39
|
+
if len(response.installed_module_versions) < page_size:
|
|
51
40
|
break
|