clarifai 9.1.0__py3-none-any.whl → 9.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. clarifai/data_upload/datasets/__init__.py +0 -0
  2. clarifai/data_upload/datasets/base.py +67 -0
  3. clarifai/data_upload/datasets/features.py +45 -0
  4. clarifai/data_upload/datasets/image.py +236 -0
  5. clarifai/data_upload/datasets/text.py +62 -0
  6. clarifai/data_upload/datasets/zoo/__init__.py +0 -0
  7. clarifai/data_upload/datasets/zoo/coco_captions.py +99 -0
  8. clarifai/data_upload/datasets/zoo/coco_detection.py +129 -0
  9. clarifai/data_upload/datasets/zoo/coco_segmentation.py +158 -0
  10. clarifai/data_upload/examples.py +19 -0
  11. clarifai/data_upload/upload.py +269 -168
  12. clarifai/listing/installed_module_versions.py +3 -14
  13. clarifai/listing/lister.py +40 -0
  14. clarifai/listing/module_versions.py +42 -0
  15. clarifai/listing/modules.py +36 -0
  16. clarifai/modules/style.css +7 -4
  17. {clarifai-9.1.0.dist-info → clarifai-9.3.1.dist-info}/METADATA +3 -3
  18. {clarifai-9.1.0.dist-info → clarifai-9.3.1.dist-info}/RECORD +37 -13
  19. clarifai_utils/data_upload/datasets/__init__.py +0 -0
  20. clarifai_utils/data_upload/datasets/base.py +67 -0
  21. clarifai_utils/data_upload/datasets/features.py +45 -0
  22. clarifai_utils/data_upload/datasets/image.py +236 -0
  23. clarifai_utils/data_upload/datasets/text.py +62 -0
  24. clarifai_utils/data_upload/datasets/zoo/__init__.py +0 -0
  25. clarifai_utils/data_upload/datasets/zoo/coco_captions.py +99 -0
  26. clarifai_utils/data_upload/datasets/zoo/coco_detection.py +129 -0
  27. clarifai_utils/data_upload/datasets/zoo/coco_segmentation.py +158 -0
  28. clarifai_utils/data_upload/examples.py +19 -0
  29. clarifai_utils/data_upload/upload.py +269 -168
  30. clarifai_utils/listing/installed_module_versions.py +3 -14
  31. clarifai_utils/listing/lister.py +40 -0
  32. clarifai_utils/listing/module_versions.py +42 -0
  33. clarifai_utils/listing/modules.py +36 -0
  34. clarifai_utils/modules/style.css +7 -4
  35. {clarifai-9.1.0.dist-info → clarifai-9.3.1.dist-info}/LICENSE +0 -0
  36. {clarifai-9.1.0.dist-info → clarifai-9.3.1.dist-info}/WHEEL +0 -0
  37. {clarifai-9.1.0.dist-info → clarifai-9.3.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,158 @@
1
+ #! COCO 2017 Image Segmentation dataset
2
+
3
+ import gc
4
+ import os
5
+ import zipfile
6
+ from functools import reduce
7
+ from glob import glob
8
+
9
+ import cv2
10
+ import numpy as np
11
+ import requests
12
+ from pycocotools import mask as maskUtils
13
+ from pycocotools.coco import COCO
14
+ from tqdm import tqdm
15
+
16
+ from ..features import VisualSegmentationFeatures
17
+
18
+
19
+ class COCOSegmentationDataset:
20
+ """COCO 2017 Image Segmentation Dataset."""
21
+
22
+ def __init__(self, split: str = "train"):
23
+ """
24
+ Initialize coco dataset.
25
+ Args:
26
+ filenames: the coco zip filenames: List[str] to be downloaded if download=True,
27
+ data_dir: the local coco dataset directory
28
+ split: "train" or "val"
29
+ """
30
+ self.filenames = {
31
+ "train": "train2017.zip",
32
+ "val": "val2017.zip",
33
+ "annotations": "annotations_trainval2017.zip"
34
+ }
35
+ self.split = split
36
+ self.url = "http://images.cocodataset.org/zips/" # coco base image-zip url
37
+ self.data_dir = os.path.join(os.curdir, ".data") # data storage dir
38
+ self.extracted_coco_dirs = {"train": None, "val": None, "annotations": None}
39
+
40
+ def coco_download(self, save_dir):
41
+ """Download coco dataset."""
42
+ if not os.path.exists(save_dir):
43
+ os.mkdir(save_dir)
44
+
45
+ #check if train, val and annotation dirs exist
46
+ #so that the coco2017 data isn't downloaded
47
+ for key, filename in self.filenames.items():
48
+ if os.path.exists(glob(f"{save_dir}/{key}*")[0]):
49
+ print("dataset already downloded and extracted")
50
+ continue
51
+
52
+ print("-" * 80)
53
+ print(f"Downloading {filename}")
54
+ print("-" * 80)
55
+
56
+ if "annotations" in filename:
57
+ self.url = "http://images.cocodataset.org/annotations/"
58
+
59
+ response = requests.get(self.url + filename, stream=True)
60
+ response.raise_for_status()
61
+ with open(os.path.join(save_dir, filename), "wb") as _file:
62
+ for chunk in tqdm(response.iter_content(chunk_size=5124000)):
63
+ if chunk:
64
+ _file.write(chunk)
65
+ print("Coco data download complete...")
66
+
67
+ #extract files
68
+ zf = zipfile.ZipFile(os.path.join(save_dir, filename))
69
+ print(f" Extracting {filename} file")
70
+ zf.extractall(path=save_dir)
71
+ # Delete coco zip
72
+ print(f" Deleting {filename}")
73
+ os.remove(path=os.path.join(save_dir, filename))
74
+
75
+ def dataloader(self):
76
+ """
77
+ Transform coco data into clarifai proto compatible format for upload.
78
+ Returns:
79
+ VisualSegmentationFeatures type generator.
80
+ """
81
+ if isinstance(self.filenames, list) and len(self.filenames) == 3:
82
+ self.coco_download(self.data_dir)
83
+ self.extracted_coco_dirs["train"] = [os.path.join(self.data_dir, i) \
84
+ for i in os.listdir(self.data_dir) if "train" in i][0]
85
+ self.extracted_coco_dirs["val"] = [os.path.join(self.data_dir, i) \
86
+ for i in os.listdir(self.data_dir) if "val" in i][0]
87
+
88
+ self.extracted_coco_dirs["annotations"] = [os.path.join(self.data_dir, i) \
89
+ for i in os.listdir(self.data_dir) if "annotations" in i][0]
90
+ else:
91
+ raise Exception(f"`filenames` must be a list of atleast 3 coco zip file names; \
92
+ train, val and annotations. Found {len(self.filenames)} items instead.")
93
+
94
+ annot_file = glob(self.extracted_coco_dirs["annotations"] + "/" +\
95
+ f"instances_{self.split}*")[0]
96
+ coco = COCO(annot_file)
97
+ categories = coco.loadCats(coco.getCatIds())
98
+ cat_id_map = {category["id"]: category["name"] for category in categories}
99
+ cat_img_ids = {}
100
+ for cat_id in list(cat_id_map.keys()):
101
+ cat_img_ids[cat_id] = coco.getImgIds(catIds=[cat_id])
102
+
103
+ img_ids = []
104
+ for i in list(cat_img_ids.values()):
105
+ img_ids.extend(i)
106
+
107
+ #get annotations for each image id
108
+ for _id in img_ids:
109
+ annots = [] # polygons
110
+ class_names = []
111
+ labels = [i for i in list(filter(lambda x: _id in cat_img_ids[x], cat_img_ids))]
112
+ image_path = glob(self.extracted_coco_dirs[self.split]+"/"+\
113
+ f"{str(_id).zfill(12)}*")[0]
114
+
115
+ image_height, image_width = cv2.imread(image_path).shape[:2]
116
+ for cat_id in labels:
117
+ annot_ids = coco.getAnnIds(imgIds=_id, catIds=[cat_id])
118
+ if len(annot_ids) > 0:
119
+ img_annotations = coco.loadAnns(annot_ids)
120
+ for ann in img_annotations:
121
+ class_names.append(cat_id_map[cat_id])
122
+ # get polygons
123
+ if type(ann['segmentation']) == list:
124
+ for seg in ann['segmentation']:
125
+ poly = np.array(seg).reshape((int(len(seg) / 2), 2))
126
+ poly[:, 0], poly[:, 1] = poly[:, 0] / image_width, poly[:, 1] / image_height
127
+ annots.append(poly.tolist()) #[[x=col, y=row],...]
128
+ else: # seg: {"counts":[...]}
129
+ if type(ann['segmentation']['counts']) == list:
130
+ rle = maskUtils.frPyObjects([ann['segmentation']], image_height, image_width)
131
+ else:
132
+ rle = ann['segmentation']
133
+ mask = maskUtils.decode(rle) #binary mask
134
+ #convert mask to polygons and add to annots
135
+ contours, _ = cv2.findContours(mask, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
136
+ polygons = []
137
+ for cont in contours:
138
+ if cont.size >= 6:
139
+ polygons.append(cont.astype(float).flatten().tolist())
140
+ # store polygons in (x,y) pairs
141
+ polygons_flattened = reduce(lambda x, y: x + y, polygons)
142
+ del polygons
143
+ del contours
144
+ del mask
145
+ gc.collect()
146
+
147
+ polygons = np.array(polygons_flattened).reshape((int(len(polygons_flattened) / 2),
148
+ 2))
149
+ polygons[:, 0] = polygons[:, 0] / image_width
150
+ polygons[:, 1] = polygons[:, 1] / image_height
151
+
152
+ annots.append(polygons.tolist()) #[[x=col, y=row],...,[x=col, y=row]]
153
+ else: # if no annotations for given image_id-cat_id pair
154
+ continue
155
+ assert len(class_names) == len(annots), f"Num classes must match num annotations\
156
+ for a single image. Found {len(class_names)} classes and {len(annots)} polygons."
157
+
158
+ yield VisualSegmentationFeatures(image_path, class_names, annots, _id)
@@ -0,0 +1,19 @@
1
+ #! Execute dataset upload using the `from_module` upload feature
2
+
3
+ from clarifai.data_upload.upload import UploadConfig
4
+
5
+ text_upload_obj = UploadConfig(
6
+ user_id="",
7
+ app_id="",
8
+ pat="",
9
+ dataset_id="",
10
+ task="visual_clf",
11
+ from_module="./examples/image_classification/cifar10",
12
+ split="train",
13
+ portal="clarifai" #clarifai(prod), dev or staging
14
+ )
15
+ ## change the task and from_module arguments in UploadConfig() to upload
16
+ ## example food-101 dataset
17
+
18
+ if __name__ == "__main__":
19
+ text_upload_obj.upload_to_clarifai()
@@ -1,201 +1,302 @@
1
1
  #! Clarifai data upload
2
2
 
3
+ import importlib
4
+ import inspect
5
+ import os
6
+ import sys
3
7
  import time
4
8
  from concurrent.futures import ThreadPoolExecutor, as_completed
5
9
  from multiprocessing import cpu_count
10
+ from typing import Iterator, Optional, Tuple, Union
6
11
 
7
- from base import Chunker
8
- from clarifai_grpc.grpc.api import resources_pb2, service_pb2
12
+ from clarifai_grpc.grpc.api import resources_pb2, service_pb2, service_pb2_grpc
9
13
  from clarifai_grpc.grpc.api.status import status_code_pb2
10
- from datasets import (ImageClassificationDataset, TextClassificationDataset,
11
- VisualDetectionDataset, VisualSegmentationDataset)
12
- from omegaconf import OmegaConf
13
14
  from tqdm import tqdm
14
15
 
15
16
  from clarifai.client import create_stub
17
+ from clarifai.data_upload.datasets.base import Chunker
18
+ from clarifai.data_upload.datasets.image import (VisualClassificationDataset,
19
+ VisualDetectionDataset, VisualSegmentationDataset)
20
+ from clarifai.data_upload.datasets.text import TextClassificationDataset
16
21
 
17
22
 
18
- def upload_data(config, inputs, inp_stub):
23
+ def load_dataset(module_dir: Union[str, os.PathLike], split: str) -> Iterator:
19
24
  """
20
- Upload inputs to clarifai platform dataset.
25
+ Validate and import dataset module data generator.
21
26
  Args:
22
- config: auth and data path info.
23
- inputs: input protos
24
- inp_stub: grpc stub
27
+ `module_dir`: relative path to the module directory
28
+ The directory must contain a `dataset.py` script and the data itself.
29
+ `split`: "train" or "val"/"test" dataset split
30
+ Module Directory Structure:
31
+ ---------------------------
32
+ <folder_name>/
33
+ ├──__init__.py
34
+ ├──<Your local dir dataset>/
35
+ └──dataset.py
36
+ dataset.py must implement a class named following the convention,
37
+ <dataset_name>Dataset and this class must have a dataloader()
38
+ generator method
25
39
  """
26
- STUB = inp_stub
27
- USER_APP_ID = resources_pb2.UserAppIDSet(
28
- user_id=config.auth["user_id"], app_id=config.auth["app_id"])
29
-
30
- upload_count = 0
31
- retry_upload = [] # those that fail to upload are stored for retries
32
-
33
- for inp_proto in inputs:
34
- response = STUB.PostInputs(
35
- service_pb2.PostInputsRequest(user_app_id=USER_APP_ID, inputs=[inp_proto]),)
36
-
37
- if response.status.code != status_code_pb2.SUCCESS:
38
- try:
39
- print(f"Post inputs failed, status:\n{response.inputs[0].status.details}\n")
40
- except:
41
- print(f"Post inputs failed, status:\n{response.status.details}\n")
42
- retry_upload.append(inp_proto)
40
+ sys.path.append(str(module_dir))
41
+
42
+ if not os.path.exists(os.path.join(module_dir, "__init__.py")):
43
+ with open(os.path.join(module_dir, "__init__.py"), "w"):
44
+ pass
45
+
46
+ import dataset # dataset module
47
+
48
+ # get main module class
49
+ main_module_cls = None
50
+ for name, obj in dataset.__dict__.items():
51
+ if inspect.isclass(obj) and "Dataset" in name:
52
+ main_module_cls = obj
43
53
  else:
44
- upload_count += 1
54
+ continue
45
55
 
46
- return retry_upload
56
+ return main_module_cls(split).dataloader()
47
57
 
48
58
 
49
- def upload_annotations(config, inputs, inp_stub):
59
+ def load_zoo_dataset(name: str, split: str) -> Iterator:
50
60
  """
51
- Upload image annotations to clarifai detection dataset
61
+ Get dataset generator object from dataset zoo.
62
+ Args:
63
+ `name`: dataset module name in datasets/zoo/.
64
+ `split`: "train" or "val"/"test" dataset split
65
+ Returns:
66
+ Data generator object
52
67
  """
53
- STUB = inp_stub
54
- USER_APP_ID = resources_pb2.UserAppIDSet(
55
- user_id=config.auth["user_id"], app_id=config.auth["app_id"])
56
-
57
- upload_count = 0
58
- retry_upload = [] # those that fail to upload are stored for retries
59
-
60
- for annot_proto in inputs:
61
- response = STUB.PostAnnotations(
62
- service_pb2.PostAnnotationsRequest(user_app_id=USER_APP_ID, annotations=[annot_proto]),)
63
-
64
- if response.status.code != status_code_pb2.SUCCESS:
65
- try:
66
- print(f"Post annotations failed, status:\n{response.annotations[0].status.details}\n")
67
- except:
68
- print(f"Post annotations failed, status:\n{response.status.details}\n")
69
- retry_upload.append(annot_proto)
68
+ zoo_dataset = importlib.import_module(f"datasets.zoo.{name}")
69
+ # get main module class
70
+ main_module_cls = None
71
+ for name, obj in zoo_dataset.__dict__.items():
72
+ if inspect.isclass(obj) and "Dataset" in name:
73
+ main_module_cls = obj
70
74
  else:
71
- upload_count += 1
75
+ continue
76
+
77
+ return main_module_cls(split).dataloader()
78
+
79
+
80
+ class UploadConfig:
81
+
82
+ def __init__(
83
+ self,
84
+ user_id: str,
85
+ app_id: str,
86
+ pat: str,
87
+ dataset_id: str,
88
+ task: str,
89
+ from_module: Optional[Union[str, os.PathLike]] = None,
90
+ from_zoo: Optional[str] = None, # load dataset from zoo
91
+ split: str = "train", # train or test/val
92
+ chunk_size: int = 16,
93
+ portal: str = "clarifai"):
94
+ """
95
+ Initialize upload configs.
96
+ Args:
97
+ `user_id`: Clarifai user id.
98
+ `app_id`: Clarifai app id.
99
+ `pat`: Clarifai PAT(Personal Access Token).
100
+ `dataset_id`: Clarifai dataset id (where data is to be uploaded).
101
+ `task`: either of `visual_clf`, `visual_detection`, `visual_segmentation` or `text_clf`.
102
+ `from_module`: Path to dataset module directory.
103
+ Should be left as None if `from_zoo` is to be used.
104
+ `from_zoo`: Name of dataset to upload from the zoo.
105
+ The name must match the dataset module name excluding the file extension.
106
+ Should be left as None if `from_module` is to be used.
107
+ `split`: Dataset split to upload. Either of train or test/val
108
+ `chunk_size`: size of chunks for parallel data upload.
109
+ """
110
+ self.USER_ID = user_id
111
+ self.APP_ID = app_id
112
+ self.PAT = pat
113
+ self.dataset_id = dataset_id
114
+ self.task = task
115
+ self.module_dir = from_module
116
+ self.zoo_dataset = from_zoo
117
+ self.split = split
118
+ self.chunk_size = chunk_size
119
+ self.num_workers: int = cpu_count()
120
+ self.__base: str = ""
121
+ if portal == "dev":
122
+ self.__base = "https://api-dev.clarifai.com"
123
+ elif portal == "staging":
124
+ self.__base = "https://api-staging.clarifai.com"
125
+ else: #prod
126
+ self.__base = "https://api.clarifai.com"
127
+
128
+ # Set auth vars as env variables
129
+ os.environ["CLARIFAI_USER_ID"] = self.USER_ID
130
+ os.environ["CLARIFAI_APP_ID"] = self.APP_ID
131
+ os.environ["CLARIFAI_API_BASE"] = self.__base
132
+ os.environ["CLARIFAI_PAT"] = self.PAT
133
+
134
+ self.STUB: service_pb2_grpc.V2Stub = create_stub()
135
+ self.metadata: Tuple = (('authorization', 'Key ' + self.PAT),)
136
+ self.user_app_id = resources_pb2.UserAppIDSet(user_id=self.USER_ID, app_id=self.APP_ID)
137
+
138
+ def _upload_inputs(self, inputs):
139
+ """
140
+ Upload inputs to clarifai platform dataset.
141
+ Args:
142
+ inputs: input protos
143
+ """
144
+ upload_count = 0
145
+ retry_upload = [] # those that fail to upload are stored for retries
146
+
147
+ for inp_proto in inputs:
148
+ response = self.STUB.PostInputs(
149
+ service_pb2.PostInputsRequest(user_app_id=self.user_app_id, inputs=[inp_proto]),)
150
+
151
+ MESSAGE_DUPLICATE_ID = "Input has a duplicate ID."
152
+ if response.status.code != status_code_pb2.SUCCESS:
153
+ try:
154
+ if response.inputs[0].status.details != MESSAGE_DUPLICATE_ID:
155
+ retry_upload.append(inp_proto)
156
+ print(f"Post inputs failed, status: {response.inputs[0].status.details}\n")
157
+ continue
158
+ except:
159
+ print(f"Post inputs failed, status: {response.status.details}\n")
160
+ else:
161
+ upload_count += 1
162
+
163
+ return retry_upload
164
+
165
+ def upload_annotations(self, inputs):
166
+ """
167
+ Upload image annotations to clarifai detection dataset
168
+ """
169
+ upload_count = 0
170
+ retry_upload = [] # those that fail to upload are stored for retries
171
+
172
+ for annot_proto in inputs:
173
+ response = self.STUB.PostAnnotations(
174
+ service_pb2.PostAnnotationsRequest(
175
+ user_app_id=self.user_app_id, annotations=[annot_proto]),)
176
+
177
+ if response.status.code != status_code_pb2.SUCCESS:
178
+ try:
179
+ print(f"Post annotations failed, status:\n{response.annotations[0].status.details}\n")
180
+ continue
181
+ except:
182
+ print(f"Post annotations failed, status:\n{response.status.details}\n")
183
+ retry_upload.append(annot_proto)
184
+ else:
185
+ upload_count += 1
186
+
187
+ return retry_upload
188
+
189
+ def concurrent_inp_upload(self, inputs, chunks):
190
+ """
191
+ Upload images concurrently.
192
+ """
193
+ inp_threads = []
194
+ retry_upload = []
195
+
196
+ with ThreadPoolExecutor(max_workers=self.num_workers) as executor:
197
+ for inp_batch in tqdm(inputs, total=chunks + 1, desc="uploading inputs..."):
198
+ inp_threads.append(executor.submit(self._upload_inputs, inp_batch))
199
+ time.sleep(0.1)
200
+
201
+ for job in tqdm(
202
+ as_completed(inp_threads), total=chunks + 1, desc="retry uploading failed protos..."):
203
+ if job.result():
204
+ retry_upload.extend(job.result())
205
+
206
+ if len(
207
+ list(retry_upload)) > 0: ## TODO: use api_with_retries functionality via upload_inputs()
208
+ _ = self._upload_inputs(retry_upload)
209
+
210
+ def concurrent_annot_upload(self, inputs, chunks):
211
+ """
212
+ Uploads annotations concurrently.
213
+ """
214
+ annot_threads = []
215
+ retry_annot_upload = []
216
+
217
+ with ThreadPoolExecutor(max_workers=self.num_workers) as executor:
218
+ for annot_batch in tqdm(inputs, total=chunks + 1, desc="uploading..."):
219
+ annot_threads.append(executor.submit(self.upload_annotations, annot_batch))
220
+ time.sleep(0.2)
221
+
222
+ for job in tqdm(
223
+ as_completed(annot_threads), total=chunks + 1, desc="retry uploading failed protos..."):
224
+ if job.result():
225
+ retry_annot_upload.extend(job.result())
226
+ if len(retry_annot_upload) > 0:
227
+ ## TODO: use api_with_retries functionality via upload_annotations()
228
+ _ = self.upload_annotations(retry_annot_upload)
229
+
230
+ def upload_to_clarifai(self):
231
+ """
232
+ Execute data upload.
233
+ """
234
+ datagen_object = None
235
+ if self.module_dir is None and self.zoo_dataset is None:
236
+ raise Exception("One of `from_module` and `from_zoo` must be \
237
+ specified. Both can't be None or defined at the same time.")
238
+ elif self.module_dir is not None and self.zoo_dataset is not None:
239
+ raise Exception("Use either of `from_module` or `from_zoo` \
240
+ but NOT both.")
241
+ elif self.module_dir is not None:
242
+ datagen_object = load_dataset(self.module_dir, self.split)
243
+ else:
244
+ datagen_object = load_zoo_dataset(self.zoo_dataset, self.split)
72
245
 
73
- return retry_upload
246
+ if self.task == "text_clf":
247
+ dataset_obj = TextClassificationDataset(datagen_object, self.dataset_id, self.split)
248
+ text_protos = dataset_obj._get_input_protos()
249
+ text_protos = dataset_obj._to_list(text_protos)
74
250
 
251
+ # Upload text
252
+ chunks = len(text_protos) // self.num_workers
253
+ chunked_text_protos = Chunker(text_protos, self.chunk_size).chunk()
75
254
 
76
- def concurrent_inp_upload(config, inputs, workers, chunks, stub):
77
- """
78
- Upload images concurrently for efficiency.
79
- """
80
- inp_threads = []
81
- retry_upload = []
255
+ self.concurrent_inp_upload(chunked_text_protos, chunks)
82
256
 
83
- with ThreadPoolExecutor(max_workers=workers) as executor:
84
- for inp_batch in tqdm(inputs, total=chunks + 1, desc="uploading.."):
85
- inp_threads.append(executor.submit(upload_data, config, inp_batch, stub))
86
- time.sleep(0.2)
257
+ elif self.task == "visual_detection":
258
+ dataset_obj = VisualDetectionDataset(datagen_object, self.dataset_id, self.split)
259
+ img_protos, annotation_protos = dataset_obj._get_input_protos()
260
+ img_protos = dataset_obj._to_list(img_protos)
87
261
 
88
- for job in tqdm(
89
- as_completed(inp_threads), total=chunks + 1, desc="retry uploading failed protos..."):
90
- if job.result():
91
- retry_upload.extend(job.result())
92
- if len(list(retry_upload)) > 0: ## TODO: use api_with_retries functionality via upload_data()
93
- _ = upload_data(config, retry_upload, stub)
262
+ # Upload images
263
+ chunks = len(img_protos) // self.num_workers
264
+ chunked_img_protos = Chunker(img_protos, self.chunk_size).chunk()
94
265
 
266
+ self.concurrent_inp_upload(chunked_img_protos, chunks)
95
267
 
96
- def concurrent_annot_upload(config, inputs, workers, chunks, stub):
97
- """
98
- Upload annotations concurrently for efficiency.
99
- """
100
- annot_threads = []
101
- retry_annot_upload = []
268
+ # Upload annotations:
269
+ print("Uploading annotations.......")
270
+ annotation_protos = dataset_obj._to_list(annotation_protos)
271
+ chunks_ = len(annotation_protos) // self.num_workers
272
+ chunked_annot_protos = Chunker(annotation_protos, self.chunk_size).chunk()
102
273
 
103
- with ThreadPoolExecutor(max_workers=workers) as executor:
104
- for annot_batch in tqdm(inputs, total=chunks + 1, desc="uploading..."):
105
- annot_threads.append(executor.submit(upload_annotations, config, annot_batch, stub))
106
- time.sleep(0.2)
274
+ self.concurrent_annot_upload(chunked_annot_protos, chunks_)
107
275
 
108
- for job in tqdm(
109
- as_completed(annot_threads), total=chunks + 1, desc="retry uploading failed protos..."):
110
- if job.result():
111
- retry_annot_upload.extend(job.result())
112
- if len(retry_annot_upload) > 0:
113
- ## TODO: use api_with_retries functionality via upload_annotations()
114
- _ = upload_annotations(config, retry_annot_upload, stub)
276
+ elif self.task == "visual_segmentation":
277
+ dataset_obj = VisualSegmentationDataset(datagen_object, self.dataset_id, self.split)
278
+ img_protos, mask_protos = dataset_obj._get_input_protos()
279
+ img_protos = dataset_obj._to_list(img_protos)
280
+ mask_protos = dataset_obj._to_list(mask_protos)
115
281
 
282
+ # Upload images
283
+ chunks = len(img_protos) // self.num_workers
284
+ chunked_img_protos = Chunker(img_protos, self.chunk_size).chunk()
116
285
 
117
- def upload_to_clarifai(config, task: str = "visual_clf"):
118
- """
119
- Execute data upload.
120
- Args:
121
- `config`: auth and data path info.
122
- `task`: Machine Learning domain task data type.
123
- Can be either of `visual_clf`, `visual_det` or `text_clf`.
124
- """
125
- STUB = create_stub()
126
- workers = cpu_count()
127
-
128
- if task == "text_clf":
129
- dataset_obj = TextClassificationDataset(config.data["clf_text_dir"], config.data["dataset_id"],
130
- config["split"])
131
- text_protos = dataset_obj._get_input_protos()
132
- text_protos = dataset_obj.to_list(text_protos)
133
-
134
- # Upload text
135
- chunks = len(text_protos) // workers
136
- chunked_text_protos = Chunker(text_protos, config["chunk_size"]).chunk()
137
-
138
- concurrent_inp_upload(config, chunked_text_protos, workers, chunks, STUB)
139
-
140
- elif task == "visual_det":
141
- dataset_obj = VisualDetectionDataset(
142
- config.data["visual_det_image_dir"],
143
- config.data["visual_det_labels_dir"],
144
- config.data["dataset_id"],
145
- config["split"],
146
- labels_from_text_file=False)
147
- img_protos, annotation_protos = dataset_obj._get_input_protos()
148
- img_protos = dataset_obj.to_list(img_protos)
149
-
150
- # Upload images
151
- chunks = len(img_protos) // workers
152
- chunked_img_protos = Chunker(img_protos, config["chunk_size"]).chunk()
153
-
154
- concurrent_inp_upload(config, chunked_img_protos, workers, chunks, STUB)
155
-
156
- # Upload annotations:
157
- print("Uploading annotations.......")
158
- annotation_protos = dataset_obj.to_list(annotation_protos)
159
- chunks_ = len(annotation_protos) // workers
160
- chunked_annot_protos = Chunker(annotation_protos, config["chunk_size"]).chunk()
161
-
162
- concurrent_annot_upload(config, chunked_annot_protos, workers, chunks_, STUB)
163
-
164
- elif task == "visual_seg":
165
- dataset_obj = VisualSegmentationDataset(config.data["visual_seg_image_dir"],
166
- config.data["visual_seg_masks_dir"],
167
- config.data["dataset_id"], config["split"])
168
- img_protos, mask_protos = dataset_obj._get_input_protos()
169
- img_protos = dataset_obj.to_list(img_protos)
170
- mask_protos = dataset_obj.to_list(mask_protos)
171
-
172
- # Upload images
173
- chunks = len(img_protos) // workers
174
- chunked_img_protos = Chunker(img_protos, config["chunk_size"]).chunk()
175
-
176
- concurrent_inp_upload(config, chunked_img_protos, workers, chunks, STUB)
177
-
178
- # Upload masks:
179
- print("Uploading masks.......")
180
- chunks_ = len(mask_protos) // workers
181
- chunked_mask_protos = Chunker(mask_protos, config["chunk_size"]).chunk()
182
-
183
- concurrent_annot_upload(config, chunked_mask_protos, workers, chunks_, STUB)
184
-
185
- else:
186
- dataset_obj = ImageClassificationDataset(config.data["clf_image_dir"],
187
- config.data["dataset_id"], config["split"])
188
- img_protos = dataset_obj._get_input_protos()
189
- img_protos = dataset_obj.to_list(img_protos)
190
-
191
- # Upload images
192
- chunks = len(img_protos) // workers
193
- chunked_img_protos = Chunker(img_protos, config["chunk_size"]).chunk()
194
-
195
- concurrent_inp_upload(config, chunked_img_protos, workers, chunks, STUB)
196
-
197
-
198
- if __name__ == "__main__":
199
- yaml_path = "./config.yaml"
200
- config = OmegaConf.load(yaml_path)
201
- upload_to_clarifai(config, task=config["task"])
286
+ #self.concurrent_inp_upload(chunked_img_protos, chunks)
287
+ # Upload masks:
288
+ print("Uploading masks.......")
289
+ chunks_ = len(mask_protos) // self.num_workers
290
+ chunked_mask_protos = Chunker(mask_protos, self.chunk_size).chunk()
291
+
292
+ self.concurrent_annot_upload(chunked_mask_protos, chunks_)
293
+ else: # visual-classification & visual-captioning
294
+ dataset_obj = VisualClassificationDataset(datagen_object, self.dataset_id, self.split)
295
+ img_protos = dataset_obj._get_input_protos()
296
+ img_protos = dataset_obj._to_list(img_protos)
297
+
298
+ # Upload images
299
+ chunks = len(img_protos) // self.num_workers
300
+ chunked_img_protos = Chunker(img_protos, self.chunk_size).chunk()
301
+
302
+ self.concurrent_inp_upload(chunked_img_protos, chunks)
@@ -24,12 +24,6 @@ def installed_module_versions_generator(stub: V2Stub,
24
24
 
25
25
  imv_success_status = {status_code_pb2.SUCCESS}
26
26
 
27
- # HACK(zeiler): this is the number of default installed module versions every app has.
28
- # so with pagination
29
- seen = {
30
- "module_manager_install": False,
31
- }
32
-
33
27
  page = 1
34
28
  while True:
35
29
  response = stub.ListInstalledModuleVersions(
@@ -39,13 +33,8 @@ def installed_module_versions_generator(stub: V2Stub,
39
33
  if response.status.code not in imv_success_status:
40
34
  raise Exception("ListInstalledModuleVersions failed with response %r" % response)
41
35
  for item in response.installed_module_versions:
42
- if item.id in seen:
43
- if not seen[item.id]: # yield it once.
44
- seen[item.id] = True
45
- yield item
46
- else:
47
- yield item
36
+ yield item
48
37
  page += 1
49
- # if we don't get a full page back (plus the hard coded ones) we know we're done.
50
- if len(response.installed_module_versions) < page_size + len(seen):
38
+ # if we don't get a full page back we know we're done.
39
+ if len(response.installed_module_versions) < page_size:
51
40
  break