clarifai 9.1.0__py3-none-any.whl → 9.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. clarifai/data_upload/datasets/__init__.py +0 -0
  2. clarifai/data_upload/datasets/base.py +67 -0
  3. clarifai/data_upload/datasets/features.py +45 -0
  4. clarifai/data_upload/datasets/image.py +236 -0
  5. clarifai/data_upload/datasets/text.py +62 -0
  6. clarifai/data_upload/datasets/zoo/__init__.py +0 -0
  7. clarifai/data_upload/datasets/zoo/coco_captions.py +99 -0
  8. clarifai/data_upload/datasets/zoo/coco_detection.py +129 -0
  9. clarifai/data_upload/datasets/zoo/coco_segmentation.py +158 -0
  10. clarifai/data_upload/examples.py +19 -0
  11. clarifai/data_upload/upload.py +269 -168
  12. clarifai/listing/installed_module_versions.py +3 -14
  13. clarifai/listing/lister.py +40 -0
  14. clarifai/listing/module_versions.py +42 -0
  15. clarifai/listing/modules.py +36 -0
  16. clarifai/modules/style.css +7 -4
  17. {clarifai-9.1.0.dist-info → clarifai-9.3.1.dist-info}/METADATA +3 -3
  18. {clarifai-9.1.0.dist-info → clarifai-9.3.1.dist-info}/RECORD +37 -13
  19. clarifai_utils/data_upload/datasets/__init__.py +0 -0
  20. clarifai_utils/data_upload/datasets/base.py +67 -0
  21. clarifai_utils/data_upload/datasets/features.py +45 -0
  22. clarifai_utils/data_upload/datasets/image.py +236 -0
  23. clarifai_utils/data_upload/datasets/text.py +62 -0
  24. clarifai_utils/data_upload/datasets/zoo/__init__.py +0 -0
  25. clarifai_utils/data_upload/datasets/zoo/coco_captions.py +99 -0
  26. clarifai_utils/data_upload/datasets/zoo/coco_detection.py +129 -0
  27. clarifai_utils/data_upload/datasets/zoo/coco_segmentation.py +158 -0
  28. clarifai_utils/data_upload/examples.py +19 -0
  29. clarifai_utils/data_upload/upload.py +269 -168
  30. clarifai_utils/listing/installed_module_versions.py +3 -14
  31. clarifai_utils/listing/lister.py +40 -0
  32. clarifai_utils/listing/module_versions.py +42 -0
  33. clarifai_utils/listing/modules.py +36 -0
  34. clarifai_utils/modules/style.css +7 -4
  35. {clarifai-9.1.0.dist-info → clarifai-9.3.1.dist-info}/LICENSE +0 -0
  36. {clarifai-9.1.0.dist-info → clarifai-9.3.1.dist-info}/WHEEL +0 -0
  37. {clarifai-9.1.0.dist-info → clarifai-9.3.1.dist-info}/top_level.txt +0 -0
File without changes
@@ -0,0 +1,67 @@
1
+ from typing import Iterator, List
2
+
3
+ from clarifai_grpc.grpc.api import resources_pb2
4
+ from google.protobuf.struct_pb2 import Struct
5
+
6
+
7
+ class ClarifaiDataset:
8
+ """
9
+ Clarifai datasets base class.
10
+ """
11
+
12
+ def __init__(self, datagen_object: Iterator, dataset_id: str, split: str) -> None:
13
+ self.datagen_object = datagen_object
14
+ self.dataset_id = dataset_id
15
+ self.split = split
16
+ self._all_input_protos = []
17
+
18
+ def __len__(self) -> int:
19
+ """
20
+ Get size of all input protos
21
+ """
22
+ return len(self._all_input_protos)
23
+
24
+ def _to_list(self, input_protos: Iterator) -> List:
25
+ """
26
+ Parse protos iterator to list.
27
+ """
28
+ return list(input_protos)
29
+
30
+ def create_input_protos(self, image_path: str, label: str, input_id: str, dataset_id: str,
31
+ metadata: Struct) -> resources_pb2.Input:
32
+ """
33
+ Create input protos for each image, label input pair.
34
+ Args:
35
+ `image_path`: path to image.
36
+ `label`: image label
37
+ `input_id: unique input id
38
+ `dataset_id`: Clarifai dataset id
39
+ `metadata`: input metadata
40
+ Returns:
41
+ An input proto representing a single row input
42
+ """
43
+ raise NotImplementedError()
44
+
45
+ def _get_input_protos(self) -> Iterator:
46
+ """
47
+ Create input protos for each row of the dataframe.
48
+ Returns:
49
+ A list of input protos
50
+ """
51
+ raise NotImplementedError()
52
+
53
+
54
+ class Chunker:
55
+ """
56
+ Split an input sequence into small chunks.
57
+ """
58
+
59
+ def __init__(self, seq: List, size: int) -> None:
60
+ self.seq = seq
61
+ self.size = size
62
+
63
+ def chunk(self) -> List[List]:
64
+ """
65
+ Chunk input sequence.
66
+ """
67
+ return [self.seq[pos:pos + self.size] for pos in range(0, len(self.seq), self.size)]
@@ -0,0 +1,45 @@
1
+ #! dataset output features (output from preprocessing & input to clarifai data proto builders)
2
+ from dataclasses import dataclass
3
+ from typing import List, Optional, Union
4
+
5
+
6
+ @dataclass
7
+ class TextFeatures:
8
+ """
9
+ Text classification datasets preprocessing output features.
10
+ """
11
+ text: str
12
+ labels: List[Union[str, int]] # List[str or int] to cater for multi-class tasks
13
+ id: Optional[int] = None # text_id
14
+
15
+
16
+ @dataclass
17
+ class VisualClassificationFeatures:
18
+ """
19
+ Image classification datasets preprocessing output features.
20
+ """
21
+ image_path: str
22
+ label: Union[str, int]
23
+ id: Optional[int] = None # image_id
24
+
25
+
26
+ @dataclass
27
+ class VisualDetectionFeatures:
28
+ """
29
+ Image Detection datasets preprocessing output features.
30
+ """
31
+ image_path: str
32
+ classes: List[Union[str, int]]
33
+ bboxes: List[List[float]]
34
+ id: Optional[int] = None # image_id
35
+
36
+
37
+ @dataclass
38
+ class VisualSegmentationFeatures:
39
+ """
40
+ Image Segmentation datasets preprocessing output features.
41
+ """
42
+ image_path: str
43
+ classes: List[Union[str]]
44
+ polygons: List[List[List[float]]]
45
+ id: Optional[int] = None # image_id
@@ -0,0 +1,236 @@
1
+ from typing import Iterator, List, Tuple, Union
2
+
3
+ from clarifai_grpc.grpc.api import resources_pb2
4
+ from google.protobuf.struct_pb2 import Struct
5
+ from tqdm import tqdm
6
+
7
+ from .base import ClarifaiDataset
8
+
9
+
10
+ class VisualClassificationDataset(ClarifaiDataset):
11
+
12
+ def __init__(self, datagen_object: Iterator, dataset_id: str, split: str) -> None:
13
+ super().__init__(datagen_object, dataset_id, split)
14
+
15
+ def create_input_protos(self, image_path: str, labels: List[Union[str, int]], input_id: str,
16
+ dataset_id: str, metadata: Struct) -> resources_pb2.Input:
17
+ """
18
+ Create input protos for each image, label input pair.
19
+ Args:
20
+ `image_path`: image path.
21
+ `labels`: image label(s)
22
+ `input_id: unique input id
23
+ `dataset_id`: Clarifai dataset id
24
+ `metadata`: image metadata
25
+ Returns:
26
+ An input proto representing a single row input
27
+ """
28
+ input_proto = resources_pb2.Input(
29
+ id=input_id,
30
+ dataset_ids=[dataset_id],
31
+ data=resources_pb2.Data(
32
+ image=resources_pb2.Image(base64=open(image_path, 'rb').read(),),
33
+ concepts=[
34
+ resources_pb2.Concept(
35
+ id=f"id-{''.join(_label.split(' '))}", name=_label, value=1.)\
36
+ for _label in labels
37
+ ],
38
+ metadata=metadata))
39
+
40
+ return input_proto
41
+
42
+ def _get_input_protos(self) -> Iterator:
43
+ """
44
+ Create input image protos for each data generator item.
45
+ Returns:
46
+ Input proto iterator
47
+ """
48
+ for i, item in tqdm(enumerate(self.datagen_object), desc="Creating input protos..."):
49
+ metadata = Struct()
50
+ image_path = item.image_path
51
+ label = item.label if isinstance(item.label, list) else [item.label] # clarifai concept
52
+ input_id = f"{self.dataset_id}-{self.split}-{i}" if item.id is None else f"{self.split}-{str(item.id)}"
53
+
54
+ input_proto = self.create_input_protos(image_path, label, input_id, self.dataset_id,
55
+ metadata)
56
+ self._all_input_protos.append(input_proto)
57
+
58
+ return iter(self._all_input_protos)
59
+
60
+
61
+ class VisualDetectionDataset(ClarifaiDataset):
62
+ """
63
+ Visual detection dataset proto class.
64
+ """
65
+
66
+ def __init__(self, datagen_object: Iterator, dataset_id: str, split: str) -> None:
67
+ super().__init__(datagen_object, dataset_id, split)
68
+ self._annotation_protos = []
69
+
70
+ def create_input_protos(self, image_path: str, input_id: str, dataset_id: str,
71
+ metadata: Struct) -> resources_pb2.Input:
72
+ """
73
+ Create input protos for each image, label input pair.
74
+ Args:
75
+ `image_path`: file path to image
76
+ `input_id: unique input id
77
+ `dataset_id`: Clarifai dataset id
78
+ `metadata`: image metadata
79
+ Returns:
80
+ An input proto representing a single row input
81
+ """
82
+ input_image_proto = resources_pb2.Input(
83
+ id=input_id,
84
+ dataset_ids=[dataset_id],
85
+ data=resources_pb2.Data(
86
+ image=resources_pb2.Image(base64=open(image_path, 'rb').read(),), metadata=metadata))
87
+
88
+ return input_image_proto
89
+
90
+ def create_annotation_proto(self, label: str, annotations: List, input_id: str,
91
+ dataset_id: str) -> resources_pb2.Annotation:
92
+ """
93
+ Create an input proto for each bounding box, label input pair.
94
+ Args:
95
+ `label`: annotation label
96
+ `annotations`: a list of a single bbox's coordinates.
97
+ `input_id: unique input id
98
+ `dataset_id`: Clarifai dataset id
99
+ Returns:
100
+ An input proto representing a single image input
101
+ """
102
+ input_annot_proto = resources_pb2.Annotation(
103
+ input_id=input_id,
104
+ data=resources_pb2.Data(regions=[
105
+ resources_pb2.Region(
106
+ region_info=resources_pb2.RegionInfo(bounding_box=resources_pb2.BoundingBox(
107
+ # Annotations ordering: [xmin, ymin, xmax, ymax]
108
+ # top_row must be less than bottom row
109
+ # left_col must be less than right col
110
+ top_row=annotations[1], #y_min
111
+ left_col=annotations[0], #x_min
112
+ bottom_row=annotations[3], #y_max
113
+ right_col=annotations[2] #x_max
114
+ )),
115
+ data=resources_pb2.Data(concepts=[
116
+ resources_pb2.Concept(
117
+ id=f"id-{''.join(label.split(' '))}", name=label, value=1.)
118
+ ]))
119
+ ]))
120
+
121
+ return input_annot_proto
122
+
123
+ def _get_input_protos(self) -> Tuple[Iterator, Iterator]:
124
+ """
125
+ Create input image protos for each data generator item.
126
+ Returns:
127
+ Input and Annotation proto iterators.
128
+ """
129
+ for i, item in tqdm(enumerate(self.datagen_object), desc="Creating input protos..."):
130
+ metadata = Struct()
131
+ image = item.image_path
132
+ labels = item.classes # list:[l1,...,ln]
133
+ bboxes = item.bboxes # [[xmin,ymin,xmax,ymax],...,[xmin,ymin,xmax,ymax]]
134
+ input_id = f"{self.dataset_id}-{self.split}-{i}" if item.id is None else f"{self.split}-{str(item.id)}"
135
+ metadata.update({"label": labels, "split": self.split})
136
+
137
+ input_image_proto = self.create_input_protos(image, input_id, self.dataset_id, metadata)
138
+ self._all_input_protos.append(input_image_proto)
139
+
140
+ # iter over bboxes and classes
141
+ # one id could have more than one bbox and label
142
+ for i in range(len(bboxes)):
143
+ input_annot_proto = self.create_annotation_proto(labels[i], bboxes[i], input_id,
144
+ self.dataset_id)
145
+ self._annotation_protos.append(input_annot_proto)
146
+
147
+ return iter(self._all_input_protos), iter(self._annotation_protos)
148
+
149
+
150
+ class VisualSegmentationDataset(ClarifaiDataset):
151
+ """
152
+ Visual segmentation dataset proto class.
153
+ """
154
+
155
+ def __init__(self, datagen_object: Iterator, dataset_id: str, split: str) -> None:
156
+ super().__init__(datagen_object, dataset_id, split)
157
+ self._mask_protos = [] # mask or polygon protos
158
+
159
+ def create_input_protos(self, image_path: str, input_id: str, dataset_id: str,
160
+ metadata: Struct) -> resources_pb2.Input:
161
+ """
162
+ Create input protos for each image, label input pair.
163
+ Args:
164
+ `image_path`: absolute image file path
165
+ `input_id: unique input id
166
+ `dataset_id`: Clarifai dataset id
167
+ `metadata`: image metadata
168
+ Returns:
169
+ An input proto representing a single input item
170
+ """
171
+ input_image_proto = resources_pb2.Input(
172
+ id=input_id,
173
+ dataset_ids=[dataset_id],
174
+ data=resources_pb2.Data(
175
+ image=resources_pb2.Image(base64=open(image_path, 'rb').read(),), metadata=metadata))
176
+
177
+ return input_image_proto
178
+
179
+ def create_mask_proto(self, label: str, polygons: List[List[float]], input_id: str,
180
+ dataset_id: str) -> resources_pb2.Annotation:
181
+ """
182
+ Create an input mask proto for an input polygon/mask and label.
183
+ Args:
184
+ `label`: image label
185
+ `polygons`: Polygon x,y points iterable
186
+ `input_id: unique input id
187
+ `dataset_id`: Clarifai dataset id
188
+ Returns:
189
+ An input proto corresponding to a single image
190
+ """
191
+ input_mask_proto = resources_pb2.Annotation(
192
+ input_id=input_id,
193
+ data=resources_pb2.Data(regions=[
194
+ resources_pb2.Region(
195
+ region_info=resources_pb2.RegionInfo(polygon=resources_pb2.Polygon(
196
+ points=[
197
+ resources_pb2.Point(
198
+ row=_point[1], # row is y point
199
+ col=_point[0], # col is x point
200
+ visibility="VISIBLE") for _point in polygons
201
+ ])),
202
+ data=resources_pb2.Data(concepts=[
203
+ resources_pb2.Concept(
204
+ id=f"id-{''.join(label.split(' '))}", name=label, value=1.)
205
+ ]))
206
+ ]))
207
+
208
+ return input_mask_proto
209
+
210
+ def _get_input_protos(self) -> Tuple[Iterator, Iterator]:
211
+ """
212
+ Create input image and annotation protos for each data generator item.
213
+ Returns:
214
+ Input and Annotation proto iterators.
215
+ """
216
+ for i, item in tqdm(enumerate(self.datagen_object), desc="Creating input protos..."):
217
+ metadata = Struct()
218
+ image = item.image_path # image path
219
+ labels = item.classes # list of class labels
220
+ _polygons = item.polygons # list of polygons: [[[x,y],...,[x,y]],...]
221
+ input_id = f"{self.dataset_id}-{self.split}-{i}" if item.id is None else f"{self.split}-{str(item.id)}"
222
+ metadata.update({"label": labels, "split": self.split})
223
+
224
+ input_image_proto = self.create_input_protos(image, input_id, self.dataset_id, metadata)
225
+ self._all_input_protos.append(input_image_proto)
226
+
227
+ ## Iterate over each masked image and create a proto for upload to clarifai
228
+ ## The length of masks/polygons-list and labels must be equal
229
+ for i, _polygon in enumerate(_polygons):
230
+ try:
231
+ input_mask_proto = self.create_mask_proto(labels[i], _polygon, input_id, self.dataset_id)
232
+ self._mask_protos.append(input_mask_proto)
233
+ except IndexError:
234
+ continue
235
+
236
+ return iter(self._all_input_protos), iter(self._mask_protos)
@@ -0,0 +1,62 @@
1
+ from typing import Iterator, List
2
+
3
+ from clarifai_grpc.grpc.api import resources_pb2
4
+ from google.protobuf.struct_pb2 import Struct
5
+ from tqdm import tqdm
6
+
7
+ from .base import ClarifaiDataset
8
+
9
+
10
+ class TextClassificationDataset(ClarifaiDataset):
11
+ """
12
+ Upload text classification datasets to clarifai datasets
13
+ """
14
+
15
+ def __init__(self, datagen_object: Iterator, dataset_id: str, split: str) -> None:
16
+ super().__init__(datagen_object, dataset_id, split)
17
+
18
+ def create_input_protos(self, text_input: str, labels: List[str], input_id: str, dataset_id: str,
19
+ metadata: Struct) -> resources_pb2.Input:
20
+ """
21
+ Create input protos for each text, label input pairs.
22
+ Args:
23
+ `text_input`: text string.
24
+ `labels`: text labels
25
+ `input_id: unique input id
26
+ `dataset_id`: Clarifai dataset id
27
+ `metadata`:input metadata
28
+ Returns:
29
+ An input proto representing a single row input
30
+ """
31
+ input_proto = resources_pb2.Input(
32
+ id=input_id,
33
+ dataset_ids=[dataset_id],
34
+ data=resources_pb2.Data(
35
+ text=resources_pb2.Text(raw=text_input),
36
+ concepts=[
37
+ resources_pb2.Concept(
38
+ id=f"id-{''.join(_label.split(' '))}", name=_label, value=1.)
39
+ for _label in labels
40
+ ],
41
+ metadata=metadata))
42
+
43
+ return input_proto
44
+
45
+ def _get_input_protos(self) -> Iterator:
46
+ """
47
+ Creates input protos for each data generator item.
48
+ Returns:
49
+ A list of input protos
50
+ """
51
+ for i, item in tqdm(enumerate(self.datagen_object), desc="Loading text data"):
52
+ metadata = Struct()
53
+ text = item.text
54
+ labels = item.labels if isinstance(item.labels, list) else [item.labels] # clarifai concept
55
+ input_id = f"{self.dataset_id}-{self.split}-{i}" if item.id is None else f"{self.split}-{str(item.id)}"
56
+ metadata.update({"label": labels, "split": self.split})
57
+
58
+ input_proto = self.create_input_protos(text, labels, input_id, self.dataset_id, metadata)
59
+
60
+ self._all_input_protos.append(input_proto)
61
+
62
+ return iter(self._all_input_protos)
File without changes
@@ -0,0 +1,99 @@
1
+ #! COCO 2017 image captioning dataset
2
+
3
+ import os
4
+ import zipfile
5
+ from glob import glob
6
+
7
+ import requests
8
+ from pycocotools.coco import COCO
9
+ from tqdm import tqdm
10
+
11
+ from ..features import VisualClassificationFeatures
12
+
13
+
14
+ class COCOCaptionsDataset:
15
+ """COCO 2017 Image Captioning Dataset."""
16
+
17
+ def __init__(self, split: str = "train"):
18
+ """
19
+ Initialize coco dataset.
20
+ Args:
21
+ filenames: the coco zip filenames: List[str] to be downloaded if download=True,
22
+ data_dir: the local coco dataset directory.
23
+ split: "train" or "val"
24
+ """
25
+ self.filenames = {
26
+ "train": "train2017.zip",
27
+ "val": "val2017.zip",
28
+ "annotations": "annotations_trainval2017.zip"
29
+ }
30
+ self.split = split
31
+ self.url = "http://images.cocodataset.org/zips/" # coco base image-zip url
32
+ self.data_dir = os.path.join(os.curdir, ".data") # data storage directory
33
+ self.extracted_coco_dirs = {"train": None, "val": None, "annotations": None}
34
+
35
+ def coco_download(self, save_dir):
36
+ """Download coco dataset."""
37
+ if not os.path.exists(save_dir):
38
+ os.mkdir(save_dir)
39
+
40
+ #check if train, val and annotation dirs exist
41
+ #so that the coco2017 data isn't downloaded
42
+ for key, filename in self.filenames.items():
43
+ if os.path.exists(glob(f"{save_dir}/{key}*")[0]):
44
+ print("Dataset already downloded and extracted")
45
+ continue
46
+
47
+ print("-" * 80)
48
+ print(f"Downloading {filename}")
49
+ print("-" * 80)
50
+
51
+ if "annotations" in filename:
52
+ self.url = "http://images.cocodataset.org/annotations/"
53
+
54
+ response = requests.get(self.url + filename, stream=True)
55
+ response.raise_for_status()
56
+ with open(os.path.join(save_dir, filename), "wb") as _file:
57
+ for chunk in tqdm(response.iter_content(chunk_size=5124000)):
58
+ if chunk:
59
+ _file.write(chunk)
60
+ print("Data download complete...")
61
+
62
+ #extract files
63
+ zf = zipfile.ZipFile(os.path.join(save_dir, filename))
64
+ print(f" Extracting {filename} file")
65
+ zf.extractall(path=save_dir)
66
+ # Delete coco zip
67
+ print(f" Deleting {filename}")
68
+ os.remove(path=os.path.join(save_dir, filename))
69
+
70
+ def dataloader(self):
71
+ """
72
+ Transform coco image captioning data into clarifai proto compatible
73
+ format for upload.
74
+ Returns:
75
+ VisualClassificationFeatures type generator.
76
+ """
77
+ if isinstance(self.filenames, list) and len(self.filenames) == 3: #train, val, annotations
78
+ self.coco_download(self.data_dir)
79
+ self.extracted_coco_dirs["train"] = [os.path.join(self.data_dir, i) \
80
+ for i in os.listdir(self.data_dir) if "train" in i][0]
81
+ self.extracted_coco_dirs["val"] = [os.path.join(self.data_dir, i) \
82
+ for i in os.listdir(self.data_dir) if "val" in i][0]
83
+
84
+ self.extracted_coco_dirs["annotations"] = [os.path.join(self.data_dir, i) \
85
+ for i in os.listdir(self.data_dir) if "annotations" in i][0]
86
+ else:
87
+ raise Exception(f"`filenames` must be a list of atleast 3 coco zip file names; \
88
+ train, val and annotations. Found {len(self.filenames)} items instead.")
89
+
90
+ annot_file = glob(self.extracted_coco_dirs["annotations"] + "/" + f"captions_{self.split}*")[0]
91
+ coco = COCO(annot_file)
92
+ annot_ids = coco.getAnnIds()
93
+ annotations = coco.loadAnns(annot_ids)
94
+ for annot in annotations:
95
+ image_path = glob(self.extracted_coco_dirs[self.split]+"/"+\
96
+ f"{str(annot['image_id']).zfill(12)}*")[0]
97
+ # image_captioning and image classification datasets have the same
98
+ # image-label input feature formats
99
+ yield VisualClassificationFeatures(image_path, annot["caption"], annot["image_id"])
@@ -0,0 +1,129 @@
1
+ #! COCO 2017 detection dataset
2
+
3
+ import os
4
+ import zipfile
5
+ from glob import glob
6
+
7
+ import cv2
8
+ import requests
9
+ from pycocotools.coco import COCO
10
+ from tqdm import tqdm
11
+
12
+ from ..features import VisualDetectionFeatures
13
+
14
+
15
+ class COCODetectionDataset:
16
+ """COCO 2017 Image Detection Dataset."""
17
+
18
+ def __init__(self, split: str = "train"):
19
+ """
20
+ Initialize coco dataset.
21
+ Args:
22
+ filenames: the coco zip filenames: List[str] to be downloaded if download=True,
23
+ data_dir: the local coco dataset directory.
24
+ split: "train" or "val"
25
+ """
26
+ self.filenames = {
27
+ "train": "train2017.zip",
28
+ "val": "val2017.zip",
29
+ "annotations": "annotations_trainval2017.zip"
30
+ }
31
+ self.split = split
32
+ self.url = "http://images.cocodataset.org/zips/" # coco base image-zip url
33
+ self.data_dir = os.path.join(os.curdir, ".data") # data storage directory
34
+ self.extracted_coco_dirs = {"train": None, "val": None, "annotations": None}
35
+
36
+ def coco_download(self, save_dir):
37
+ """Download coco dataset."""
38
+ if not os.path.exists(save_dir):
39
+ os.mkdir(save_dir)
40
+
41
+ #check if train*, val* and annotation* dirs exist
42
+ #so that the coco2017 data isn't downloaded
43
+ for key, filename in self.filenames.items():
44
+ if os.path.exists(glob(f"{save_dir}/{key}*")[0]):
45
+ print("dataset already downloded and extracted")
46
+ continue
47
+
48
+ print("-" * 80)
49
+ print(f"Downloading {filename}")
50
+ print("-" * 80)
51
+
52
+ if "annotations" in filename:
53
+ self.url = "http://images.cocodataset.org/annotations/"
54
+
55
+ response = requests.get(self.url + filename, stream=True)
56
+ response.raise_for_status()
57
+ with open(os.path.join(save_dir, filename), "wb") as _file:
58
+ for chunk in tqdm(response.iter_content(chunk_size=5124000)):
59
+ if chunk:
60
+ _file.write(chunk)
61
+ print("Coco data download complete...")
62
+
63
+ #extract files
64
+ zf = zipfile.ZipFile(os.path.join(save_dir, filename))
65
+ print(f" Extracting {filename} file")
66
+ zf.extractall(path=save_dir)
67
+ # Delete coco zip
68
+ print(f" Deleting {filename}")
69
+ os.remove(path=os.path.join(save_dir, filename))
70
+
71
+ def dataloader(self):
72
+ """
73
+ Transform coco object detection data into clarifai proto compatible
74
+ format for upload.
75
+ Returns:
76
+ VisualDetectionFeatures type generator.
77
+ """
78
+ if isinstance(self.filenames, list) and len(self.filenames) == 3:
79
+ self.coco_download(self.data_dir)
80
+ self.extracted_coco_dirs["train"] = [os.path.join(self.data_dir, i) \
81
+ for i in os.listdir(self.data_dir) if "train" in i][0]
82
+ self.extracted_coco_dirs["val"] = [os.path.join(self.data_dir, i) \
83
+ for i in os.listdir(self.data_dir) if "val" in i][0]
84
+
85
+ self.extracted_coco_dirs["annotations"] = [os.path.join(self.data_dir, i) \
86
+ for i in os.listdir(self.data_dir) if "annotations" in i][0]
87
+ else:
88
+ raise Exception(f"`filenames` must be a list of atleast 2 coco zip file names; \
89
+ train, val and annotations. Found {len(self.filenames)} items instead.")
90
+
91
+ annot_file = glob(self.extracted_coco_dirs["annotations"] + "/" +\
92
+ f"instances_{self.split}*")[0]
93
+ coco = COCO(annot_file)
94
+ categories = coco.loadCats(coco.getCatIds())
95
+ cat_id_map = {category["id"]: category["name"] for category in categories}
96
+ cat_img_ids = {}
97
+ for cat_id in list(cat_id_map.keys()):
98
+ cat_img_ids[cat_id] = coco.getImgIds(catIds=[cat_id])
99
+
100
+ img_ids = []
101
+ for i in list(cat_img_ids.values()):
102
+ img_ids.extend(i)
103
+
104
+ #get annotations for each image id
105
+ for _id in img_ids:
106
+ annots = [] # bboxes
107
+ class_names = []
108
+ labels = [i for i in list(filter(lambda x: _id in cat_img_ids[x], cat_img_ids))]
109
+ image_path = glob(self.extracted_coco_dirs[self.split]+"/"+\
110
+ f"{str(_id).zfill(12)}*")[0]
111
+
112
+ image_height, image_width = cv2.imread(image_path).shape[:2]
113
+ for cat_id in labels:
114
+ annot_ids = coco.getAnnIds(imgIds=_id, catIds=[cat_id])
115
+ if len(annot_ids) > 0:
116
+ img_annotations = coco.loadAnns(annot_ids)
117
+ for ann in img_annotations:
118
+ class_names.append(cat_id_map[cat_id])
119
+ x_min = ann['bbox'][0] / image_width #left_col
120
+ y_min = ann['bbox'][1] / image_height #top_row
121
+ x_max = (ann['bbox'][0] + ann['bbox'][2]) / image_width #right_col
122
+ y_max = (ann['bbox'][1] + ann['bbox'][3]) / image_height #bottom_row
123
+ annots.append([x_min, y_min, x_max, y_max])
124
+ else: # if no annotations for given image_id-cat_id pair
125
+ continue
126
+ assert len(class_names) == len(annots), f"Num classes must match num bbox annotations\
127
+ for a single image. Found {len(class_names)} classes and {len(annots)} bboxes."
128
+
129
+ yield VisualDetectionFeatures(image_path, class_names, annots, _id)