PyPI - clarifai - Versions diffs - 9.7.0__py3-none-any.whl → 9.7.2__py3-none-any.whl - Mend

clarifai 9.7.0py3-none-any.whl → 9.7.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (336) hide show

clarifai_utils/datasets/upload/base.py ADDED Viewed

@@ -0,0 +1,66 @@
+from collections import defaultdict
+from typing import Iterator, List, Tuple, TypeVar, Union
+from clarifai_grpc.grpc.api import resources_pb2
+from clarifai.client.input import Inputs
+from clarifai.datasets.upload.features import (TextFeatures, VisualClassificationFeatures,
+                                               VisualDetectionFeatures, VisualSegmentationFeatures)
+OutputFeaturesType = TypeVar(
+    'OutputFeaturesType',
+    bound=Union[TextFeatures, VisualClassificationFeatures, VisualDetectionFeatures,
+                VisualSegmentationFeatures])
+class ClarifaiDataset:
+  """Clarifai datasets base class."""
+  def __init__(self, datagen_object: Iterator, dataset_id: str, split: str) -> None:
+    self.datagen_object = datagen_object
+    self.dataset_id = dataset_id
+    self.split = split
+    self.all_input_ids = {}
+    self._all_input_protos = {}
+    self._all_annotation_protos = defaultdict(list)
+    self.input_object = Inputs()
+  def __len__(self) -> int:
+    """Get size of all input protos"""
+    return len(self.datagen_object)
+  def _to_list(self, input_protos: Iterator) -> List:
+    """Parse protos iterator to list."""
+    return list(input_protos)
+  def _extract_protos(self) -> None:
+    """Create input image protos for each data generator item."""
+    raise NotImplementedError()
+  def get_protos(self, input_ids: List[int]
+                ) -> Tuple[List[resources_pb2.Input], List[resources_pb2.Annotation]]:
+    """Get input and annotation protos based on input_ids.
+    Args:
+      input_ids: List of input IDs to retrieve the protos for.
+    Returns:
+      Input and Annotation proto iterators for the specified input IDs.
+    """
+    input_protos, annotation_protos = self._extract_protos(input_ids)
+    return input_protos, annotation_protos
+class ClarifaiDataLoader:
+  """Clarifai data loader base class."""
+  def __init__(self, split: str) -> None:
+    pass
+  def load_data(self) -> None:
+    raise NotImplementedError()
+  def __len__(self) -> int:
+    raise NotImplementedError()
+  def __getitem__(self, index: int) -> OutputFeaturesType:
+    raise NotImplementedError()

clarifai_utils/datasets/upload/examples/README.md ADDED Viewed

@@ -0,0 +1,31 @@
+## Dataset upload from local directory
+Examples of how to upload your local directory datasets into clarifai app using `module_dir` feature from `Dataset`.
+**Note:**
+**Note:**
+- Ensure that the `CLARIFAI_PAT` environment variable is set.
+- Ensure that the appropriate base workflow is being set for indexing respective input type.
+## Image Classification - Cifar10
+```python
+from clarifai.client.dataset import Dataset
+dataset = Dataset(user_id="user_id", app_id="app_id", dataset_id="dataset_id")
+dataset.upload_dataset(task="visual_classification", split="train", module_dir="path_to_cifar10_module")
+```
+## Image Classification - [Food-101](https://data.vision.ee.ethz.ch/cvl/datasets_extra/food-101/)
+```python
+from clarifai.client.dataset import Dataset
+dataset = Dataset(user_id="user_id", app_id="app_id", dataset_id="dataset_id")
+dataset.upload_dataset(task="visual_classification", split="train", module_dir="path_to_food-101_module")
+```
+## Text Classification - IMDB Reviews
+```python
+from clarifai.client.dataset import Dataset
+dataset = Dataset(user_id="user_id", app_id="app_id", dataset_id="dataset_id")
+dataset.upload_dataset(task="text_clf", split="train", module_dir="path_to_imdb_reviews_module")
+```

clarifai_utils/datasets/upload/examples/image_classification/cifar10/dataset.py ADDED Viewed

@@ -0,0 +1,42 @@
+#! Cifar10 Dataset
+import csv
+import os
+from clarifai.datasets.upload.base import ClarifaiDataLoader
+from clarifai.datasets.upload.features import VisualClassificationFeatures
+class Cifar10DataLoader(ClarifaiDataLoader):
+  """Cifar10 Dataset."""
+  def __init__(self, split: str = "train"):
+    """Initialize dataset params.
+    Args:
+      split: "train" or "test"
+    """
+    self.split = split
+    self.data_dirs = {
+        "train": os.path.join(os.path.dirname(__file__), "cifar_small_train.csv"),
+        "test": os.path.join(os.path.dirname(__file__), "cifar_small_test.csv")
+    }
+    self.data = self.load_data()
+  def load_data(self):
+    data = []
+    with open(self.data_dirs[self.split]) as _file:
+      reader = csv.reader(_file)
+      next(reader, None)  # skip header
+      for review in reader:
+        data.append((review[0], review[1]))
+    return data
+  def __getitem__(self, index):
+    item = self.data[index]
+    return VisualClassificationFeatures(
+        image_path=os.path.join(os.path.dirname(__file__), item[0]),
+        label=item[1],
+        id=os.path.basename(item[0]).split(".")[0])
+  def __len__(self):
+    return len(self.data)

clarifai_utils/datasets/upload/examples/image_classification/food-101/dataset.py ADDED Viewed

@@ -0,0 +1,39 @@
+import os
+from clarifai.datasets.upload.base import ClarifaiDataLoader
+from clarifai.datasets.upload.features import VisualClassificationFeatures
+class Food101DataLoader(ClarifaiDataLoader):
+  """Food-101 Image Classification Dataset."""
+  def __init__(self, split: str = "train"):
+    """Initialize dataset params.
+    Args:
+      split: "train" or "test"
+    """
+    self.split = split
+    self.image_dir = {"train": os.path.join(os.path.dirname(__file__), "images")}
+    self.load_data()
+  def load_data(self):
+    """Load data for the food-101 dataset."""
+    self.data = []
+    class_names = os.listdir(self.image_dir[self.split])
+    for class_name in class_names:
+      for image in os.listdir(os.path.join(self.image_dir[self.split], class_name)):
+        image_path = os.path.join(self.image_dir[self.split], class_name, image)
+        self.data.append({
+            "image_path": image_path,
+            "class_name": class_name,
+        })
+  def __getitem__(self, idx):
+    data_item = self.data[idx]
+    image_path = data_item["image_path"]
+    class_name = data_item["class_name"]
+    return VisualClassificationFeatures(
+        image_path=image_path, label=class_name, id=os.path.basename(image_path).split(".")[0])
+  def __len__(self):
+    return len(self.data)

clarifai_utils/datasets/upload/examples/text_classification/imdb_dataset/dataset.py ADDED Viewed

@@ -0,0 +1,37 @@
+import csv
+import os
+from clarifai.datasets.upload.base import ClarifaiDataLoader
+from clarifai.datasets.upload.features import TextFeatures
+class IMDBMovieReviewsDataLoader(ClarifaiDataLoader):
+  """IMDB 50K Movie Reviews Dataset."""
+  def __init__(self, split: str = "train"):
+    """Initialize dataset params.
+    Args:
+        split: "train" or "test"
+    """
+    self.split = split
+    self.data_dirs = {
+        "train": os.path.join(os.path.dirname(__file__), "train.csv"),
+        "test": os.path.join(os.path.dirname(__file__), "test.csv")
+    }
+    self.data = []
+    self.load_data()
+  def load_data(self):
+    with open(self.data_dirs[self.split]) as _file:
+      reader = csv.reader(_file)
+      next(reader, None)  # skip header
+      for review in reader:
+        self.data.append({"text": review[0], "labels": review[1], "id": None})
+  def __getitem__(self, idx):
+    item = self.data[idx]
+    return TextFeatures(text=item["text"], labels=item["labels"], id=item["id"])
+  def __len__(self):
+    return len(self.data)

clarifai_utils/{data_upload/datasets → datasets/upload}/features.py RENAMED Viewed

@@ -5,9 +5,7 @@ from typing import List, Optional, Union
 @dataclass
 class TextFeatures:
-  """
-  Text classification datasets preprocessing output features.
-  """
+  """Text classification datasets preprocessing output features."""
   text: str
   labels: List[Union[str, int]]  # List[str or int] to cater for multi-class tasks
   id: Optional[int] = None  # text_id
@@ -15,9 +13,7 @@ class TextFeatures:
 @dataclass
 class VisualClassificationFeatures:
-  """
-  Image classification datasets preprocessing output features.
-  """
+  """Image classification datasets preprocessing output features."""
   image_path: str
   label: Union[str, int]
   geo_info: Optional[List[float]] = None  #[Longitude, Latitude]
@@ -26,9 +22,7 @@ class VisualClassificationFeatures:
 @dataclass
 class VisualDetectionFeatures:
-  """
-  Image Detection datasets preprocessing output features.
-  """
+  """Image Detection datasets preprocessing output features."""
   image_path: str
   classes: List[Union[str, int]]
   bboxes: List[List[float]]
@@ -38,9 +32,7 @@ class VisualDetectionFeatures:
 @dataclass
 class VisualSegmentationFeatures:
-  """
-  Image Segmentation datasets preprocessing output features.
-  """
+  """Image Segmentation datasets preprocessing output features."""
   image_path: str
   classes: List[Union[str, int]]
   polygons: List[List[List[float]]]

clarifai_utils/datasets/upload/image.py ADDED Viewed

@@ -0,0 +1,156 @@
+import os
+from concurrent.futures import ThreadPoolExecutor
+from typing import Iterator, List, Tuple
+from clarifai_grpc.grpc.api import resources_pb2
+from google.protobuf.struct_pb2 import Struct
+from .base import ClarifaiDataset
+class VisualClassificationDataset(ClarifaiDataset):
+  def __init__(self, datagen_object: Iterator, dataset_id: str, split: str) -> None:
+    super().__init__(datagen_object, dataset_id, split)
+  def _extract_protos(self, batch_input_ids: List[str]
+                     ) -> Tuple[List[resources_pb2.Input], List[resources_pb2.Annotation]]:
+    """Create input image and annotation protos for batch of input ids.
+    Args:
+      batch_input_ids: List of input IDs to retrieve the protos for.
+    Returns:
+      input_protos: List of input protos.
+      annotation_protos: List of annotation protos.
+    """
+    input_protos, annotation_protos = [], []
+    def process_datagen_item(id):
+      datagen_item = self.datagen_object[id]
+      metadata = Struct()
+      image_path = datagen_item.image_path
+      label = datagen_item.label if isinstance(datagen_item.label,
+                                               list) else [datagen_item.label]  # clarifai concept
+      input_id = f"{self.dataset_id}-{self.split}-{id}" if datagen_item.id is None else f"{self.split}-{str(datagen_item.id)}"
+      geo_info = datagen_item.geo_info
+      metadata.update({"filename": os.path.basename(image_path), "split": self.split})
+      self.all_input_ids[id] = input_id
+      input_protos.append(
+          self.input_object.get_input_from_file(
+              input_id=input_id,
+              image_file=image_path,
+              dataset_id=self.dataset_id,
+              labels=label,
+              geo_info=geo_info,
+              metadata=metadata))
+    with ThreadPoolExecutor(max_workers=4) as executor:
+      futures = [executor.submit(process_datagen_item, id) for id in batch_input_ids]
+      for job in futures:
+        job.result()
+    return input_protos, annotation_protos
+class VisualDetectionDataset(ClarifaiDataset):
+  """Visual detection dataset proto class."""
+  def __init__(self, datagen_object: Iterator, dataset_id: str, split: str) -> None:
+    super().__init__(datagen_object, dataset_id, split)
+  def _extract_protos(self, batch_input_ids: List[int]
+                     ) -> Tuple[List[resources_pb2.Input], List[resources_pb2.Annotation]]:
+    """Create input image protos for each data generator item.
+    Args:
+      batch_input_ids: List of input IDs to retrieve the protos for.
+    Returns:
+      input_protos: List of input protos.
+      annotation_protos: List of annotation protos.
+    """
+    input_protos, annotation_protos = [], []
+    def process_datagen_item(id):
+      datagen_item = self.datagen_object[id]
+      metadata = Struct()
+      image = datagen_item.image_path
+      labels = datagen_item.classes  # list:[l1,...,ln]
+      bboxes = datagen_item.bboxes  # [[xmin,ymin,xmax,ymax],...,[xmin,ymin,xmax,ymax]]
+      input_id = f"{self.dataset_id}-{self.split}-{i}" if datagen_item.id is None else f"{self.split}-{str(datagen_item.id)}"
+      metadata.update({"filename": os.path.basename(image), "split": self.split})
+      geo_info = datagen_item.geo_info
+      self.all_input_ids[id] = input_id
+      input_protos.append(
+          self.input_object.get_input_from_file(
+              input_id=input_id,
+              image_file=image,
+              dataset_id=self.dataset_id,
+              geo_info=geo_info,
+              metadata=metadata))
+      # iter over bboxes and classes
+      # one id could have more than one bbox and label
+      for i in range(len(bboxes)):
+        annotation_protos.append(
+            self.input_object.get_annotation_proto(
+                input_id=input_id, label=labels[i], annotations=bboxes[i]))
+    with ThreadPoolExecutor(max_workers=4) as executor:
+      futures = [executor.submit(process_datagen_item, id) for id in batch_input_ids]
+      for job in futures:
+        job.result()
+    return input_protos, annotation_protos
+class VisualSegmentationDataset(ClarifaiDataset):
+  """Visual segmentation dataset proto class."""
+  def __init__(self, datagen_object: Iterator, dataset_id: str, split: str) -> None:
+    super().__init__(datagen_object, dataset_id, split)
+  def _extract_protos(self, batch_input_ids: List[str]
+                     ) -> Tuple[List[resources_pb2.Input], List[resources_pb2.Annotation]]:
+    """Create input image and annotation protos for batch of input ids.
+    Args:
+      batch_input_ids: List of input IDs to retrieve the protos for.
+    Returns:
+      input_protos: List of input protos.
+      annotation_protos: List of annotation protos.
+    """
+    input_protos, annotation_protos = [], []
+    def process_datagen_item(id):
+      datagen_item = self.datagen_object[id]
+      metadata = Struct()
+      image = datagen_item.image_path
+      labels = datagen_item.classes
+      _polygons = datagen_item.polygons  # list of polygons: [[[x,y],...,[x,y]],...]
+      input_id = f"{self.dataset_id}-{self.split}-{i}" if datagen_item.id is None else f"{self.split}-{str(datagen_item.id)}"
+      metadata.update({"filename": os.path.basename(image), "split": self.split})
+      geo_info = datagen_item.geo_info
+      self.all_input_ids[id] = input_id
+      input_protos.append(
+          self.input_object.get_input_from_file(
+              input_id=input_id,
+              image_file=image,
+              dataset_id=self.dataset_id,
+              geo_info=geo_info,
+              metadata=metadata))
+      ## Iterate over each masked image and create a proto for upload to clarifai
+      ## The length of masks/polygons-list and labels must be equal
+      for i, _polygon in enumerate(_polygons):
+        try:
+          annotation_protos.append(
+              self.input_object.get_mask_proto(
+                  input_id=input_id, label=labels[i], polygons=_polygon))
+        except IndexError:
+          continue
+    with ThreadPoolExecutor(max_workers=4) as executor:
+      futures = [executor.submit(process_datagen_item, id) for id in batch_input_ids]
+      for job in futures:
+        job.result()
+    return input_protos, annotation_protos

clarifai_utils/datasets/upload/loaders/README.md ADDED Viewed

@@ -0,0 +1,49 @@
+## Dataset Loaders
+A collection of data preprocessing modules for popular public datasets to allow for compatible upload into Clarifai user app datasets.
+## Usage
+If a dataset module exists in the zoo, uploading the specific dataset can be easily done by simply creating a python script (or via commandline) and specifying the dataset module name in the `dataset_loader` parameter of the  `Dataset` class, `upload_dataset` method .i.e.
+```python
+from clarifai.client.app import App
+app = App(app_id="", user_id="")
+# Create a dataset in Clarifai App
+dataset = app.create_dataset(dataset_id="")
+# execute data upload to Clarifai app dataset
+dataset.upload_dataset(task='visual_segmentation', split="train", dataset_loader='coco_segmentation')
+```
+## Dataset Loaders
+ | dataset name | task | module name (.py) | splits |
+ | --- | --- | --- | --- |
+ | [COCO 2017](https://cocodataset.org/#download) | Detection | `coco_detection` | `train`, `val` |
+ |        | Segmentation | `coco_segmentation` | `train`, `val` |
+ |       | Captions | `coco_captions` | `train`, `val` |
+ |[xVIEW](http://xviewdataset.org/)  | Detection | `xview_detection` | `train`
+ | [ImageNet](https://www.image-net.org/)  | Classification | `imagenet_classification` | `train`
+## Contributing Modules
+A dataloader (preprocessing) module is a python script that contains a dataloader class which implements data download (to download the dataloader from a source to local disk dir) & extraction and dataloader methods.
+The class naming convention is `<datasetname>DataLoader`. The dataset class must accept `split` as the only argument in the `__init__` method and the `__getitem__` method must return either of `VisualClassificationFeatures()`, `VisualDetectionFeatures()`, `VisualSegmentationFeatures()` or `TextFeatures()` as defined in [clarifai/datasets/upload/features.py](../features.py). Other methods can be added as seen fit but must be inherited from parent `ClarifaiDataLoader` base class [clarifai/datasets/upload/base.py](../base.py).
+Reference can be taken from the existing dataset modules in the zoo for development.
+## Notes
+* Dataloaders in the zoo by default first create a `data` directory in the zoo directory then download the data into this `data` directory, preprocess the data and finally execute upload to a Clarifai app dataset. For instance with the COCO dataset modules above, the coco2017 dataset is by default downloaded first into a `data` directory, extracted and then preprocessing is performed on it and finally uploaded to Clarifai.
+* Taking the above into consideration, to avoid the scripts re-downloading data you already have locally, create a `data` directory in the loaders directory and move your extracted data there. **Ensure that the extracted folder/file names and file structure MATCH those when the downloaded zips are extracted.**
+* COCO Format: To reuse the coco modules above on your coco format data, ensure the criteria in the two points above is adhered to first. If so, pass the coco module name from any of the above in the loaders to the `dataset_loader=` parameter in `upload_dataset()`.
+* xVIEW Dataset: To upload, you have to register and download images,label from [xviewdataset](http://xviewdataset.org/#dataset) follow the above mentioned steps to place extracted folder in `data` directory. Finally pass the xview module name to `dataset_loader=` parameter in `upload_dataset()`.
+* ImageNet Dataset: ImageNet Dataset should be downloaded and placed in the 'data' folder along with the [label mapping file](https://www.kaggle.com/competitions/imagenet-object-localization-challenge/data?select=LOC_synset_mapping.txt).
+		<data>/
+      	├── train/
+      	├── LOC_synset_mapping.txt

clarifai_utils/{data_upload/datasets/zoo → datasets/upload/loaders}/coco_captions.py RENAMED Viewed

@@ -8,15 +8,16 @@ import requests
 from pycocotools.coco import COCO
 from tqdm import tqdm
+from clarifai.datasets.upload.base import ClarifaiDataLoader
 from ..features import VisualClassificationFeatures
-class COCOCaptionsDataset:
+class COCOCaptionsDataLoader(ClarifaiDataLoader):
   """COCO 2017 Image Captioning Dataset."""
   def __init__(self, split: str = "train"):
-    """
-    Initialize coco dataset.
+    """Initialize coco dataset.
     Args:
       filenames: the coco zip filenames: Dict[str, str] to be downloaded if download=True,
       data_dir: the local coco dataset directory.
@@ -29,9 +30,12 @@ class COCOCaptionsDataset:
     }
     self.split = split
     self.url = "http://images.cocodataset.org/zips/"  # coco base image-zip url
-    self.data_dir = os.path.join(os.curdir, "data")  # data storage directory
+    self.data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)),
+                                 "data")  # data storage directory
     self.extracted_coco_dirs = {"train": None, "val": None, "annotations": None}
+    self.load_data()
   def coco_download(self, save_dir):
     """Download coco dataset."""
     if not os.path.exists(save_dir):
@@ -68,14 +72,8 @@ class COCOCaptionsDataset:
       print(f" Deleting {filename}")
       os.remove(path=os.path.join(save_dir, filename))
-  def dataloader(self):
-    """
-    Transform coco image captioning data into clarifai proto compatible
-    format for upload.
-    Returns:
-      VisualClassificationFeatures type generator.
-    """
-    if isinstance(self.filenames, dict) and len(self.filenames) == 3:  #train, val, annotations
+  def load_data(self):
+    if isinstance(self.filenames, dict) and len(self.filenames) == 3:
       self.coco_download(self.data_dir)
       self.extracted_coco_dirs["train"] = [os.path.join(self.data_dir, i) \
       for i in os.listdir(self.data_dir) if "train" in i][0]
@@ -85,16 +83,21 @@ class COCOCaptionsDataset:
       self.extracted_coco_dirs["annotations"] = [os.path.join(self.data_dir, i) \
       for i in os.listdir(self.data_dir) if "annotations" in i][0]
     else:
-      raise Exception(f"`filenames` must be a dict of atleast 3 coco zip file names; \
-      train, val and annotations. Found {len(self.filenames)} items instead.")
+      raise Exception(f"`filenames` must be a dict of atleast 2 coco zip file names; \
+  train, val and annotations. Found {len(self.filenames)} items instead.")
     annot_file = glob(self.extracted_coco_dirs["annotations"] + "/" + f"captions_{self.split}*")[0]
     coco = COCO(annot_file)
     annot_ids = coco.getAnnIds()
-    annotations = coco.loadAnns(annot_ids)
-    for annot in annotations:
-      image_path = glob(self.extracted_coco_dirs[self.split]+"/"+\
-      f"{str(annot['image_id']).zfill(12)}*")[0]
-      # image_captioning and image classification datasets have the same
-      # image-label input feature formats
-      yield VisualClassificationFeatures(image_path, annot["caption"], id=annot["image_id"])
+    self.annotations = coco.loadAnns(annot_ids)
+  def __len__(self):
+    return len(self.annotations)
+  def __getitem__(self, idx):
+    annot = self.annotations[idx]
+    image_path = glob(
+        os.path.join(self.extracted_coco_dirs[self.split],
+                     f"{str(annot['image_id']).zfill(12)}*"))[0]
+    return VisualClassificationFeatures(image_path, annot["caption"], id=annot["image_id"])

clarifai 9.7.0__py3-none-any.whl → 9.7.2__py3-none-any.whl

clarifai 9.7.0py3-none-any.whl → 9.7.2py3-none-any.whl