clarifai 11.3.0rc2__py3-none-any.whl → 11.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- clarifai/__init__.py +1 -1
- clarifai/cli/__main__.py +1 -1
- clarifai/cli/base.py +144 -136
- clarifai/cli/compute_cluster.py +45 -31
- clarifai/cli/deployment.py +93 -76
- clarifai/cli/model.py +578 -180
- clarifai/cli/nodepool.py +100 -82
- clarifai/client/__init__.py +12 -2
- clarifai/client/app.py +973 -911
- clarifai/client/auth/helper.py +345 -342
- clarifai/client/auth/register.py +7 -7
- clarifai/client/auth/stub.py +107 -106
- clarifai/client/base.py +185 -178
- clarifai/client/compute_cluster.py +214 -180
- clarifai/client/dataset.py +793 -698
- clarifai/client/deployment.py +55 -50
- clarifai/client/input.py +1223 -1088
- clarifai/client/lister.py +47 -45
- clarifai/client/model.py +1939 -1717
- clarifai/client/model_client.py +525 -502
- clarifai/client/module.py +82 -73
- clarifai/client/nodepool.py +358 -213
- clarifai/client/runner.py +58 -0
- clarifai/client/search.py +342 -309
- clarifai/client/user.py +419 -414
- clarifai/client/workflow.py +294 -274
- clarifai/constants/dataset.py +11 -17
- clarifai/constants/model.py +8 -2
- clarifai/datasets/export/inputs_annotations.py +233 -217
- clarifai/datasets/upload/base.py +63 -51
- clarifai/datasets/upload/features.py +43 -38
- clarifai/datasets/upload/image.py +237 -207
- clarifai/datasets/upload/loaders/coco_captions.py +34 -32
- clarifai/datasets/upload/loaders/coco_detection.py +72 -65
- clarifai/datasets/upload/loaders/imagenet_classification.py +57 -53
- clarifai/datasets/upload/loaders/xview_detection.py +274 -132
- clarifai/datasets/upload/multimodal.py +55 -46
- clarifai/datasets/upload/text.py +55 -47
- clarifai/datasets/upload/utils.py +250 -234
- clarifai/errors.py +51 -50
- clarifai/models/api.py +260 -238
- clarifai/modules/css.py +50 -50
- clarifai/modules/pages.py +33 -33
- clarifai/rag/rag.py +312 -288
- clarifai/rag/utils.py +91 -84
- clarifai/runners/models/model_builder.py +906 -802
- clarifai/runners/models/model_class.py +370 -331
- clarifai/runners/models/model_run_locally.py +459 -419
- clarifai/runners/models/model_runner.py +170 -162
- clarifai/runners/models/model_servicer.py +78 -70
- clarifai/runners/server.py +111 -101
- clarifai/runners/utils/code_script.py +225 -187
- clarifai/runners/utils/const.py +4 -1
- clarifai/runners/utils/data_types/__init__.py +12 -0
- clarifai/runners/utils/data_types/data_types.py +598 -0
- clarifai/runners/utils/data_utils.py +387 -440
- clarifai/runners/utils/loader.py +247 -227
- clarifai/runners/utils/method_signatures.py +411 -386
- clarifai/runners/utils/openai_convertor.py +108 -109
- clarifai/runners/utils/serializers.py +175 -179
- clarifai/runners/utils/url_fetcher.py +35 -35
- clarifai/schema/search.py +56 -63
- clarifai/urls/helper.py +125 -102
- clarifai/utils/cli.py +129 -123
- clarifai/utils/config.py +127 -87
- clarifai/utils/constants.py +49 -0
- clarifai/utils/evaluation/helpers.py +503 -466
- clarifai/utils/evaluation/main.py +431 -393
- clarifai/utils/evaluation/testset_annotation_parser.py +154 -144
- clarifai/utils/logging.py +324 -306
- clarifai/utils/misc.py +60 -56
- clarifai/utils/model_train.py +165 -146
- clarifai/utils/protobuf.py +126 -103
- clarifai/versions.py +3 -1
- clarifai/workflows/export.py +48 -50
- clarifai/workflows/utils.py +39 -36
- clarifai/workflows/validate.py +55 -43
- {clarifai-11.3.0rc2.dist-info → clarifai-11.4.0.dist-info}/METADATA +16 -6
- clarifai-11.4.0.dist-info/RECORD +109 -0
- {clarifai-11.3.0rc2.dist-info → clarifai-11.4.0.dist-info}/WHEEL +1 -1
- clarifai/__pycache__/__init__.cpython-310.pyc +0 -0
- clarifai/__pycache__/__init__.cpython-311.pyc +0 -0
- clarifai/__pycache__/__init__.cpython-39.pyc +0 -0
- clarifai/__pycache__/errors.cpython-310.pyc +0 -0
- clarifai/__pycache__/errors.cpython-311.pyc +0 -0
- clarifai/__pycache__/versions.cpython-310.pyc +0 -0
- clarifai/__pycache__/versions.cpython-311.pyc +0 -0
- clarifai/cli/__pycache__/__init__.cpython-310.pyc +0 -0
- clarifai/cli/__pycache__/__init__.cpython-311.pyc +0 -0
- clarifai/cli/__pycache__/base.cpython-310.pyc +0 -0
- clarifai/cli/__pycache__/base.cpython-311.pyc +0 -0
- clarifai/cli/__pycache__/base_cli.cpython-310.pyc +0 -0
- clarifai/cli/__pycache__/compute_cluster.cpython-310.pyc +0 -0
- clarifai/cli/__pycache__/compute_cluster.cpython-311.pyc +0 -0
- clarifai/cli/__pycache__/deployment.cpython-310.pyc +0 -0
- clarifai/cli/__pycache__/deployment.cpython-311.pyc +0 -0
- clarifai/cli/__pycache__/model.cpython-310.pyc +0 -0
- clarifai/cli/__pycache__/model.cpython-311.pyc +0 -0
- clarifai/cli/__pycache__/model_cli.cpython-310.pyc +0 -0
- clarifai/cli/__pycache__/nodepool.cpython-310.pyc +0 -0
- clarifai/cli/__pycache__/nodepool.cpython-311.pyc +0 -0
- clarifai/client/__pycache__/__init__.cpython-310.pyc +0 -0
- clarifai/client/__pycache__/__init__.cpython-311.pyc +0 -0
- clarifai/client/__pycache__/__init__.cpython-39.pyc +0 -0
- clarifai/client/__pycache__/app.cpython-310.pyc +0 -0
- clarifai/client/__pycache__/app.cpython-311.pyc +0 -0
- clarifai/client/__pycache__/app.cpython-39.pyc +0 -0
- clarifai/client/__pycache__/base.cpython-310.pyc +0 -0
- clarifai/client/__pycache__/base.cpython-311.pyc +0 -0
- clarifai/client/__pycache__/compute_cluster.cpython-310.pyc +0 -0
- clarifai/client/__pycache__/compute_cluster.cpython-311.pyc +0 -0
- clarifai/client/__pycache__/dataset.cpython-310.pyc +0 -0
- clarifai/client/__pycache__/dataset.cpython-311.pyc +0 -0
- clarifai/client/__pycache__/deployment.cpython-310.pyc +0 -0
- clarifai/client/__pycache__/deployment.cpython-311.pyc +0 -0
- clarifai/client/__pycache__/input.cpython-310.pyc +0 -0
- clarifai/client/__pycache__/input.cpython-311.pyc +0 -0
- clarifai/client/__pycache__/lister.cpython-310.pyc +0 -0
- clarifai/client/__pycache__/lister.cpython-311.pyc +0 -0
- clarifai/client/__pycache__/model.cpython-310.pyc +0 -0
- clarifai/client/__pycache__/model.cpython-311.pyc +0 -0
- clarifai/client/__pycache__/module.cpython-310.pyc +0 -0
- clarifai/client/__pycache__/module.cpython-311.pyc +0 -0
- clarifai/client/__pycache__/nodepool.cpython-310.pyc +0 -0
- clarifai/client/__pycache__/nodepool.cpython-311.pyc +0 -0
- clarifai/client/__pycache__/search.cpython-310.pyc +0 -0
- clarifai/client/__pycache__/search.cpython-311.pyc +0 -0
- clarifai/client/__pycache__/user.cpython-310.pyc +0 -0
- clarifai/client/__pycache__/user.cpython-311.pyc +0 -0
- clarifai/client/__pycache__/workflow.cpython-310.pyc +0 -0
- clarifai/client/__pycache__/workflow.cpython-311.pyc +0 -0
- clarifai/client/auth/__pycache__/__init__.cpython-310.pyc +0 -0
- clarifai/client/auth/__pycache__/__init__.cpython-311.pyc +0 -0
- clarifai/client/auth/__pycache__/helper.cpython-310.pyc +0 -0
- clarifai/client/auth/__pycache__/helper.cpython-311.pyc +0 -0
- clarifai/client/auth/__pycache__/register.cpython-310.pyc +0 -0
- clarifai/client/auth/__pycache__/register.cpython-311.pyc +0 -0
- clarifai/client/auth/__pycache__/stub.cpython-310.pyc +0 -0
- clarifai/client/auth/__pycache__/stub.cpython-311.pyc +0 -0
- clarifai/client/cli/__init__.py +0 -0
- clarifai/client/cli/__pycache__/__init__.cpython-310.pyc +0 -0
- clarifai/client/cli/__pycache__/base_cli.cpython-310.pyc +0 -0
- clarifai/client/cli/__pycache__/model_cli.cpython-310.pyc +0 -0
- clarifai/client/cli/base_cli.py +0 -88
- clarifai/client/cli/model_cli.py +0 -29
- clarifai/constants/__pycache__/base.cpython-310.pyc +0 -0
- clarifai/constants/__pycache__/base.cpython-311.pyc +0 -0
- clarifai/constants/__pycache__/dataset.cpython-310.pyc +0 -0
- clarifai/constants/__pycache__/dataset.cpython-311.pyc +0 -0
- clarifai/constants/__pycache__/input.cpython-310.pyc +0 -0
- clarifai/constants/__pycache__/input.cpython-311.pyc +0 -0
- clarifai/constants/__pycache__/model.cpython-310.pyc +0 -0
- clarifai/constants/__pycache__/model.cpython-311.pyc +0 -0
- clarifai/constants/__pycache__/rag.cpython-310.pyc +0 -0
- clarifai/constants/__pycache__/rag.cpython-311.pyc +0 -0
- clarifai/constants/__pycache__/search.cpython-310.pyc +0 -0
- clarifai/constants/__pycache__/search.cpython-311.pyc +0 -0
- clarifai/constants/__pycache__/workflow.cpython-310.pyc +0 -0
- clarifai/constants/__pycache__/workflow.cpython-311.pyc +0 -0
- clarifai/datasets/__pycache__/__init__.cpython-310.pyc +0 -0
- clarifai/datasets/__pycache__/__init__.cpython-311.pyc +0 -0
- clarifai/datasets/__pycache__/__init__.cpython-39.pyc +0 -0
- clarifai/datasets/export/__pycache__/__init__.cpython-310.pyc +0 -0
- clarifai/datasets/export/__pycache__/__init__.cpython-311.pyc +0 -0
- clarifai/datasets/export/__pycache__/__init__.cpython-39.pyc +0 -0
- clarifai/datasets/export/__pycache__/inputs_annotations.cpython-310.pyc +0 -0
- clarifai/datasets/export/__pycache__/inputs_annotations.cpython-311.pyc +0 -0
- clarifai/datasets/upload/__pycache__/__init__.cpython-310.pyc +0 -0
- clarifai/datasets/upload/__pycache__/__init__.cpython-311.pyc +0 -0
- clarifai/datasets/upload/__pycache__/__init__.cpython-39.pyc +0 -0
- clarifai/datasets/upload/__pycache__/base.cpython-310.pyc +0 -0
- clarifai/datasets/upload/__pycache__/base.cpython-311.pyc +0 -0
- clarifai/datasets/upload/__pycache__/features.cpython-310.pyc +0 -0
- clarifai/datasets/upload/__pycache__/features.cpython-311.pyc +0 -0
- clarifai/datasets/upload/__pycache__/image.cpython-310.pyc +0 -0
- clarifai/datasets/upload/__pycache__/image.cpython-311.pyc +0 -0
- clarifai/datasets/upload/__pycache__/multimodal.cpython-310.pyc +0 -0
- clarifai/datasets/upload/__pycache__/multimodal.cpython-311.pyc +0 -0
- clarifai/datasets/upload/__pycache__/text.cpython-310.pyc +0 -0
- clarifai/datasets/upload/__pycache__/text.cpython-311.pyc +0 -0
- clarifai/datasets/upload/__pycache__/utils.cpython-310.pyc +0 -0
- clarifai/datasets/upload/__pycache__/utils.cpython-311.pyc +0 -0
- clarifai/datasets/upload/loaders/__pycache__/__init__.cpython-311.pyc +0 -0
- clarifai/datasets/upload/loaders/__pycache__/__init__.cpython-39.pyc +0 -0
- clarifai/datasets/upload/loaders/__pycache__/coco_detection.cpython-311.pyc +0 -0
- clarifai/datasets/upload/loaders/__pycache__/imagenet_classification.cpython-311.pyc +0 -0
- clarifai/models/__pycache__/__init__.cpython-39.pyc +0 -0
- clarifai/modules/__pycache__/__init__.cpython-39.pyc +0 -0
- clarifai/rag/__pycache__/__init__.cpython-310.pyc +0 -0
- clarifai/rag/__pycache__/__init__.cpython-311.pyc +0 -0
- clarifai/rag/__pycache__/__init__.cpython-39.pyc +0 -0
- clarifai/rag/__pycache__/rag.cpython-310.pyc +0 -0
- clarifai/rag/__pycache__/rag.cpython-311.pyc +0 -0
- clarifai/rag/__pycache__/rag.cpython-39.pyc +0 -0
- clarifai/rag/__pycache__/utils.cpython-310.pyc +0 -0
- clarifai/rag/__pycache__/utils.cpython-311.pyc +0 -0
- clarifai/runners/__pycache__/__init__.cpython-310.pyc +0 -0
- clarifai/runners/__pycache__/__init__.cpython-311.pyc +0 -0
- clarifai/runners/__pycache__/__init__.cpython-39.pyc +0 -0
- clarifai/runners/dockerfile_template/Dockerfile.cpu.template +0 -31
- clarifai/runners/dockerfile_template/Dockerfile.cuda.template +0 -42
- clarifai/runners/dockerfile_template/Dockerfile.nim +0 -71
- clarifai/runners/models/__pycache__/__init__.cpython-310.pyc +0 -0
- clarifai/runners/models/__pycache__/__init__.cpython-311.pyc +0 -0
- clarifai/runners/models/__pycache__/__init__.cpython-39.pyc +0 -0
- clarifai/runners/models/__pycache__/base_typed_model.cpython-310.pyc +0 -0
- clarifai/runners/models/__pycache__/base_typed_model.cpython-311.pyc +0 -0
- clarifai/runners/models/__pycache__/base_typed_model.cpython-39.pyc +0 -0
- clarifai/runners/models/__pycache__/model_builder.cpython-311.pyc +0 -0
- clarifai/runners/models/__pycache__/model_class.cpython-310.pyc +0 -0
- clarifai/runners/models/__pycache__/model_class.cpython-311.pyc +0 -0
- clarifai/runners/models/__pycache__/model_run_locally.cpython-310-pytest-7.1.2.pyc +0 -0
- clarifai/runners/models/__pycache__/model_run_locally.cpython-310.pyc +0 -0
- clarifai/runners/models/__pycache__/model_run_locally.cpython-311.pyc +0 -0
- clarifai/runners/models/__pycache__/model_runner.cpython-310.pyc +0 -0
- clarifai/runners/models/__pycache__/model_runner.cpython-311.pyc +0 -0
- clarifai/runners/models/__pycache__/model_upload.cpython-310.pyc +0 -0
- clarifai/runners/models/base_typed_model.py +0 -238
- clarifai/runners/models/model_class_refract.py +0 -80
- clarifai/runners/models/model_upload.py +0 -607
- clarifai/runners/models/temp.py +0 -25
- clarifai/runners/utils/__pycache__/__init__.cpython-310.pyc +0 -0
- clarifai/runners/utils/__pycache__/__init__.cpython-311.pyc +0 -0
- clarifai/runners/utils/__pycache__/__init__.cpython-38.pyc +0 -0
- clarifai/runners/utils/__pycache__/__init__.cpython-39.pyc +0 -0
- clarifai/runners/utils/__pycache__/buffered_stream.cpython-310.pyc +0 -0
- clarifai/runners/utils/__pycache__/buffered_stream.cpython-38.pyc +0 -0
- clarifai/runners/utils/__pycache__/buffered_stream.cpython-39.pyc +0 -0
- clarifai/runners/utils/__pycache__/const.cpython-310.pyc +0 -0
- clarifai/runners/utils/__pycache__/const.cpython-311.pyc +0 -0
- clarifai/runners/utils/__pycache__/constants.cpython-310.pyc +0 -0
- clarifai/runners/utils/__pycache__/constants.cpython-38.pyc +0 -0
- clarifai/runners/utils/__pycache__/constants.cpython-39.pyc +0 -0
- clarifai/runners/utils/__pycache__/data_handler.cpython-310.pyc +0 -0
- clarifai/runners/utils/__pycache__/data_handler.cpython-311.pyc +0 -0
- clarifai/runners/utils/__pycache__/data_handler.cpython-38.pyc +0 -0
- clarifai/runners/utils/__pycache__/data_handler.cpython-39.pyc +0 -0
- clarifai/runners/utils/__pycache__/data_utils.cpython-310.pyc +0 -0
- clarifai/runners/utils/__pycache__/data_utils.cpython-311.pyc +0 -0
- clarifai/runners/utils/__pycache__/data_utils.cpython-38.pyc +0 -0
- clarifai/runners/utils/__pycache__/data_utils.cpython-39.pyc +0 -0
- clarifai/runners/utils/__pycache__/grpc_server.cpython-310.pyc +0 -0
- clarifai/runners/utils/__pycache__/grpc_server.cpython-38.pyc +0 -0
- clarifai/runners/utils/__pycache__/grpc_server.cpython-39.pyc +0 -0
- clarifai/runners/utils/__pycache__/health.cpython-310.pyc +0 -0
- clarifai/runners/utils/__pycache__/health.cpython-38.pyc +0 -0
- clarifai/runners/utils/__pycache__/health.cpython-39.pyc +0 -0
- clarifai/runners/utils/__pycache__/loader.cpython-310.pyc +0 -0
- clarifai/runners/utils/__pycache__/loader.cpython-311.pyc +0 -0
- clarifai/runners/utils/__pycache__/logging.cpython-310.pyc +0 -0
- clarifai/runners/utils/__pycache__/logging.cpython-38.pyc +0 -0
- clarifai/runners/utils/__pycache__/logging.cpython-39.pyc +0 -0
- clarifai/runners/utils/__pycache__/stream_source.cpython-310.pyc +0 -0
- clarifai/runners/utils/__pycache__/stream_source.cpython-39.pyc +0 -0
- clarifai/runners/utils/__pycache__/url_fetcher.cpython-310.pyc +0 -0
- clarifai/runners/utils/__pycache__/url_fetcher.cpython-311.pyc +0 -0
- clarifai/runners/utils/__pycache__/url_fetcher.cpython-38.pyc +0 -0
- clarifai/runners/utils/__pycache__/url_fetcher.cpython-39.pyc +0 -0
- clarifai/runners/utils/data_handler.py +0 -231
- clarifai/runners/utils/data_handler_refract.py +0 -213
- clarifai/runners/utils/data_types.py +0 -469
- clarifai/runners/utils/logger.py +0 -0
- clarifai/runners/utils/openai_format.py +0 -87
- clarifai/schema/__pycache__/search.cpython-310.pyc +0 -0
- clarifai/schema/__pycache__/search.cpython-311.pyc +0 -0
- clarifai/urls/__pycache__/helper.cpython-310.pyc +0 -0
- clarifai/urls/__pycache__/helper.cpython-311.pyc +0 -0
- clarifai/utils/__pycache__/__init__.cpython-310.pyc +0 -0
- clarifai/utils/__pycache__/__init__.cpython-311.pyc +0 -0
- clarifai/utils/__pycache__/__init__.cpython-39.pyc +0 -0
- clarifai/utils/__pycache__/cli.cpython-310.pyc +0 -0
- clarifai/utils/__pycache__/cli.cpython-311.pyc +0 -0
- clarifai/utils/__pycache__/config.cpython-311.pyc +0 -0
- clarifai/utils/__pycache__/constants.cpython-310.pyc +0 -0
- clarifai/utils/__pycache__/constants.cpython-311.pyc +0 -0
- clarifai/utils/__pycache__/logging.cpython-310.pyc +0 -0
- clarifai/utils/__pycache__/logging.cpython-311.pyc +0 -0
- clarifai/utils/__pycache__/misc.cpython-310.pyc +0 -0
- clarifai/utils/__pycache__/misc.cpython-311.pyc +0 -0
- clarifai/utils/__pycache__/model_train.cpython-310.pyc +0 -0
- clarifai/utils/__pycache__/model_train.cpython-311.pyc +0 -0
- clarifai/utils/__pycache__/protobuf.cpython-311.pyc +0 -0
- clarifai/utils/evaluation/__pycache__/__init__.cpython-311.pyc +0 -0
- clarifai/utils/evaluation/__pycache__/__init__.cpython-39.pyc +0 -0
- clarifai/utils/evaluation/__pycache__/helpers.cpython-311.pyc +0 -0
- clarifai/utils/evaluation/__pycache__/main.cpython-311.pyc +0 -0
- clarifai/utils/evaluation/__pycache__/main.cpython-39.pyc +0 -0
- clarifai/workflows/__pycache__/__init__.cpython-310.pyc +0 -0
- clarifai/workflows/__pycache__/__init__.cpython-311.pyc +0 -0
- clarifai/workflows/__pycache__/__init__.cpython-39.pyc +0 -0
- clarifai/workflows/__pycache__/export.cpython-310.pyc +0 -0
- clarifai/workflows/__pycache__/export.cpython-311.pyc +0 -0
- clarifai/workflows/__pycache__/utils.cpython-310.pyc +0 -0
- clarifai/workflows/__pycache__/utils.cpython-311.pyc +0 -0
- clarifai/workflows/__pycache__/validate.cpython-310.pyc +0 -0
- clarifai/workflows/__pycache__/validate.cpython-311.pyc +0 -0
- clarifai-11.3.0rc2.dist-info/RECORD +0 -322
- {clarifai-11.3.0rc2.dist-info → clarifai-11.4.0.dist-info}/entry_points.txt +0 -0
- {clarifai-11.3.0rc2.dist-info → clarifai-11.4.0.dist-info/licenses}/LICENSE +0 -0
- {clarifai-11.3.0rc2.dist-info → clarifai-11.4.0.dist-info}/top_level.txt +0 -0
clarifai/client/dataset.py
CHANGED
@@ -20,11 +20,16 @@ from clarifai.client.base import BaseClient
|
|
20
20
|
from clarifai.client.input import Inputs
|
21
21
|
from clarifai.client.lister import Lister
|
22
22
|
from clarifai.constants.dataset import DATASET_UPLOAD_TASKS, MAX_RETRIES
|
23
|
-
from clarifai.datasets.export.inputs_annotations import (
|
24
|
-
|
23
|
+
from clarifai.datasets.export.inputs_annotations import (
|
24
|
+
DatasetExportReader,
|
25
|
+
InputAnnotationDownloader,
|
26
|
+
)
|
25
27
|
from clarifai.datasets.upload.base import ClarifaiDataLoader
|
26
|
-
from clarifai.datasets.upload.image import (
|
27
|
-
|
28
|
+
from clarifai.datasets.upload.image import (
|
29
|
+
VisualClassificationDataset,
|
30
|
+
VisualDetectionDataset,
|
31
|
+
VisualSegmentationDataset,
|
32
|
+
)
|
28
33
|
from clarifai.datasets.upload.multimodal import MultiModalDataset
|
29
34
|
from clarifai.datasets.upload.text import TextClassificationDataset
|
30
35
|
from clarifai.datasets.upload.utils import DisplayUploadStatus
|
@@ -33,709 +38,799 @@ from clarifai.urls.helper import ClarifaiUrlHelper
|
|
33
38
|
from clarifai.utils.logging import add_file_handler, logger, process_log_files
|
34
39
|
from clarifai.utils.misc import BackoffIterator, Chunker
|
35
40
|
|
36
|
-
ClarifaiDatasetType = TypeVar(
|
37
|
-
|
38
|
-
|
41
|
+
ClarifaiDatasetType = TypeVar(
|
42
|
+
'ClarifaiDatasetType',
|
43
|
+
VisualClassificationDataset,
|
44
|
+
VisualDetectionDataset,
|
45
|
+
VisualSegmentationDataset,
|
46
|
+
TextClassificationDataset,
|
47
|
+
)
|
39
48
|
|
40
49
|
|
41
50
|
class Dataset(Lister, BaseClient):
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
url: str = None,
|
46
|
-
dataset_id: str = None,
|
47
|
-
dataset_version_id: str = None,
|
48
|
-
base_url: str = "https://api.clarifai.com",
|
49
|
-
pat: str = None,
|
50
|
-
token: str = None,
|
51
|
-
root_certificates_path: str = None,
|
52
|
-
**kwargs):
|
53
|
-
"""Initializes a Dataset object.
|
54
|
-
|
55
|
-
Args:
|
56
|
-
url (str): The URL to initialize the dataset object.
|
57
|
-
dataset_id (str): The Dataset ID within the App to interact with.
|
58
|
-
dataset_version_id (str): The Dataset Version ID within the Dataset to interact with.
|
59
|
-
base_url (str): Base API url. Default "https://api.clarifai.com"
|
60
|
-
pat (str): A personal access token for authentication. Can be set as env var CLARIFAI_PAT
|
61
|
-
token (str): A session token for authentication. Accepts either a session token or a pat. Can be set as env var CLARIFAI_SESSION_TOKEN
|
62
|
-
root_certificates_path (str): Path to the SSL root certificates file, used to establish secure gRPC connections.
|
63
|
-
**kwargs: Additional keyword arguments to be passed to the Dataset.
|
64
|
-
"""
|
65
|
-
if url and dataset_id:
|
66
|
-
raise UserError("You can only specify one of url or dataset_id.")
|
67
|
-
if url:
|
68
|
-
user_id, app_id, _, dataset_id, dataset_version_id = ClarifaiUrlHelper.split_clarifai_url(
|
69
|
-
url)
|
70
|
-
kwargs = {'user_id': user_id, 'app_id': app_id}
|
71
|
-
dataset_version = {
|
72
|
-
'id': dataset_version_id
|
73
|
-
} if dataset_version_id else kwargs['version'] if 'version' in kwargs else None
|
74
|
-
self.kwargs = {**kwargs, 'id': dataset_id, 'version': dataset_version}
|
75
|
-
self.dataset_info = resources_pb2.Dataset(**self.kwargs)
|
76
|
-
# Related to Dataset Upload
|
77
|
-
self.num_workers: int = min(10, cpu_count()) #15 req/sec rate limit
|
78
|
-
self.annot_num_workers = 4
|
79
|
-
self.max_retires = 10
|
80
|
-
self.batch_size = 128 # limit max protos in a req
|
81
|
-
self.task = None # Upload dataset type
|
82
|
-
self.input_object = Inputs(
|
83
|
-
user_id=self.user_id,
|
84
|
-
app_id=self.app_id,
|
85
|
-
pat=pat,
|
86
|
-
token=token,
|
87
|
-
base_url=base_url,
|
88
|
-
root_certificates_path=root_certificates_path)
|
89
|
-
self.logger = logger
|
90
|
-
BaseClient.__init__(
|
51
|
+
"""Dataset is a class that provides access to Clarifai API endpoints related to Dataset information."""
|
52
|
+
|
53
|
+
def __init__(
|
91
54
|
self,
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
'version':
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
for inp_batch in annots
|
233
|
-
]
|
234
|
-
|
235
|
-
for job in as_completed(annot_threads):
|
236
|
-
result = job.result()
|
237
|
-
if result:
|
238
|
-
retry_annot_upload.extend(result)
|
239
|
-
|
240
|
-
return retry_annot_upload
|
241
|
-
|
242
|
-
def _delete_failed_inputs(self,
|
243
|
-
batch_input_ids: List[int],
|
244
|
-
dataset_obj: ClarifaiDatasetType,
|
245
|
-
upload_response: MultiInputResponse = None,
|
246
|
-
batch_no: Optional[int] = None) -> Tuple[List[int], List[int]]:
|
247
|
-
"""Delete failed input ids from clarifai platform dataset.
|
248
|
-
|
249
|
-
Args:
|
250
|
-
batch_input_ids: batch input ids
|
251
|
-
dataset_obj: ClarifaiDataset object
|
252
|
-
upload_response: upload response proto
|
253
|
-
|
254
|
-
Returns:
|
255
|
-
success_inputs: upload success input ids
|
256
|
-
failed_inputs: upload failed input ids
|
257
|
-
"""
|
258
|
-
success_status = status_pb2.Status(code=status_code_pb2.INPUT_DOWNLOAD_SUCCESS)
|
259
|
-
input_ids = {dataset_obj.all_input_ids[id]: id for id in batch_input_ids}
|
260
|
-
response = self._grpc_request(
|
261
|
-
self.STUB.ListInputs,
|
262
|
-
service_pb2.ListInputsRequest(
|
263
|
-
ids=list(input_ids.keys()),
|
264
|
-
per_page=len(input_ids),
|
55
|
+
url: str = None,
|
56
|
+
dataset_id: str = None,
|
57
|
+
dataset_version_id: str = None,
|
58
|
+
base_url: str = "https://api.clarifai.com",
|
59
|
+
pat: str = None,
|
60
|
+
token: str = None,
|
61
|
+
root_certificates_path: str = None,
|
62
|
+
**kwargs,
|
63
|
+
):
|
64
|
+
"""Initializes a Dataset object.
|
65
|
+
|
66
|
+
Args:
|
67
|
+
url (str): The URL to initialize the dataset object.
|
68
|
+
dataset_id (str): The Dataset ID within the App to interact with.
|
69
|
+
dataset_version_id (str): The Dataset Version ID within the Dataset to interact with.
|
70
|
+
base_url (str): Base API url. Default "https://api.clarifai.com"
|
71
|
+
pat (str): A personal access token for authentication. Can be set as env var CLARIFAI_PAT
|
72
|
+
token (str): A session token for authentication. Accepts either a session token or a pat. Can be set as env var CLARIFAI_SESSION_TOKEN
|
73
|
+
root_certificates_path (str): Path to the SSL root certificates file, used to establish secure gRPC connections.
|
74
|
+
**kwargs: Additional keyword arguments to be passed to the Dataset.
|
75
|
+
"""
|
76
|
+
if url and dataset_id:
|
77
|
+
raise UserError("You can only specify one of url or dataset_id.")
|
78
|
+
if url:
|
79
|
+
user_id, app_id, _, dataset_id, dataset_version_id = (
|
80
|
+
ClarifaiUrlHelper.split_clarifai_url(url)
|
81
|
+
)
|
82
|
+
kwargs = {'user_id': user_id, 'app_id': app_id}
|
83
|
+
dataset_version = (
|
84
|
+
{'id': dataset_version_id}
|
85
|
+
if dataset_version_id
|
86
|
+
else kwargs['version']
|
87
|
+
if 'version' in kwargs
|
88
|
+
else None
|
89
|
+
)
|
90
|
+
self.kwargs = {**kwargs, 'id': dataset_id, 'version': dataset_version}
|
91
|
+
self.dataset_info = resources_pb2.Dataset(**self.kwargs)
|
92
|
+
# Related to Dataset Upload
|
93
|
+
self.num_workers: int = min(10, cpu_count()) # 15 req/sec rate limit
|
94
|
+
self.annot_num_workers = 4
|
95
|
+
self.max_retires = 10
|
96
|
+
self.batch_size = 128 # limit max protos in a req
|
97
|
+
self.task = None # Upload dataset type
|
98
|
+
self.input_object = Inputs(
|
99
|
+
user_id=self.user_id,
|
100
|
+
app_id=self.app_id,
|
101
|
+
pat=pat,
|
102
|
+
token=token,
|
103
|
+
base_url=base_url,
|
104
|
+
root_certificates_path=root_certificates_path,
|
105
|
+
)
|
106
|
+
self.logger = logger
|
107
|
+
BaseClient.__init__(
|
108
|
+
self,
|
109
|
+
user_id=self.user_id,
|
110
|
+
app_id=self.app_id,
|
111
|
+
base=base_url,
|
112
|
+
pat=pat,
|
113
|
+
token=token,
|
114
|
+
root_certificates_path=root_certificates_path,
|
115
|
+
)
|
116
|
+
Lister.__init__(self)
|
117
|
+
|
118
|
+
def create_version(self, **kwargs) -> 'Dataset':
|
119
|
+
"""Creates a dataset version for the Dataset.
|
120
|
+
|
121
|
+
Args:
|
122
|
+
**kwargs: Additional keyword arguments to be passed to Dataset Version.
|
123
|
+
- description (str): The description of the dataset version.
|
124
|
+
- metadata (dict): The metadata of the dataset version.
|
125
|
+
|
126
|
+
Returns:
|
127
|
+
Dataset: A Dataset object for the specified dataset ID.
|
128
|
+
|
129
|
+
Example:
|
130
|
+
>>> from clarifai.client.dataset import Dataset
|
131
|
+
>>> dataset = Dataset(dataset_id='dataset_id', user_id='user_id', app_id='app_id')
|
132
|
+
>>> dataset_version = dataset.create_version(description='dataset_version_description')
|
133
|
+
"""
|
134
|
+
request = service_pb2.PostDatasetVersionsRequest(
|
135
|
+
user_app_id=self.user_app_id,
|
136
|
+
dataset_id=self.id,
|
137
|
+
dataset_versions=[resources_pb2.DatasetVersion(**kwargs)],
|
138
|
+
)
|
139
|
+
|
140
|
+
response = self._grpc_request(self.STUB.PostDatasetVersions, request)
|
141
|
+
if response.status.code != status_code_pb2.SUCCESS:
|
142
|
+
raise Exception(response.status)
|
143
|
+
self.logger.info("\nDataset Version created\n%s", response.status)
|
144
|
+
kwargs.update(
|
145
|
+
{
|
146
|
+
'dataset_id': self.id,
|
147
|
+
'version': response.dataset_versions[0],
|
148
|
+
}
|
149
|
+
)
|
150
|
+
|
151
|
+
return Dataset.from_auth_helper(self.auth_helper, **kwargs)
|
152
|
+
|
153
|
+
def delete_version(self, version_id: str) -> None:
|
154
|
+
"""Deletes a dataset version for the Dataset.
|
155
|
+
|
156
|
+
Args:
|
157
|
+
version_id (str): The version ID to delete.
|
158
|
+
|
159
|
+
Example:
|
160
|
+
>>> from clarifai.client.dataset import Dataset
|
161
|
+
>>> dataset = Dataset(dataset_id='dataset_id', user_id='user_id', app_id='app_id')
|
162
|
+
>>> dataset.delete_version(version_id='version_id')
|
163
|
+
"""
|
164
|
+
request = service_pb2.DeleteDatasetVersionsRequest(
|
165
|
+
user_app_id=self.user_app_id, dataset_id=self.id, dataset_version_ids=[version_id]
|
166
|
+
)
|
167
|
+
|
168
|
+
response = self._grpc_request(self.STUB.DeleteDatasetVersions, request)
|
169
|
+
if response.status.code != status_code_pb2.SUCCESS:
|
170
|
+
raise Exception(response.status)
|
171
|
+
self.logger.info("\nDataset Version Deleted\n%s", response.status)
|
172
|
+
|
173
|
+
def list_versions(
|
174
|
+
self, page_no: int = None, per_page: int = None
|
175
|
+
) -> Generator['Dataset', None, None]:
|
176
|
+
"""Lists all the versions for the dataset.
|
177
|
+
|
178
|
+
Args:
|
179
|
+
page_no (int): The page number to list.
|
180
|
+
per_page (int): The number of items per page.
|
181
|
+
|
182
|
+
Yields:
|
183
|
+
Dataset: Dataset objects for the versions of the dataset.
|
184
|
+
|
185
|
+
Example:
|
186
|
+
>>> from clarifai.client.dataset import Dataset
|
187
|
+
>>> dataset = Dataset(dataset_id='dataset_id', user_id='user_id', app_id='app_id')
|
188
|
+
>>> all_dataset_versions = list(dataset.list_versions())
|
189
|
+
|
190
|
+
Note:
|
191
|
+
Defaults to 16 per page if page_no is specified and per_page is not specified.
|
192
|
+
If both page_no and per_page are None, then lists all the resources.
|
193
|
+
"""
|
194
|
+
request_data = dict(
|
265
195
|
user_app_id=self.user_app_id,
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
duplicate_input_ids = [
|
275
|
-
input.id for input in upload_response.inputs
|
276
|
-
if input.status.details == 'Input has a duplicate ID.'
|
277
|
-
] #handling duplicte ID failures.
|
278
|
-
if duplicate_input_ids:
|
279
|
-
success_input_ids = list(set(success_input_ids.copy()) - set(duplicate_input_ids.copy()))
|
280
|
-
failed_input_ids = list(set(failed_input_ids) - set(duplicate_input_ids))
|
281
|
-
duplicate_details = [[
|
282
|
-
input_ids[id], id, "Input has a duplicate ID.",
|
283
|
-
getattr(dataset_obj.data_generator[input_ids[id]], 'image_path', None),
|
284
|
-
getattr(dataset_obj.data_generator[input_ids[id]], 'labels', None),
|
285
|
-
getattr(dataset_obj.data_generator[input_ids[id]], 'metadata', None)
|
286
|
-
] for id in duplicate_input_ids]
|
287
|
-
duplicate_table = tabulate(
|
288
|
-
duplicate_details,
|
289
|
-
headers=["Index", "Input ID", "Status", "Image Path", "Labels", "Metadata"],
|
290
|
-
tablefmt="grid")
|
291
|
-
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
292
|
-
self.logger.warning(
|
293
|
-
f"{timestamp}\nFailed to upload {len(duplicate_input_ids)} inputs due to duplicate IDs in current batch {batch_no}:\n{duplicate_table}\n\n"
|
294
|
-
)
|
295
|
-
|
296
|
-
#delete failed inputs
|
297
|
-
self._grpc_request(
|
298
|
-
self.STUB.DeleteInputs,
|
299
|
-
service_pb2.DeleteInputsRequest(user_app_id=self.user_app_id, ids=failed_input_ids),
|
300
|
-
)
|
301
|
-
return [input_ids[id] for id in success_input_ids], [input_ids[id] for id in failed_input_ids]
|
302
|
-
|
303
|
-
def _upload_inputs_annotations(
|
304
|
-
self,
|
305
|
-
batch_input_ids: List[int],
|
306
|
-
dataset_obj: ClarifaiDatasetType,
|
307
|
-
batch_no: Optional[int] = None,
|
308
|
-
is_retry_duplicates: bool = False,
|
309
|
-
) -> Tuple[List[int], List[resources_pb2.Annotation], MultiInputResponse]:
|
310
|
-
"""Uploads batch of inputs and annotations concurrently to clarifai platform dataset.
|
311
|
-
|
312
|
-
Args:
|
313
|
-
batch_input_ids: batch input ids
|
314
|
-
dataset_obj: ClarifaiDataset object
|
315
|
-
|
316
|
-
Returns:
|
317
|
-
failed_input_ids: failed input ids
|
318
|
-
retry_annot_protos: failed annot protos
|
319
|
-
response: upload response proto
|
320
|
-
"""
|
321
|
-
input_protos, _ = dataset_obj.get_protos(batch_input_ids)
|
322
|
-
if is_retry_duplicates:
|
323
|
-
for inp in input_protos:
|
324
|
-
inp.id = uuid.uuid4().hex
|
325
|
-
|
326
|
-
input_job_id, _response = self.input_object.upload_inputs(inputs=input_protos, show_log=False)
|
327
|
-
retry_annot_protos = []
|
328
|
-
|
329
|
-
self.input_object._wait_for_inputs(input_job_id)
|
330
|
-
success_input_ids, failed_input_ids = self._delete_failed_inputs(batch_input_ids, dataset_obj,
|
331
|
-
_response, batch_no)
|
332
|
-
|
333
|
-
if self.task in ["visual_detection", "visual_segmentation"] and success_input_ids:
|
334
|
-
_, annotation_protos = dataset_obj.get_protos(success_input_ids)
|
335
|
-
chunked_annotation_protos = Chunker(annotation_protos, self.batch_size).chunk()
|
336
|
-
retry_annot_protos.extend(self._concurrent_annot_upload(chunked_annotation_protos))
|
337
|
-
|
338
|
-
return failed_input_ids, retry_annot_protos, _response
|
339
|
-
|
340
|
-
def _retry_uploads(self, failed_input_ids: List[int],
|
341
|
-
retry_annot_protos: List[resources_pb2.Annotation],
|
342
|
-
dataset_obj: ClarifaiDatasetType, batch_no: Optional[int]) -> None:
|
343
|
-
"""Retry failed uploads.
|
344
|
-
|
345
|
-
Args:
|
346
|
-
failed_input_ids: failed input ids
|
347
|
-
retry_annot_protos: failed annot protos
|
348
|
-
dataset_obj: ClarifaiDataset object
|
349
|
-
"""
|
350
|
-
|
351
|
-
for _retry in range(MAX_RETRIES):
|
352
|
-
if not failed_input_ids and not retry_annot_protos:
|
353
|
-
break
|
354
|
-
if failed_input_ids:
|
355
|
-
retry_input_ids = [dataset_obj.all_input_ids[id] for id in failed_input_ids]
|
356
|
-
logger.warning(
|
357
|
-
f"Retrying upload for {len(failed_input_ids)} inputs in current batch: {retry_input_ids}\n"
|
196
|
+
dataset_id=self.id,
|
197
|
+
)
|
198
|
+
all_dataset_versions_info = self.list_pages_generator(
|
199
|
+
self.STUB.ListDatasetVersions,
|
200
|
+
service_pb2.ListDatasetVersionsRequest,
|
201
|
+
request_data,
|
202
|
+
per_page=per_page,
|
203
|
+
page_no=page_no,
|
358
204
|
)
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
385
|
-
|
386
|
-
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
|
394
|
-
|
395
|
-
|
396
|
-
|
397
|
-
|
398
|
-
|
399
|
-
|
400
|
-
|
401
|
-
|
402
|
-
|
403
|
-
|
404
|
-
|
405
|
-
|
406
|
-
|
407
|
-
|
408
|
-
|
409
|
-
|
410
|
-
|
411
|
-
|
412
|
-
|
413
|
-
|
414
|
-
|
415
|
-
|
416
|
-
|
417
|
-
|
418
|
-
for
|
205
|
+
|
206
|
+
for dataset_version_info in all_dataset_versions_info:
|
207
|
+
dataset_version_info['id'] = dataset_version_info['dataset_version_id']
|
208
|
+
del dataset_version_info['dataset_version_id']
|
209
|
+
dataset_version_info.pop('metrics', None)
|
210
|
+
dataset_version_info.pop('export_info', None)
|
211
|
+
kwargs = {
|
212
|
+
'dataset_id': self.id,
|
213
|
+
'version': resources_pb2.DatasetVersion(**dataset_version_info),
|
214
|
+
}
|
215
|
+
yield Dataset.from_auth_helper(self.auth_helper, **kwargs)
|
216
|
+
|
217
|
+
def list_inputs(
|
218
|
+
self, page_no: int = None, per_page: int = None, input_type: str = None
|
219
|
+
) -> Generator[Input, None, None]:
|
220
|
+
"""Lists all the inputs for the dataset.
|
221
|
+
|
222
|
+
Args:
|
223
|
+
page_no (int): The page number to list.
|
224
|
+
per_page (int): The number of items per page.
|
225
|
+
input_type (str): The type of input to list. Options: 'image', 'video', 'audio', 'text'.
|
226
|
+
|
227
|
+
Yields:
|
228
|
+
Input: Input objects in the dataset.
|
229
|
+
|
230
|
+
Example:
|
231
|
+
>>> from clarifai.client.dataset import Dataset
|
232
|
+
>>> dataset = Dataset(dataset_id='dataset_id', user_id='user_id', app_id='app_id')
|
233
|
+
>>> all_dataset_inputs = list(dataset.list_inputs())
|
234
|
+
"""
|
235
|
+
return self.input_object.list_inputs(
|
236
|
+
dataset_id=self.id, page_no=page_no, per_page=per_page, input_type=input_type
|
237
|
+
)
|
238
|
+
|
239
|
+
def __iter__(self):
|
240
|
+
return iter(DatasetExportReader(archive_url=self.archive_zip()))
|
241
|
+
|
242
|
+
def _concurrent_annot_upload(
|
243
|
+
self, annots: List[List[resources_pb2.Annotation]]
|
244
|
+
) -> Union[List[resources_pb2.Annotation], List[None]]:
|
245
|
+
"""Uploads annotations concurrently.
|
246
|
+
|
247
|
+
Args:
|
248
|
+
annots: annot protos
|
249
|
+
|
250
|
+
Returns:
|
251
|
+
retry_annot_upload: All failed annot protos during upload
|
252
|
+
"""
|
253
|
+
annot_threads = []
|
254
|
+
retry_annot_upload = []
|
255
|
+
|
256
|
+
with ThreadPoolExecutor(
|
257
|
+
max_workers=self.annot_num_workers
|
258
|
+
) as executor: # limit annot workers
|
259
|
+
annot_threads = [
|
260
|
+
executor.submit(self.input_object.upload_annotations, inp_batch, False)
|
261
|
+
for inp_batch in annots
|
262
|
+
]
|
263
|
+
|
264
|
+
for job in as_completed(annot_threads):
|
265
|
+
result = job.result()
|
266
|
+
if result:
|
267
|
+
retry_annot_upload.extend(result)
|
268
|
+
|
269
|
+
return retry_annot_upload
|
270
|
+
|
271
|
+
def _delete_failed_inputs(
|
272
|
+
self,
|
273
|
+
batch_input_ids: List[int],
|
274
|
+
dataset_obj: ClarifaiDatasetType,
|
275
|
+
upload_response: MultiInputResponse = None,
|
276
|
+
batch_no: Optional[int] = None,
|
277
|
+
) -> Tuple[List[int], List[int]]:
|
278
|
+
"""Delete failed input ids from clarifai platform dataset.
|
279
|
+
|
280
|
+
Args:
|
281
|
+
batch_input_ids: batch input ids
|
282
|
+
dataset_obj: ClarifaiDataset object
|
283
|
+
upload_response: upload response proto
|
284
|
+
|
285
|
+
Returns:
|
286
|
+
success_inputs: upload success input ids
|
287
|
+
failed_inputs: upload failed input ids
|
288
|
+
"""
|
289
|
+
success_status = status_pb2.Status(code=status_code_pb2.INPUT_DOWNLOAD_SUCCESS)
|
290
|
+
input_ids = {dataset_obj.all_input_ids[id]: id for id in batch_input_ids}
|
291
|
+
response = self._grpc_request(
|
292
|
+
self.STUB.ListInputs,
|
293
|
+
service_pb2.ListInputsRequest(
|
294
|
+
ids=list(input_ids.keys()),
|
295
|
+
per_page=len(input_ids),
|
296
|
+
user_app_id=self.user_app_id,
|
297
|
+
status=success_status,
|
298
|
+
),
|
299
|
+
)
|
300
|
+
response_dict = MessageToDict(response)
|
301
|
+
success_inputs = response_dict.get('inputs', [])
|
302
|
+
|
303
|
+
success_input_ids = [input.get('id') for input in success_inputs]
|
304
|
+
failed_input_ids = list(set(input_ids) - set(success_input_ids.copy()))
|
305
|
+
# check duplicate input ids
|
306
|
+
duplicate_input_ids = [
|
307
|
+
input.id
|
308
|
+
for input in upload_response.inputs
|
309
|
+
if input.status.details == 'Input has a duplicate ID.'
|
310
|
+
] # handling duplicte ID failures.
|
311
|
+
if duplicate_input_ids:
|
312
|
+
success_input_ids = list(
|
313
|
+
set(success_input_ids.copy()) - set(duplicate_input_ids.copy())
|
314
|
+
)
|
315
|
+
failed_input_ids = list(set(failed_input_ids) - set(duplicate_input_ids))
|
316
|
+
duplicate_details = [
|
317
|
+
[
|
318
|
+
input_ids[id],
|
319
|
+
id,
|
320
|
+
"Input has a duplicate ID.",
|
321
|
+
getattr(dataset_obj.data_generator[input_ids[id]], 'image_path', None),
|
322
|
+
getattr(dataset_obj.data_generator[input_ids[id]], 'labels', None),
|
323
|
+
getattr(dataset_obj.data_generator[input_ids[id]], 'metadata', None),
|
324
|
+
]
|
325
|
+
for id in duplicate_input_ids
|
326
|
+
]
|
327
|
+
duplicate_table = tabulate(
|
328
|
+
duplicate_details,
|
329
|
+
headers=["Index", "Input ID", "Status", "Image Path", "Labels", "Metadata"],
|
330
|
+
tablefmt="grid",
|
331
|
+
)
|
332
|
+
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
333
|
+
self.logger.warning(
|
334
|
+
f"{timestamp}\nFailed to upload {len(duplicate_input_ids)} inputs due to duplicate IDs in current batch {batch_no}:\n{duplicate_table}\n\n"
|
335
|
+
)
|
336
|
+
|
337
|
+
# delete failed inputs
|
338
|
+
self._grpc_request(
|
339
|
+
self.STUB.DeleteInputs,
|
340
|
+
service_pb2.DeleteInputsRequest(user_app_id=self.user_app_id, ids=failed_input_ids),
|
341
|
+
)
|
342
|
+
return [input_ids[id] for id in success_input_ids], [
|
343
|
+
input_ids[id] for id in failed_input_ids
|
419
344
|
]
|
420
345
|
|
421
|
-
|
422
|
-
|
423
|
-
|
424
|
-
|
425
|
-
|
426
|
-
|
427
|
-
|
428
|
-
|
429
|
-
|
430
|
-
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
438
|
-
|
439
|
-
|
440
|
-
|
441
|
-
|
442
|
-
|
443
|
-
|
444
|
-
|
445
|
-
|
346
|
+
def _upload_inputs_annotations(
|
347
|
+
self,
|
348
|
+
batch_input_ids: List[int],
|
349
|
+
dataset_obj: ClarifaiDatasetType,
|
350
|
+
batch_no: Optional[int] = None,
|
351
|
+
is_retry_duplicates: bool = False,
|
352
|
+
) -> Tuple[List[int], List[resources_pb2.Annotation], MultiInputResponse]:
|
353
|
+
"""Uploads batch of inputs and annotations concurrently to clarifai platform dataset.
|
354
|
+
|
355
|
+
Args:
|
356
|
+
batch_input_ids: batch input ids
|
357
|
+
dataset_obj: ClarifaiDataset object
|
358
|
+
|
359
|
+
Returns:
|
360
|
+
failed_input_ids: failed input ids
|
361
|
+
retry_annot_protos: failed annot protos
|
362
|
+
response: upload response proto
|
363
|
+
"""
|
364
|
+
input_protos, _ = dataset_obj.get_protos(batch_input_ids)
|
365
|
+
if is_retry_duplicates:
|
366
|
+
for inp in input_protos:
|
367
|
+
inp.id = uuid.uuid4().hex
|
368
|
+
|
369
|
+
input_job_id, _response = self.input_object.upload_inputs(
|
370
|
+
inputs=input_protos, show_log=False
|
371
|
+
)
|
372
|
+
retry_annot_protos = []
|
373
|
+
|
374
|
+
self.input_object._wait_for_inputs(input_job_id)
|
375
|
+
success_input_ids, failed_input_ids = self._delete_failed_inputs(
|
376
|
+
batch_input_ids, dataset_obj, _response, batch_no
|
377
|
+
)
|
378
|
+
|
379
|
+
if self.task in ["visual_detection", "visual_segmentation"] and success_input_ids:
|
380
|
+
_, annotation_protos = dataset_obj.get_protos(success_input_ids)
|
381
|
+
chunked_annotation_protos = Chunker(annotation_protos, self.batch_size).chunk()
|
382
|
+
retry_annot_protos.extend(self._concurrent_annot_upload(chunked_annotation_protos))
|
383
|
+
|
384
|
+
return failed_input_ids, retry_annot_protos, _response
|
385
|
+
|
386
|
+
def _retry_uploads(
|
387
|
+
self,
|
388
|
+
failed_input_ids: List[int],
|
389
|
+
retry_annot_protos: List[resources_pb2.Annotation],
|
390
|
+
dataset_obj: ClarifaiDatasetType,
|
391
|
+
batch_no: Optional[int],
|
392
|
+
) -> None:
|
393
|
+
"""Retry failed uploads.
|
394
|
+
|
395
|
+
Args:
|
396
|
+
failed_input_ids: failed input ids
|
397
|
+
retry_annot_protos: failed annot protos
|
398
|
+
dataset_obj: ClarifaiDataset object
|
399
|
+
"""
|
400
|
+
|
401
|
+
for _retry in range(MAX_RETRIES):
|
402
|
+
if not failed_input_ids and not retry_annot_protos:
|
403
|
+
break
|
404
|
+
if failed_input_ids:
|
405
|
+
retry_input_ids = [dataset_obj.all_input_ids[id] for id in failed_input_ids]
|
406
|
+
logger.warning(
|
407
|
+
f"Retrying upload for {len(failed_input_ids)} inputs in current batch: {retry_input_ids}\n"
|
408
|
+
)
|
409
|
+
failed_retrying_inputs, _, retry_response = self._upload_inputs_annotations(
|
410
|
+
failed_input_ids, dataset_obj, batch_no
|
411
|
+
)
|
412
|
+
failed_input_ids = failed_retrying_inputs
|
413
|
+
if retry_annot_protos:
|
414
|
+
chunked_annotation_protos = Chunker(retry_annot_protos, self.batch_size).chunk()
|
415
|
+
_ = self._concurrent_annot_upload(chunked_annotation_protos)
|
416
|
+
|
417
|
+
# Log failed inputs
|
418
|
+
if failed_input_ids:
|
419
|
+
failed_inputs_logs = []
|
420
|
+
input_map = {input.id: input for input in retry_response.inputs}
|
421
|
+
for index in failed_retrying_inputs:
|
422
|
+
failed_id = dataset_obj.all_input_ids[index]
|
423
|
+
input_details = input_map.get(failed_id)
|
424
|
+
if input_details:
|
425
|
+
failed_input_details = [
|
426
|
+
index,
|
427
|
+
failed_id,
|
428
|
+
input_details.status.details,
|
429
|
+
getattr(dataset_obj.data_generator[index], 'image_path', None)
|
430
|
+
or getattr(dataset_obj.data_generator[index], 'text', None),
|
431
|
+
dataset_obj.data_generator[index].labels,
|
432
|
+
dataset_obj.data_generator[index].metadata,
|
433
|
+
]
|
434
|
+
failed_inputs_logs.append(failed_input_details)
|
435
|
+
|
436
|
+
failed_table = tabulate(
|
437
|
+
failed_inputs_logs,
|
438
|
+
headers=["Index", "Input ID", "Status", "Input", "Labels", "Metadata"],
|
439
|
+
tablefmt="grid",
|
440
|
+
)
|
441
|
+
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
442
|
+
self.logger.warning(
|
443
|
+
f"{timestamp}\nFailed to upload {len(failed_retrying_inputs)} inputs in current batch {batch_no} due to {retry_response}:\n{failed_table}\n\n"
|
444
|
+
)
|
445
|
+
|
446
|
+
def _data_upload(
|
447
|
+
self,
|
448
|
+
dataset_obj: ClarifaiDatasetType,
|
449
|
+
is_log_retry: bool = False,
|
450
|
+
log_retry_ids: List[int] = None,
|
451
|
+
**kwargs,
|
452
|
+
) -> None:
|
453
|
+
"""Uploads inputs and annotations to clarifai platform dataset.
|
454
|
+
|
455
|
+
Args:
|
456
|
+
dataset_obj: ClarifaiDataset object,
|
457
|
+
is_log_retry: True if the iteration is to retry uploads from logs.
|
458
|
+
**kwargs: Additional keyword arguments for retry uploading functionality..
|
459
|
+
|
460
|
+
Returns:
|
461
|
+
None
|
462
|
+
"""
|
463
|
+
if is_log_retry:
|
464
|
+
input_ids = log_retry_ids
|
465
|
+
else:
|
466
|
+
input_ids = list(range(len(dataset_obj)))
|
467
|
+
|
468
|
+
chunk_input_ids = Chunker(input_ids, self.batch_size).chunk()
|
469
|
+
with ThreadPoolExecutor(max_workers=self.num_workers) as executor:
|
470
|
+
with tqdm(total=len(chunk_input_ids), desc='Uploading Dataset') as progress:
|
471
|
+
# Submit all jobs to the executor and store the returned futures
|
472
|
+
futures = [
|
473
|
+
executor.submit(
|
474
|
+
self._upload_inputs_annotations,
|
475
|
+
batch_input_ids,
|
476
|
+
dataset_obj,
|
477
|
+
batch_no,
|
478
|
+
**kwargs,
|
479
|
+
)
|
480
|
+
for batch_no, batch_input_ids in enumerate(chunk_input_ids)
|
481
|
+
]
|
482
|
+
|
483
|
+
for batch_no, job in enumerate(as_completed(futures)):
|
484
|
+
retry_input_ids, retry_annot_protos, _ = job.result()
|
485
|
+
self._retry_uploads(retry_input_ids, retry_annot_protos, dataset_obj, batch_no)
|
486
|
+
progress.update()
|
487
|
+
|
488
|
+
def upload_dataset(
|
489
|
+
self,
|
490
|
+
dataloader: Type[ClarifaiDataLoader],
|
491
|
+
batch_size: int = 32,
|
492
|
+
get_upload_status: bool = False,
|
493
|
+
log_warnings: bool = False,
|
494
|
+
**kwargs,
|
495
|
+
) -> None:
|
496
|
+
"""Uploads a dataset to the app.
|
497
|
+
|
498
|
+
Args:
|
499
|
+
dataloader (Type[ClarifaiDataLoader]): ClarifaiDataLoader object
|
500
|
+
batch_size (int): batch size for concurrent upload of inputs and annotations (max: 128)
|
501
|
+
get_upload_status (bool): True if you want to get the upload status of the dataset
|
502
|
+
log_warnings (bool): True if you want to save log warnings in a file
|
503
|
+
kwargs: Additional keyword arguments for retry uploading functionality..
|
504
|
+
"""
|
505
|
+
# set batch size and task
|
506
|
+
self.batch_size = min(self.batch_size, batch_size)
|
507
|
+
self.task = dataloader.task
|
508
|
+
if self.task not in DATASET_UPLOAD_TASKS:
|
509
|
+
raise UserError(
|
510
|
+
"Task should be one of \
|
446
511
|
'text_classification', 'visual_classification', \
|
447
512
|
'visual_detection', 'visual_segmentation', 'visual_captioning', 'multimodal_dataset'"
|
448
|
-
|
449
|
-
|
450
|
-
|
451
|
-
|
452
|
-
|
453
|
-
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
|
462
|
-
|
463
|
-
|
464
|
-
|
465
|
-
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
|
472
|
-
|
473
|
-
|
474
|
-
|
475
|
-
|
476
|
-
|
477
|
-
|
478
|
-
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
|
483
|
-
|
484
|
-
|
485
|
-
|
486
|
-
|
487
|
-
|
488
|
-
|
489
|
-
|
490
|
-
|
491
|
-
|
492
|
-
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
|
497
|
-
|
498
|
-
|
499
|
-
|
500
|
-
|
501
|
-
|
502
|
-
|
503
|
-
|
504
|
-
|
505
|
-
|
506
|
-
|
507
|
-
|
508
|
-
|
509
|
-
|
510
|
-
|
511
|
-
|
512
|
-
|
513
|
-
|
514
|
-
|
515
|
-
|
516
|
-
|
517
|
-
|
518
|
-
|
519
|
-
|
520
|
-
|
521
|
-
|
522
|
-
|
523
|
-
|
524
|
-
|
525
|
-
|
526
|
-
|
527
|
-
|
528
|
-
|
529
|
-
|
530
|
-
|
531
|
-
|
532
|
-
|
533
|
-
|
534
|
-
|
535
|
-
|
536
|
-
|
537
|
-
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
|
542
|
-
|
543
|
-
|
544
|
-
|
545
|
-
|
546
|
-
|
547
|
-
|
548
|
-
|
549
|
-
|
550
|
-
|
551
|
-
|
552
|
-
|
553
|
-
|
554
|
-
|
555
|
-
|
556
|
-
|
557
|
-
|
558
|
-
|
559
|
-
|
560
|
-
|
561
|
-
|
562
|
-
|
563
|
-
|
564
|
-
|
565
|
-
|
566
|
-
|
567
|
-
|
568
|
-
|
569
|
-
|
570
|
-
|
571
|
-
|
572
|
-
|
573
|
-
|
574
|
-
|
575
|
-
|
576
|
-
|
577
|
-
|
578
|
-
|
579
|
-
|
580
|
-
|
581
|
-
|
582
|
-
|
583
|
-
|
584
|
-
|
585
|
-
|
586
|
-
|
587
|
-
|
588
|
-
|
589
|
-
|
590
|
-
|
591
|
-
|
592
|
-
|
593
|
-
|
594
|
-
|
595
|
-
|
596
|
-
|
597
|
-
|
598
|
-
|
599
|
-
|
600
|
-
|
601
|
-
|
602
|
-
|
603
|
-
|
604
|
-
|
605
|
-
|
606
|
-
|
607
|
-
|
608
|
-
|
609
|
-
|
610
|
-
|
611
|
-
|
612
|
-
|
613
|
-
|
614
|
-
|
615
|
-
|
616
|
-
|
617
|
-
|
618
|
-
|
619
|
-
|
620
|
-
|
621
|
-
|
622
|
-
|
623
|
-
|
624
|
-
|
625
|
-
|
626
|
-
|
627
|
-
|
628
|
-
|
629
|
-
|
630
|
-
|
631
|
-
|
632
|
-
|
633
|
-
time.
|
634
|
-
|
635
|
-
|
636
|
-
|
637
|
-
|
638
|
-
|
639
|
-
|
640
|
-
|
641
|
-
|
642
|
-
|
643
|
-
|
644
|
-
|
645
|
-
|
646
|
-
|
647
|
-
|
648
|
-
|
649
|
-
|
650
|
-
|
651
|
-
|
652
|
-
|
653
|
-
|
654
|
-
|
655
|
-
|
656
|
-
|
657
|
-
|
658
|
-
|
659
|
-
|
660
|
-
|
661
|
-
|
662
|
-
|
663
|
-
|
664
|
-
|
665
|
-
|
666
|
-
|
667
|
-
|
668
|
-
|
669
|
-
|
670
|
-
|
671
|
-
self
|
672
|
-
|
673
|
-
|
674
|
-
|
675
|
-
|
676
|
-
|
677
|
-
|
678
|
-
|
679
|
-
|
680
|
-
|
681
|
-
|
682
|
-
|
683
|
-
|
684
|
-
|
685
|
-
|
686
|
-
|
687
|
-
|
688
|
-
|
689
|
-
|
690
|
-
|
691
|
-
|
513
|
+
)
|
514
|
+
|
515
|
+
if self.task == "text_classification":
|
516
|
+
dataset_obj = TextClassificationDataset(dataloader, self.id)
|
517
|
+
|
518
|
+
elif self.task == "visual_detection":
|
519
|
+
dataset_obj = VisualDetectionDataset(dataloader, self.id)
|
520
|
+
|
521
|
+
elif self.task == "visual_segmentation":
|
522
|
+
dataset_obj = VisualSegmentationDataset(dataloader, self.id)
|
523
|
+
|
524
|
+
elif self.task == "multimodal_dataset":
|
525
|
+
dataset_obj = MultiModalDataset(dataloader, self.id)
|
526
|
+
|
527
|
+
else: # visual_classification & visual_captioning
|
528
|
+
dataset_obj = VisualClassificationDataset(dataloader, self.id)
|
529
|
+
|
530
|
+
if get_upload_status:
|
531
|
+
pre_upload_stats = self.get_upload_status(pre_upload=True)
|
532
|
+
|
533
|
+
# add file handler to log warnings
|
534
|
+
if log_warnings:
|
535
|
+
add_file_handler(
|
536
|
+
self.logger, f"Dataset_Upload{str(int(datetime.now().timestamp()))}.log"
|
537
|
+
)
|
538
|
+
self._data_upload(dataset_obj, **kwargs)
|
539
|
+
|
540
|
+
if get_upload_status:
|
541
|
+
self.get_upload_status(dataloader=dataloader, pre_upload_stats=pre_upload_stats)
|
542
|
+
|
543
|
+
def retry_upload_from_logs(
|
544
|
+
self,
|
545
|
+
log_file_path: str,
|
546
|
+
dataloader: Type[ClarifaiDataLoader],
|
547
|
+
retry_duplicates: bool = False,
|
548
|
+
log_warnings: bool = False,
|
549
|
+
**kwargs,
|
550
|
+
) -> None:
|
551
|
+
"""Retries failed uploads from the log file.
|
552
|
+
|
553
|
+
Args:
|
554
|
+
log_file_path (str): path to the log file
|
555
|
+
dataloader (Type[ClarifaiDataLoader]): ClarifaiDataLoader object
|
556
|
+
retry_duplicate (bool): True if you want to retry duplicate inputs
|
557
|
+
kwargs: Additional keyword arguments for retry uploading functionality..
|
558
|
+
"""
|
559
|
+
|
560
|
+
duplicate_input_ids, failed_input_ids = process_log_files(log_file_path)
|
561
|
+
if log_warnings:
|
562
|
+
add_file_handler(
|
563
|
+
self.logger, f"Dataset_Upload{str(int(datetime.now().timestamp()))}.log"
|
564
|
+
)
|
565
|
+
|
566
|
+
if retry_duplicates and duplicate_input_ids:
|
567
|
+
logger.warning(f"Retrying upload for {len(duplicate_input_ids)} duplicate inputs...\n")
|
568
|
+
duplicate_inputs_indexes = [input["Index"] for input in duplicate_input_ids]
|
569
|
+
self.upload_dataset(
|
570
|
+
dataloader=dataloader,
|
571
|
+
log_retry_ids=duplicate_inputs_indexes,
|
572
|
+
is_retry_duplicates=True,
|
573
|
+
is_log_retry=True,
|
574
|
+
**kwargs,
|
575
|
+
)
|
576
|
+
|
577
|
+
if failed_input_ids:
|
578
|
+
# failed_inputs= ([input["Input_ID"] for input in failed_input_ids])
|
579
|
+
logger.warning(f"Retrying upload for {len(failed_input_ids)} failed inputs...\n")
|
580
|
+
failed_input_indexes = [input["Index"] for input in failed_input_ids]
|
581
|
+
self.upload_dataset(
|
582
|
+
dataloader=dataloader,
|
583
|
+
log_retry_ids=failed_input_indexes,
|
584
|
+
is_log_retry=True,
|
585
|
+
**kwargs,
|
586
|
+
)
|
587
|
+
|
588
|
+
def upload_from_csv(
|
589
|
+
self,
|
590
|
+
csv_path: str,
|
591
|
+
input_type: str = 'text',
|
592
|
+
csv_type: str = None,
|
593
|
+
labels: bool = True,
|
594
|
+
batch_size: int = 128,
|
595
|
+
) -> None:
|
596
|
+
"""Uploads dataset from a csv file.
|
597
|
+
|
598
|
+
Args:
|
599
|
+
csv_path (str): path to the csv file
|
600
|
+
input_type (str): type of the dataset(text, image)
|
601
|
+
csv_type (str): type of the csv file(raw, url, file_path)
|
602
|
+
labels (bool): True if csv file has labels column
|
603
|
+
batch_size (int): batch size for concurrent upload of inputs and annotations
|
604
|
+
|
605
|
+
Example:
|
606
|
+
>>> from clarifai.client.dataset import Dataset
|
607
|
+
>>> dataset = Dataset(user_id = 'user_id', app_id = 'demo_app', dataset_id = 'demo_dataset')
|
608
|
+
>>> dataset.upload_from_csv(csv_path='csv_path', input_type='text', csv_type='raw, labels=True)
|
609
|
+
|
610
|
+
Note:
|
611
|
+
CSV file supports 'inputid', 'input', 'concepts', 'metadata', 'geopoints' columns.
|
612
|
+
All the data in the CSV should be in double quotes.
|
613
|
+
metadata should be in single quotes format. Example: "{'key': 'value'}"
|
614
|
+
geopoints should be in "long,lat" format.
|
615
|
+
"""
|
616
|
+
if input_type not in ['image', 'text', 'video', 'audio']:
|
617
|
+
raise UserError('Invalid input type, it should be image,text,audio or video')
|
618
|
+
if csv_type not in ['raw', 'url', 'file_path']:
|
619
|
+
raise UserError('Invalid csv type, it should be raw, url or file_path')
|
620
|
+
assert csv_path.endswith('.csv'), 'csv_path should be a csv file'
|
621
|
+
if csv_type == 'raw' and input_type != 'text':
|
622
|
+
raise UserError('Only text input type is supported for raw csv type')
|
623
|
+
batch_size = min(128, batch_size)
|
624
|
+
input_protos = self.input_object.get_inputs_from_csv(
|
625
|
+
csv_path=csv_path,
|
626
|
+
input_type=input_type,
|
627
|
+
csv_type=csv_type,
|
628
|
+
dataset_id=self.id,
|
629
|
+
labels=labels,
|
630
|
+
)
|
631
|
+
self.input_object._bulk_upload(inputs=input_protos, batch_size=batch_size)
|
632
|
+
|
633
|
+
def upload_from_folder(
|
634
|
+
self, folder_path: str, input_type: str, labels: bool = False, batch_size: int = 128
|
635
|
+
) -> None:
|
636
|
+
"""Upload dataset from folder.
|
637
|
+
|
638
|
+
Args:
|
639
|
+
folder_path (str): Path to the folder containing images.
|
640
|
+
input_type (str): type of the dataset(text, image)
|
641
|
+
labels (bool): True if folder name is the label for the inputs
|
642
|
+
batch_size (int): batch size for concurrent upload of inputs and annotations
|
643
|
+
|
644
|
+
Example:
|
645
|
+
>>> from clarifai.client.dataset import Dataset
|
646
|
+
>>> dataset = Dataset(user_id = 'user_id', app_id = 'demo_app', dataset_id = 'demo_dataset')
|
647
|
+
>>> dataset.upload_from_folder(folder_path='folder_path', input_type='text', labels=True)
|
648
|
+
|
649
|
+
Note: The filename is used as the input_id.
|
650
|
+
"""
|
651
|
+
if input_type not in ['image', 'text']:
|
652
|
+
raise UserError('Invalid input type it should be image or text')
|
653
|
+
if input_type == 'image':
|
654
|
+
input_protos = self.input_object.get_image_inputs_from_folder(
|
655
|
+
folder_path=folder_path, dataset_id=self.id, labels=labels
|
656
|
+
)
|
657
|
+
if input_type == 'text':
|
658
|
+
input_protos = self.input_object.get_text_inputs_from_folder(
|
659
|
+
folder_path=folder_path, dataset_id=self.id, labels=labels
|
660
|
+
)
|
661
|
+
self.input_object._bulk_upload(inputs=input_protos, batch_size=batch_size)
|
662
|
+
|
663
|
+
def get_upload_status(
|
664
|
+
self,
|
665
|
+
dataloader: Type[ClarifaiDataLoader] = None,
|
666
|
+
delete_version: bool = False,
|
667
|
+
timeout: int = 600,
|
668
|
+
pre_upload_stats: Tuple[Dict[str, int], Dict[str, int]] = None,
|
669
|
+
pre_upload: bool = False,
|
670
|
+
) -> Optional[Tuple[Dict[str, int], Dict[str, int]]]:
|
671
|
+
"""Creates a new dataset version and displays the upload status of the dataset.
|
672
|
+
|
673
|
+
Args:
|
674
|
+
dataloader (Type[ClarifaiDataLoader]): ClarifaiDataLoader object
|
675
|
+
delete_version (bool): True if you want to delete the version after getting the upload status
|
676
|
+
timeout (int): Timeout in seconds for getting the upload status. Default is 600 seconds.
|
677
|
+
pre_upload_stats (Tuple[Dict[str, int], Dict[str, int]]): The pre upload stats for the dataset.
|
678
|
+
pre_upload (bool): True if you want to get the pre upload stats for the dataset.
|
679
|
+
|
680
|
+
Example:
|
681
|
+
>>> from clarifai.client.dataset import Dataset
|
682
|
+
>>> dataset = Dataset(dataset_id='dataset_id', user_id='user_id', app_id='app_id')
|
683
|
+
>>> dataset.get_upload_status(dataloader)
|
684
|
+
|
685
|
+
Note:
|
686
|
+
This is a beta feature and is subject to change.
|
687
|
+
"""
|
688
|
+
self.logger.info("Getting dataset upload status...")
|
689
|
+
dataset_version_id = uuid.uuid4().hex
|
690
|
+
_ = self.create_version(id=dataset_version_id, description="SDK Upload Status")
|
691
|
+
|
692
|
+
request_data = dict(
|
693
|
+
user_app_id=self.user_app_id,
|
694
|
+
dataset_id=self.id,
|
695
|
+
dataset_version_id=dataset_version_id,
|
696
|
+
)
|
697
|
+
|
698
|
+
start_time = time.time()
|
699
|
+
backoff_iterator = BackoffIterator(10)
|
700
|
+
while True:
|
701
|
+
dataset_metrics_response = self._grpc_request(
|
702
|
+
self.STUB.ListDatasetVersionMetricsGroups,
|
703
|
+
service_pb2.ListDatasetVersionMetricsGroupsRequest(**request_data),
|
704
|
+
)
|
705
|
+
|
706
|
+
if dataset_metrics_response.status.code != status_code_pb2.SUCCESS:
|
707
|
+
self.delete_version(dataset_version_id)
|
708
|
+
raise Exception(
|
709
|
+
"Failed to get dataset metrics {}".format(dataset_metrics_response.status)
|
710
|
+
)
|
711
|
+
|
712
|
+
dict_response = MessageToDict(dataset_metrics_response)
|
713
|
+
if len(dict_response.keys()) == 1 and time.time() - start_time < timeout:
|
714
|
+
self.logger.info("Crunching the dataset metrics. Please wait...")
|
715
|
+
time.sleep(next(backoff_iterator))
|
716
|
+
continue
|
717
|
+
else:
|
718
|
+
if time.time() - start_time > timeout:
|
719
|
+
self.delete_version(dataset_version_id)
|
720
|
+
raise UserError(
|
721
|
+
"Dataset metrics are taking too long to process. Please try again later."
|
722
|
+
)
|
723
|
+
break
|
724
|
+
# get pre upload stats
|
725
|
+
if pre_upload:
|
726
|
+
return DisplayUploadStatus.get_dataset_version_stats(dataset_metrics_response)
|
727
|
+
|
728
|
+
dataset_info_dict = dict(user_id=self.user_id, app_id=self.app_id, dataset_id=self.id)
|
729
|
+
DisplayUploadStatus(
|
730
|
+
dataloader, dataset_metrics_response, dataset_info_dict, pre_upload_stats
|
731
|
+
)
|
732
|
+
|
733
|
+
if delete_version:
|
734
|
+
self.delete_version(dataset_version_id)
|
735
|
+
|
736
|
+
def merge_dataset(self, merge_dataset_id: str) -> None:
|
737
|
+
"""Merges the another dataset into current dataset.
|
738
|
+
|
739
|
+
Args:
|
740
|
+
merge_dataset_id (str): The dataset ID of the dataset to merge.
|
741
|
+
|
742
|
+
Example:
|
743
|
+
>>> from clarifai.client.dataset import Dataset
|
744
|
+
>>> dataset = Dataset(dataset_id='dataset_id', user_id='user_id', app_id='app_id')
|
745
|
+
>>> dataset.merge_dataset(merge_dataset_id='merge_dataset_id')
|
746
|
+
"""
|
747
|
+
dataset_filter = resources_pb2.Filter(
|
748
|
+
input=resources_pb2.Input(dataset_ids=[merge_dataset_id])
|
749
|
+
)
|
750
|
+
query = resources_pb2.Search(query=resources_pb2.Query(filters=[dataset_filter]))
|
751
|
+
request = service_pb2.PostDatasetInputsRequest(
|
752
|
+
user_app_id=self.user_app_id, dataset_id=self.id, search=query
|
753
|
+
)
|
754
|
+
|
755
|
+
response = self._grpc_request(self.STUB.PostDatasetInputs, request)
|
756
|
+
if response.status.code != status_code_pb2.SUCCESS:
|
757
|
+
raise Exception(response.status)
|
758
|
+
self.logger.info("\nDataset Merged\n%s", response.status)
|
759
|
+
|
760
|
+
def archive_zip(self, wait: bool = True) -> str:
|
761
|
+
"""Exports the dataset to a zip file URL."""
|
762
|
+
request = service_pb2.PutDatasetVersionExportsRequest(
|
763
|
+
user_app_id=self.user_app_id,
|
764
|
+
dataset_id=self.id,
|
765
|
+
dataset_version_id=self.version.id,
|
766
|
+
exports=[
|
767
|
+
resources_pb2.DatasetVersionExport(
|
768
|
+
format=resources_pb2.DatasetVersionExportFormat.CLARIFAI_DATA_PROTOBUF
|
769
|
+
)
|
770
|
+
],
|
771
|
+
)
|
772
|
+
|
692
773
|
response = self._grpc_request(self.STUB.PutDatasetVersionExports, request)
|
693
774
|
if response.status.code != status_code_pb2.SUCCESS:
|
694
|
-
|
695
|
-
|
696
|
-
|
697
|
-
|
698
|
-
|
699
|
-
|
700
|
-
|
701
|
-
|
702
|
-
|
703
|
-
|
704
|
-
|
705
|
-
|
706
|
-
|
707
|
-
|
708
|
-
|
709
|
-
|
710
|
-
|
711
|
-
|
712
|
-
|
713
|
-
|
714
|
-
|
715
|
-
|
716
|
-
|
717
|
-
|
718
|
-
|
719
|
-
|
720
|
-
|
721
|
-
|
722
|
-
|
723
|
-
|
724
|
-
|
725
|
-
|
726
|
-
|
727
|
-
|
728
|
-
|
729
|
-
|
730
|
-
|
731
|
-
|
732
|
-
|
733
|
-
|
734
|
-
|
735
|
-
|
736
|
-
|
737
|
-
|
738
|
-
|
739
|
-
|
740
|
-
|
741
|
-
|
775
|
+
raise Exception(response.status)
|
776
|
+
if wait:
|
777
|
+
while response.exports[0].status.code in (
|
778
|
+
status_code_pb2.DATASET_VERSION_EXPORT_PENDING,
|
779
|
+
status_code_pb2.DATASET_VERSION_EXPORT_IN_PROGRESS,
|
780
|
+
):
|
781
|
+
time.sleep(1)
|
782
|
+
response = self._grpc_request(self.STUB.PutDatasetVersionExports, request)
|
783
|
+
if response.status.code != status_code_pb2.SUCCESS:
|
784
|
+
raise Exception(response.status)
|
785
|
+
if response.exports[0].status.code != status_code_pb2.DATASET_VERSION_EXPORT_SUCCESS:
|
786
|
+
raise Exception(response.exports[0].status)
|
787
|
+
return response.exports[0].url
|
788
|
+
|
789
|
+
def export(
|
790
|
+
self,
|
791
|
+
save_path: str,
|
792
|
+
archive_url: str = None,
|
793
|
+
local_archive_path: str = None,
|
794
|
+
split: str = 'all',
|
795
|
+
num_workers: int = 4,
|
796
|
+
) -> None:
|
797
|
+
"""Exports the Clarifai protobuf dataset to a local archive.
|
798
|
+
|
799
|
+
Args:
|
800
|
+
save_path (str): The path to save the archive to.
|
801
|
+
archive_url (str): The URL to the Clarifai protobuf archive.
|
802
|
+
local_archive_path (str): The path to the local Clarifai protobuf archive.
|
803
|
+
split (str): Export dataset inputs in the directory format {split}/{input_type}. Default is all.
|
804
|
+
num_workers (int): Number of workers to use for downloading the archive. Default is 4.
|
805
|
+
|
806
|
+
Example:
|
807
|
+
>>> from clarifai.client.dataset import Dataset
|
808
|
+
>>> Dataset().export(save_path='output.zip')
|
809
|
+
"""
|
810
|
+
if local_archive_path and not os.path.exists(local_archive_path):
|
811
|
+
raise UserError(f"Archive {local_archive_path} does not exist.")
|
812
|
+
if not archive_url and not local_archive_path:
|
813
|
+
archive_url = self.archive_zip()
|
814
|
+
# Create a session object and set auth header
|
815
|
+
session = requests.Session()
|
816
|
+
retries = Retry(total=5, backoff_factor=0.1, status_forcelist=[500, 502, 503, 504])
|
817
|
+
session.mount('https://', HTTPAdapter(max_retries=retries))
|
818
|
+
session.headers.update({'Authorization': self.metadata[0][1]})
|
819
|
+
with DatasetExportReader(
|
820
|
+
session=session, archive_url=archive_url, local_archive_path=local_archive_path
|
821
|
+
) as reader:
|
822
|
+
InputAnnotationDownloader(session, reader, num_workers).download_archive(
|
823
|
+
save_path=save_path, split=split
|
824
|
+
)
|
825
|
+
|
826
|
+
def __getattr__(self, name):
|
827
|
+
return getattr(self.dataset_info, name)
|
828
|
+
|
829
|
+
def __str__(self):
|
830
|
+
init_params = [param for param in self.kwargs.keys()]
|
831
|
+
attribute_strings = [
|
832
|
+
f"{param}={getattr(self.dataset_info, param)}"
|
833
|
+
for param in init_params
|
834
|
+
if hasattr(self.dataset_info, param)
|
835
|
+
]
|
836
|
+
return f"Dataset Details: \n{', '.join(attribute_strings)}\n"
|