PyPI - clarifai - Versions diffs - 10.11.1__py3-none-any.whl → 10.11.2rc2__py3-none-any.whl - Mend

clarifai 10.11.1py3-none-any.whl → 10.11.2rc2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (159) hide show

clarifai/runners/matt_llm_example.py ADDED Viewed

@@ -0,0 +1,129 @@
+from clarifai_grpc.grpc.api import resources_pb2, service_pb2
+from clarifai_grpc.grpc.api.status import status_code_pb2, status_pb2
+from collections.abc import Iterator
+from google.protobuf import json_format
+from clarifai.client.runner import Runner
+import time
+from threading import Thread
+import grpc
+import requests
+from transformers import (AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer)
+model_name_or_path = "TheBloke/Llama-2-7B-chat-GPTQ"
+model_basename = "model"
+use_triton = False
+tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
+model = AutoModelForCausalLM.from_pretrained(model_name_or_path, device_map='auto')
+streamer = TextIteratorStreamer(tokenizer)
+print("Model loaded")
+class MyRunner(Runner):
+  """A custom runner that adds "Hello World" to the end of the text and replaces the domain of the
+  image URL as an example.
+  """
+  def run_input(self, input: resources_pb2.Input, output_info: resources_pb2.OutputInfo,
+                **kwargs) -> resources_pb2.Output:
+    """This is the method that will be called when the runner is run. It takes in an input and
+    returns an output.
+    """
+    output = resources_pb2.Output()
+    data = input.data
+    # Optional use of output_info
+    params_dict = {}
+    if "params" in output_info:
+      params_dict = output_info["params"]
+    if data.text.raw != "":
+      output.data.text.raw = data.text.raw + "Hello World" + params_dict.get(
+          "hello", "") + kwargs.get("extra", "")
+    if data.image.url != "":
+      output.data.text.raw = data.image.url.replace("samples.clarifai.com",
+                                                    "newdomain.com" + params_dict.get("domain",))
+    return output
+  def generate(self, request: service_pb2.PostModelOutputsRequest
+              ) -> Iterator[service_pb2.MultiOutputResponse]:
+    """Example yielding a whole batch of streamed stuff back.
+    """
+    output_info = None
+    if request.model.model_version.id != "":
+      output_info = json_format.MessageToDict(
+          request.model.model_version.output_info, preserving_proto_field_name=True)
+    for inp in request.inputs:
+      data = inp.data
+      print('start')
+      if data.text.raw != "":
+        input_text = data.text.raw
+      elif data.text.url != "":
+        input_text = str(requests.get(data.text.url).text)
+      else:
+        raise Exception("Need to include data.text.raw or data.text.url in your inputs.")
+      st = time.time()
+      max_tokens = 1024
+      # # Method 1
+      inputs = tokenizer(input_text, return_tensors='pt') #.input_ids.cuda()
+      generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=max_tokens)
+      thread = Thread(target=model.generate, kwargs=generation_kwargs)
+      thread.start()
+      times = []
+      st = time.time()
+      total_start = st
+      for new_text in streamer:
+        duration = time.time() - st
+        st = time.time()
+        print(f"Duration: {duration}")
+        times.append(duration)
+        # for new_text in ["hello", "world", "i'm", "streaming"]:
+        # out = model.generate(inputs=input_ids, temperature=0.7, max_new_tokens=max_tokens)
+        # out_text = tokenizer.decode(out[0], skip_special_tokens=True)
+        # output.data.text.raw = out_text.replace(input_text, '')
+        # # # Method 2
+        # print('before')
+        # pipe = pipeline(
+        #     "text-generation",
+        #     model=model,
+        #     tokenizer=tokenizer,
+        #     streamer=streamer,
+        #     max_new_tokens=max_tokens,
+        #     temperature=0.7,
+        #     top_p=0.95,
+        #     repetition_penalty=1.15,
+        #     return_full_text=False)
+        # print('pipe')
+        # a = pipe(input_text)
+        # print(a)
+        print("Posting: ", new_text)
+        output = resources_pb2.Output()
+        output.data.text.raw = new_text
+        result = service_pb2.MultiOutputResponse(
+            status=status_pb2.Status(
+                code=status_code_pb2.SUCCESS,
+                description="Success",
+            ),
+            outputs=[output],
+        )
+        yield result
+      print(f"Total time: {time.time() - total_start}")
+      print(f"Average time: {sum(times) / len(times)}")
+if __name__ == '__main__':
+  # Make sure you set these env vars before running the example.
+  # CLARIFAI_PAT
+  # CLARIFAI_USER_ID
+  # You need to first create a runner in the Clarifai API and then use the ID here.
+  MyRunner(runner_id="matt-test-runner", base_url="http://q6:32013", num_parallel_polls=1).start()

clarifai/runners/matt_llm_example.py~ ADDED Viewed

@@ -0,0 +1,128 @@
+from clarifai_grpc.grpc.api import resources_pb2, service_pb2
+from clarifai_grpc.grpc.api.status import status_code_pb2, status_pb2
+from collections.abc import Iterator
+from google.protobuf import json_format
+from clarifai.client.runner import Runner
+import time
+from threading import Thread
+import grpc
+import requests
+from transformers import (AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer)
+model_name_or_path = "TheBloke/Llama-2-7B-chat-GPTQ"
+model_basename = "model"
+use_triton = False
+tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
+model = AutoModelForCausalLM.from_pretrained(model_name_or_path, device_map='auto')
+streamer = TextIteratorStreamer(tokenizer)
+print("Model loaded")
+class MyRunner(Runner):
+  """A custom runner that adds "Hello World" to the end of the text and replaces the domain of the
+  image URL as an example.
+  """
+  def run_input(self, input: resources_pb2.Input, output_info: resources_pb2.OutputInfo,
+                **kwargs) -> resources_pb2.Output:
+    """This is the method that will be called when the runner is run. It takes in an input and
+    returns an output.
+    """
+    output = resources_pb2.Output()
+    data = input.data
+    # Optional use of output_info
+    params_dict = {}
+    if "params" in output_info:
+      params_dict = output_info["params"]
+    if data.text.raw != "":
+      output.data.text.raw = data.text.raw + "Hello World" + params_dict.get(
+          "hello", "") + kwargs.get("extra", "")
+    if data.image.url != "":
+      output.data.text.raw = data.image.url.replace("samples.clarifai.com",
+                                                    "newdomain.com" + params_dict.get("domain",))
+    return output
+  def generate(self, request: service_pb2.PostModelOutputsRequest
+              ) -> Iterator[service_pb2.MultiOutputResponse]:
+    """Example yielding a whole batch of streamed stuff back.
+    """
+    output_info = None
+    if request.model.model_version.id != "":
+      output_info = json_format.MessageToDict(
+          request.model.model_version.output_info, preserving_proto_field_name=True)
+    for inp in request.inputs:
+      data = inp.data
+      print('start')
+      if data.text.raw != "":
+        input_text = data.text.raw
+      elif data.text.url != "":
+        input_text = str(requests.get(data.text.url).text)
+      else:
+        raise Exception("Need to include data.text.raw or data.text.url in your inputs.")
+      st = time.time()
+      max_tokens = 1024
+      # # Method 1
+      inputs = tokenizer(input_text, return_tensors='pt') #.input_ids.cuda()
+      generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=max_tokens)
+      thread = Thread(target=model.generate, kwargs=generation_kwargs)
+      thread.start()
+      times = []
+      st = time.time()
+      for new_text in streamer:
+        duration = time.time() - st
+        st = time.time()
+        print(f"Duration: {duration}")
+        times.append(duration)
+        # for new_text in ["hello", "world", "i'm", "streaming"]:
+        # out = model.generate(inputs=input_ids, temperature=0.7, max_new_tokens=max_tokens)
+        # out_text = tokenizer.decode(out[0], skip_special_tokens=True)
+        # output.data.text.raw = out_text.replace(input_text, '')
+        # # # Method 2
+        # print('before')
+        # pipe = pipeline(
+        #     "text-generation",
+        #     model=model,
+        #     tokenizer=tokenizer,
+        #     streamer=streamer,
+        #     max_new_tokens=max_tokens,
+        #     temperature=0.7,
+        #     top_p=0.95,
+        #     repetition_penalty=1.15,
+        #     return_full_text=False)
+        # print('pipe')
+        # a = pipe(input_text)
+        # print(a)
+        print("Posting: ", new_text)
+        output = resources_pb2.Output()
+        output.data.text.raw = new_text
+        result = service_pb2.MultiOutputResponse(
+            status=status_pb2.Status(
+                code=status_code_pb2.SUCCESS,
+                description="Success",
+            ),
+            outputs=[output],
+        )
+        yield result
+      print(f"Total time: {time.time() - st}")
+      print(f"Average time: {sum(times) / len(times)}")
+if __name__ == '__main__':
+  # Make sure you set these env vars before running the example.
+  # CLARIFAI_PAT
+  # CLARIFAI_USER_ID
+  # You need to first create a runner in the Clarifai API and then use the ID here.
+  MyRunner(runner_id="matt-test-runner", base_url="http://q6:32013", num_parallel_polls=1).start()

clarifai/runners/models/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file

clarifai/runners/models/__pycache__/base_typed_model.cpython-310.pyc ADDED Viewed

Binary file

clarifai/runners/models/__pycache__/model_class.cpython-310.pyc ADDED Viewed

Binary file

clarifai/runners/models/__pycache__/model_run_locally.cpython-310.pyc ADDED Viewed

Binary file

clarifai/runners/models/__pycache__/model_runner.cpython-310.pyc ADDED Viewed

Binary file

clarifai/runners/models/__pycache__/model_servicer.cpython-310.pyc ADDED Viewed

Binary file

clarifai/runners/models/__pycache__/model_upload.cpython-310.pyc ADDED Viewed

Binary file

clarifai/runners/models/model_upload.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import os
 import re
+import sys
 import time
 from string import Template
@@ -8,6 +9,7 @@ from clarifai_grpc.grpc.api import resources_pb2, service_pb2
 from clarifai_grpc.grpc.api.status import status_code_pb2
 from google.protobuf import json_format
 from rich import print
+from rich.markup import escape
 from clarifai.client import BaseClient
 from clarifai.runners.utils.const import (AVAILABLE_PYTHON_IMAGES, AVAILABLE_TORCH_IMAGES,
@@ -27,10 +29,17 @@ def _clear_line(n: int = 1) -> None:
 class ModelUploader:
-  def __init__(self, folder: str):
+  def __init__(self, folder: str, validate_api_ids: bool = True):
+    """
+    :param folder: The folder containing the model.py, config.yaml, requirements.txt and
+    checkpoints.
+    :param validate_api_ids: Whether to validate the user_id and app_id in the config file.
+    """
     self._client = None
     self.folder = self._validate_folder(folder)
     self.config = self._load_config(os.path.join(self.folder, 'config.yaml'))
+    self.validate_api_ids = validate_api_ids
+    self._validate_config()
     self.model_proto = self._get_model_proto()
     self.model_id = self.model_proto.id
     self.model_version_id = None
@@ -69,13 +78,64 @@ class ModelUploader:
       assert "repo_id" in self.config.get("checkpoints"), "No repo_id specified in the config file"
       repo_id = self.config.get("checkpoints").get("repo_id")
-      # prefer env var for HF_TOKEN but if not provided then use the one from config.yaml if any.
-      if 'HF_TOKEN' in os.environ:
-        hf_token = os.environ['HF_TOKEN']
-      else:
-        hf_token = self.config.get("checkpoints").get("hf_token", None)
+      # get from config.yaml otherwise fall back to HF_TOKEN env var.
+      hf_token = self.config.get("checkpoints").get("hf_token", os.environ.get("HF_TOKEN", None))
       return repo_id, hf_token
+  def _check_app_exists(self):
+    if not self.validate_api_ids:
+      return True
+    resp = self.client.STUB.GetApp(service_pb2.GetAppRequest(user_app_id=self.client.user_app_id))
+    if resp.status.code == status_code_pb2.SUCCESS:
+      return True
+    return False
+  def _validate_config_model(self):
+    assert "model" in self.config, "model section not found in the config file"
+    model = self.config.get('model')
+    assert "user_id" in model, "user_id not found in the config file"
+    assert "app_id" in model, "app_id not found in the config file"
+    assert "model_type_id" in model, "model_type_id not found in the config file"
+    assert "id" in model, "model_id not found in the config file"
+    if '.' in model.get('id'):
+      logger.error(
+          "Model ID cannot contain '.', please remove it from the model_id in the config file")
+      sys.exit(1)
+    assert model.get('user_id') != "", "user_id cannot be empty in the config file"
+    assert model.get('app_id') != "", "app_id cannot be empty in the config file"
+    assert model.get('model_type_id') != "", "model_type_id cannot be empty in the config file"
+    assert model.get('id') != "", "model_id cannot be empty in the config file"
+    if not self._check_app_exists():
+      logger.error(
+          f"App {self.client.user_app_id.app_id} not found for user {self.client.user_app_id.user_id}"
+      )
+      sys.exit(1)
+  def _validate_config(self):
+    self._validate_config_model()
+    if self.config.get("checkpoints"):
+      self._validate_config_checkpoints()
+    assert "inference_compute_info" in self.config, "inference_compute_info not found in the config file"
+    if self.config.get("concepts"):
+      model_type_id = self.config.get('model').get('model_type_id')
+      assert model_type_id in CONCEPTS_REQUIRED_MODEL_TYPE, f"Model type {model_type_id} not supported for concepts"
+    if self.config.get("checkpoints"):
+      _, hf_token = self._validate_config_checkpoints()
+      if hf_token:
+        is_valid_token = HuggingFaceLoader.validate_hftoken(hf_token)
+        if not is_valid_token:
+          logger.error(
+              "Invalid Hugging Face token provided in the config file, this might cause issues with downloading the restricted model checkpoints."
+          )
+          logger.info("Continuing without Hugging Face token")
   @property
   def client(self):
     if self._client is None:
@@ -259,6 +319,7 @@ class ModelUploader:
     if not success:
       logger.error(f"Failed to download checkpoints for model {repo_id}")
+      sys.exit(1)
     else:
       logger.info(f"Downloaded checkpoints for model {repo_id}")
     return success
@@ -353,10 +414,10 @@ class ModelUploader:
     model_version_proto = self.get_model_version_proto()
     if download_checkpoints:
-      tar_cmd = f"tar --exclude=*~ -czvf {self.tar_file} -C {self.folder} ."
+      tar_cmd = f"tar --exclude=*~ --exclude={self.tar_file} -czvf {self.tar_file} -C {self.folder} ."
     else:  # we don't want to send the checkpoints up even if they are in the folder.
       logger.info(f"Skipping {self.checkpoint_path} in the tar file that is uploaded.")
-      tar_cmd = f"tar --exclude={self.checkpoint_suffix} --exclude=*~ -czvf {self.tar_file} -C {self.folder} ."
+      tar_cmd = f"tar --exclude={self.checkpoint_suffix} --exclude=*~ --exclude={self.tar_file} -czvf {self.tar_file} -C {self.folder} ."
     # Tar the folder
     logger.debug(tar_cmd)
     os.system(tar_cmd)
@@ -366,6 +427,9 @@ class ModelUploader:
     logger.info(f"Size of the tar is: {file_size} bytes")
     self.maybe_create_model()
+    if not self.check_model_exists():
+      logger.error(f"Failed to create model: {self.model_proto.id}")
+      sys.exit(1)
     for response in self.client.STUB.PostModelVersionsUpload(
         self.model_version_stream_upload_iterator(model_version_proto, file_path),):
@@ -430,7 +494,7 @@ class ModelUploader:
     file_size = os.path.getsize(file_path)
     logger.info(f"Uploading model version of model {self.model_proto.id}")
     logger.info(f"Using file '{os.path.basename(file_path)}' of size: {file_size} bytes")
-    return service_pb2.PostModelVersionsUploadRequest(
+    result = service_pb2.PostModelVersionsUploadRequest(
         upload_config=service_pb2.PostModelVersionsUploadConfig(
             user_app_id=self.client.user_app_id,
             model_id=self.model_proto.id,
@@ -438,6 +502,7 @@ class ModelUploader:
             total_size=file_size,
             is_v3=self.is_v3,
         ))
+    return result
   def get_model_build_logs(self):
     logs_request = service_pb2.ListLogEntriesRequest(
@@ -470,7 +535,7 @@ class ModelUploader:
         for log_entry in logs.log_entries:
           if log_entry.url not in seen_logs:
             seen_logs.add(log_entry.url)
-            print(f"Model Building Logs...: {log_entry.message.strip()}")
+            print(f"Model Building Logs...: {escape(log_entry.message.strip())}")
         time.sleep(1)
       elif status_code == status_code_pb2.MODEL_TRAINED:
         logger.info(f"\nModel build complete! (elapsed {time.time() - st:.1f}s)")

clarifai/runners/utils/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file

clarifai/runners/utils/__pycache__/const.cpython-310.pyc ADDED Viewed

Binary file

clarifai/runners/utils/__pycache__/data_handler.cpython-310.pyc ADDED Viewed

Binary file

clarifai/runners/utils/__pycache__/data_utils.cpython-310.pyc ADDED Viewed

Binary file

clarifai/runners/utils/__pycache__/loader.cpython-310.pyc ADDED Viewed

Binary file

clarifai/runners/utils/__pycache__/logging.cpython-310.pyc ADDED Viewed

Binary file

clarifai/runners/utils/__pycache__/url_fetcher.cpython-310.pyc ADDED Viewed

Binary file

clarifai/runners/utils/const.py CHANGED Viewed

@@ -1,39 +1,41 @@
-PYTHON_BASE_IMAGE = 'public.ecr.aws/clarifai-models/python-base:{python_version}'
-TORCH_BASE_IMAGE = 'public.ecr.aws/clarifai-models/torch:{torch_version}-py{python_version}-cuda{cuda_version}'
+import os
+registry = os.environ.get('CLARIFAI_BASE_IMAGE_REGISTRY', 'public.ecr.aws/clarifai-models')
+PYTHON_BASE_IMAGE = registry + '/python-base:{python_version}'
+TORCH_BASE_IMAGE = registry + '/torch:{torch_version}-py{python_version}-cuda{cuda_version}'
 # List of available python base images
-AVAILABLE_PYTHON_IMAGES = ['3.8', '3.9', '3.10', '3.11', '3.12', '3.13']
+AVAILABLE_PYTHON_IMAGES = ['3.11', '3.12', '3.13']
-DEFAULT_PYTHON_VERSION = 3.11
+DEFAULT_PYTHON_VERSION = 3.12
 # List of available torch images
 AVAILABLE_TORCH_IMAGES = [
-    '1.13.1-py3.8-cuda117',
-    '1.13.1-py3.9-cuda117',
-    '1.13.1-py3.10-cuda117',
-    '2.1.2-py3.8-cuda121',
-    '2.1.2-py3.9-cuda121',
-    '2.1.2-py3.10-cuda121',
-    '2.1.2-py3.11-cuda121',
-    '2.2.2-py3.8-cuda121',
-    '2.2.2-py3.9-cuda121',
-    '2.2.2-py3.10-cuda121',
     '2.2.2-py3.11-cuda121',
-    '2.2.2-py3.12-cuda121',
-    '2.3.1-py3.8-cuda121',
-    '2.3.1-py3.9-cuda121',
-    '2.3.1-py3.10-cuda121',
     '2.3.1-py3.11-cuda121',
-    '2.3.1-py3.12-cuda121',
-    '2.4.1-py3.8-cuda124',
-    '2.4.1-py3.9-cuda124',
-    '2.4.1-py3.10-cuda124',
+    '2.4.0-py3.11-cuda121',
+    '2.4.0-py3.11-cuda124',
+    '2.4.1-py3.11-cuda121',
     '2.4.1-py3.11-cuda124',
-    '2.4.1-py3.12-cuda124',
-    '2.5.1-py3.9-cuda124',
-    '2.5.1-py3.10-cuda124',
+    '2.5.1-py3.11-cuda121',
     '2.5.1-py3.11-cuda124',
+    '2.2.2-py3.12-cuda121',
+    '2.3.1-py3.12-cuda121',
+    '2.4.0-py3.12-cuda121',
+    '2.4.0-py3.12-cuda124',
+    '2.4.1-py3.12-cuda121',
+    '2.4.1-py3.12-cuda124',
+    '2.5.1-py3.12-cuda121',
     '2.5.1-py3.12-cuda124',
+    # '2.2.2-py3.13-cuda121',
+    # '2.3.1-py3.13-cuda121',
+    # '2.4.0-py3.13-cuda121',
+    # '2.4.0-py3.13-cuda124',
+    # '2.4.1-py3.13-cuda121',
+    # '2.4.1-py3.13-cuda124',
+    # '2.5.1-py3.13-cuda121',
+    # '2.5.1-py3.13-cuda124',
 ]
 CONCEPTS_REQUIRED_MODEL_TYPE = [
     'visual-classifier', 'visual-detector', 'visual-segmenter', 'text-classifier'

clarifai/runners/utils/loader.py CHANGED Viewed

@@ -1,6 +1,8 @@
+import fnmatch
 import importlib.util
 import json
 import os
+import shutil
 import subprocess
 from clarifai.utils.logging import logger
@@ -14,22 +16,28 @@ class HuggingFaceLoader:
     self.repo_id = repo_id
     self.token = token
     if token:
-      try:
-        if importlib.util.find_spec("huggingface_hub") is None:
-          raise ImportError(self.HF_DOWNLOAD_TEXT)
-        os.environ['HF_TOKEN'] = token
-        from huggingface_hub import HfApi
-        api = HfApi()
-        api.whoami(token=token)
+      if self.validate_hftoken(token):
         subprocess.run(f'huggingface-cli login --token={os.environ["HF_TOKEN"]}', shell=True)
-      except Exception as e:
-        logger.error(
-            f"Error setting up Hugging Face token, please make sure you have the correct token: {e}"
-        )
+        logger.info("Hugging Face token validated")
+      else:
         logger.info("Continuing without Hugging Face token")
+  @classmethod
+  def validate_hftoken(cls, hf_token: str):
+    try:
+      if importlib.util.find_spec("huggingface_hub") is None:
+        raise ImportError(cls.HF_DOWNLOAD_TEXT)
+      os.environ['HF_TOKEN'] = hf_token
+      from huggingface_hub import HfApi
+      api = HfApi()
+      api.whoami(token=hf_token)
+      return True
+    except Exception as e:
+      logger.error(
+          f"Error setting up Hugging Face token, please make sure you have the correct token: {e}")
+      return False
   def download_checkpoints(self, checkpoint_path: str):
     # throw error if huggingface_hub wasn't installed
     try:
@@ -46,10 +54,20 @@ class HuggingFaceLoader:
         if not is_hf_model_exists:
           logger.error("Model %s not found on Hugging Face" % (self.repo_id))
           return False
+        self.ignore_patterns = self._get_ignore_patterns()
         snapshot_download(
-            repo_id=self.repo_id, local_dir=checkpoint_path, local_dir_use_symlinks=False)
+            repo_id=self.repo_id,
+            local_dir=checkpoint_path,
+            local_dir_use_symlinks=False,
+            ignore_patterns=self.ignore_patterns)
+        # Remove the `.cache` folder if it exists
+        cache_path = os.path.join(checkpoint_path, ".cache")
+        if os.path.exists(cache_path) and os.path.isdir(cache_path):
+          shutil.rmtree(cache_path)
       except Exception as e:
-        logger.exception(f"Error downloading model checkpoints {e}")
+        logger.error(f"Error downloading model checkpoints {e}")
         return False
       finally:
         is_downloaded = self.validate_download(checkpoint_path)
@@ -94,11 +112,41 @@ class HuggingFaceLoader:
       from huggingface_hub import list_repo_files
     except ImportError:
       raise ImportError(self.HF_DOWNLOAD_TEXT)
+    # Get the list of files on the repo
+    repo_files = list_repo_files(self.repo_id, token=self.token)
+    self.ignore_patterns = self._get_ignore_patterns()
+    # Get the list of files on the repo that are not ignored
+    if getattr(self, "ignore_patterns", None):
+      patterns = self.ignore_patterns
+      def should_ignore(file_path):
+        return any(fnmatch.fnmatch(file_path, pattern) for pattern in patterns)
+      repo_files = [f for f in repo_files if not should_ignore(f)]
+    # Check if downloaded files match the files we expect (ignoring ignored patterns)
     checkpoint_dir_files = [
         f for dp, dn, fn in os.walk(os.path.expanduser(checkpoint_path)) for f in fn
     ]
-    return (len(checkpoint_dir_files) >= len(list_repo_files(self.repo_id))) and len(
-        list_repo_files(self.repo_id)) > 0
+    # Validate by comparing file lists
+    return len(checkpoint_dir_files) >= len(repo_files) and not (
+        len(set(repo_files) - set(checkpoint_dir_files)) > 0) and len(repo_files) > 0
+  def _get_ignore_patterns(self):
+    # check if model exists on HF
+    try:
+      from huggingface_hub import list_repo_files
+    except ImportError:
+      raise ImportError(self.HF_DOWNLOAD_TEXT)
+    # Get the list of files on the repo that are not ignored
+    repo_files = list_repo_files(self.repo_id, token=self.token)
+    self.ignore_patterns = None
+    if any(f.endswith(".safetensors") for f in repo_files):
+      self.ignore_patterns = ["**/original/*", "**/*.pth", "**/*.bin", "*.pth", "*.bin"]
+    return self.ignore_patterns
   @staticmethod
   def validate_config(checkpoint_path: str):

clarifai/runners/utils/logging.py ADDED Viewed

@@ -0,0 +1,6 @@
+import os
+from clarifai.utils.logging import get_logger
+logger_level = os.environ.get("LOG_LEVEL", "INFO")
+logger = get_logger(logger_level, __name__)