huggingface-hub 0.18.0rc0__py3-none-any.whl → 0.19.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of huggingface-hub might be problematic. Click here for more details.

Files changed (45) hide show
  1. huggingface_hub/__init__.py +31 -5
  2. huggingface_hub/_commit_api.py +7 -11
  3. huggingface_hub/_inference_endpoints.py +348 -0
  4. huggingface_hub/_login.py +9 -7
  5. huggingface_hub/_multi_commits.py +1 -1
  6. huggingface_hub/_snapshot_download.py +6 -7
  7. huggingface_hub/_space_api.py +7 -4
  8. huggingface_hub/_tensorboard_logger.py +1 -0
  9. huggingface_hub/_webhooks_payload.py +7 -7
  10. huggingface_hub/commands/lfs.py +3 -6
  11. huggingface_hub/commands/user.py +1 -4
  12. huggingface_hub/constants.py +27 -0
  13. huggingface_hub/file_download.py +142 -134
  14. huggingface_hub/hf_api.py +1058 -503
  15. huggingface_hub/hf_file_system.py +57 -12
  16. huggingface_hub/hub_mixin.py +3 -5
  17. huggingface_hub/inference/_client.py +43 -8
  18. huggingface_hub/inference/_common.py +8 -16
  19. huggingface_hub/inference/_generated/_async_client.py +41 -8
  20. huggingface_hub/inference/_text_generation.py +43 -0
  21. huggingface_hub/inference_api.py +1 -1
  22. huggingface_hub/lfs.py +32 -14
  23. huggingface_hub/repocard_data.py +7 -0
  24. huggingface_hub/repository.py +19 -3
  25. huggingface_hub/templates/datasetcard_template.md +83 -43
  26. huggingface_hub/templates/modelcard_template.md +4 -3
  27. huggingface_hub/utils/__init__.py +1 -1
  28. huggingface_hub/utils/_cache_assets.py +3 -3
  29. huggingface_hub/utils/_cache_manager.py +6 -7
  30. huggingface_hub/utils/_datetime.py +3 -1
  31. huggingface_hub/utils/_errors.py +10 -0
  32. huggingface_hub/utils/_hf_folder.py +4 -2
  33. huggingface_hub/utils/_http.py +10 -1
  34. huggingface_hub/utils/_runtime.py +4 -2
  35. huggingface_hub/utils/endpoint_helpers.py +27 -175
  36. huggingface_hub/utils/insecure_hashlib.py +34 -0
  37. huggingface_hub/utils/logging.py +4 -6
  38. huggingface_hub/utils/sha.py +2 -1
  39. {huggingface_hub-0.18.0rc0.dist-info → huggingface_hub-0.19.0.dist-info}/METADATA +16 -15
  40. huggingface_hub-0.19.0.dist-info/RECORD +74 -0
  41. {huggingface_hub-0.18.0rc0.dist-info → huggingface_hub-0.19.0.dist-info}/WHEEL +1 -1
  42. huggingface_hub-0.18.0rc0.dist-info/RECORD +0 -72
  43. {huggingface_hub-0.18.0rc0.dist-info → huggingface_hub-0.19.0.dist-info}/LICENSE +0 -0
  44. {huggingface_hub-0.18.0rc0.dist-info → huggingface_hub-0.19.0.dist-info}/entry_points.txt +0 -0
  45. {huggingface_hub-0.18.0rc0.dist-info → huggingface_hub-0.19.0.dist-info}/top_level.txt +0 -0
@@ -1,103 +1,143 @@
1
1
  ---
2
- # For reference on model card metadata, see the spec: https://github.com/huggingface/hub-docs/blob/main/datasetcard.md?plain=1
2
+ # For reference on dataset card metadata, see the spec: https://github.com/huggingface/hub-docs/blob/main/datasetcard.md?plain=1
3
3
  # Doc / guide: https://huggingface.co/docs/hub/datasets-cards
4
4
  {{ card_data }}
5
5
  ---
6
6
 
7
7
  # Dataset Card for {{ pretty_name | default("Dataset Name", true) }}
8
8
 
9
- ## Dataset Description
9
+ <!-- Provide a quick summary of the dataset. -->
10
10
 
11
- - **Homepage:** {{ homepage_url | default("", true)}}
12
- - **Repository:** {{ repo_url | default("", true)}}
13
- - **Paper:** {{ paper_url | default("", true)}}
14
- - **Leaderboard:** {{ leaderboard_url | default("", true)}}
15
- - **Point of Contact:** {{ point_of_contact | default("", true)}}
11
+ {{ dataset_summary | default("", true) }}
16
12
 
17
- ### Dataset Summary
13
+ ## Dataset Details
18
14
 
19
- {{ dataset_summary | default("[More Information Needed]", true)}}
15
+ ### Dataset Description
20
16
 
21
- ### Supported Tasks and Leaderboards
17
+ <!-- Provide a longer summary of what this dataset is. -->
22
18
 
23
- {{ supported_tasks_and_leaderboards_section | default("[More Information Needed]", true)}}
19
+ {{ dataset_description | default("", true) }}
24
20
 
25
- ### Languages
21
+ - **Curated by:** {{ curators | default("[More Information Needed]", true)}}
22
+ - **Funded by [optional]:** {{ funded_by | default("[More Information Needed]", true)}}
23
+ - **Shared by [optional]:** {{ shared_by | default("[More Information Needed]", true)}}
24
+ - **Language(s) (NLP):** {{ language | default("[More Information Needed]", true)}}
25
+ - **License:** {{ license | default("[More Information Needed]", true)}}
26
26
 
27
- {{ languages_section | default("[More Information Needed]", true)}}
27
+ ### Dataset Sources [optional]
28
28
 
29
- ## Dataset Structure
29
+ <!-- Provide the basic links for the dataset. -->
30
+
31
+ - **Repository:** {{ repo | default("[More Information Needed]", true)}}
32
+ - **Paper [optional]:** {{ paper | default("[More Information Needed]", true)}}
33
+ - **Demo [optional]:** {{ demo | default("[More Information Needed]", true)}}
34
+
35
+ ## Uses
30
36
 
31
- ### Data Instances
37
+ <!-- Address questions around how the dataset is intended to be used. -->
32
38
 
33
- {{ data_instances_section | default("[More Information Needed]", true)}}
39
+ ### Direct Use
34
40
 
35
- ### Data Fields
41
+ <!-- This section describes suitable use cases for the dataset. -->
36
42
 
37
- {{ data_fields_section | default("[More Information Needed]", true)}}
43
+ {{ direct_use | default("[More Information Needed]", true)}}
38
44
 
39
- ### Data Splits
45
+ ### Out-of-Scope Use
40
46
 
41
- {{ data_splits_section | default("[More Information Needed]", true)}}
47
+ <!-- This section addresses misuse, malicious use, and uses that the dataset will not work well for. -->
48
+
49
+ {{ out_of_scope_use | default("[More Information Needed]", true)}}
50
+
51
+ ## Dataset Structure
52
+
53
+ <!-- This section provides a description of the dataset fields, and additional information about the dataset structure such as criteria used to create the splits, relationships between data points, etc. -->
54
+
55
+ {{ dataset_structure | default("[More Information Needed]", true)}}
42
56
 
43
57
  ## Dataset Creation
44
58
 
45
59
  ### Curation Rationale
46
60
 
61
+ <!-- Motivation for the creation of this dataset. -->
62
+
47
63
  {{ curation_rationale_section | default("[More Information Needed]", true)}}
48
64
 
49
65
  ### Source Data
50
66
 
51
- #### Initial Data Collection and Normalization
67
+ <!-- This section describes the source data (e.g. news text and headlines, social media posts, translated sentences, ...). -->
68
+
69
+ #### Data Collection and Processing
70
+
71
+ <!-- This section describes the data collection and processing process such as data selection criteria, filtering and normalization methods, tools and libraries used, etc. -->
52
72
 
53
- {{ data_collection_section | default("[More Information Needed]", true)}}
73
+ {{ data_collection_and_processing_section | default("[More Information Needed]", true)}}
54
74
 
55
- #### Who are the source language producers?
75
+ #### Who are the source data producers?
56
76
 
57
- {{ source_language_producers_section | default("[More Information Needed]", true)}}
77
+ <!-- This section describes the people or systems who originally created the data. It should also include self-reported demographic or identity information for the source data creators if this information is available. -->
58
78
 
59
- ### Annotations
79
+ {{ source_data_producers_section | default("[More Information Needed]", true)}}
80
+
81
+ ### Annotations [optional]
82
+
83
+ <!-- If the dataset contains annotations which are not part of the initial data collection, use this section to describe them. -->
60
84
 
61
85
  #### Annotation process
62
86
 
87
+ <!-- This section describes the annotation process such as annotation tools used in the process, the amount of data annotated, annotation guidelines provided to the annotators, interannotator statistics, annotation validation, etc. -->
88
+
63
89
  {{ annotation_process_section | default("[More Information Needed]", true)}}
64
90
 
65
91
  #### Who are the annotators?
66
92
 
93
+ <!-- This section describes the people or systems who created the annotations. -->
94
+
67
95
  {{ who_are_annotators_section | default("[More Information Needed]", true)}}
68
96
 
69
- ### Personal and Sensitive Information
97
+ #### Personal and Sensitive Information
98
+
99
+ <!-- State whether the dataset contains data that might be considered personal, sensitive, or private (e.g., data that reveals addresses, uniquely identifiable names or aliases, racial or ethnic origins, sexual orientations, religious beliefs, political opinions, financial or health data, etc.). If efforts were made to anonymize the data, describe the anonymization process. -->
100
+
101
+ {{ personal_and_sensitive_information | default("[More Information Needed]", true)}}
102
+
103
+ ## Bias, Risks, and Limitations
104
+
105
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
106
+
107
+ {{ bias_risks_limitations | default("[More Information Needed]", true)}}
108
+
109
+ ### Recommendations
70
110
 
71
- {{ personal_and_sensitive_information_section | default("[More Information Needed]", true)}}
111
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
72
112
 
73
- ## Considerations for Using the Data
113
+ {{ bias_recommendations | default("Users should be made aware of the risks, biases and limitations of the dataset. More information needed for further recommendations.", true)}}
74
114
 
75
- ### Social Impact of Dataset
115
+ ## Citation [optional]
76
116
 
77
- {{ social_impact_section | default("[More Information Needed]", true)}}
117
+ <!-- If there is a paper or blog post introducing the dataset, the APA and Bibtex information for that should go in this section. -->
78
118
 
79
- ### Discussion of Biases
119
+ **BibTeX:**
80
120
 
81
- {{ discussion_of_biases_section | default("[More Information Needed]", true)}}
121
+ {{ citation_bibtex | default("[More Information Needed]", true)}}
82
122
 
83
- ### Other Known Limitations
123
+ **APA:**
84
124
 
85
- {{ known_limitations_section | default("[More Information Needed]", true)}}
125
+ {{ citation_apa | default("[More Information Needed]", true)}}
86
126
 
87
- ## Additional Information
127
+ ## Glossary [optional]
88
128
 
89
- ### Dataset Curators
129
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the dataset or dataset card. -->
90
130
 
91
- {{ dataset_curators_section | default("[More Information Needed]", true)}}
131
+ {{ glossary | default("[More Information Needed]", true)}}
92
132
 
93
- ### Licensing Information
133
+ ## More Information [optional]
94
134
 
95
- {{ licensing_information_section | default("[More Information Needed]", true)}}
135
+ {{ more_information | default("[More Information Needed]", true)}}
96
136
 
97
- ### Citation Information
137
+ ## Dataset Card Authors [optional]
98
138
 
99
- {{ citation_information_section | default("[More Information Needed]", true)}}
139
+ {{ dataset_card_authors | default("[More Information Needed]", true)}}
100
140
 
101
- ### Contributions
141
+ ## Dataset Card Contact
102
142
 
103
- {{ contributions_section | default("[More Information Needed]", true)}}
143
+ {{ dataset_card_contact | default("[More Information Needed]", true)}}
@@ -19,11 +19,12 @@
19
19
  {{ model_description | default("", true) }}
20
20
 
21
21
  - **Developed by:** {{ developers | default("[More Information Needed]", true)}}
22
+ - **Funded by [optional]:** {{ funded_by | default("[More Information Needed]", true)}}
22
23
  - **Shared by [optional]:** {{ shared_by | default("[More Information Needed]", true)}}
23
24
  - **Model type:** {{ model_type | default("[More Information Needed]", true)}}
24
25
  - **Language(s) (NLP):** {{ language | default("[More Information Needed]", true)}}
25
26
  - **License:** {{ license | default("[More Information Needed]", true)}}
26
- - **Finetuned from model [optional]:** {{ finetuned_from | default("[More Information Needed]", true)}}
27
+ - **Finetuned from model [optional]:** {{ base_model | default("[More Information Needed]", true)}}
27
28
 
28
29
  ### Model Sources [optional]
29
30
 
@@ -77,7 +78,7 @@ Use the code below to get started with the model.
77
78
 
78
79
  ### Training Data
79
80
 
80
- <!-- This should link to a Data Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
82
 
82
83
  {{ training_data | default("[More Information Needed]", true)}}
83
84
 
@@ -108,7 +109,7 @@ Use the code below to get started with the model.
108
109
 
109
110
  #### Testing Data
110
111
 
111
- <!-- This should link to a Data Card if possible. -->
112
+ <!-- This should link to a Dataset Card if possible. -->
112
113
 
113
114
  {{ testing_data | default("[More Information Needed]", true)}}
114
115
 
@@ -44,7 +44,7 @@ from ._fixes import SoftTemporaryDirectory, yaml_dump
44
44
  from ._git_credential import list_credential_helpers, set_git_credential, unset_git_credential
45
45
  from ._headers import build_hf_headers, get_token_to_send, LocalTokenNotFoundError
46
46
  from ._hf_folder import HfFolder
47
- from ._http import configure_http_backend, get_session, http_backoff
47
+ from ._http import configure_http_backend, get_session, http_backoff, reset_sessions
48
48
  from ._pagination import paginate
49
49
  from ._paths import filter_repo_objects, IGNORE_GIT_FOLDER_PATTERNS
50
50
  from ._experimental import experimental
@@ -15,7 +15,7 @@
15
15
  from pathlib import Path
16
16
  from typing import Union
17
17
 
18
- from ..constants import HUGGINGFACE_ASSETS_CACHE
18
+ from ..constants import HF_ASSETS_CACHE
19
19
 
20
20
 
21
21
  def cached_assets_path(
@@ -91,7 +91,7 @@ def cached_assets_path(
91
91
  assets_dir (`str`, `Path`, *optional*):
92
92
  Path to the folder where assets are cached. This must not be the same folder
93
93
  where Hub files are cached. Defaults to `HF_HOME / "assets"` if not provided.
94
- Can also be set with `HUGGINGFACE_ASSETS_CACHE` environment variable.
94
+ Can also be set with `HF_ASSETS_CACHE` environment variable.
95
95
 
96
96
  Returns:
97
97
  Path to the cache folder (`Path`).
@@ -115,7 +115,7 @@ def cached_assets_path(
115
115
  """
116
116
  # Resolve assets_dir
117
117
  if assets_dir is None:
118
- assets_dir = HUGGINGFACE_ASSETS_CACHE
118
+ assets_dir = HF_ASSETS_CACHE
119
119
  assets_dir = Path(assets_dir).expanduser().resolve()
120
120
 
121
121
  # Avoid names that could create path issues
@@ -21,7 +21,7 @@ from dataclasses import dataclass
21
21
  from pathlib import Path
22
22
  from typing import Dict, FrozenSet, List, Literal, Optional, Set, Union
23
23
 
24
- from ..constants import HUGGINGFACE_HUB_CACHE
24
+ from ..constants import HF_HUB_CACHE
25
25
  from . import logging
26
26
 
27
27
 
@@ -580,26 +580,25 @@ def scan_cache_dir(cache_dir: Optional[Union[str, Path]] = None) -> HFCacheInfo:
580
580
  Returns: a [`~HFCacheInfo`] object.
581
581
  """
582
582
  if cache_dir is None:
583
- cache_dir = HUGGINGFACE_HUB_CACHE
583
+ cache_dir = HF_HUB_CACHE
584
584
 
585
585
  cache_dir = Path(cache_dir).expanduser().resolve()
586
586
  if not cache_dir.exists():
587
587
  raise CacheNotFound(
588
- f"Cache directory not found: {cache_dir}. Please use `cache_dir`"
589
- " argument or set `HUGGINGFACE_HUB_CACHE` environment variable.",
588
+ f"Cache directory not found: {cache_dir}. Please use `cache_dir` argument or set `HF_HUB_CACHE` environment variable.",
590
589
  cache_dir=cache_dir,
591
590
  )
592
591
 
593
592
  if cache_dir.is_file():
594
593
  raise ValueError(
595
- f"Scan cache expects a directory but found a file: {cache_dir}. Please use"
596
- " `cache_dir` argument or set `HUGGINGFACE_HUB_CACHE` environment"
597
- " variable."
594
+ f"Scan cache expects a directory but found a file: {cache_dir}. Please use `cache_dir` argument or set `HF_HUB_CACHE` environment variable."
598
595
  )
599
596
 
600
597
  repos: Set[CachedRepoInfo] = set()
601
598
  warnings: List[CorruptedCacheException] = []
602
599
  for repo_path in cache_dir.iterdir():
600
+ if repo_path.name == ".locks": # skip './.locks/' folder
601
+ continue
603
602
  try:
604
603
  repos.add(_scan_cached_repo(repo_path))
605
604
  except CorruptedCacheException as e:
@@ -55,7 +55,9 @@ def parse_datetime(date_string: str) -> datetime:
55
55
  # timezone and then move it to the appropriate UTC timezone.
56
56
  # See https://en.wikipedia.org/wiki/ISO_8601#Coordinated_Universal_Time_(UTC)
57
57
  # Taken from https://stackoverflow.com/a/3168394.
58
-
58
+ if len(date_string) == 30:
59
+ # Means timezoned-timestamp with nanoseconds precision. We need to truncate the last 3 digits.
60
+ date_string = date_string[:-4] + "Z"
59
61
  dt = datetime.strptime(date_string, "%Y-%m-%dT%H:%M:%S.%fZ")
60
62
  dt += UTC_OFFSET # By default, datetime is not timezoned -> move to UTC time
61
63
  return dt.astimezone(timezone.utc) # Set explicit timezone
@@ -2,6 +2,7 @@ from typing import Optional
2
2
 
3
3
  from requests import HTTPError, Response
4
4
 
5
+ from ..constants import INFERENCE_ENDPOINTS_ENDPOINT
5
6
  from ._fixes import JSONDecodeError
6
7
 
7
8
 
@@ -293,6 +294,15 @@ def hf_raise_for_status(response: Response, endpoint_name: Optional[str] = None)
293
294
  # This prevent from raising a misleading `RepositoryNotFoundError` (see below).
294
295
  pass
295
296
 
297
+ elif (
298
+ response.status_code == 401
299
+ and response.request.url is not None
300
+ and INFERENCE_ENDPOINTS_ENDPOINT in response.request.url
301
+ ):
302
+ # Not enough permission to list Inference Endpoints from this org. We don't raise a custom error for this.
303
+ # This prevent from raising a misleading `RepositoryNotFoundError` (see below).
304
+ pass
305
+
296
306
  elif error_code == "RepoNotFound" or response.status_code == 401:
297
307
  # 401 is misleading as it is returned for:
298
308
  # - private and gated repos if user is not authenticated
@@ -46,7 +46,7 @@ class HfFolder:
46
46
  """
47
47
  Get token or None if not existent.
48
48
 
49
- Note that a token can be also provided using the `HUGGING_FACE_HUB_TOKEN` environment variable.
49
+ Note that a token can be also provided using the `HF_TOKEN` environment variable.
50
50
 
51
51
  Token is saved in the huggingface home folder. You can configure it by setting
52
52
  the `HF_HOME` environment variable. Previous location was `~/.huggingface/token`.
@@ -63,7 +63,9 @@ class HfFolder:
63
63
  pass
64
64
 
65
65
  # 1. Is it set by environment variable ?
66
- token: Optional[str] = os.environ.get("HUGGING_FACE_HUB_TOKEN")
66
+ token: Optional[str] = os.environ.get("HF_TOKEN")
67
+ if token is None: # Ensure backward compatibility but doesn't have priority
68
+ token = os.environ.get("HUGGING_FACE_HUB_TOKEN")
67
69
  if token is not None:
68
70
  token = token.replace("\r", "").replace("\n", "").strip()
69
71
  return token
@@ -113,7 +113,7 @@ def configure_http_backend(backend_factory: BACKEND_FACTORY_T = _default_backend
113
113
  """
114
114
  global _GLOBAL_BACKEND_FACTORY
115
115
  _GLOBAL_BACKEND_FACTORY = backend_factory
116
- _get_session_from_cache.cache_clear()
116
+ reset_sessions()
117
117
 
118
118
 
119
119
  def get_session() -> requests.Session:
@@ -148,6 +148,15 @@ def get_session() -> requests.Session:
148
148
  return _get_session_from_cache(process_id=os.getpid(), thread_id=threading.get_ident())
149
149
 
150
150
 
151
+ def reset_sessions() -> None:
152
+ """Reset the cache of sessions.
153
+
154
+ Mostly used internally when sessions are reconfigured or an SSLError is raised.
155
+ See [`configure_http_backend`] for more details.
156
+ """
157
+ _get_session_from_cache.cache_clear()
158
+
159
+
151
160
  @lru_cache
152
161
  def _get_session_from_cache(process_id: int, thread_id: int) -> requests.Session:
153
162
  """
@@ -305,8 +305,8 @@ def dump_environment_info() -> Dict[str, Any]:
305
305
 
306
306
  # Environment variables
307
307
  info["ENDPOINT"] = constants.ENDPOINT
308
- info["HUGGINGFACE_HUB_CACHE"] = constants.HUGGINGFACE_HUB_CACHE
309
- info["HUGGINGFACE_ASSETS_CACHE"] = constants.HUGGINGFACE_ASSETS_CACHE
308
+ info["HF_HUB_CACHE"] = constants.HF_HUB_CACHE
309
+ info["HF_ASSETS_CACHE"] = constants.HF_ASSETS_CACHE
310
310
  info["HF_TOKEN_PATH"] = constants.HF_TOKEN_PATH
311
311
  info["HF_HUB_OFFLINE"] = constants.HF_HUB_OFFLINE
312
312
  info["HF_HUB_DISABLE_TELEMETRY"] = constants.HF_HUB_DISABLE_TELEMETRY
@@ -315,6 +315,8 @@ def dump_environment_info() -> Dict[str, Any]:
315
315
  info["HF_HUB_DISABLE_EXPERIMENTAL_WARNING"] = constants.HF_HUB_DISABLE_EXPERIMENTAL_WARNING
316
316
  info["HF_HUB_DISABLE_IMPLICIT_TOKEN"] = constants.HF_HUB_DISABLE_IMPLICIT_TOKEN
317
317
  info["HF_HUB_ENABLE_HF_TRANSFER"] = constants.HF_HUB_ENABLE_HF_TRANSFER
318
+ info["HF_HUB_ETAG_TIMEOUT"] = constants.HF_HUB_ETAG_TIMEOUT
319
+ info["HF_HUB_DOWNLOAD_TIMEOUT"] = constants.HF_HUB_DOWNLOAD_TIMEOUT
318
320
 
319
321
  print("\nCopy-and-paste the text below in your GitHub issue.\n")
320
322
  print("\n".join([f"- {prop}: {val}" for prop, val in info.items()]) + "\n")
@@ -16,27 +16,28 @@ with the aim for a user-friendly interface.
16
16
  import math
17
17
  import re
18
18
  from dataclasses import dataclass
19
- from typing import TYPE_CHECKING, Iterable, List, Optional, Union
19
+ from typing import TYPE_CHECKING, List, Optional, Union
20
+
21
+ from ..repocard_data import ModelCardData
20
22
 
21
23
 
22
24
  if TYPE_CHECKING:
23
25
  from ..hf_api import ModelInfo
24
26
 
25
27
 
26
- def _filter_emissions(
27
- models: Iterable["ModelInfo"],
28
- minimum_threshold: Optional[float] = None,
29
- maximum_threshold: Optional[float] = None,
30
- ) -> Iterable["ModelInfo"]:
31
- """Filters a list of models for those that include an emission tag and limit them to between two thresholds
28
+ def _is_emission_within_treshold(model_info: "ModelInfo", minimum_threshold: float, maximum_threshold: float) -> bool:
29
+ """Checks if a model's emission is within a given threshold.
32
30
 
33
31
  Args:
34
- models (Iterable of `ModelInfo`):
35
- A list of models to filter.
36
- minimum_threshold (`float`, *optional*):
32
+ model_info (`ModelInfo`):
33
+ A model info object containing the model's emission information.
34
+ minimum_threshold (`float`):
37
35
  A minimum carbon threshold to filter by, such as 1.
38
- maximum_threshold (`float`, *optional*):
36
+ maximum_threshold (`float`):
39
37
  A maximum carbon threshold to filter by, such as 10.
38
+
39
+ Returns:
40
+ `bool`: Whether the model's emission is within the given threshold.
40
41
  """
41
42
  if minimum_threshold is None and maximum_threshold is None:
42
43
  raise ValueError("Both `minimum_threshold` and `maximum_threshold` cannot both be `None`")
@@ -45,26 +46,24 @@ def _filter_emissions(
45
46
  if maximum_threshold is None:
46
47
  maximum_threshold = math.inf
47
48
 
48
- for model in models:
49
- card_data = getattr(model, "cardData", None)
50
- if card_data is None or not isinstance(card_data, dict):
51
- continue
49
+ card_data = getattr(model_info, "card_data", None)
50
+ if card_data is None or not isinstance(card_data, (dict, ModelCardData)):
51
+ return False
52
52
 
53
- # Get CO2 emission metadata
54
- emission = card_data.get("co2_eq_emissions", None)
55
- if isinstance(emission, dict):
56
- emission = emission["emissions"]
57
- if not emission:
58
- continue
53
+ # Get CO2 emission metadata
54
+ emission = card_data.get("co2_eq_emissions", None)
55
+ if isinstance(emission, dict):
56
+ emission = emission["emissions"]
57
+ if not emission:
58
+ return False
59
59
 
60
- # Filter out if value is missing or out of range
61
- matched = re.search(r"\d+\.\d+|\d+", str(emission))
62
- if matched is None:
63
- continue
60
+ # Filter out if value is missing or out of range
61
+ matched = re.search(r"\d+\.\d+|\d+", str(emission))
62
+ if matched is None:
63
+ return False
64
64
 
65
- emission_value = float(matched.group(0))
66
- if emission_value >= minimum_threshold and emission_value <= maximum_threshold:
67
- yield model
65
+ emission_value = float(matched.group(0))
66
+ return minimum_threshold <= emission_value <= maximum_threshold
68
67
 
69
68
 
70
69
  @dataclass
@@ -203,16 +202,11 @@ class ModelFilter:
203
202
  >>> # For the task
204
203
  >>> new_filter = ModelFilter(task="text-classification")
205
204
 
206
- >>> # Retrieving tags using the `HfApi.get_model_tags` method
207
205
  >>> from huggingface_hub import HfApi
208
206
 
209
207
  >>> api = HfApi()
210
208
  # To list model tags
211
209
 
212
- >>> api.get_model_tags()
213
- # To list dataset tags
214
-
215
- >>> api.get_dataset_tags()
216
210
  >>> new_filter = ModelFilter(tags="benchmark:raft")
217
211
 
218
212
  >>> # Related to the dataset
@@ -227,145 +221,3 @@ class ModelFilter:
227
221
  task: Optional[Union[str, List[str]]] = None
228
222
  trained_dataset: Optional[Union[str, List[str]]] = None
229
223
  tags: Optional[Union[str, List[str]]] = None
230
-
231
-
232
- class AttributeDictionary(dict):
233
- """
234
- `dict` subclass that also provides access to keys as attributes
235
-
236
- If a key starts with a number, it will exist in the dictionary but not as an
237
- attribute
238
-
239
- Example:
240
-
241
- ```python
242
- >>> d = AttributeDictionary()
243
- >>> d["test"] = "a"
244
- >>> print(d.test) # prints "a"
245
- ```
246
-
247
- """
248
-
249
- def __getattr__(self, k):
250
- if k in self:
251
- return self[k]
252
- else:
253
- raise AttributeError(k)
254
-
255
- def __setattr__(self, k, v):
256
- (self.__setitem__, super().__setattr__)[k[0] == "_"](k, v)
257
-
258
- def __delattr__(self, k):
259
- if k in self:
260
- del self[k]
261
- else:
262
- raise AttributeError(k)
263
-
264
- def __dir__(self):
265
- keys = sorted(self.keys())
266
- keys = [key for key in keys if key.replace("_", "").isalpha()]
267
- return super().__dir__() + keys
268
-
269
- def __repr__(self):
270
- repr_str = "Available Attributes or Keys:\n"
271
- for key in sorted(self.keys()):
272
- repr_str += f" * {key}"
273
- if not key.replace("_", "").isalpha():
274
- repr_str += " (Key only)"
275
- repr_str += "\n"
276
- return repr_str
277
-
278
-
279
- class GeneralTags(AttributeDictionary):
280
- """
281
- A namespace object holding all tags, filtered by `keys` If a tag starts with
282
- a number, it will only exist in the dictionary
283
-
284
- Example:
285
- ```python
286
- >>> a.b["1a"] # will work
287
- >>> a["b"]["1a"] # will work
288
- >>> # a.b.1a # will not work
289
- ```
290
-
291
- Args:
292
- tag_dictionary (`dict`):
293
- A dictionary of tags returned from the /api/***-tags-by-type api
294
- endpoint
295
- keys (`list`):
296
- A list of keys to unpack the `tag_dictionary` with, such as
297
- `["library","language"]`
298
- """
299
-
300
- def __init__(self, tag_dictionary: dict, keys: Optional[list] = None):
301
- self._tag_dictionary = tag_dictionary
302
- if keys is None:
303
- keys = list(self._tag_dictionary.keys())
304
- for key in keys:
305
- self._unpack_and_assign_dictionary(key)
306
-
307
- def _unpack_and_assign_dictionary(self, key: str):
308
- "Assign nested attributes to `self.key` containing information as an `AttributeDictionary`"
309
- ref = AttributeDictionary()
310
- setattr(self, key, ref)
311
- for item in self._tag_dictionary.get(key, []):
312
- label = item["label"].replace(" ", "").replace("-", "_").replace(".", "_")
313
- ref[label] = item["id"]
314
- self[key] = ref
315
-
316
-
317
- class ModelTags(GeneralTags):
318
- """
319
- A namespace object holding all available model tags If a tag starts with a
320
- number, it will only exist in the dictionary
321
-
322
- Example:
323
-
324
- ```python
325
- >>> a.dataset["1_5BArabicCorpus"] # will work
326
- >>> a["dataset"]["1_5BArabicCorpus"] # will work
327
- >>> # o.dataset.1_5BArabicCorpus # will not work
328
- ```
329
-
330
- Args:
331
- model_tag_dictionary (`dict`):
332
- A dictionary of valid model tags, returned from the
333
- /api/models-tags-by-type api endpoint
334
- """
335
-
336
- def __init__(self, model_tag_dictionary: dict):
337
- keys = ["library", "language", "license", "dataset", "pipeline_tag"]
338
- super().__init__(model_tag_dictionary, keys)
339
-
340
-
341
- class DatasetTags(GeneralTags):
342
- """
343
- A namespace object holding all available dataset tags If a tag starts with a
344
- number, it will only exist in the dictionary
345
-
346
- Example
347
-
348
- ```python
349
- >>> a.size_categories["100K<n<1M"] # will work
350
- >>> a["size_categories"]["100K<n<1M"] # will work
351
- >>> # o.size_categories.100K<n<1M # will not work
352
- ```
353
-
354
- Args:
355
- dataset_tag_dictionary (`dict`):
356
- A dictionary of valid dataset tags, returned from the
357
- /api/datasets-tags-by-type api endpoint
358
- """
359
-
360
- def __init__(self, dataset_tag_dictionary: dict):
361
- keys = [
362
- "language",
363
- "multilinguality",
364
- "language_creators",
365
- "task_categories",
366
- "size_categories",
367
- "benchmark",
368
- "task_ids",
369
- "license",
370
- ]
371
- super().__init__(dataset_tag_dictionary, keys)