huggingface-hub 0.18.0rc0__py3-none-any.whl → 0.19.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of huggingface-hub might be problematic. Click here for more details.
- huggingface_hub/__init__.py +31 -5
- huggingface_hub/_commit_api.py +7 -11
- huggingface_hub/_inference_endpoints.py +348 -0
- huggingface_hub/_login.py +9 -7
- huggingface_hub/_multi_commits.py +1 -1
- huggingface_hub/_snapshot_download.py +6 -7
- huggingface_hub/_space_api.py +7 -4
- huggingface_hub/_tensorboard_logger.py +1 -0
- huggingface_hub/_webhooks_payload.py +7 -7
- huggingface_hub/commands/lfs.py +3 -6
- huggingface_hub/commands/user.py +1 -4
- huggingface_hub/constants.py +27 -0
- huggingface_hub/file_download.py +142 -134
- huggingface_hub/hf_api.py +1058 -503
- huggingface_hub/hf_file_system.py +57 -12
- huggingface_hub/hub_mixin.py +3 -5
- huggingface_hub/inference/_client.py +43 -8
- huggingface_hub/inference/_common.py +8 -16
- huggingface_hub/inference/_generated/_async_client.py +41 -8
- huggingface_hub/inference/_text_generation.py +43 -0
- huggingface_hub/inference_api.py +1 -1
- huggingface_hub/lfs.py +32 -14
- huggingface_hub/repocard_data.py +7 -0
- huggingface_hub/repository.py +19 -3
- huggingface_hub/templates/datasetcard_template.md +83 -43
- huggingface_hub/templates/modelcard_template.md +4 -3
- huggingface_hub/utils/__init__.py +1 -1
- huggingface_hub/utils/_cache_assets.py +3 -3
- huggingface_hub/utils/_cache_manager.py +6 -7
- huggingface_hub/utils/_datetime.py +3 -1
- huggingface_hub/utils/_errors.py +10 -0
- huggingface_hub/utils/_hf_folder.py +4 -2
- huggingface_hub/utils/_http.py +10 -1
- huggingface_hub/utils/_runtime.py +4 -2
- huggingface_hub/utils/endpoint_helpers.py +27 -175
- huggingface_hub/utils/insecure_hashlib.py +34 -0
- huggingface_hub/utils/logging.py +4 -6
- huggingface_hub/utils/sha.py +2 -1
- {huggingface_hub-0.18.0rc0.dist-info → huggingface_hub-0.19.0.dist-info}/METADATA +16 -15
- huggingface_hub-0.19.0.dist-info/RECORD +74 -0
- {huggingface_hub-0.18.0rc0.dist-info → huggingface_hub-0.19.0.dist-info}/WHEEL +1 -1
- huggingface_hub-0.18.0rc0.dist-info/RECORD +0 -72
- {huggingface_hub-0.18.0rc0.dist-info → huggingface_hub-0.19.0.dist-info}/LICENSE +0 -0
- {huggingface_hub-0.18.0rc0.dist-info → huggingface_hub-0.19.0.dist-info}/entry_points.txt +0 -0
- {huggingface_hub-0.18.0rc0.dist-info → huggingface_hub-0.19.0.dist-info}/top_level.txt +0 -0
|
@@ -1,103 +1,143 @@
|
|
|
1
1
|
---
|
|
2
|
-
# For reference on
|
|
2
|
+
# For reference on dataset card metadata, see the spec: https://github.com/huggingface/hub-docs/blob/main/datasetcard.md?plain=1
|
|
3
3
|
# Doc / guide: https://huggingface.co/docs/hub/datasets-cards
|
|
4
4
|
{{ card_data }}
|
|
5
5
|
---
|
|
6
6
|
|
|
7
7
|
# Dataset Card for {{ pretty_name | default("Dataset Name", true) }}
|
|
8
8
|
|
|
9
|
-
|
|
9
|
+
<!-- Provide a quick summary of the dataset. -->
|
|
10
10
|
|
|
11
|
-
|
|
12
|
-
- **Repository:** {{ repo_url | default("", true)}}
|
|
13
|
-
- **Paper:** {{ paper_url | default("", true)}}
|
|
14
|
-
- **Leaderboard:** {{ leaderboard_url | default("", true)}}
|
|
15
|
-
- **Point of Contact:** {{ point_of_contact | default("", true)}}
|
|
11
|
+
{{ dataset_summary | default("", true) }}
|
|
16
12
|
|
|
17
|
-
|
|
13
|
+
## Dataset Details
|
|
18
14
|
|
|
19
|
-
|
|
15
|
+
### Dataset Description
|
|
20
16
|
|
|
21
|
-
|
|
17
|
+
<!-- Provide a longer summary of what this dataset is. -->
|
|
22
18
|
|
|
23
|
-
{{
|
|
19
|
+
{{ dataset_description | default("", true) }}
|
|
24
20
|
|
|
25
|
-
|
|
21
|
+
- **Curated by:** {{ curators | default("[More Information Needed]", true)}}
|
|
22
|
+
- **Funded by [optional]:** {{ funded_by | default("[More Information Needed]", true)}}
|
|
23
|
+
- **Shared by [optional]:** {{ shared_by | default("[More Information Needed]", true)}}
|
|
24
|
+
- **Language(s) (NLP):** {{ language | default("[More Information Needed]", true)}}
|
|
25
|
+
- **License:** {{ license | default("[More Information Needed]", true)}}
|
|
26
26
|
|
|
27
|
-
|
|
27
|
+
### Dataset Sources [optional]
|
|
28
28
|
|
|
29
|
-
|
|
29
|
+
<!-- Provide the basic links for the dataset. -->
|
|
30
|
+
|
|
31
|
+
- **Repository:** {{ repo | default("[More Information Needed]", true)}}
|
|
32
|
+
- **Paper [optional]:** {{ paper | default("[More Information Needed]", true)}}
|
|
33
|
+
- **Demo [optional]:** {{ demo | default("[More Information Needed]", true)}}
|
|
34
|
+
|
|
35
|
+
## Uses
|
|
30
36
|
|
|
31
|
-
|
|
37
|
+
<!-- Address questions around how the dataset is intended to be used. -->
|
|
32
38
|
|
|
33
|
-
|
|
39
|
+
### Direct Use
|
|
34
40
|
|
|
35
|
-
|
|
41
|
+
<!-- This section describes suitable use cases for the dataset. -->
|
|
36
42
|
|
|
37
|
-
{{
|
|
43
|
+
{{ direct_use | default("[More Information Needed]", true)}}
|
|
38
44
|
|
|
39
|
-
###
|
|
45
|
+
### Out-of-Scope Use
|
|
40
46
|
|
|
41
|
-
|
|
47
|
+
<!-- This section addresses misuse, malicious use, and uses that the dataset will not work well for. -->
|
|
48
|
+
|
|
49
|
+
{{ out_of_scope_use | default("[More Information Needed]", true)}}
|
|
50
|
+
|
|
51
|
+
## Dataset Structure
|
|
52
|
+
|
|
53
|
+
<!-- This section provides a description of the dataset fields, and additional information about the dataset structure such as criteria used to create the splits, relationships between data points, etc. -->
|
|
54
|
+
|
|
55
|
+
{{ dataset_structure | default("[More Information Needed]", true)}}
|
|
42
56
|
|
|
43
57
|
## Dataset Creation
|
|
44
58
|
|
|
45
59
|
### Curation Rationale
|
|
46
60
|
|
|
61
|
+
<!-- Motivation for the creation of this dataset. -->
|
|
62
|
+
|
|
47
63
|
{{ curation_rationale_section | default("[More Information Needed]", true)}}
|
|
48
64
|
|
|
49
65
|
### Source Data
|
|
50
66
|
|
|
51
|
-
|
|
67
|
+
<!-- This section describes the source data (e.g. news text and headlines, social media posts, translated sentences, ...). -->
|
|
68
|
+
|
|
69
|
+
#### Data Collection and Processing
|
|
70
|
+
|
|
71
|
+
<!-- This section describes the data collection and processing process such as data selection criteria, filtering and normalization methods, tools and libraries used, etc. -->
|
|
52
72
|
|
|
53
|
-
{{
|
|
73
|
+
{{ data_collection_and_processing_section | default("[More Information Needed]", true)}}
|
|
54
74
|
|
|
55
|
-
#### Who are the source
|
|
75
|
+
#### Who are the source data producers?
|
|
56
76
|
|
|
57
|
-
|
|
77
|
+
<!-- This section describes the people or systems who originally created the data. It should also include self-reported demographic or identity information for the source data creators if this information is available. -->
|
|
58
78
|
|
|
59
|
-
|
|
79
|
+
{{ source_data_producers_section | default("[More Information Needed]", true)}}
|
|
80
|
+
|
|
81
|
+
### Annotations [optional]
|
|
82
|
+
|
|
83
|
+
<!-- If the dataset contains annotations which are not part of the initial data collection, use this section to describe them. -->
|
|
60
84
|
|
|
61
85
|
#### Annotation process
|
|
62
86
|
|
|
87
|
+
<!-- This section describes the annotation process such as annotation tools used in the process, the amount of data annotated, annotation guidelines provided to the annotators, interannotator statistics, annotation validation, etc. -->
|
|
88
|
+
|
|
63
89
|
{{ annotation_process_section | default("[More Information Needed]", true)}}
|
|
64
90
|
|
|
65
91
|
#### Who are the annotators?
|
|
66
92
|
|
|
93
|
+
<!-- This section describes the people or systems who created the annotations. -->
|
|
94
|
+
|
|
67
95
|
{{ who_are_annotators_section | default("[More Information Needed]", true)}}
|
|
68
96
|
|
|
69
|
-
|
|
97
|
+
#### Personal and Sensitive Information
|
|
98
|
+
|
|
99
|
+
<!-- State whether the dataset contains data that might be considered personal, sensitive, or private (e.g., data that reveals addresses, uniquely identifiable names or aliases, racial or ethnic origins, sexual orientations, religious beliefs, political opinions, financial or health data, etc.). If efforts were made to anonymize the data, describe the anonymization process. -->
|
|
100
|
+
|
|
101
|
+
{{ personal_and_sensitive_information | default("[More Information Needed]", true)}}
|
|
102
|
+
|
|
103
|
+
## Bias, Risks, and Limitations
|
|
104
|
+
|
|
105
|
+
<!-- This section is meant to convey both technical and sociotechnical limitations. -->
|
|
106
|
+
|
|
107
|
+
{{ bias_risks_limitations | default("[More Information Needed]", true)}}
|
|
108
|
+
|
|
109
|
+
### Recommendations
|
|
70
110
|
|
|
71
|
-
|
|
111
|
+
<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
|
|
72
112
|
|
|
73
|
-
|
|
113
|
+
{{ bias_recommendations | default("Users should be made aware of the risks, biases and limitations of the dataset. More information needed for further recommendations.", true)}}
|
|
74
114
|
|
|
75
|
-
|
|
115
|
+
## Citation [optional]
|
|
76
116
|
|
|
77
|
-
|
|
117
|
+
<!-- If there is a paper or blog post introducing the dataset, the APA and Bibtex information for that should go in this section. -->
|
|
78
118
|
|
|
79
|
-
|
|
119
|
+
**BibTeX:**
|
|
80
120
|
|
|
81
|
-
{{
|
|
121
|
+
{{ citation_bibtex | default("[More Information Needed]", true)}}
|
|
82
122
|
|
|
83
|
-
|
|
123
|
+
**APA:**
|
|
84
124
|
|
|
85
|
-
{{
|
|
125
|
+
{{ citation_apa | default("[More Information Needed]", true)}}
|
|
86
126
|
|
|
87
|
-
##
|
|
127
|
+
## Glossary [optional]
|
|
88
128
|
|
|
89
|
-
|
|
129
|
+
<!-- If relevant, include terms and calculations in this section that can help readers understand the dataset or dataset card. -->
|
|
90
130
|
|
|
91
|
-
{{
|
|
131
|
+
{{ glossary | default("[More Information Needed]", true)}}
|
|
92
132
|
|
|
93
|
-
|
|
133
|
+
## More Information [optional]
|
|
94
134
|
|
|
95
|
-
{{
|
|
135
|
+
{{ more_information | default("[More Information Needed]", true)}}
|
|
96
136
|
|
|
97
|
-
|
|
137
|
+
## Dataset Card Authors [optional]
|
|
98
138
|
|
|
99
|
-
{{
|
|
139
|
+
{{ dataset_card_authors | default("[More Information Needed]", true)}}
|
|
100
140
|
|
|
101
|
-
|
|
141
|
+
## Dataset Card Contact
|
|
102
142
|
|
|
103
|
-
{{
|
|
143
|
+
{{ dataset_card_contact | default("[More Information Needed]", true)}}
|
|
@@ -19,11 +19,12 @@
|
|
|
19
19
|
{{ model_description | default("", true) }}
|
|
20
20
|
|
|
21
21
|
- **Developed by:** {{ developers | default("[More Information Needed]", true)}}
|
|
22
|
+
- **Funded by [optional]:** {{ funded_by | default("[More Information Needed]", true)}}
|
|
22
23
|
- **Shared by [optional]:** {{ shared_by | default("[More Information Needed]", true)}}
|
|
23
24
|
- **Model type:** {{ model_type | default("[More Information Needed]", true)}}
|
|
24
25
|
- **Language(s) (NLP):** {{ language | default("[More Information Needed]", true)}}
|
|
25
26
|
- **License:** {{ license | default("[More Information Needed]", true)}}
|
|
26
|
-
- **Finetuned from model [optional]:** {{
|
|
27
|
+
- **Finetuned from model [optional]:** {{ base_model | default("[More Information Needed]", true)}}
|
|
27
28
|
|
|
28
29
|
### Model Sources [optional]
|
|
29
30
|
|
|
@@ -77,7 +78,7 @@ Use the code below to get started with the model.
|
|
|
77
78
|
|
|
78
79
|
### Training Data
|
|
79
80
|
|
|
80
|
-
<!-- This should link to a
|
|
81
|
+
<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
|
|
81
82
|
|
|
82
83
|
{{ training_data | default("[More Information Needed]", true)}}
|
|
83
84
|
|
|
@@ -108,7 +109,7 @@ Use the code below to get started with the model.
|
|
|
108
109
|
|
|
109
110
|
#### Testing Data
|
|
110
111
|
|
|
111
|
-
<!-- This should link to a
|
|
112
|
+
<!-- This should link to a Dataset Card if possible. -->
|
|
112
113
|
|
|
113
114
|
{{ testing_data | default("[More Information Needed]", true)}}
|
|
114
115
|
|
|
@@ -44,7 +44,7 @@ from ._fixes import SoftTemporaryDirectory, yaml_dump
|
|
|
44
44
|
from ._git_credential import list_credential_helpers, set_git_credential, unset_git_credential
|
|
45
45
|
from ._headers import build_hf_headers, get_token_to_send, LocalTokenNotFoundError
|
|
46
46
|
from ._hf_folder import HfFolder
|
|
47
|
-
from ._http import configure_http_backend, get_session, http_backoff
|
|
47
|
+
from ._http import configure_http_backend, get_session, http_backoff, reset_sessions
|
|
48
48
|
from ._pagination import paginate
|
|
49
49
|
from ._paths import filter_repo_objects, IGNORE_GIT_FOLDER_PATTERNS
|
|
50
50
|
from ._experimental import experimental
|
|
@@ -15,7 +15,7 @@
|
|
|
15
15
|
from pathlib import Path
|
|
16
16
|
from typing import Union
|
|
17
17
|
|
|
18
|
-
from ..constants import
|
|
18
|
+
from ..constants import HF_ASSETS_CACHE
|
|
19
19
|
|
|
20
20
|
|
|
21
21
|
def cached_assets_path(
|
|
@@ -91,7 +91,7 @@ def cached_assets_path(
|
|
|
91
91
|
assets_dir (`str`, `Path`, *optional*):
|
|
92
92
|
Path to the folder where assets are cached. This must not be the same folder
|
|
93
93
|
where Hub files are cached. Defaults to `HF_HOME / "assets"` if not provided.
|
|
94
|
-
Can also be set with `
|
|
94
|
+
Can also be set with `HF_ASSETS_CACHE` environment variable.
|
|
95
95
|
|
|
96
96
|
Returns:
|
|
97
97
|
Path to the cache folder (`Path`).
|
|
@@ -115,7 +115,7 @@ def cached_assets_path(
|
|
|
115
115
|
"""
|
|
116
116
|
# Resolve assets_dir
|
|
117
117
|
if assets_dir is None:
|
|
118
|
-
assets_dir =
|
|
118
|
+
assets_dir = HF_ASSETS_CACHE
|
|
119
119
|
assets_dir = Path(assets_dir).expanduser().resolve()
|
|
120
120
|
|
|
121
121
|
# Avoid names that could create path issues
|
|
@@ -21,7 +21,7 @@ from dataclasses import dataclass
|
|
|
21
21
|
from pathlib import Path
|
|
22
22
|
from typing import Dict, FrozenSet, List, Literal, Optional, Set, Union
|
|
23
23
|
|
|
24
|
-
from ..constants import
|
|
24
|
+
from ..constants import HF_HUB_CACHE
|
|
25
25
|
from . import logging
|
|
26
26
|
|
|
27
27
|
|
|
@@ -580,26 +580,25 @@ def scan_cache_dir(cache_dir: Optional[Union[str, Path]] = None) -> HFCacheInfo:
|
|
|
580
580
|
Returns: a [`~HFCacheInfo`] object.
|
|
581
581
|
"""
|
|
582
582
|
if cache_dir is None:
|
|
583
|
-
cache_dir =
|
|
583
|
+
cache_dir = HF_HUB_CACHE
|
|
584
584
|
|
|
585
585
|
cache_dir = Path(cache_dir).expanduser().resolve()
|
|
586
586
|
if not cache_dir.exists():
|
|
587
587
|
raise CacheNotFound(
|
|
588
|
-
f"Cache directory not found: {cache_dir}. Please use `cache_dir`"
|
|
589
|
-
" argument or set `HUGGINGFACE_HUB_CACHE` environment variable.",
|
|
588
|
+
f"Cache directory not found: {cache_dir}. Please use `cache_dir` argument or set `HF_HUB_CACHE` environment variable.",
|
|
590
589
|
cache_dir=cache_dir,
|
|
591
590
|
)
|
|
592
591
|
|
|
593
592
|
if cache_dir.is_file():
|
|
594
593
|
raise ValueError(
|
|
595
|
-
f"Scan cache expects a directory but found a file: {cache_dir}. Please use"
|
|
596
|
-
" `cache_dir` argument or set `HUGGINGFACE_HUB_CACHE` environment"
|
|
597
|
-
" variable."
|
|
594
|
+
f"Scan cache expects a directory but found a file: {cache_dir}. Please use `cache_dir` argument or set `HF_HUB_CACHE` environment variable."
|
|
598
595
|
)
|
|
599
596
|
|
|
600
597
|
repos: Set[CachedRepoInfo] = set()
|
|
601
598
|
warnings: List[CorruptedCacheException] = []
|
|
602
599
|
for repo_path in cache_dir.iterdir():
|
|
600
|
+
if repo_path.name == ".locks": # skip './.locks/' folder
|
|
601
|
+
continue
|
|
603
602
|
try:
|
|
604
603
|
repos.add(_scan_cached_repo(repo_path))
|
|
605
604
|
except CorruptedCacheException as e:
|
|
@@ -55,7 +55,9 @@ def parse_datetime(date_string: str) -> datetime:
|
|
|
55
55
|
# timezone and then move it to the appropriate UTC timezone.
|
|
56
56
|
# See https://en.wikipedia.org/wiki/ISO_8601#Coordinated_Universal_Time_(UTC)
|
|
57
57
|
# Taken from https://stackoverflow.com/a/3168394.
|
|
58
|
-
|
|
58
|
+
if len(date_string) == 30:
|
|
59
|
+
# Means timezoned-timestamp with nanoseconds precision. We need to truncate the last 3 digits.
|
|
60
|
+
date_string = date_string[:-4] + "Z"
|
|
59
61
|
dt = datetime.strptime(date_string, "%Y-%m-%dT%H:%M:%S.%fZ")
|
|
60
62
|
dt += UTC_OFFSET # By default, datetime is not timezoned -> move to UTC time
|
|
61
63
|
return dt.astimezone(timezone.utc) # Set explicit timezone
|
huggingface_hub/utils/_errors.py
CHANGED
|
@@ -2,6 +2,7 @@ from typing import Optional
|
|
|
2
2
|
|
|
3
3
|
from requests import HTTPError, Response
|
|
4
4
|
|
|
5
|
+
from ..constants import INFERENCE_ENDPOINTS_ENDPOINT
|
|
5
6
|
from ._fixes import JSONDecodeError
|
|
6
7
|
|
|
7
8
|
|
|
@@ -293,6 +294,15 @@ def hf_raise_for_status(response: Response, endpoint_name: Optional[str] = None)
|
|
|
293
294
|
# This prevent from raising a misleading `RepositoryNotFoundError` (see below).
|
|
294
295
|
pass
|
|
295
296
|
|
|
297
|
+
elif (
|
|
298
|
+
response.status_code == 401
|
|
299
|
+
and response.request.url is not None
|
|
300
|
+
and INFERENCE_ENDPOINTS_ENDPOINT in response.request.url
|
|
301
|
+
):
|
|
302
|
+
# Not enough permission to list Inference Endpoints from this org. We don't raise a custom error for this.
|
|
303
|
+
# This prevent from raising a misleading `RepositoryNotFoundError` (see below).
|
|
304
|
+
pass
|
|
305
|
+
|
|
296
306
|
elif error_code == "RepoNotFound" or response.status_code == 401:
|
|
297
307
|
# 401 is misleading as it is returned for:
|
|
298
308
|
# - private and gated repos if user is not authenticated
|
|
@@ -46,7 +46,7 @@ class HfFolder:
|
|
|
46
46
|
"""
|
|
47
47
|
Get token or None if not existent.
|
|
48
48
|
|
|
49
|
-
Note that a token can be also provided using the `
|
|
49
|
+
Note that a token can be also provided using the `HF_TOKEN` environment variable.
|
|
50
50
|
|
|
51
51
|
Token is saved in the huggingface home folder. You can configure it by setting
|
|
52
52
|
the `HF_HOME` environment variable. Previous location was `~/.huggingface/token`.
|
|
@@ -63,7 +63,9 @@ class HfFolder:
|
|
|
63
63
|
pass
|
|
64
64
|
|
|
65
65
|
# 1. Is it set by environment variable ?
|
|
66
|
-
token: Optional[str] = os.environ.get("
|
|
66
|
+
token: Optional[str] = os.environ.get("HF_TOKEN")
|
|
67
|
+
if token is None: # Ensure backward compatibility but doesn't have priority
|
|
68
|
+
token = os.environ.get("HUGGING_FACE_HUB_TOKEN")
|
|
67
69
|
if token is not None:
|
|
68
70
|
token = token.replace("\r", "").replace("\n", "").strip()
|
|
69
71
|
return token
|
huggingface_hub/utils/_http.py
CHANGED
|
@@ -113,7 +113,7 @@ def configure_http_backend(backend_factory: BACKEND_FACTORY_T = _default_backend
|
|
|
113
113
|
"""
|
|
114
114
|
global _GLOBAL_BACKEND_FACTORY
|
|
115
115
|
_GLOBAL_BACKEND_FACTORY = backend_factory
|
|
116
|
-
|
|
116
|
+
reset_sessions()
|
|
117
117
|
|
|
118
118
|
|
|
119
119
|
def get_session() -> requests.Session:
|
|
@@ -148,6 +148,15 @@ def get_session() -> requests.Session:
|
|
|
148
148
|
return _get_session_from_cache(process_id=os.getpid(), thread_id=threading.get_ident())
|
|
149
149
|
|
|
150
150
|
|
|
151
|
+
def reset_sessions() -> None:
|
|
152
|
+
"""Reset the cache of sessions.
|
|
153
|
+
|
|
154
|
+
Mostly used internally when sessions are reconfigured or an SSLError is raised.
|
|
155
|
+
See [`configure_http_backend`] for more details.
|
|
156
|
+
"""
|
|
157
|
+
_get_session_from_cache.cache_clear()
|
|
158
|
+
|
|
159
|
+
|
|
151
160
|
@lru_cache
|
|
152
161
|
def _get_session_from_cache(process_id: int, thread_id: int) -> requests.Session:
|
|
153
162
|
"""
|
|
@@ -305,8 +305,8 @@ def dump_environment_info() -> Dict[str, Any]:
|
|
|
305
305
|
|
|
306
306
|
# Environment variables
|
|
307
307
|
info["ENDPOINT"] = constants.ENDPOINT
|
|
308
|
-
info["
|
|
309
|
-
info["
|
|
308
|
+
info["HF_HUB_CACHE"] = constants.HF_HUB_CACHE
|
|
309
|
+
info["HF_ASSETS_CACHE"] = constants.HF_ASSETS_CACHE
|
|
310
310
|
info["HF_TOKEN_PATH"] = constants.HF_TOKEN_PATH
|
|
311
311
|
info["HF_HUB_OFFLINE"] = constants.HF_HUB_OFFLINE
|
|
312
312
|
info["HF_HUB_DISABLE_TELEMETRY"] = constants.HF_HUB_DISABLE_TELEMETRY
|
|
@@ -315,6 +315,8 @@ def dump_environment_info() -> Dict[str, Any]:
|
|
|
315
315
|
info["HF_HUB_DISABLE_EXPERIMENTAL_WARNING"] = constants.HF_HUB_DISABLE_EXPERIMENTAL_WARNING
|
|
316
316
|
info["HF_HUB_DISABLE_IMPLICIT_TOKEN"] = constants.HF_HUB_DISABLE_IMPLICIT_TOKEN
|
|
317
317
|
info["HF_HUB_ENABLE_HF_TRANSFER"] = constants.HF_HUB_ENABLE_HF_TRANSFER
|
|
318
|
+
info["HF_HUB_ETAG_TIMEOUT"] = constants.HF_HUB_ETAG_TIMEOUT
|
|
319
|
+
info["HF_HUB_DOWNLOAD_TIMEOUT"] = constants.HF_HUB_DOWNLOAD_TIMEOUT
|
|
318
320
|
|
|
319
321
|
print("\nCopy-and-paste the text below in your GitHub issue.\n")
|
|
320
322
|
print("\n".join([f"- {prop}: {val}" for prop, val in info.items()]) + "\n")
|
|
@@ -16,27 +16,28 @@ with the aim for a user-friendly interface.
|
|
|
16
16
|
import math
|
|
17
17
|
import re
|
|
18
18
|
from dataclasses import dataclass
|
|
19
|
-
from typing import TYPE_CHECKING,
|
|
19
|
+
from typing import TYPE_CHECKING, List, Optional, Union
|
|
20
|
+
|
|
21
|
+
from ..repocard_data import ModelCardData
|
|
20
22
|
|
|
21
23
|
|
|
22
24
|
if TYPE_CHECKING:
|
|
23
25
|
from ..hf_api import ModelInfo
|
|
24
26
|
|
|
25
27
|
|
|
26
|
-
def
|
|
27
|
-
|
|
28
|
-
minimum_threshold: Optional[float] = None,
|
|
29
|
-
maximum_threshold: Optional[float] = None,
|
|
30
|
-
) -> Iterable["ModelInfo"]:
|
|
31
|
-
"""Filters a list of models for those that include an emission tag and limit them to between two thresholds
|
|
28
|
+
def _is_emission_within_treshold(model_info: "ModelInfo", minimum_threshold: float, maximum_threshold: float) -> bool:
|
|
29
|
+
"""Checks if a model's emission is within a given threshold.
|
|
32
30
|
|
|
33
31
|
Args:
|
|
34
|
-
|
|
35
|
-
A
|
|
36
|
-
minimum_threshold (`float
|
|
32
|
+
model_info (`ModelInfo`):
|
|
33
|
+
A model info object containing the model's emission information.
|
|
34
|
+
minimum_threshold (`float`):
|
|
37
35
|
A minimum carbon threshold to filter by, such as 1.
|
|
38
|
-
maximum_threshold (`float
|
|
36
|
+
maximum_threshold (`float`):
|
|
39
37
|
A maximum carbon threshold to filter by, such as 10.
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
`bool`: Whether the model's emission is within the given threshold.
|
|
40
41
|
"""
|
|
41
42
|
if minimum_threshold is None and maximum_threshold is None:
|
|
42
43
|
raise ValueError("Both `minimum_threshold` and `maximum_threshold` cannot both be `None`")
|
|
@@ -45,26 +46,24 @@ def _filter_emissions(
|
|
|
45
46
|
if maximum_threshold is None:
|
|
46
47
|
maximum_threshold = math.inf
|
|
47
48
|
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
continue
|
|
49
|
+
card_data = getattr(model_info, "card_data", None)
|
|
50
|
+
if card_data is None or not isinstance(card_data, (dict, ModelCardData)):
|
|
51
|
+
return False
|
|
52
52
|
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
53
|
+
# Get CO2 emission metadata
|
|
54
|
+
emission = card_data.get("co2_eq_emissions", None)
|
|
55
|
+
if isinstance(emission, dict):
|
|
56
|
+
emission = emission["emissions"]
|
|
57
|
+
if not emission:
|
|
58
|
+
return False
|
|
59
59
|
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
60
|
+
# Filter out if value is missing or out of range
|
|
61
|
+
matched = re.search(r"\d+\.\d+|\d+", str(emission))
|
|
62
|
+
if matched is None:
|
|
63
|
+
return False
|
|
64
64
|
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
yield model
|
|
65
|
+
emission_value = float(matched.group(0))
|
|
66
|
+
return minimum_threshold <= emission_value <= maximum_threshold
|
|
68
67
|
|
|
69
68
|
|
|
70
69
|
@dataclass
|
|
@@ -203,16 +202,11 @@ class ModelFilter:
|
|
|
203
202
|
>>> # For the task
|
|
204
203
|
>>> new_filter = ModelFilter(task="text-classification")
|
|
205
204
|
|
|
206
|
-
>>> # Retrieving tags using the `HfApi.get_model_tags` method
|
|
207
205
|
>>> from huggingface_hub import HfApi
|
|
208
206
|
|
|
209
207
|
>>> api = HfApi()
|
|
210
208
|
# To list model tags
|
|
211
209
|
|
|
212
|
-
>>> api.get_model_tags()
|
|
213
|
-
# To list dataset tags
|
|
214
|
-
|
|
215
|
-
>>> api.get_dataset_tags()
|
|
216
210
|
>>> new_filter = ModelFilter(tags="benchmark:raft")
|
|
217
211
|
|
|
218
212
|
>>> # Related to the dataset
|
|
@@ -227,145 +221,3 @@ class ModelFilter:
|
|
|
227
221
|
task: Optional[Union[str, List[str]]] = None
|
|
228
222
|
trained_dataset: Optional[Union[str, List[str]]] = None
|
|
229
223
|
tags: Optional[Union[str, List[str]]] = None
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
class AttributeDictionary(dict):
|
|
233
|
-
"""
|
|
234
|
-
`dict` subclass that also provides access to keys as attributes
|
|
235
|
-
|
|
236
|
-
If a key starts with a number, it will exist in the dictionary but not as an
|
|
237
|
-
attribute
|
|
238
|
-
|
|
239
|
-
Example:
|
|
240
|
-
|
|
241
|
-
```python
|
|
242
|
-
>>> d = AttributeDictionary()
|
|
243
|
-
>>> d["test"] = "a"
|
|
244
|
-
>>> print(d.test) # prints "a"
|
|
245
|
-
```
|
|
246
|
-
|
|
247
|
-
"""
|
|
248
|
-
|
|
249
|
-
def __getattr__(self, k):
|
|
250
|
-
if k in self:
|
|
251
|
-
return self[k]
|
|
252
|
-
else:
|
|
253
|
-
raise AttributeError(k)
|
|
254
|
-
|
|
255
|
-
def __setattr__(self, k, v):
|
|
256
|
-
(self.__setitem__, super().__setattr__)[k[0] == "_"](k, v)
|
|
257
|
-
|
|
258
|
-
def __delattr__(self, k):
|
|
259
|
-
if k in self:
|
|
260
|
-
del self[k]
|
|
261
|
-
else:
|
|
262
|
-
raise AttributeError(k)
|
|
263
|
-
|
|
264
|
-
def __dir__(self):
|
|
265
|
-
keys = sorted(self.keys())
|
|
266
|
-
keys = [key for key in keys if key.replace("_", "").isalpha()]
|
|
267
|
-
return super().__dir__() + keys
|
|
268
|
-
|
|
269
|
-
def __repr__(self):
|
|
270
|
-
repr_str = "Available Attributes or Keys:\n"
|
|
271
|
-
for key in sorted(self.keys()):
|
|
272
|
-
repr_str += f" * {key}"
|
|
273
|
-
if not key.replace("_", "").isalpha():
|
|
274
|
-
repr_str += " (Key only)"
|
|
275
|
-
repr_str += "\n"
|
|
276
|
-
return repr_str
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
class GeneralTags(AttributeDictionary):
|
|
280
|
-
"""
|
|
281
|
-
A namespace object holding all tags, filtered by `keys` If a tag starts with
|
|
282
|
-
a number, it will only exist in the dictionary
|
|
283
|
-
|
|
284
|
-
Example:
|
|
285
|
-
```python
|
|
286
|
-
>>> a.b["1a"] # will work
|
|
287
|
-
>>> a["b"]["1a"] # will work
|
|
288
|
-
>>> # a.b.1a # will not work
|
|
289
|
-
```
|
|
290
|
-
|
|
291
|
-
Args:
|
|
292
|
-
tag_dictionary (`dict`):
|
|
293
|
-
A dictionary of tags returned from the /api/***-tags-by-type api
|
|
294
|
-
endpoint
|
|
295
|
-
keys (`list`):
|
|
296
|
-
A list of keys to unpack the `tag_dictionary` with, such as
|
|
297
|
-
`["library","language"]`
|
|
298
|
-
"""
|
|
299
|
-
|
|
300
|
-
def __init__(self, tag_dictionary: dict, keys: Optional[list] = None):
|
|
301
|
-
self._tag_dictionary = tag_dictionary
|
|
302
|
-
if keys is None:
|
|
303
|
-
keys = list(self._tag_dictionary.keys())
|
|
304
|
-
for key in keys:
|
|
305
|
-
self._unpack_and_assign_dictionary(key)
|
|
306
|
-
|
|
307
|
-
def _unpack_and_assign_dictionary(self, key: str):
|
|
308
|
-
"Assign nested attributes to `self.key` containing information as an `AttributeDictionary`"
|
|
309
|
-
ref = AttributeDictionary()
|
|
310
|
-
setattr(self, key, ref)
|
|
311
|
-
for item in self._tag_dictionary.get(key, []):
|
|
312
|
-
label = item["label"].replace(" ", "").replace("-", "_").replace(".", "_")
|
|
313
|
-
ref[label] = item["id"]
|
|
314
|
-
self[key] = ref
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
class ModelTags(GeneralTags):
|
|
318
|
-
"""
|
|
319
|
-
A namespace object holding all available model tags If a tag starts with a
|
|
320
|
-
number, it will only exist in the dictionary
|
|
321
|
-
|
|
322
|
-
Example:
|
|
323
|
-
|
|
324
|
-
```python
|
|
325
|
-
>>> a.dataset["1_5BArabicCorpus"] # will work
|
|
326
|
-
>>> a["dataset"]["1_5BArabicCorpus"] # will work
|
|
327
|
-
>>> # o.dataset.1_5BArabicCorpus # will not work
|
|
328
|
-
```
|
|
329
|
-
|
|
330
|
-
Args:
|
|
331
|
-
model_tag_dictionary (`dict`):
|
|
332
|
-
A dictionary of valid model tags, returned from the
|
|
333
|
-
/api/models-tags-by-type api endpoint
|
|
334
|
-
"""
|
|
335
|
-
|
|
336
|
-
def __init__(self, model_tag_dictionary: dict):
|
|
337
|
-
keys = ["library", "language", "license", "dataset", "pipeline_tag"]
|
|
338
|
-
super().__init__(model_tag_dictionary, keys)
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
class DatasetTags(GeneralTags):
|
|
342
|
-
"""
|
|
343
|
-
A namespace object holding all available dataset tags If a tag starts with a
|
|
344
|
-
number, it will only exist in the dictionary
|
|
345
|
-
|
|
346
|
-
Example
|
|
347
|
-
|
|
348
|
-
```python
|
|
349
|
-
>>> a.size_categories["100K<n<1M"] # will work
|
|
350
|
-
>>> a["size_categories"]["100K<n<1M"] # will work
|
|
351
|
-
>>> # o.size_categories.100K<n<1M # will not work
|
|
352
|
-
```
|
|
353
|
-
|
|
354
|
-
Args:
|
|
355
|
-
dataset_tag_dictionary (`dict`):
|
|
356
|
-
A dictionary of valid dataset tags, returned from the
|
|
357
|
-
/api/datasets-tags-by-type api endpoint
|
|
358
|
-
"""
|
|
359
|
-
|
|
360
|
-
def __init__(self, dataset_tag_dictionary: dict):
|
|
361
|
-
keys = [
|
|
362
|
-
"language",
|
|
363
|
-
"multilinguality",
|
|
364
|
-
"language_creators",
|
|
365
|
-
"task_categories",
|
|
366
|
-
"size_categories",
|
|
367
|
-
"benchmark",
|
|
368
|
-
"task_ids",
|
|
369
|
-
"license",
|
|
370
|
-
]
|
|
371
|
-
super().__init__(dataset_tag_dictionary, keys)
|