durable_huggingface_hub 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.editorconfig +29 -0
- data/.rubocop.yml +108 -0
- data/CHANGELOG.md +127 -0
- data/README.md +547 -0
- data/Rakefile +106 -0
- data/devenv.lock +171 -0
- data/devenv.nix +15 -0
- data/devenv.yaml +8 -0
- data/huggingface_hub.gemspec +63 -0
- data/lib/durable_huggingface_hub/authentication.rb +245 -0
- data/lib/durable_huggingface_hub/cache.rb +508 -0
- data/lib/durable_huggingface_hub/configuration.rb +191 -0
- data/lib/durable_huggingface_hub/constants.rb +145 -0
- data/lib/durable_huggingface_hub/errors.rb +412 -0
- data/lib/durable_huggingface_hub/file_download.rb +831 -0
- data/lib/durable_huggingface_hub/hf_api.rb +1278 -0
- data/lib/durable_huggingface_hub/repo_card.rb +430 -0
- data/lib/durable_huggingface_hub/types/cache_info.rb +298 -0
- data/lib/durable_huggingface_hub/types/commit_info.rb +149 -0
- data/lib/durable_huggingface_hub/types/dataset_info.rb +158 -0
- data/lib/durable_huggingface_hub/types/model_info.rb +154 -0
- data/lib/durable_huggingface_hub/types/space_info.rb +158 -0
- data/lib/durable_huggingface_hub/types/user.rb +179 -0
- data/lib/durable_huggingface_hub/types.rb +205 -0
- data/lib/durable_huggingface_hub/utils/auth.rb +174 -0
- data/lib/durable_huggingface_hub/utils/headers.rb +220 -0
- data/lib/durable_huggingface_hub/utils/http.rb +329 -0
- data/lib/durable_huggingface_hub/utils/paths.rb +230 -0
- data/lib/durable_huggingface_hub/utils/progress.rb +217 -0
- data/lib/durable_huggingface_hub/utils/retry.rb +165 -0
- data/lib/durable_huggingface_hub/utils/validators.rb +236 -0
- data/lib/durable_huggingface_hub/version.rb +8 -0
- data/lib/huggingface_hub.rb +205 -0
- metadata +334 -0
|
@@ -0,0 +1,430 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "yaml"
|
|
4
|
+
require "pathname"
|
|
5
|
+
require_relative "hf_api"
|
|
6
|
+
require_relative "utils/validators"
|
|
7
|
+
|
|
8
|
+
module DurableHuggingfaceHub
|
|
9
|
+
# Base class for repository cards (README.md files with YAML frontmatter).
|
|
10
|
+
#
|
|
11
|
+
# Repository cards contain metadata and documentation for models, datasets,
|
|
12
|
+
# and spaces on the HuggingFace Hub. They consist of YAML frontmatter
|
|
13
|
+
# followed by markdown content.
|
|
14
|
+
#
|
|
15
|
+
# @example Load a model card from the Hub
|
|
16
|
+
# card = DurableHuggingfaceHub::ModelCard.load("bert-base-uncased")
|
|
17
|
+
# puts card.data["license"]
|
|
18
|
+
# puts card.text
|
|
19
|
+
#
|
|
20
|
+
# @example Create and save a new model card
|
|
21
|
+
# card = DurableHuggingfaceHub::ModelCard.new(
|
|
22
|
+
# text: "# My Model\n\nThis is my model.",
|
|
23
|
+
# data: { "license" => "mit", "language" => "en" }
|
|
24
|
+
# )
|
|
25
|
+
# card.save("my-model/README.md")
|
|
26
|
+
class RepoCard
|
|
27
|
+
# @return [Hash] Metadata from YAML frontmatter
|
|
28
|
+
attr_accessor :data
|
|
29
|
+
|
|
30
|
+
# @return [String] Markdown content (without frontmatter)
|
|
31
|
+
attr_accessor :text
|
|
32
|
+
|
|
33
|
+
# Initialize a new RepoCard
|
|
34
|
+
#
|
|
35
|
+
# @param text [String] Markdown content
|
|
36
|
+
# @param data [Hash] Metadata dictionary
|
|
37
|
+
def initialize(text: "", data: {})
|
|
38
|
+
@text = text || ""
|
|
39
|
+
@data = data || {}
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
# Load a repository card from a file.
|
|
43
|
+
#
|
|
44
|
+
# @param file_path [String, Pathname] Path to the README.md file
|
|
45
|
+
# @return [RepoCard] The loaded repository card
|
|
46
|
+
#
|
|
47
|
+
# @example Load from local file
|
|
48
|
+
# card = RepoCard.load("path/to/README.md")
|
|
49
|
+
def self.load(file_path)
|
|
50
|
+
file_path = Pathname(file_path)
|
|
51
|
+
raise ArgumentError, "File not found: #{file_path}" unless file_path.exist?
|
|
52
|
+
|
|
53
|
+
content = file_path.read
|
|
54
|
+
parse(content)
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
# Load a repository card from the HuggingFace Hub.
|
|
58
|
+
#
|
|
59
|
+
# @param repo_id [String] Repository ID
|
|
60
|
+
# @param repo_type [String, Symbol] Type of repository ("model", "dataset", or "space")
|
|
61
|
+
# @param revision [String, nil] Git revision (branch, tag, or commit SHA)
|
|
62
|
+
# @param token [String, nil] HuggingFace API token
|
|
63
|
+
# @param timeout [Numeric, nil] Request timeout in seconds
|
|
64
|
+
# @return [RepoCard] The loaded repository card
|
|
65
|
+
#
|
|
66
|
+
# @raise [RepositoryNotFoundError] If repository doesn't exist
|
|
67
|
+
# @raise [EntryNotFoundError] If README.md doesn't exist
|
|
68
|
+
#
|
|
69
|
+
# @example Load model card from Hub
|
|
70
|
+
# card = ModelCard.from_hub("bert-base-uncased")
|
|
71
|
+
def self.from_hub(repo_id, repo_type: nil, revision: nil, token: nil, timeout: nil)
|
|
72
|
+
Utils::Validators.validate_repo_id(repo_id)
|
|
73
|
+
repo_type ||= self.default_repo_type
|
|
74
|
+
repo_type = Utils::Validators.validate_repo_type(repo_type)
|
|
75
|
+
|
|
76
|
+
api = HfApi.new(token: token)
|
|
77
|
+
|
|
78
|
+
# Build URL for README.md
|
|
79
|
+
url_path = "/#{repo_type}s/#{repo_id}/resolve/#{revision || 'main'}/README.md"
|
|
80
|
+
|
|
81
|
+
begin
|
|
82
|
+
response = api.http_client.get(url_path, timeout: timeout)
|
|
83
|
+
content = response.body
|
|
84
|
+
parse(content)
|
|
85
|
+
rescue HfHubHTTPError => e
|
|
86
|
+
if e.status_code == 404
|
|
87
|
+
raise EntryNotFoundError, "README.md not found in #{repo_id}"
|
|
88
|
+
else
|
|
89
|
+
raise
|
|
90
|
+
end
|
|
91
|
+
end
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
# Parse repository card content (YAML frontmatter + markdown).
|
|
95
|
+
#
|
|
96
|
+
# @param content [String] Full content of README.md
|
|
97
|
+
# @return [RepoCard] Parsed repository card
|
|
98
|
+
#
|
|
99
|
+
# @example Parse content string
|
|
100
|
+
# content = "---\nlicense: mit\n---\n# My Model"
|
|
101
|
+
# card = RepoCard.parse(content)
|
|
102
|
+
def self.parse(content)
|
|
103
|
+
# Check for YAML frontmatter (starts with ---)
|
|
104
|
+
if content.start_with?("---\n")
|
|
105
|
+
# Find the closing ---
|
|
106
|
+
end_index = content.index("\n---\n", 4)
|
|
107
|
+
|
|
108
|
+
if end_index
|
|
109
|
+
# Extract YAML frontmatter
|
|
110
|
+
yaml_content = content[4...end_index]
|
|
111
|
+
markdown_content = content[(end_index + 5)..-1] || ""
|
|
112
|
+
|
|
113
|
+
begin
|
|
114
|
+
metadata = YAML.safe_load(yaml_content, permitted_classes: [Date, Time]) || {}
|
|
115
|
+
rescue Psych::SyntaxError => e
|
|
116
|
+
warn "Failed to parse YAML frontmatter: #{e.message}"
|
|
117
|
+
metadata = {}
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
new(text: markdown_content.strip, data: metadata)
|
|
121
|
+
else
|
|
122
|
+
# No closing ---, treat everything as content
|
|
123
|
+
new(text: content)
|
|
124
|
+
end
|
|
125
|
+
else
|
|
126
|
+
# No frontmatter
|
|
127
|
+
new(text: content)
|
|
128
|
+
end
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
# Convert the repository card to a string (YAML frontmatter + markdown).
|
|
132
|
+
#
|
|
133
|
+
# @return [String] Full content with frontmatter
|
|
134
|
+
#
|
|
135
|
+
# @example Convert to string
|
|
136
|
+
# content = card.to_s
|
|
137
|
+
# File.write("README.md", content)
|
|
138
|
+
def to_s
|
|
139
|
+
if @data.empty?
|
|
140
|
+
@text
|
|
141
|
+
else
|
|
142
|
+
yaml_str = YAML.dump(@data).sub(/^---\n/, "")
|
|
143
|
+
"---\n#{yaml_str}---\n\n#{@text}"
|
|
144
|
+
end
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
# Save the repository card to a file.
|
|
148
|
+
#
|
|
149
|
+
# @param file_path [String, Pathname] Path to save the README.md file
|
|
150
|
+
#
|
|
151
|
+
# @example Save to local file
|
|
152
|
+
# card.save("my-model/README.md")
|
|
153
|
+
def save(file_path)
|
|
154
|
+
file_path = Pathname(file_path)
|
|
155
|
+
file_path.dirname.mkpath # Create parent directories if needed
|
|
156
|
+
file_path.write(to_s)
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
# Push the repository card to the HuggingFace Hub.
|
|
160
|
+
#
|
|
161
|
+
# @param repo_id [String] Repository ID
|
|
162
|
+
# @param repo_type [String, Symbol] Type of repository
|
|
163
|
+
# @param revision [String, nil] Git revision to push to
|
|
164
|
+
# @param commit_message [String, nil] Commit message
|
|
165
|
+
# @param commit_description [String, nil] Commit description
|
|
166
|
+
# @param token [String, nil] HuggingFace API token
|
|
167
|
+
# @param timeout [Numeric, nil] Request timeout in seconds
|
|
168
|
+
# @return [String] URL of the uploaded README.md
|
|
169
|
+
#
|
|
170
|
+
# @example Push to Hub
|
|
171
|
+
# card.push_to_hub("my-username/my-model")
|
|
172
|
+
def push_to_hub(
|
|
173
|
+
repo_id,
|
|
174
|
+
repo_type: self.class.default_repo_type,
|
|
175
|
+
revision: nil,
|
|
176
|
+
commit_message: nil,
|
|
177
|
+
commit_description: nil,
|
|
178
|
+
token: nil,
|
|
179
|
+
timeout: nil
|
|
180
|
+
)
|
|
181
|
+
api = HfApi.new(token: token)
|
|
182
|
+
|
|
183
|
+
# Create a temporary file with the content
|
|
184
|
+
require "tempfile"
|
|
185
|
+
Tempfile.create(["README", ".md"]) do |temp_file|
|
|
186
|
+
temp_file.write(to_s)
|
|
187
|
+
temp_file.flush
|
|
188
|
+
|
|
189
|
+
api.upload_file(
|
|
190
|
+
repo_id: repo_id,
|
|
191
|
+
path_or_fileobj: temp_file.path,
|
|
192
|
+
path_in_repo: "README.md",
|
|
193
|
+
repo_type: repo_type,
|
|
194
|
+
revision: revision,
|
|
195
|
+
commit_message: commit_message || "Update README.md",
|
|
196
|
+
commit_description: commit_description,
|
|
197
|
+
timeout: timeout
|
|
198
|
+
)
|
|
199
|
+
end
|
|
200
|
+
end
|
|
201
|
+
|
|
202
|
+
# Update the metadata in the repository card.
|
|
203
|
+
#
|
|
204
|
+
# @param updates [Hash] Metadata updates to merge
|
|
205
|
+
#
|
|
206
|
+
# @example Update metadata
|
|
207
|
+
# card.update_metadata({ "license" => "apache-2.0" })
|
|
208
|
+
def update_metadata(updates)
|
|
209
|
+
@data.merge!(updates)
|
|
210
|
+
end
|
|
211
|
+
|
|
212
|
+
# Default repository type for this card class.
|
|
213
|
+
# Subclasses should override this.
|
|
214
|
+
#
|
|
215
|
+
# @return [String] Default repository type
|
|
216
|
+
def self.default_repo_type
|
|
217
|
+
"model"
|
|
218
|
+
end
|
|
219
|
+
|
|
220
|
+
# Validate the repository card metadata.
|
|
221
|
+
# Subclasses can override this to add specific validation.
|
|
222
|
+
#
|
|
223
|
+
# @return [Array<String>] List of validation errors (empty if valid)
|
|
224
|
+
def validate
|
|
225
|
+
[]
|
|
226
|
+
end
|
|
227
|
+
end
|
|
228
|
+
|
|
229
|
+
# Model card for documenting machine learning models.
|
|
230
|
+
#
|
|
231
|
+
# Model cards provide essential information about models including:
|
|
232
|
+
# - Model architecture and training details
|
|
233
|
+
# - Intended use and limitations
|
|
234
|
+
# - Training data and evaluation results
|
|
235
|
+
# - Ethical considerations
|
|
236
|
+
#
|
|
237
|
+
# @example Create a model card
|
|
238
|
+
# card = ModelCard.new(
|
|
239
|
+
# text: "# BERT Base Uncased\n\nBERT model trained on...",
|
|
240
|
+
# data: {
|
|
241
|
+
# "license" => "apache-2.0",
|
|
242
|
+
# "language" => "en",
|
|
243
|
+
# "tags" => ["bert", "nlp"]
|
|
244
|
+
# }
|
|
245
|
+
# )
|
|
246
|
+
class ModelCard < RepoCard
|
|
247
|
+
# @return [String] Default repository type for model cards
|
|
248
|
+
def self.default_repo_type
|
|
249
|
+
"model"
|
|
250
|
+
end
|
|
251
|
+
|
|
252
|
+
# Validate model card metadata.
|
|
253
|
+
#
|
|
254
|
+
# @return [Array<String>] List of validation errors
|
|
255
|
+
def validate
|
|
256
|
+
errors = []
|
|
257
|
+
|
|
258
|
+
# Check for required fields
|
|
259
|
+
errors << "license is required" unless @data["license"]
|
|
260
|
+
errors << "language is required" unless @data["language"]
|
|
261
|
+
|
|
262
|
+
# Validate license format (should be SPDX identifier)
|
|
263
|
+
if @data["license"] && !@data["license"].is_a?(String)
|
|
264
|
+
errors << "license must be a string"
|
|
265
|
+
end
|
|
266
|
+
|
|
267
|
+
# Validate language format
|
|
268
|
+
if @data["language"]
|
|
269
|
+
if @data["language"].is_a?(String)
|
|
270
|
+
# Single language
|
|
271
|
+
elsif @data["language"].is_a?(Array)
|
|
272
|
+
# Multiple languages
|
|
273
|
+
@data["language"].each do |lang|
|
|
274
|
+
errors << "language array elements must be strings" unless lang.is_a?(String)
|
|
275
|
+
end
|
|
276
|
+
else
|
|
277
|
+
errors << "language must be a string or array of strings"
|
|
278
|
+
end
|
|
279
|
+
end
|
|
280
|
+
|
|
281
|
+
# Validate tags format
|
|
282
|
+
if @data["tags"] && !@data["tags"].is_a?(Array)
|
|
283
|
+
errors << "tags must be an array"
|
|
284
|
+
end
|
|
285
|
+
|
|
286
|
+
errors
|
|
287
|
+
end
|
|
288
|
+
|
|
289
|
+
# Add evaluation results to the model card metadata.
|
|
290
|
+
#
|
|
291
|
+
# @param task_type [String] Type of task (e.g., "text-classification")
|
|
292
|
+
# @param dataset_name [String] Name of the evaluation dataset
|
|
293
|
+
# @param metric_name [String] Name of the metric
|
|
294
|
+
# @param metric_value [Numeric] Value of the metric
|
|
295
|
+
#
|
|
296
|
+
# @example Add evaluation result
|
|
297
|
+
# card.add_evaluation_result(
|
|
298
|
+
# task_type: "text-classification",
|
|
299
|
+
# dataset_name: "glue",
|
|
300
|
+
# metric_name: "accuracy",
|
|
301
|
+
# metric_value: 0.95
|
|
302
|
+
# )
|
|
303
|
+
def add_evaluation_result(task_type:, dataset_name:, metric_name:, metric_value:)
|
|
304
|
+
@data["model-index"] ||= []
|
|
305
|
+
|
|
306
|
+
model_index = @data["model-index"].first || {}
|
|
307
|
+
model_index["results"] ||= []
|
|
308
|
+
|
|
309
|
+
result = {
|
|
310
|
+
"task" => { "type" => task_type },
|
|
311
|
+
"dataset" => { "name" => dataset_name },
|
|
312
|
+
"metrics" => [{ "name" => metric_name, "value" => metric_value }]
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
model_index["results"] << result
|
|
316
|
+
@data["model-index"] = [model_index] if @data["model-index"].empty?
|
|
317
|
+
end
|
|
318
|
+
end
|
|
319
|
+
|
|
320
|
+
# Dataset card for documenting datasets.
|
|
321
|
+
#
|
|
322
|
+
# Dataset cards provide information about datasets including:
|
|
323
|
+
# - Dataset description and structure
|
|
324
|
+
# - Data collection methodology
|
|
325
|
+
# - Intended use cases
|
|
326
|
+
# - Limitations and biases
|
|
327
|
+
#
|
|
328
|
+
# @example Create a dataset card
|
|
329
|
+
# card = DatasetCard.new(
|
|
330
|
+
# text: "# My Dataset\n\nThis dataset contains...",
|
|
331
|
+
# data: {
|
|
332
|
+
# "license" => "cc-by-4.0",
|
|
333
|
+
# "language" => ["en", "es"],
|
|
334
|
+
# "task_categories" => ["text-classification"]
|
|
335
|
+
# }
|
|
336
|
+
# )
|
|
337
|
+
class DatasetCard < RepoCard
|
|
338
|
+
# @return [String] Default repository type for dataset cards
|
|
339
|
+
def self.default_repo_type
|
|
340
|
+
"dataset"
|
|
341
|
+
end
|
|
342
|
+
|
|
343
|
+
# Validate dataset card metadata.
|
|
344
|
+
#
|
|
345
|
+
# @return [Array<String>] List of validation errors
|
|
346
|
+
def validate
|
|
347
|
+
errors = []
|
|
348
|
+
|
|
349
|
+
# Check for required fields
|
|
350
|
+
errors << "license is required" unless @data["license"]
|
|
351
|
+
|
|
352
|
+
# Validate license format
|
|
353
|
+
if @data["license"] && !@data["license"].is_a?(String)
|
|
354
|
+
errors << "license must be a string"
|
|
355
|
+
end
|
|
356
|
+
|
|
357
|
+
# Validate language format
|
|
358
|
+
if @data["language"]
|
|
359
|
+
if @data["language"].is_a?(String)
|
|
360
|
+
# Single language
|
|
361
|
+
elsif @data["language"].is_a?(Array)
|
|
362
|
+
# Multiple languages
|
|
363
|
+
@data["language"].each do |lang|
|
|
364
|
+
errors << "language array elements must be strings" unless lang.is_a?(String)
|
|
365
|
+
end
|
|
366
|
+
else
|
|
367
|
+
errors << "language must be a string or array of strings"
|
|
368
|
+
end
|
|
369
|
+
end
|
|
370
|
+
|
|
371
|
+
# Validate task_categories format
|
|
372
|
+
if @data["task_categories"] && !@data["task_categories"].is_a?(Array)
|
|
373
|
+
errors << "task_categories must be an array"
|
|
374
|
+
end
|
|
375
|
+
|
|
376
|
+
errors
|
|
377
|
+
end
|
|
378
|
+
end
|
|
379
|
+
|
|
380
|
+
# Space card for documenting Spaces (interactive demos).
|
|
381
|
+
#
|
|
382
|
+
# Space cards provide information about Gradio/Streamlit apps including:
|
|
383
|
+
# - App description and usage
|
|
384
|
+
# - Technical requirements
|
|
385
|
+
# - Model dependencies
|
|
386
|
+
#
|
|
387
|
+
# @example Create a space card
|
|
388
|
+
# card = SpaceCard.new(
|
|
389
|
+
# text: "# My Demo\n\nThis space demonstrates...",
|
|
390
|
+
# data: {
|
|
391
|
+
# "sdk" => "gradio",
|
|
392
|
+
# "sdk_version" => "3.0.0",
|
|
393
|
+
# "app_file" => "app.py"
|
|
394
|
+
# }
|
|
395
|
+
# )
|
|
396
|
+
class SpaceCard < RepoCard
|
|
397
|
+
# @return [String] Default repository type for space cards
|
|
398
|
+
def self.default_repo_type
|
|
399
|
+
"space"
|
|
400
|
+
end
|
|
401
|
+
|
|
402
|
+
# Validate space card metadata.
|
|
403
|
+
#
|
|
404
|
+
# @return [Array<String>] List of validation errors
|
|
405
|
+
def validate
|
|
406
|
+
errors = []
|
|
407
|
+
|
|
408
|
+
# Check for required fields
|
|
409
|
+
errors << "sdk is required" unless @data["sdk"]
|
|
410
|
+
errors << "app_file is required" unless @data["app_file"]
|
|
411
|
+
|
|
412
|
+
# Validate SDK
|
|
413
|
+
if @data["sdk"] && !["gradio", "streamlit", "docker", "static"].include?(@data["sdk"])
|
|
414
|
+
errors << "sdk must be one of: gradio, streamlit, docker, static"
|
|
415
|
+
end
|
|
416
|
+
|
|
417
|
+
# Validate app_file
|
|
418
|
+
if @data["app_file"] && !@data["app_file"].is_a?(String)
|
|
419
|
+
errors << "app_file must be a string"
|
|
420
|
+
end
|
|
421
|
+
|
|
422
|
+
# Validate sdk_version format
|
|
423
|
+
if @data["sdk_version"] && !@data["sdk_version"].is_a?(String)
|
|
424
|
+
errors << "sdk_version must be a string"
|
|
425
|
+
end
|
|
426
|
+
|
|
427
|
+
errors
|
|
428
|
+
end
|
|
429
|
+
end
|
|
430
|
+
end
|