durable_huggingface_hub 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. checksums.yaml +7 -0
  2. data/.editorconfig +29 -0
  3. data/.rubocop.yml +108 -0
  4. data/CHANGELOG.md +127 -0
  5. data/README.md +547 -0
  6. data/Rakefile +106 -0
  7. data/devenv.lock +171 -0
  8. data/devenv.nix +15 -0
  9. data/devenv.yaml +8 -0
  10. data/huggingface_hub.gemspec +63 -0
  11. data/lib/durable_huggingface_hub/authentication.rb +245 -0
  12. data/lib/durable_huggingface_hub/cache.rb +508 -0
  13. data/lib/durable_huggingface_hub/configuration.rb +191 -0
  14. data/lib/durable_huggingface_hub/constants.rb +145 -0
  15. data/lib/durable_huggingface_hub/errors.rb +412 -0
  16. data/lib/durable_huggingface_hub/file_download.rb +831 -0
  17. data/lib/durable_huggingface_hub/hf_api.rb +1278 -0
  18. data/lib/durable_huggingface_hub/repo_card.rb +430 -0
  19. data/lib/durable_huggingface_hub/types/cache_info.rb +298 -0
  20. data/lib/durable_huggingface_hub/types/commit_info.rb +149 -0
  21. data/lib/durable_huggingface_hub/types/dataset_info.rb +158 -0
  22. data/lib/durable_huggingface_hub/types/model_info.rb +154 -0
  23. data/lib/durable_huggingface_hub/types/space_info.rb +158 -0
  24. data/lib/durable_huggingface_hub/types/user.rb +179 -0
  25. data/lib/durable_huggingface_hub/types.rb +205 -0
  26. data/lib/durable_huggingface_hub/utils/auth.rb +174 -0
  27. data/lib/durable_huggingface_hub/utils/headers.rb +220 -0
  28. data/lib/durable_huggingface_hub/utils/http.rb +329 -0
  29. data/lib/durable_huggingface_hub/utils/paths.rb +230 -0
  30. data/lib/durable_huggingface_hub/utils/progress.rb +217 -0
  31. data/lib/durable_huggingface_hub/utils/retry.rb +165 -0
  32. data/lib/durable_huggingface_hub/utils/validators.rb +236 -0
  33. data/lib/durable_huggingface_hub/version.rb +8 -0
  34. data/lib/huggingface_hub.rb +205 -0
  35. metadata +334 -0
@@ -0,0 +1,430 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "yaml"
4
+ require "pathname"
5
+ require_relative "hf_api"
6
+ require_relative "utils/validators"
7
+
8
+ module DurableHuggingfaceHub
9
+ # Base class for repository cards (README.md files with YAML frontmatter).
10
+ #
11
+ # Repository cards contain metadata and documentation for models, datasets,
12
+ # and spaces on the HuggingFace Hub. They consist of YAML frontmatter
13
+ # followed by markdown content.
14
+ #
15
+ # @example Load a model card from the Hub
16
+ # card = DurableHuggingfaceHub::ModelCard.load("bert-base-uncased")
17
+ # puts card.data["license"]
18
+ # puts card.text
19
+ #
20
+ # @example Create and save a new model card
21
+ # card = DurableHuggingfaceHub::ModelCard.new(
22
+ # text: "# My Model\n\nThis is my model.",
23
+ # data: { "license" => "mit", "language" => "en" }
24
+ # )
25
+ # card.save("my-model/README.md")
26
+ class RepoCard
27
+ # @return [Hash] Metadata from YAML frontmatter
28
+ attr_accessor :data
29
+
30
+ # @return [String] Markdown content (without frontmatter)
31
+ attr_accessor :text
32
+
33
+ # Initialize a new RepoCard
34
+ #
35
+ # @param text [String] Markdown content
36
+ # @param data [Hash] Metadata dictionary
37
+ def initialize(text: "", data: {})
38
+ @text = text || ""
39
+ @data = data || {}
40
+ end
41
+
42
+ # Load a repository card from a file.
43
+ #
44
+ # @param file_path [String, Pathname] Path to the README.md file
45
+ # @return [RepoCard] The loaded repository card
46
+ #
47
+ # @example Load from local file
48
+ # card = RepoCard.load("path/to/README.md")
49
+ def self.load(file_path)
50
+ file_path = Pathname(file_path)
51
+ raise ArgumentError, "File not found: #{file_path}" unless file_path.exist?
52
+
53
+ content = file_path.read
54
+ parse(content)
55
+ end
56
+
57
+ # Load a repository card from the HuggingFace Hub.
58
+ #
59
+ # @param repo_id [String] Repository ID
60
+ # @param repo_type [String, Symbol] Type of repository ("model", "dataset", or "space")
61
+ # @param revision [String, nil] Git revision (branch, tag, or commit SHA)
62
+ # @param token [String, nil] HuggingFace API token
63
+ # @param timeout [Numeric, nil] Request timeout in seconds
64
+ # @return [RepoCard] The loaded repository card
65
+ #
66
+ # @raise [RepositoryNotFoundError] If repository doesn't exist
67
+ # @raise [EntryNotFoundError] If README.md doesn't exist
68
+ #
69
+ # @example Load model card from Hub
70
+ # card = ModelCard.from_hub("bert-base-uncased")
71
+ def self.from_hub(repo_id, repo_type: nil, revision: nil, token: nil, timeout: nil)
72
+ Utils::Validators.validate_repo_id(repo_id)
73
+ repo_type ||= self.default_repo_type
74
+ repo_type = Utils::Validators.validate_repo_type(repo_type)
75
+
76
+ api = HfApi.new(token: token)
77
+
78
+ # Build URL for README.md
79
+ url_path = "/#{repo_type}s/#{repo_id}/resolve/#{revision || 'main'}/README.md"
80
+
81
+ begin
82
+ response = api.http_client.get(url_path, timeout: timeout)
83
+ content = response.body
84
+ parse(content)
85
+ rescue HfHubHTTPError => e
86
+ if e.status_code == 404
87
+ raise EntryNotFoundError, "README.md not found in #{repo_id}"
88
+ else
89
+ raise
90
+ end
91
+ end
92
+ end
93
+
94
+ # Parse repository card content (YAML frontmatter + markdown).
95
+ #
96
+ # @param content [String] Full content of README.md
97
+ # @return [RepoCard] Parsed repository card
98
+ #
99
+ # @example Parse content string
100
+ # content = "---\nlicense: mit\n---\n# My Model"
101
+ # card = RepoCard.parse(content)
102
+ def self.parse(content)
103
+ # Check for YAML frontmatter (starts with ---)
104
+ if content.start_with?("---\n")
105
+ # Find the closing ---
106
+ end_index = content.index("\n---\n", 4)
107
+
108
+ if end_index
109
+ # Extract YAML frontmatter
110
+ yaml_content = content[4...end_index]
111
+ markdown_content = content[(end_index + 5)..-1] || ""
112
+
113
+ begin
114
+ metadata = YAML.safe_load(yaml_content, permitted_classes: [Date, Time]) || {}
115
+ rescue Psych::SyntaxError => e
116
+ warn "Failed to parse YAML frontmatter: #{e.message}"
117
+ metadata = {}
118
+ end
119
+
120
+ new(text: markdown_content.strip, data: metadata)
121
+ else
122
+ # No closing ---, treat everything as content
123
+ new(text: content)
124
+ end
125
+ else
126
+ # No frontmatter
127
+ new(text: content)
128
+ end
129
+ end
130
+
131
+ # Convert the repository card to a string (YAML frontmatter + markdown).
132
+ #
133
+ # @return [String] Full content with frontmatter
134
+ #
135
+ # @example Convert to string
136
+ # content = card.to_s
137
+ # File.write("README.md", content)
138
+ def to_s
139
+ if @data.empty?
140
+ @text
141
+ else
142
+ yaml_str = YAML.dump(@data).sub(/^---\n/, "")
143
+ "---\n#{yaml_str}---\n\n#{@text}"
144
+ end
145
+ end
146
+
147
+ # Save the repository card to a file.
148
+ #
149
+ # @param file_path [String, Pathname] Path to save the README.md file
150
+ #
151
+ # @example Save to local file
152
+ # card.save("my-model/README.md")
153
+ def save(file_path)
154
+ file_path = Pathname(file_path)
155
+ file_path.dirname.mkpath # Create parent directories if needed
156
+ file_path.write(to_s)
157
+ end
158
+
159
+ # Push the repository card to the HuggingFace Hub.
160
+ #
161
+ # @param repo_id [String] Repository ID
162
+ # @param repo_type [String, Symbol] Type of repository
163
+ # @param revision [String, nil] Git revision to push to
164
+ # @param commit_message [String, nil] Commit message
165
+ # @param commit_description [String, nil] Commit description
166
+ # @param token [String, nil] HuggingFace API token
167
+ # @param timeout [Numeric, nil] Request timeout in seconds
168
+ # @return [String] URL of the uploaded README.md
169
+ #
170
+ # @example Push to Hub
171
+ # card.push_to_hub("my-username/my-model")
172
+ def push_to_hub(
173
+ repo_id,
174
+ repo_type: self.class.default_repo_type,
175
+ revision: nil,
176
+ commit_message: nil,
177
+ commit_description: nil,
178
+ token: nil,
179
+ timeout: nil
180
+ )
181
+ api = HfApi.new(token: token)
182
+
183
+ # Create a temporary file with the content
184
+ require "tempfile"
185
+ Tempfile.create(["README", ".md"]) do |temp_file|
186
+ temp_file.write(to_s)
187
+ temp_file.flush
188
+
189
+ api.upload_file(
190
+ repo_id: repo_id,
191
+ path_or_fileobj: temp_file.path,
192
+ path_in_repo: "README.md",
193
+ repo_type: repo_type,
194
+ revision: revision,
195
+ commit_message: commit_message || "Update README.md",
196
+ commit_description: commit_description,
197
+ timeout: timeout
198
+ )
199
+ end
200
+ end
201
+
202
+ # Update the metadata in the repository card.
203
+ #
204
+ # @param updates [Hash] Metadata updates to merge
205
+ #
206
+ # @example Update metadata
207
+ # card.update_metadata({ "license" => "apache-2.0" })
208
+ def update_metadata(updates)
209
+ @data.merge!(updates)
210
+ end
211
+
212
+ # Default repository type for this card class.
213
+ # Subclasses should override this.
214
+ #
215
+ # @return [String] Default repository type
216
+ def self.default_repo_type
217
+ "model"
218
+ end
219
+
220
+ # Validate the repository card metadata.
221
+ # Subclasses can override this to add specific validation.
222
+ #
223
+ # @return [Array<String>] List of validation errors (empty if valid)
224
+ def validate
225
+ []
226
+ end
227
+ end
228
+
229
+ # Model card for documenting machine learning models.
230
+ #
231
+ # Model cards provide essential information about models including:
232
+ # - Model architecture and training details
233
+ # - Intended use and limitations
234
+ # - Training data and evaluation results
235
+ # - Ethical considerations
236
+ #
237
+ # @example Create a model card
238
+ # card = ModelCard.new(
239
+ # text: "# BERT Base Uncased\n\nBERT model trained on...",
240
+ # data: {
241
+ # "license" => "apache-2.0",
242
+ # "language" => "en",
243
+ # "tags" => ["bert", "nlp"]
244
+ # }
245
+ # )
246
+ class ModelCard < RepoCard
247
+ # @return [String] Default repository type for model cards
248
+ def self.default_repo_type
249
+ "model"
250
+ end
251
+
252
+ # Validate model card metadata.
253
+ #
254
+ # @return [Array<String>] List of validation errors
255
+ def validate
256
+ errors = []
257
+
258
+ # Check for required fields
259
+ errors << "license is required" unless @data["license"]
260
+ errors << "language is required" unless @data["language"]
261
+
262
+ # Validate license format (should be SPDX identifier)
263
+ if @data["license"] && !@data["license"].is_a?(String)
264
+ errors << "license must be a string"
265
+ end
266
+
267
+ # Validate language format
268
+ if @data["language"]
269
+ if @data["language"].is_a?(String)
270
+ # Single language
271
+ elsif @data["language"].is_a?(Array)
272
+ # Multiple languages
273
+ @data["language"].each do |lang|
274
+ errors << "language array elements must be strings" unless lang.is_a?(String)
275
+ end
276
+ else
277
+ errors << "language must be a string or array of strings"
278
+ end
279
+ end
280
+
281
+ # Validate tags format
282
+ if @data["tags"] && !@data["tags"].is_a?(Array)
283
+ errors << "tags must be an array"
284
+ end
285
+
286
+ errors
287
+ end
288
+
289
+ # Add evaluation results to the model card metadata.
290
+ #
291
+ # @param task_type [String] Type of task (e.g., "text-classification")
292
+ # @param dataset_name [String] Name of the evaluation dataset
293
+ # @param metric_name [String] Name of the metric
294
+ # @param metric_value [Numeric] Value of the metric
295
+ #
296
+ # @example Add evaluation result
297
+ # card.add_evaluation_result(
298
+ # task_type: "text-classification",
299
+ # dataset_name: "glue",
300
+ # metric_name: "accuracy",
301
+ # metric_value: 0.95
302
+ # )
303
+ def add_evaluation_result(task_type:, dataset_name:, metric_name:, metric_value:)
304
+ @data["model-index"] ||= []
305
+
306
+ model_index = @data["model-index"].first || {}
307
+ model_index["results"] ||= []
308
+
309
+ result = {
310
+ "task" => { "type" => task_type },
311
+ "dataset" => { "name" => dataset_name },
312
+ "metrics" => [{ "name" => metric_name, "value" => metric_value }]
313
+ }
314
+
315
+ model_index["results"] << result
316
+ @data["model-index"] = [model_index] if @data["model-index"].empty?
317
+ end
318
+ end
319
+
320
+ # Dataset card for documenting datasets.
321
+ #
322
+ # Dataset cards provide information about datasets including:
323
+ # - Dataset description and structure
324
+ # - Data collection methodology
325
+ # - Intended use cases
326
+ # - Limitations and biases
327
+ #
328
+ # @example Create a dataset card
329
+ # card = DatasetCard.new(
330
+ # text: "# My Dataset\n\nThis dataset contains...",
331
+ # data: {
332
+ # "license" => "cc-by-4.0",
333
+ # "language" => ["en", "es"],
334
+ # "task_categories" => ["text-classification"]
335
+ # }
336
+ # )
337
+ class DatasetCard < RepoCard
338
+ # @return [String] Default repository type for dataset cards
339
+ def self.default_repo_type
340
+ "dataset"
341
+ end
342
+
343
+ # Validate dataset card metadata.
344
+ #
345
+ # @return [Array<String>] List of validation errors
346
+ def validate
347
+ errors = []
348
+
349
+ # Check for required fields
350
+ errors << "license is required" unless @data["license"]
351
+
352
+ # Validate license format
353
+ if @data["license"] && !@data["license"].is_a?(String)
354
+ errors << "license must be a string"
355
+ end
356
+
357
+ # Validate language format
358
+ if @data["language"]
359
+ if @data["language"].is_a?(String)
360
+ # Single language
361
+ elsif @data["language"].is_a?(Array)
362
+ # Multiple languages
363
+ @data["language"].each do |lang|
364
+ errors << "language array elements must be strings" unless lang.is_a?(String)
365
+ end
366
+ else
367
+ errors << "language must be a string or array of strings"
368
+ end
369
+ end
370
+
371
+ # Validate task_categories format
372
+ if @data["task_categories"] && !@data["task_categories"].is_a?(Array)
373
+ errors << "task_categories must be an array"
374
+ end
375
+
376
+ errors
377
+ end
378
+ end
379
+
380
+ # Space card for documenting Spaces (interactive demos).
381
+ #
382
+ # Space cards provide information about Gradio/Streamlit apps including:
383
+ # - App description and usage
384
+ # - Technical requirements
385
+ # - Model dependencies
386
+ #
387
+ # @example Create a space card
388
+ # card = SpaceCard.new(
389
+ # text: "# My Demo\n\nThis space demonstrates...",
390
+ # data: {
391
+ # "sdk" => "gradio",
392
+ # "sdk_version" => "3.0.0",
393
+ # "app_file" => "app.py"
394
+ # }
395
+ # )
396
+ class SpaceCard < RepoCard
397
+ # @return [String] Default repository type for space cards
398
+ def self.default_repo_type
399
+ "space"
400
+ end
401
+
402
+ # Validate space card metadata.
403
+ #
404
+ # @return [Array<String>] List of validation errors
405
+ def validate
406
+ errors = []
407
+
408
+ # Check for required fields
409
+ errors << "sdk is required" unless @data["sdk"]
410
+ errors << "app_file is required" unless @data["app_file"]
411
+
412
+ # Validate SDK
413
+ if @data["sdk"] && !["gradio", "streamlit", "docker", "static"].include?(@data["sdk"])
414
+ errors << "sdk must be one of: gradio, streamlit, docker, static"
415
+ end
416
+
417
+ # Validate app_file
418
+ if @data["app_file"] && !@data["app_file"].is_a?(String)
419
+ errors << "app_file must be a string"
420
+ end
421
+
422
+ # Validate sdk_version format
423
+ if @data["sdk_version"] && !@data["sdk_version"].is_a?(String)
424
+ errors << "sdk_version must be a string"
425
+ end
426
+
427
+ errors
428
+ end
429
+ end
430
+ end