dagster-hf-datasets 0.0.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (24) hide show
  1. dagster_hf_datasets-0.0.2/PKG-INFO +146 -0
  2. dagster_hf_datasets-0.0.2/README.md +115 -0
  3. dagster_hf_datasets-0.0.2/dagster_hf_datasets/__init__.py +16 -0
  4. dagster_hf_datasets-0.0.2/dagster_hf_datasets/_export/__init__.py +7 -0
  5. dagster_hf_datasets-0.0.2/dagster_hf_datasets/_export/_dataset_card.py +393 -0
  6. dagster_hf_datasets-0.0.2/dagster_hf_datasets/_export/_publisher.py +131 -0
  7. dagster_hf_datasets-0.0.2/dagster_hf_datasets/_metadata/__init__.py +17 -0
  8. dagster_hf_datasets-0.0.2/dagster_hf_datasets/_metadata/_dataset_metadata.py +379 -0
  9. dagster_hf_datasets-0.0.2/dagster_hf_datasets/_partitions/__init__.py +3 -0
  10. dagster_hf_datasets-0.0.2/dagster_hf_datasets/_partitions/_partition_mapping.py +138 -0
  11. dagster_hf_datasets-0.0.2/dagster_hf_datasets/assets/__init__.py +7 -0
  12. dagster_hf_datasets-0.0.2/dagster_hf_datasets/assets/dataset_asset.py +127 -0
  13. dagster_hf_datasets-0.0.2/dagster_hf_datasets/assets/multi_asset.py +202 -0
  14. dagster_hf_datasets-0.0.2/dagster_hf_datasets/io_manager/__init__.py +3 -0
  15. dagster_hf_datasets-0.0.2/dagster_hf_datasets/io_manager/_io_manager.py +140 -0
  16. dagster_hf_datasets-0.0.2/dagster_hf_datasets/resources/__init__.py +3 -0
  17. dagster_hf_datasets-0.0.2/dagster_hf_datasets/resources/huggingface_resource.py +295 -0
  18. dagster_hf_datasets-0.0.2/dagster_hf_datasets.egg-info/PKG-INFO +146 -0
  19. dagster_hf_datasets-0.0.2/dagster_hf_datasets.egg-info/SOURCES.txt +22 -0
  20. dagster_hf_datasets-0.0.2/dagster_hf_datasets.egg-info/dependency_links.txt +1 -0
  21. dagster_hf_datasets-0.0.2/dagster_hf_datasets.egg-info/requires.txt +3 -0
  22. dagster_hf_datasets-0.0.2/dagster_hf_datasets.egg-info/top_level.txt +1 -0
  23. dagster_hf_datasets-0.0.2/pyproject.toml +68 -0
  24. dagster_hf_datasets-0.0.2/setup.cfg +4 -0
@@ -0,0 +1,146 @@
1
+ Metadata-Version: 2.4
2
+ Name: dagster-hf-datasets
3
+ Version: 0.0.2
4
+ Summary: Dagster Integration with HF Datasets
5
+ Author: Parag Ekbote
6
+ License-Expression: Apache-2.0
7
+ Project-URL: Documentation, https://docs.dagster.io/integrations/libraries/hf-datasets
8
+ Project-URL: GitHub, https://github.com/dagster-io/community-integrations/tree/main/libraries/dagster-hf-datasets
9
+ Project-URL: Examples, https://huggingface.co/buckets/the-hf-stack/dagster-hf-datasets-examples
10
+ Project-URL: Release Article, https://huggingface.co/blog/AINovice2005/dagster-hf-datasets
11
+ Keywords: dagster,huggingface,datasets,etl,data-pipelines,mlops
12
+ Classifier: Development Status :: 3 - Alpha
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: Intended Audience :: Science/Research
15
+ Classifier: Intended Audience :: Information Technology
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Operating System :: OS Independent
21
+ Classifier: Topic :: Database
22
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
23
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
24
+ Classifier: Topic :: Software Development :: Libraries
25
+ Classifier: Topic :: Utilities
26
+ Requires-Python: >=3.10
27
+ Description-Content-Type: text/markdown
28
+ Requires-Dist: dagster>=1.8.0
29
+ Requires-Dist: huggingface_hub>=1.0.0
30
+ Requires-Dist: datasets>=4.8.0
31
+
32
+ # Dagster-HF-Datasets
33
+
34
+ <p align="center">
35
+ <img
36
+ src="https://raw.githubusercontent.com/dagster-io/community-integrations/main/libraries/dagster-hf-datasets/docs/assets/dagster_readme_logo.jpg"
37
+ alt="Dagster-HF-Datasets Logo"
38
+ width="500"
39
+ />
40
+ </p>
41
+
42
+ ## Overview
43
+
44
+ Dagster-HF-Datasets integrates Hugging Face datasets with Dagster for building reproducible, observable data pipelines. Load datasets directly as Dagster assets, apply transformations, and publish results back to the Hub.
45
+
46
+ ### Features
47
+
48
+ - **Hugging Face dataset assets** — Load any HF dataset as a Dagster asset with automatic metadata.
49
+ - **Streaming support** — Efficiently handle large datasets with runtime-only streaming mode.
50
+ - **Parquet persistence** — Auto-save datasets to disk for caching and versioning.
51
+ - **Metadata & lineage** — Rich metadata for observability and data lineage tracking.
52
+ - **Multi-asset pipelines** — Create split-aware assets from datasets with multiple splits.
53
+ - **Hub publishing** — Push processed datasets back to the Hugging Face Hub with dataset cards.
54
+
55
+ ---
56
+
57
+ ## Installation
58
+
59
+ ```bash
60
+ pip install dagster-hf-datasets
61
+ ```
62
+
63
+ ## Development Install:
64
+
65
+ ```bash
66
+ git clone https://github.com/dagster-io/community-integrations.git
67
+
68
+ cd libraries/dagster-hf-datasets
69
+
70
+ pip install -e .
71
+ ```
72
+
73
+ ---
74
+
75
+ ## Examples
76
+
77
+ ### Basic Asset Pipeline
78
+
79
+ Get started with a simple example of materializing a Hugging Face dataset as a Dagster asset:
80
+
81
+ See [examples/basic_asset_pipeline.py](https://github.com/dagster-io/community-integrations/blob/main/libraries/dagster-hf-datasets/examples/basic_asset_pipeline.py)
82
+
83
+ - Dataset materialization with `hf_dataset_asset`
84
+ - Parquet persistence via `HFParquetIOManager`
85
+ - Automatic metadata enrichment
86
+ - Hugging Face Hub observability
87
+
88
+ ---
89
+
90
+ ### Multi-Asset Streaming Pipeline
91
+
92
+ Process large datasets efficiently with runtime-only streaming ingestion:
93
+
94
+ See [examples/multi_asset_pipeline.py](https://github.com/dagster-io/community-integrations/blob/main/libraries/dagster-hf-datasets/examples/multi_asset_pipeline.py)
95
+
96
+ - Streaming dataset loading with `load_dataset(..., streaming=True)`
97
+ - Deterministic sampling of IterableDatasets
98
+ - Metadata extraction from streaming sources
99
+ - Conversion to persistent materialized artifacts
100
+
101
+ ---
102
+
103
+ ### Complete Dataset Pipeline
104
+
105
+ Build production-grade data pipelines with dataset cleaning, transformation and publishing:
106
+
107
+ See [examples/multi_asset_pipeline.py](https://github.com/dagster-io/community-integrations/blob/main/libraries/dagster-hf-datasets/examples/dataset_pipeline.py)
108
+
109
+ - Deduplication and filtering of raw data
110
+ - Text normalization and formatting
111
+ - Multi-step lineage-aware transformations
112
+ - Hugging Face Hub dataset publishing
113
+
114
+ ---
115
+
116
+ ## Documentation
117
+
118
+ - [Usage Guide](https://github.com/dagster-io/community-integrations/blob/main/libraries/dagster-hf-datasets/docs/Usage.md) — Quick start, configuration, publishing datasets to Hugging Face Hub, and metadata/lineage tracking
119
+ - [API Reference](https://github.com/dagster-io/community-integrations/blob/main/libraries/dagster-hf-datasets/docs/API.md) — Complete API documentation for `HuggingFaceResource`, asset decorators, and the IO manager
120
+
121
+ ---
122
+
123
+ ## Resources
124
+
125
+ - [Release Article](https://huggingface.co/blog/AINovice2005/dagster-hf-datasets) — Deep dive into the motivation, architecture, runtime lifecycle and patterns behind `dagster-hf-datasets`.
126
+
127
+ - [Official Dagster Documentation](https://docs.dagster.io/integrations/libraries/hf-datasets) — Installation instructions, features and end-to-end usage guide.
128
+
129
+ - [Examples on Hugging Face](https://huggingface.co/buckets/the-hf-stack/dagster-hf-datasets-examples) — Explore 10+ curated example pipelines.
130
+
131
+ ---
132
+
133
+ ## Development
134
+
135
+ ### Test
136
+
137
+ ```bash
138
+ make test
139
+ ```
140
+
141
+ ### Build
142
+
143
+ ```bash
144
+ make build
145
+ ```
146
+ ---
@@ -0,0 +1,115 @@
1
+ # Dagster-HF-Datasets
2
+
3
+ <p align="center">
4
+ <img
5
+ src="https://raw.githubusercontent.com/dagster-io/community-integrations/main/libraries/dagster-hf-datasets/docs/assets/dagster_readme_logo.jpg"
6
+ alt="Dagster-HF-Datasets Logo"
7
+ width="500"
8
+ />
9
+ </p>
10
+
11
+ ## Overview
12
+
13
+ Dagster-HF-Datasets integrates Hugging Face datasets with Dagster for building reproducible, observable data pipelines. Load datasets directly as Dagster assets, apply transformations, and publish results back to the Hub.
14
+
15
+ ### Features
16
+
17
+ - **Hugging Face dataset assets** — Load any HF dataset as a Dagster asset with automatic metadata.
18
+ - **Streaming support** — Efficiently handle large datasets with runtime-only streaming mode.
19
+ - **Parquet persistence** — Auto-save datasets to disk for caching and versioning.
20
+ - **Metadata & lineage** — Rich metadata for observability and data lineage tracking.
21
+ - **Multi-asset pipelines** — Create split-aware assets from datasets with multiple splits.
22
+ - **Hub publishing** — Push processed datasets back to the Hugging Face Hub with dataset cards.
23
+
24
+ ---
25
+
26
+ ## Installation
27
+
28
+ ```bash
29
+ pip install dagster-hf-datasets
30
+ ```
31
+
32
+ ## Development Install:
33
+
34
+ ```bash
35
+ git clone https://github.com/dagster-io/community-integrations.git
36
+
37
+ cd libraries/dagster-hf-datasets
38
+
39
+ pip install -e .
40
+ ```
41
+
42
+ ---
43
+
44
+ ## Examples
45
+
46
+ ### Basic Asset Pipeline
47
+
48
+ Get started with a simple example of materializing a Hugging Face dataset as a Dagster asset:
49
+
50
+ See [examples/basic_asset_pipeline.py](https://github.com/dagster-io/community-integrations/blob/main/libraries/dagster-hf-datasets/examples/basic_asset_pipeline.py)
51
+
52
+ - Dataset materialization with `hf_dataset_asset`
53
+ - Parquet persistence via `HFParquetIOManager`
54
+ - Automatic metadata enrichment
55
+ - Hugging Face Hub observability
56
+
57
+ ---
58
+
59
+ ### Multi-Asset Streaming Pipeline
60
+
61
+ Process large datasets efficiently with runtime-only streaming ingestion:
62
+
63
+ See [examples/multi_asset_pipeline.py](https://github.com/dagster-io/community-integrations/blob/main/libraries/dagster-hf-datasets/examples/multi_asset_pipeline.py)
64
+
65
+ - Streaming dataset loading with `load_dataset(..., streaming=True)`
66
+ - Deterministic sampling of IterableDatasets
67
+ - Metadata extraction from streaming sources
68
+ - Conversion to persistent materialized artifacts
69
+
70
+ ---
71
+
72
+ ### Complete Dataset Pipeline
73
+
74
+ Build production-grade data pipelines with dataset cleaning, transformation and publishing:
75
+
76
+ See [examples/multi_asset_pipeline.py](https://github.com/dagster-io/community-integrations/blob/main/libraries/dagster-hf-datasets/examples/dataset_pipeline.py)
77
+
78
+ - Deduplication and filtering of raw data
79
+ - Text normalization and formatting
80
+ - Multi-step lineage-aware transformations
81
+ - Hugging Face Hub dataset publishing
82
+
83
+ ---
84
+
85
+ ## Documentation
86
+
87
+ - [Usage Guide](https://github.com/dagster-io/community-integrations/blob/main/libraries/dagster-hf-datasets/docs/Usage.md) — Quick start, configuration, publishing datasets to Hugging Face Hub, and metadata/lineage tracking
88
+ - [API Reference](https://github.com/dagster-io/community-integrations/blob/main/libraries/dagster-hf-datasets/docs/API.md) — Complete API documentation for `HuggingFaceResource`, asset decorators, and the IO manager
89
+
90
+ ---
91
+
92
+ ## Resources
93
+
94
+ - [Release Article](https://huggingface.co/blog/AINovice2005/dagster-hf-datasets) — Deep dive into the motivation, architecture, runtime lifecycle and patterns behind `dagster-hf-datasets`.
95
+
96
+ - [Official Dagster Documentation](https://docs.dagster.io/integrations/libraries/hf-datasets) — Installation instructions, features and end-to-end usage guide.
97
+
98
+ - [Examples on Hugging Face](https://huggingface.co/buckets/the-hf-stack/dagster-hf-datasets-examples) — Explore 10+ curated example pipelines.
99
+
100
+ ---
101
+
102
+ ## Development
103
+
104
+ ### Test
105
+
106
+ ```bash
107
+ make test
108
+ ```
109
+
110
+ ### Build
111
+
112
+ ```bash
113
+ make build
114
+ ```
115
+ ---
@@ -0,0 +1,16 @@
1
+ from dagster._core.libraries import DagsterLibraryRegistry
2
+ from .resources.huggingface_resource import HuggingFaceResource
3
+ from .assets.dataset_asset import hf_dataset_asset
4
+ from .assets.multi_asset import hf_multi_asset
5
+
6
+ __version__ = "0.0.2"
7
+
8
+ DagsterLibraryRegistry.register(
9
+ "dagster-hf-datasets", __version__, is_dagster_package=False
10
+ )
11
+
12
+ __all__ = [
13
+ "HuggingFaceResource",
14
+ "hf_dataset_asset",
15
+ "hf_multi_asset",
16
+ ]
@@ -0,0 +1,7 @@
1
+ from ._dataset_card import DatasetCardBuilder
2
+ from ._publisher import HFDatasetPublisher
3
+
4
+ __all__ = [
5
+ "DatasetCardBuilder",
6
+ "HFDatasetPublisher",
7
+ ]
@@ -0,0 +1,393 @@
1
+ from __future__ import annotations
2
+
3
+ from datetime import (
4
+ UTC,
5
+ datetime,
6
+ )
7
+ from typing import Any
8
+
9
+
10
+ class DatasetCardBuilder:
11
+ """
12
+ Lightweight lineage-aware Hugging Face
13
+ dataset card builder.
14
+
15
+ Focus areas:
16
+ - provenance
17
+ - reproducibility
18
+ - transformation lineage
19
+ - runtime metadata observability
20
+ - lightweight ML dataset documentation
21
+
22
+ Explicit non-goals:
23
+ - advanced templating systems
24
+ - governance workflows
25
+ - synchronization orchestration
26
+ """
27
+
28
+ HUB_METADATA_KEYS = {
29
+ "hub_downloads",
30
+ "hub_likes",
31
+ "hub_tags",
32
+ "hub_private",
33
+ "hub_gated",
34
+ "dataset_size_bytes",
35
+ "download_size_bytes",
36
+ }
37
+
38
+ PIPELINE_METADATA_KEYS = {
39
+ "pipeline",
40
+ "pipeline_mode",
41
+ "processing_mode",
42
+ "processing_type",
43
+ "source_streaming",
44
+ }
45
+
46
+ REPRODUCIBILITY_KEYS = {
47
+ "fingerprint",
48
+ "revision",
49
+ }
50
+
51
+ SUMMARY_KEYS = {
52
+ "dataset_type",
53
+ "num_rows",
54
+ "features",
55
+ "execution_mode",
56
+ "streaming",
57
+ }
58
+
59
+ def __init__(
60
+ self,
61
+ *,
62
+ dataset_name: str,
63
+ source_dataset: str,
64
+ source_revision: str | None = None,
65
+ description: str | None = None,
66
+ processing_steps: list[str] | None = None,
67
+ metadata: dict[str, Any] | None = None,
68
+ ) -> None:
69
+ self.dataset_name = dataset_name
70
+ self.source_dataset = source_dataset
71
+ self.source_revision = source_revision
72
+ self.description = description
73
+ self.processing_steps = processing_steps or []
74
+ self.metadata = metadata or {}
75
+
76
+ def build(self) -> str:
77
+ """
78
+ Build lineage-aware Hugging Face
79
+ dataset card.
80
+ """
81
+
82
+ generated_at = datetime.now(UTC).isoformat()
83
+
84
+ yaml_frontmatter = self._build_yaml_frontmatter()
85
+
86
+ dataset_summary = self._build_dataset_summary()
87
+
88
+ pipeline_semantics = self._build_pipeline_semantics()
89
+
90
+ processing_section = self._build_processing_steps()
91
+
92
+ metadata_section = self._build_metadata_section()
93
+
94
+ reproducibility_section = self._build_reproducibility_section()
95
+
96
+ usage_section = self._build_usage_section()
97
+
98
+ return f"""---
99
+ {yaml_frontmatter}
100
+ ---
101
+
102
+ # {self.dataset_name}
103
+
104
+ {self.description or "Processed dataset generated with dagster-hf-datasets."}
105
+
106
+ ## Dataset Summary
107
+
108
+ {dataset_summary}
109
+
110
+ ## Source Dataset
111
+
112
+ - Dataset: `{self.source_dataset}`
113
+ - Revision: `{self.source_revision or "unknown"}`
114
+
115
+ ## Pipeline Semantics
116
+
117
+ {pipeline_semantics}
118
+
119
+ ## Processing Lineage
120
+
121
+ {processing_section}
122
+
123
+ ## Reproducibility
124
+
125
+ {reproducibility_section}
126
+
127
+ ## Metadata
128
+
129
+ {metadata_section}
130
+
131
+ ## Usage
132
+
133
+ {usage_section}
134
+
135
+ ## Provenance
136
+
137
+ - Generated at: `{generated_at}`
138
+ - Generated with: `dagster-hf-datasets`
139
+ - Lineage tracking enabled
140
+ """
141
+
142
+ def _build_yaml_frontmatter(
143
+ self,
144
+ ) -> str:
145
+ """
146
+ Build Hugging Face dataset card
147
+ YAML metadata block.
148
+ """
149
+
150
+ tags = [
151
+ "dagster",
152
+ "huggingface",
153
+ "datasets",
154
+ "lineage-tracking",
155
+ ]
156
+
157
+ if self.metadata.get(
158
+ "source_streaming",
159
+ False,
160
+ ):
161
+ tags.append("streaming-ingestion")
162
+
163
+ pipeline = self.metadata.get("pipeline")
164
+
165
+ if pipeline:
166
+ tags.append(str(pipeline))
167
+
168
+ tag_lines = "\n".join(f"- {tag}" for tag in tags)
169
+
170
+ return f"""language:
171
+ - en
172
+
173
+ tags:
174
+ {tag_lines}
175
+
176
+ source_datasets:
177
+ - {self.source_dataset}
178
+
179
+ generated_by:
180
+ - dagster-hf-datasets
181
+ """
182
+
183
+ def _build_dataset_summary(
184
+ self,
185
+ ) -> str:
186
+ """
187
+ Build dataset summary section.
188
+ """
189
+
190
+ lines: list[str] = []
191
+
192
+ for key in self.SUMMARY_KEYS:
193
+ value = self.metadata.get(key)
194
+
195
+ if value is None:
196
+ continue
197
+
198
+ if key == "features":
199
+ lines.append("- Features:")
200
+
201
+ if isinstance(value, list):
202
+ lines.extend(f" - `{feature}`" for feature in value)
203
+
204
+ elif isinstance(
205
+ value,
206
+ dict,
207
+ ):
208
+ for split, features in value.items():
209
+ lines.append(f" - {split}:")
210
+
211
+ for feature in features:
212
+ lines.append(f" - `{feature}`")
213
+
214
+ continue
215
+
216
+ lines.append(f"- {self._format_key(key)}: " f"`{value}`")
217
+
218
+ if not lines:
219
+ return "- No dataset summary available."
220
+
221
+ return "\n".join(lines)
222
+
223
+ def _build_pipeline_semantics(
224
+ self,
225
+ ) -> str:
226
+ """
227
+ Build pipeline semantics section.
228
+ """
229
+
230
+ lines: list[str] = []
231
+
232
+ source_streaming = self.metadata.get("source_streaming")
233
+
234
+ if source_streaming is not None:
235
+ lines.append(
236
+ "- Source Ingestion Mode: "
237
+ f"`{'streaming' if source_streaming else 'materialized'}`"
238
+ )
239
+
240
+ execution_mode = self.metadata.get("execution_mode")
241
+
242
+ if execution_mode:
243
+ lines.append("- Execution Mode: " f"`{execution_mode}`")
244
+
245
+ dataset_type = self.metadata.get("dataset_type")
246
+
247
+ if dataset_type:
248
+ lines.append("- Materialized Artifact Type: " f"`{dataset_type}`")
249
+
250
+ pipeline_mode = self.metadata.get("pipeline_mode")
251
+
252
+ if pipeline_mode:
253
+ lines.append("- Pipeline Mode: " f"`{pipeline_mode}`")
254
+
255
+ if not lines:
256
+ return "- No pipeline semantics available."
257
+
258
+ return "\n".join(lines)
259
+
260
+ def _build_processing_steps(
261
+ self,
262
+ ) -> str:
263
+ """
264
+ Build processing lineage section.
265
+ """
266
+
267
+ if not self.processing_steps:
268
+ return "- No processing steps recorded."
269
+
270
+ return "\n".join(f"- {step}" for step in self.processing_steps)
271
+
272
+ def _build_reproducibility_section(
273
+ self,
274
+ ) -> str:
275
+ """
276
+ Build reproducibility metadata.
277
+ """
278
+
279
+ lines: list[str] = []
280
+
281
+ lines.append(
282
+ f"- Source Dataset Revision: " f"`{self.source_revision or 'unknown'}`"
283
+ )
284
+
285
+ for key in self.REPRODUCIBILITY_KEYS:
286
+ value = self.metadata.get(key)
287
+
288
+ if value is None:
289
+ continue
290
+
291
+ lines.append(f"- {self._format_key(key)}: " f"`{value}`")
292
+
293
+ if not lines:
294
+ return "- No reproducibility metadata."
295
+
296
+ return "\n".join(lines)
297
+
298
+ def _build_metadata_section(
299
+ self,
300
+ ) -> str:
301
+ """
302
+ Build structured metadata summary.
303
+ """
304
+
305
+ if not self.metadata:
306
+ return "- No additional metadata."
307
+
308
+ sections = []
309
+
310
+ runtime_lines = []
311
+ pipeline_lines = []
312
+ hub_lines = []
313
+
314
+ for key, value in self.metadata.items():
315
+ if key in self.SUMMARY_KEYS:
316
+ continue
317
+
318
+ if key in (self.REPRODUCIBILITY_KEYS):
319
+ continue
320
+
321
+ rendered = self._render_metadata_line(
322
+ key,
323
+ value,
324
+ )
325
+
326
+ if key in self.HUB_METADATA_KEYS:
327
+ hub_lines.append(rendered)
328
+
329
+ elif key in (self.PIPELINE_METADATA_KEYS):
330
+ pipeline_lines.append(rendered)
331
+
332
+ else:
333
+ runtime_lines.append(rendered)
334
+
335
+ if runtime_lines:
336
+ sections.append("### Runtime Metadata\n\n" + "\n".join(runtime_lines))
337
+
338
+ if pipeline_lines:
339
+ sections.append("### Pipeline Metadata\n\n" + "\n".join(pipeline_lines))
340
+
341
+ if hub_lines:
342
+ sections.append("### Hub Metadata\n\n" + "\n".join(hub_lines))
343
+
344
+ return "\n\n".join(sections)
345
+
346
+ def _build_usage_section(
347
+ self,
348
+ ) -> str:
349
+ """
350
+ Build dataset usage examples.
351
+ """
352
+
353
+ return f"""```python
354
+ from datasets import load_dataset
355
+
356
+ dataset = load_dataset(
357
+ "{self.dataset_name}"
358
+ )
359
+ ```"""
360
+
361
+ def _render_metadata_line(
362
+ self,
363
+ key: str,
364
+ value: Any,
365
+ ) -> str:
366
+ """
367
+ Render metadata values safely.
368
+ """
369
+
370
+ formatted_key = self._format_key(key)
371
+
372
+ if isinstance(value, list):
373
+ rendered = "\n".join(f" - `{item}`" for item in value)
374
+
375
+ return f"- **{formatted_key}**:\n" f"{rendered}"
376
+
377
+ if isinstance(value, dict):
378
+ rendered = "\n".join(f" - **{k}**: `{v}`" for k, v in value.items())
379
+
380
+ return f"- **{formatted_key}**:\n" f"{rendered}"
381
+
382
+ return f"- **{formatted_key}**: " f"`{value}`"
383
+
384
+ @staticmethod
385
+ def _format_key(
386
+ key: str,
387
+ ) -> str:
388
+ """
389
+ Format metadata keys into
390
+ readable display names.
391
+ """
392
+
393
+ return key.replace("_", " ").strip().title()