faceberg 0.1.0__tar.gz → 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- faceberg-0.1.1/PKG-INFO +147 -0
- faceberg-0.1.1/README.md +104 -0
- {faceberg-0.1.0 → faceberg-0.1.1}/pyproject.toml +1 -1
- faceberg-0.1.0/PKG-INFO +0 -175
- faceberg-0.1.0/README.md +0 -132
- {faceberg-0.1.0 → faceberg-0.1.1}/.gitignore +0 -0
- {faceberg-0.1.0 → faceberg-0.1.1}/LICENSE +0 -0
- {faceberg-0.1.0 → faceberg-0.1.1}/faceberg/__init__.py +0 -0
- {faceberg-0.1.0 → faceberg-0.1.1}/faceberg/bridge.py +0 -0
- {faceberg-0.1.0 → faceberg-0.1.1}/faceberg/catalog.py +0 -0
- {faceberg-0.1.0 → faceberg-0.1.1}/faceberg/cli.py +0 -0
- {faceberg-0.1.0 → faceberg-0.1.1}/faceberg/config.py +0 -0
- {faceberg-0.1.0 → faceberg-0.1.1}/faceberg/convert.py +0 -0
- {faceberg-0.1.0 → faceberg-0.1.1}/faceberg/pretty.py +0 -0
- {faceberg-0.1.0 → faceberg-0.1.1}/faceberg/server.py +0 -0
- {faceberg-0.1.0 → faceberg-0.1.1}/faceberg/shell.py +0 -0
- {faceberg-0.1.0 → faceberg-0.1.1}/faceberg/spaces/Dockerfile +0 -0
- {faceberg-0.1.0 → faceberg-0.1.1}/faceberg/spaces/README.md +0 -0
- {faceberg-0.1.0 → faceberg-0.1.1}/faceberg/spaces/landing.html +0 -0
- {faceberg-0.1.0 → faceberg-0.1.1}/faceberg/tests/__init__.py +0 -0
- {faceberg-0.1.0 → faceberg-0.1.1}/faceberg/tests/conftest.py +0 -0
- {faceberg-0.1.0 → faceberg-0.1.1}/faceberg/tests/test_bridge.py +0 -0
- {faceberg-0.1.0 → faceberg-0.1.1}/faceberg/tests/test_catalog.py +0 -0
- {faceberg-0.1.0 → faceberg-0.1.1}/faceberg/tests/test_catalog_duckdb.py +0 -0
- {faceberg-0.1.0 → faceberg-0.1.1}/faceberg/tests/test_catalog_pandas.py +0 -0
- {faceberg-0.1.0 → faceberg-0.1.1}/faceberg/tests/test_cli.py +0 -0
- {faceberg-0.1.0 → faceberg-0.1.1}/faceberg/tests/test_config.py +0 -0
- {faceberg-0.1.0 → faceberg-0.1.1}/faceberg/tests/test_convert.py +0 -0
- {faceberg-0.1.0 → faceberg-0.1.1}/faceberg/tests/test_pretty.py +0 -0
- {faceberg-0.1.0 → faceberg-0.1.1}/faceberg/tests/test_server.py +0 -0
- {faceberg-0.1.0 → faceberg-0.1.1}/faceberg/tests/test_server_playwright.py +0 -0
faceberg-0.1.1/PKG-INFO
ADDED
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: faceberg
|
|
3
|
+
Version: 0.1.1
|
|
4
|
+
Summary: Bridge HuggingFace datasets with Apache Iceberg
|
|
5
|
+
Project-URL: Homepage, https://github.com/kszucs/faceberg
|
|
6
|
+
Project-URL: Documentation, https://github.com/kszucs/faceberg
|
|
7
|
+
Project-URL: Repository, https://github.com/kszucs/faceberg
|
|
8
|
+
Author-email: Krisztian Szucs <kszucs@users.noreply.github.com>
|
|
9
|
+
License: Apache-2.0
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Keywords: data-lake,datasets,huggingface,iceberg
|
|
12
|
+
Classifier: Development Status :: 3 - Alpha
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Requires-Python: >=3.9
|
|
21
|
+
Requires-Dist: click>=8.0.0
|
|
22
|
+
Requires-Dist: datasets>=2.0.0
|
|
23
|
+
Requires-Dist: fsspec>=2023.1.0
|
|
24
|
+
Requires-Dist: huggingface-hub>=0.20.0
|
|
25
|
+
Requires-Dist: jinja2>=3.1.6
|
|
26
|
+
Requires-Dist: litestar>=2.0.0
|
|
27
|
+
Requires-Dist: pyarrow>=21.0.0
|
|
28
|
+
Requires-Dist: pyiceberg>=0.6.0
|
|
29
|
+
Requires-Dist: pyyaml>=6.0
|
|
30
|
+
Requires-Dist: rich>=13.0.0
|
|
31
|
+
Requires-Dist: uuid-utils>=0.9.0
|
|
32
|
+
Requires-Dist: uvicorn[standard]>=0.27.0
|
|
33
|
+
Provides-Extra: dev
|
|
34
|
+
Requires-Dist: black>=23.0.0; extra == 'dev'
|
|
35
|
+
Requires-Dist: duckdb>=0.10.0; extra == 'dev'
|
|
36
|
+
Requires-Dist: mypy>=1.0.0; extra == 'dev'
|
|
37
|
+
Requires-Dist: pytest-cov>=4.0.0; extra == 'dev'
|
|
38
|
+
Requires-Dist: pytest-playwright>=0.7.0; extra == 'dev'
|
|
39
|
+
Requires-Dist: pytest>=7.0.0; extra == 'dev'
|
|
40
|
+
Requires-Dist: requests>=2.31.0; extra == 'dev'
|
|
41
|
+
Requires-Dist: ruff>=0.1.0; extra == 'dev'
|
|
42
|
+
Description-Content-Type: text/markdown
|
|
43
|
+
|
|
44
|
+

|
|
45
|
+
|
|
46
|
+
# Faceberg
|
|
47
|
+
|
|
48
|
+
**Bridge HuggingFace datasets with Apache Iceberg tables — no data copying, just metadata.**
|
|
49
|
+
|
|
50
|
+
Faceberg maps HuggingFace datasets to Apache Iceberg tables. Your catalog metadata lives on HuggingFace Spaces with an auto-deployed REST API, and any Iceberg-compatible query engine can access the data.
|
|
51
|
+
|
|
52
|
+
## Installation
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
pip install faceberg
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
## Quick Start
|
|
59
|
+
|
|
60
|
+
```bash
|
|
61
|
+
export HF_TOKEN=your_huggingface_token
|
|
62
|
+
|
|
63
|
+
# Create a catalog on HuggingFace Hub
|
|
64
|
+
faceberg user/mycatalog init
|
|
65
|
+
|
|
66
|
+
# Add datasets
|
|
67
|
+
faceberg user/mycatalog add stanfordnlp/imdb --config plain_text
|
|
68
|
+
faceberg user/mycatalog add openai/gsm8k --config main
|
|
69
|
+
|
|
70
|
+
# Query with interactive DuckDB shell
|
|
71
|
+
faceberg user/mycatalog quack
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
```sql
|
|
75
|
+
SELECT label, substr(text, 1, 100) as preview
|
|
76
|
+
FROM iceberg_catalog.stanfordnlp.imdb
|
|
77
|
+
LIMIT 10;
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
## How It Works
|
|
81
|
+
|
|
82
|
+
```
|
|
83
|
+
HuggingFace Hub
|
|
84
|
+
┌─────────────────────────────────────────────────────────┐
|
|
85
|
+
│ │
|
|
86
|
+
│ ┌─────────────────────┐ ┌─────────────────────────┐ │
|
|
87
|
+
│ │ HF Datasets │ │ HF Spaces (Catalog) │ │
|
|
88
|
+
│ │ (Original Parquet) │◄───│ • Iceberg metadata │ │
|
|
89
|
+
│ │ │ │ • REST API endpoint │ │
|
|
90
|
+
│ │ stanfordnlp/imdb/ │ │ • faceberg.yml │ │
|
|
91
|
+
│ │ └── *.parquet │ │ │ │
|
|
92
|
+
│ └─────────────────────┘ └───────────┬─────────────┘ │
|
|
93
|
+
│ │ │
|
|
94
|
+
└─────────────────────────────────────────┼───────────────┘
|
|
95
|
+
│ Iceberg REST API
|
|
96
|
+
▼
|
|
97
|
+
┌─────────────────────────┐
|
|
98
|
+
│ Query Engines │
|
|
99
|
+
│ DuckDB, Pandas, Spark │
|
|
100
|
+
└─────────────────────────┘
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
**No data is copied** — only metadata is created. Query with DuckDB, PyIceberg, Spark, or any Iceberg-compatible tool.
|
|
104
|
+
|
|
105
|
+
## Python API
|
|
106
|
+
|
|
107
|
+
```python
|
|
108
|
+
import os
|
|
109
|
+
from faceberg import catalog
|
|
110
|
+
|
|
111
|
+
cat = catalog("user/mycatalog", hf_token=os.environ.get("HF_TOKEN"))
|
|
112
|
+
table = cat.load_table("stanfordnlp.imdb")
|
|
113
|
+
df = table.scan(limit=100).to_pandas()
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
## Share Your Catalog
|
|
117
|
+
|
|
118
|
+
Your catalog is accessible to anyone via the REST API:
|
|
119
|
+
|
|
120
|
+
```python
|
|
121
|
+
import duckdb
|
|
122
|
+
|
|
123
|
+
conn = duckdb.connect()
|
|
124
|
+
conn.execute("INSTALL iceberg; LOAD iceberg")
|
|
125
|
+
conn.execute("ATTACH 'https://user-mycatalog.hf.space' AS cat (TYPE ICEBERG)")
|
|
126
|
+
|
|
127
|
+
result = conn.execute("SELECT * FROM cat.stanfordnlp.imdb LIMIT 5").fetchdf()
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
## Documentation
|
|
131
|
+
|
|
132
|
+
- [Getting Started](docs/index.qmd) — Full quickstart guide
|
|
133
|
+
- [Local Catalogs](docs/local.qmd) — Use local catalogs for development
|
|
134
|
+
- [DuckDB Integration](docs/integrations/duckdb.qmd) — Advanced SQL queries
|
|
135
|
+
- [Pandas Integration](docs/integrations/pandas.qmd) — Load into DataFrames
|
|
136
|
+
|
|
137
|
+
## Development
|
|
138
|
+
|
|
139
|
+
```bash
|
|
140
|
+
git clone https://github.com/kszucs/faceberg
|
|
141
|
+
cd faceberg
|
|
142
|
+
pip install -e .
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
## License
|
|
146
|
+
|
|
147
|
+
Apache 2.0
|
faceberg-0.1.1/README.md
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+

|
|
2
|
+
|
|
3
|
+
# Faceberg
|
|
4
|
+
|
|
5
|
+
**Bridge HuggingFace datasets with Apache Iceberg tables — no data copying, just metadata.**
|
|
6
|
+
|
|
7
|
+
Faceberg maps HuggingFace datasets to Apache Iceberg tables. Your catalog metadata lives on HuggingFace Spaces with an auto-deployed REST API, and any Iceberg-compatible query engine can access the data.
|
|
8
|
+
|
|
9
|
+
## Installation
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
pip install faceberg
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
## Quick Start
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
export HF_TOKEN=your_huggingface_token
|
|
19
|
+
|
|
20
|
+
# Create a catalog on HuggingFace Hub
|
|
21
|
+
faceberg user/mycatalog init
|
|
22
|
+
|
|
23
|
+
# Add datasets
|
|
24
|
+
faceberg user/mycatalog add stanfordnlp/imdb --config plain_text
|
|
25
|
+
faceberg user/mycatalog add openai/gsm8k --config main
|
|
26
|
+
|
|
27
|
+
# Query with interactive DuckDB shell
|
|
28
|
+
faceberg user/mycatalog quack
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
```sql
|
|
32
|
+
SELECT label, substr(text, 1, 100) as preview
|
|
33
|
+
FROM iceberg_catalog.stanfordnlp.imdb
|
|
34
|
+
LIMIT 10;
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
## How It Works
|
|
38
|
+
|
|
39
|
+
```
|
|
40
|
+
HuggingFace Hub
|
|
41
|
+
┌─────────────────────────────────────────────────────────┐
|
|
42
|
+
│ │
|
|
43
|
+
│ ┌─────────────────────┐ ┌─────────────────────────┐ │
|
|
44
|
+
│ │ HF Datasets │ │ HF Spaces (Catalog) │ │
|
|
45
|
+
│ │ (Original Parquet) │◄───│ • Iceberg metadata │ │
|
|
46
|
+
│ │ │ │ • REST API endpoint │ │
|
|
47
|
+
│ │ stanfordnlp/imdb/ │ │ • faceberg.yml │ │
|
|
48
|
+
│ │ └── *.parquet │ │ │ │
|
|
49
|
+
│ └─────────────────────┘ └───────────┬─────────────┘ │
|
|
50
|
+
│ │ │
|
|
51
|
+
└─────────────────────────────────────────┼───────────────┘
|
|
52
|
+
│ Iceberg REST API
|
|
53
|
+
▼
|
|
54
|
+
┌─────────────────────────┐
|
|
55
|
+
│ Query Engines │
|
|
56
|
+
│ DuckDB, Pandas, Spark │
|
|
57
|
+
└─────────────────────────┘
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
**No data is copied** — only metadata is created. Query with DuckDB, PyIceberg, Spark, or any Iceberg-compatible tool.
|
|
61
|
+
|
|
62
|
+
## Python API
|
|
63
|
+
|
|
64
|
+
```python
|
|
65
|
+
import os
|
|
66
|
+
from faceberg import catalog
|
|
67
|
+
|
|
68
|
+
cat = catalog("user/mycatalog", hf_token=os.environ.get("HF_TOKEN"))
|
|
69
|
+
table = cat.load_table("stanfordnlp.imdb")
|
|
70
|
+
df = table.scan(limit=100).to_pandas()
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
## Share Your Catalog
|
|
74
|
+
|
|
75
|
+
Your catalog is accessible to anyone via the REST API:
|
|
76
|
+
|
|
77
|
+
```python
|
|
78
|
+
import duckdb
|
|
79
|
+
|
|
80
|
+
conn = duckdb.connect()
|
|
81
|
+
conn.execute("INSTALL iceberg; LOAD iceberg")
|
|
82
|
+
conn.execute("ATTACH 'https://user-mycatalog.hf.space' AS cat (TYPE ICEBERG)")
|
|
83
|
+
|
|
84
|
+
result = conn.execute("SELECT * FROM cat.stanfordnlp.imdb LIMIT 5").fetchdf()
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
## Documentation
|
|
88
|
+
|
|
89
|
+
- [Getting Started](docs/index.qmd) — Full quickstart guide
|
|
90
|
+
- [Local Catalogs](docs/local.qmd) — Use local catalogs for development
|
|
91
|
+
- [DuckDB Integration](docs/integrations/duckdb.qmd) — Advanced SQL queries
|
|
92
|
+
- [Pandas Integration](docs/integrations/pandas.qmd) — Load into DataFrames
|
|
93
|
+
|
|
94
|
+
## Development
|
|
95
|
+
|
|
96
|
+
```bash
|
|
97
|
+
git clone https://github.com/kszucs/faceberg
|
|
98
|
+
cd faceberg
|
|
99
|
+
pip install -e .
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
## License
|
|
103
|
+
|
|
104
|
+
Apache 2.0
|
faceberg-0.1.0/PKG-INFO
DELETED
|
@@ -1,175 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: faceberg
|
|
3
|
-
Version: 0.1.0
|
|
4
|
-
Summary: Bridge HuggingFace datasets with Apache Iceberg
|
|
5
|
-
Project-URL: Homepage, https://github.com/kszucs/faceberg
|
|
6
|
-
Project-URL: Documentation, https://github.com/kszucs/faceberg
|
|
7
|
-
Project-URL: Repository, https://github.com/kszucs/faceberg
|
|
8
|
-
Author-email: Krisztian Szucs <kszucs@users.noreply.github.com>
|
|
9
|
-
License: Apache-2.0
|
|
10
|
-
License-File: LICENSE
|
|
11
|
-
Keywords: data-lake,datasets,huggingface,iceberg
|
|
12
|
-
Classifier: Development Status :: 3 - Alpha
|
|
13
|
-
Classifier: Intended Audience :: Developers
|
|
14
|
-
Classifier: License :: OSI Approved :: Apache Software License
|
|
15
|
-
Classifier: Programming Language :: Python :: 3
|
|
16
|
-
Classifier: Programming Language :: Python :: 3.9
|
|
17
|
-
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
-
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
-
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
-
Requires-Python: >=3.9
|
|
21
|
-
Requires-Dist: click>=8.0.0
|
|
22
|
-
Requires-Dist: datasets>=2.0.0
|
|
23
|
-
Requires-Dist: fsspec>=2023.1.0
|
|
24
|
-
Requires-Dist: huggingface-hub>=0.20.0
|
|
25
|
-
Requires-Dist: jinja2>=3.1.6
|
|
26
|
-
Requires-Dist: litestar>=2.0.0
|
|
27
|
-
Requires-Dist: pyarrow>=21.0.0
|
|
28
|
-
Requires-Dist: pyiceberg>=0.6.0
|
|
29
|
-
Requires-Dist: pyyaml>=6.0
|
|
30
|
-
Requires-Dist: rich>=13.0.0
|
|
31
|
-
Requires-Dist: uuid-utils>=0.9.0
|
|
32
|
-
Requires-Dist: uvicorn[standard]>=0.27.0
|
|
33
|
-
Provides-Extra: dev
|
|
34
|
-
Requires-Dist: black>=23.0.0; extra == 'dev'
|
|
35
|
-
Requires-Dist: duckdb>=0.10.0; extra == 'dev'
|
|
36
|
-
Requires-Dist: mypy>=1.0.0; extra == 'dev'
|
|
37
|
-
Requires-Dist: pytest-cov>=4.0.0; extra == 'dev'
|
|
38
|
-
Requires-Dist: pytest-playwright>=0.7.0; extra == 'dev'
|
|
39
|
-
Requires-Dist: pytest>=7.0.0; extra == 'dev'
|
|
40
|
-
Requires-Dist: requests>=2.31.0; extra == 'dev'
|
|
41
|
-
Requires-Dist: ruff>=0.1.0; extra == 'dev'
|
|
42
|
-
Description-Content-Type: text/markdown
|
|
43
|
-
|
|
44
|
-

|
|
45
|
-
|
|
46
|
-
# Faceberg
|
|
47
|
-
|
|
48
|
-
Bridge HuggingFace datasets with Apache Iceberg tables.
|
|
49
|
-
|
|
50
|
-
## Installation
|
|
51
|
-
|
|
52
|
-
```bash
|
|
53
|
-
pip install faceberg
|
|
54
|
-
```
|
|
55
|
-
|
|
56
|
-
## Quick Start
|
|
57
|
-
|
|
58
|
-
```bash
|
|
59
|
-
# Create a catalog and add a dataset
|
|
60
|
-
faceberg mycatalog init
|
|
61
|
-
faceberg mycatalog add stanfordnlp/imdb --config plain_text
|
|
62
|
-
faceberg mycatalog sync
|
|
63
|
-
|
|
64
|
-
# Query the data
|
|
65
|
-
faceberg mycatalog scan default.imdb --limit 5
|
|
66
|
-
```
|
|
67
|
-
|
|
68
|
-
**Python API:**
|
|
69
|
-
|
|
70
|
-
```python
|
|
71
|
-
from faceberg import catalog
|
|
72
|
-
|
|
73
|
-
cat = catalog("mycatalog")
|
|
74
|
-
table = cat.load_table("default.imdb")
|
|
75
|
-
df = table.scan().to_pandas()
|
|
76
|
-
print(df.head())
|
|
77
|
-
```
|
|
78
|
-
|
|
79
|
-
**Documentation:**
|
|
80
|
-
- [Getting Started](docs/index.qmd) - Quickstart guide
|
|
81
|
-
- [Local Catalogs](docs/local.qmd) - Use local catalogs for testing
|
|
82
|
-
- [DuckDB Integration](docs/integrations/duckdb.qmd) - Query with SQL
|
|
83
|
-
- [Pandas Integration](docs/integrations/pandas.qmd) - Load into DataFrames
|
|
84
|
-
|
|
85
|
-
## How It Works
|
|
86
|
-
|
|
87
|
-
Faceberg creates lightweight Iceberg metadata that points to original HuggingFace dataset files:
|
|
88
|
-
|
|
89
|
-
```
|
|
90
|
-
HuggingFace Dataset Your Catalog
|
|
91
|
-
┌─────────────────┐ ┌──────────────────┐
|
|
92
|
-
│ org/dataset │ │ mycatalog/ │
|
|
93
|
-
│ ├── train.pq ◄──┼─────────┼─ default/ │
|
|
94
|
-
│ └── test.pq ◄──┼─────────┼─ └── imdb/ │
|
|
95
|
-
└─────────────────┘ │ └── metadata/
|
|
96
|
-
└──────────────────┘
|
|
97
|
-
```
|
|
98
|
-
|
|
99
|
-
No data is copied—only metadata is created. Query with DuckDB, PyIceberg, Spark, or any Iceberg-compatible tool.
|
|
100
|
-
|
|
101
|
-
## Usage
|
|
102
|
-
|
|
103
|
-
### CLI Commands
|
|
104
|
-
|
|
105
|
-
```bash
|
|
106
|
-
# Initialize catalog
|
|
107
|
-
faceberg mycatalog init
|
|
108
|
-
|
|
109
|
-
# Add datasets
|
|
110
|
-
faceberg mycatalog add openai/gsm8k --config main
|
|
111
|
-
|
|
112
|
-
# Sync datasets (creates Iceberg metadata)
|
|
113
|
-
faceberg mycatalog sync
|
|
114
|
-
|
|
115
|
-
# List tables
|
|
116
|
-
faceberg mycatalog list
|
|
117
|
-
|
|
118
|
-
# Show table info
|
|
119
|
-
faceberg mycatalog info default.gsm8k
|
|
120
|
-
|
|
121
|
-
# Scan data
|
|
122
|
-
faceberg mycatalog scan default.gsm8k --limit 10
|
|
123
|
-
|
|
124
|
-
# Start REST server
|
|
125
|
-
faceberg mycatalog serve --port 8181
|
|
126
|
-
```
|
|
127
|
-
|
|
128
|
-
### Remote Catalogs on HuggingFace Hub
|
|
129
|
-
|
|
130
|
-
```bash
|
|
131
|
-
# Initialize remote catalog
|
|
132
|
-
export HF_TOKEN=your_token
|
|
133
|
-
faceberg org/catalog-repo init
|
|
134
|
-
|
|
135
|
-
# Add and sync datasets
|
|
136
|
-
faceberg org/catalog-repo add deepmind/code_contests --config default
|
|
137
|
-
faceberg org/catalog-repo sync
|
|
138
|
-
|
|
139
|
-
# Serve remote catalog
|
|
140
|
-
faceberg org/catalog-repo serve
|
|
141
|
-
```
|
|
142
|
-
|
|
143
|
-
### Query with DuckDB
|
|
144
|
-
|
|
145
|
-
```python
|
|
146
|
-
import duckdb
|
|
147
|
-
|
|
148
|
-
conn = duckdb.connect()
|
|
149
|
-
conn.execute("INSTALL httpfs; LOAD httpfs")
|
|
150
|
-
conn.execute("INSTALL iceberg; LOAD iceberg")
|
|
151
|
-
|
|
152
|
-
# Query local catalog
|
|
153
|
-
result = conn.execute("""
|
|
154
|
-
SELECT * FROM iceberg_scan('mycatalog/default/imdb/metadata/v1.metadata.json')
|
|
155
|
-
LIMIT 10
|
|
156
|
-
""").fetchall()
|
|
157
|
-
|
|
158
|
-
# Query remote catalog
|
|
159
|
-
result = conn.execute("""
|
|
160
|
-
SELECT * FROM iceberg_scan('hf://datasets/org/catalog/default/table/metadata/v1.metadata.json')
|
|
161
|
-
LIMIT 10
|
|
162
|
-
""").fetchall()
|
|
163
|
-
```
|
|
164
|
-
|
|
165
|
-
## Development
|
|
166
|
-
|
|
167
|
-
```bash
|
|
168
|
-
git clone https://github.com/kszucs/faceberg
|
|
169
|
-
cd faceberg
|
|
170
|
-
pip install -e .
|
|
171
|
-
```
|
|
172
|
-
|
|
173
|
-
## License
|
|
174
|
-
|
|
175
|
-
Apache 2.0
|
faceberg-0.1.0/README.md
DELETED
|
@@ -1,132 +0,0 @@
|
|
|
1
|
-

|
|
2
|
-
|
|
3
|
-
# Faceberg
|
|
4
|
-
|
|
5
|
-
Bridge HuggingFace datasets with Apache Iceberg tables.
|
|
6
|
-
|
|
7
|
-
## Installation
|
|
8
|
-
|
|
9
|
-
```bash
|
|
10
|
-
pip install faceberg
|
|
11
|
-
```
|
|
12
|
-
|
|
13
|
-
## Quick Start
|
|
14
|
-
|
|
15
|
-
```bash
|
|
16
|
-
# Create a catalog and add a dataset
|
|
17
|
-
faceberg mycatalog init
|
|
18
|
-
faceberg mycatalog add stanfordnlp/imdb --config plain_text
|
|
19
|
-
faceberg mycatalog sync
|
|
20
|
-
|
|
21
|
-
# Query the data
|
|
22
|
-
faceberg mycatalog scan default.imdb --limit 5
|
|
23
|
-
```
|
|
24
|
-
|
|
25
|
-
**Python API:**
|
|
26
|
-
|
|
27
|
-
```python
|
|
28
|
-
from faceberg import catalog
|
|
29
|
-
|
|
30
|
-
cat = catalog("mycatalog")
|
|
31
|
-
table = cat.load_table("default.imdb")
|
|
32
|
-
df = table.scan().to_pandas()
|
|
33
|
-
print(df.head())
|
|
34
|
-
```
|
|
35
|
-
|
|
36
|
-
**Documentation:**
|
|
37
|
-
- [Getting Started](docs/index.qmd) - Quickstart guide
|
|
38
|
-
- [Local Catalogs](docs/local.qmd) - Use local catalogs for testing
|
|
39
|
-
- [DuckDB Integration](docs/integrations/duckdb.qmd) - Query with SQL
|
|
40
|
-
- [Pandas Integration](docs/integrations/pandas.qmd) - Load into DataFrames
|
|
41
|
-
|
|
42
|
-
## How It Works
|
|
43
|
-
|
|
44
|
-
Faceberg creates lightweight Iceberg metadata that points to original HuggingFace dataset files:
|
|
45
|
-
|
|
46
|
-
```
|
|
47
|
-
HuggingFace Dataset Your Catalog
|
|
48
|
-
┌─────────────────┐ ┌──────────────────┐
|
|
49
|
-
│ org/dataset │ │ mycatalog/ │
|
|
50
|
-
│ ├── train.pq ◄──┼─────────┼─ default/ │
|
|
51
|
-
│ └── test.pq ◄──┼─────────┼─ └── imdb/ │
|
|
52
|
-
└─────────────────┘ │ └── metadata/
|
|
53
|
-
└──────────────────┘
|
|
54
|
-
```
|
|
55
|
-
|
|
56
|
-
No data is copied—only metadata is created. Query with DuckDB, PyIceberg, Spark, or any Iceberg-compatible tool.
|
|
57
|
-
|
|
58
|
-
## Usage
|
|
59
|
-
|
|
60
|
-
### CLI Commands
|
|
61
|
-
|
|
62
|
-
```bash
|
|
63
|
-
# Initialize catalog
|
|
64
|
-
faceberg mycatalog init
|
|
65
|
-
|
|
66
|
-
# Add datasets
|
|
67
|
-
faceberg mycatalog add openai/gsm8k --config main
|
|
68
|
-
|
|
69
|
-
# Sync datasets (creates Iceberg metadata)
|
|
70
|
-
faceberg mycatalog sync
|
|
71
|
-
|
|
72
|
-
# List tables
|
|
73
|
-
faceberg mycatalog list
|
|
74
|
-
|
|
75
|
-
# Show table info
|
|
76
|
-
faceberg mycatalog info default.gsm8k
|
|
77
|
-
|
|
78
|
-
# Scan data
|
|
79
|
-
faceberg mycatalog scan default.gsm8k --limit 10
|
|
80
|
-
|
|
81
|
-
# Start REST server
|
|
82
|
-
faceberg mycatalog serve --port 8181
|
|
83
|
-
```
|
|
84
|
-
|
|
85
|
-
### Remote Catalogs on HuggingFace Hub
|
|
86
|
-
|
|
87
|
-
```bash
|
|
88
|
-
# Initialize remote catalog
|
|
89
|
-
export HF_TOKEN=your_token
|
|
90
|
-
faceberg org/catalog-repo init
|
|
91
|
-
|
|
92
|
-
# Add and sync datasets
|
|
93
|
-
faceberg org/catalog-repo add deepmind/code_contests --config default
|
|
94
|
-
faceberg org/catalog-repo sync
|
|
95
|
-
|
|
96
|
-
# Serve remote catalog
|
|
97
|
-
faceberg org/catalog-repo serve
|
|
98
|
-
```
|
|
99
|
-
|
|
100
|
-
### Query with DuckDB
|
|
101
|
-
|
|
102
|
-
```python
|
|
103
|
-
import duckdb
|
|
104
|
-
|
|
105
|
-
conn = duckdb.connect()
|
|
106
|
-
conn.execute("INSTALL httpfs; LOAD httpfs")
|
|
107
|
-
conn.execute("INSTALL iceberg; LOAD iceberg")
|
|
108
|
-
|
|
109
|
-
# Query local catalog
|
|
110
|
-
result = conn.execute("""
|
|
111
|
-
SELECT * FROM iceberg_scan('mycatalog/default/imdb/metadata/v1.metadata.json')
|
|
112
|
-
LIMIT 10
|
|
113
|
-
""").fetchall()
|
|
114
|
-
|
|
115
|
-
# Query remote catalog
|
|
116
|
-
result = conn.execute("""
|
|
117
|
-
SELECT * FROM iceberg_scan('hf://datasets/org/catalog/default/table/metadata/v1.metadata.json')
|
|
118
|
-
LIMIT 10
|
|
119
|
-
""").fetchall()
|
|
120
|
-
```
|
|
121
|
-
|
|
122
|
-
## Development
|
|
123
|
-
|
|
124
|
-
```bash
|
|
125
|
-
git clone https://github.com/kszucs/faceberg
|
|
126
|
-
cd faceberg
|
|
127
|
-
pip install -e .
|
|
128
|
-
```
|
|
129
|
-
|
|
130
|
-
## License
|
|
131
|
-
|
|
132
|
-
Apache 2.0
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|