faceberg 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- faceberg/_version.py +34 -0
- faceberg/catalog.py +92 -76
- faceberg/discover.py +181 -0
- faceberg/iceberg.py +707 -0
- faceberg/tests/test_catalog.py +1 -2
- faceberg/tests/test_discover.py +257 -0
- faceberg/tests/test_iceberg.py +911 -0
- faceberg-0.1.2.dist-info/METADATA +149 -0
- {faceberg-0.1.0.dist-info → faceberg-0.1.2.dist-info}/RECORD +12 -11
- faceberg/bridge.py +0 -586
- faceberg/convert.py +0 -813
- faceberg/tests/test_bridge.py +0 -825
- faceberg/tests/test_convert.py +0 -422
- faceberg-0.1.0.dist-info/METADATA +0 -175
- {faceberg-0.1.0.dist-info → faceberg-0.1.2.dist-info}/WHEEL +0 -0
- {faceberg-0.1.0.dist-info → faceberg-0.1.2.dist-info}/entry_points.txt +0 -0
- {faceberg-0.1.0.dist-info → faceberg-0.1.2.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: faceberg
|
|
3
|
+
Version: 0.1.2
|
|
4
|
+
Summary: Bridge HuggingFace datasets with Apache Iceberg
|
|
5
|
+
Project-URL: Homepage, https://github.com/kszucs/faceberg
|
|
6
|
+
Project-URL: Documentation, https://github.com/kszucs/faceberg
|
|
7
|
+
Project-URL: Repository, https://github.com/kszucs/faceberg
|
|
8
|
+
Author-email: Krisztian Szucs <kszucs@users.noreply.github.com>
|
|
9
|
+
License: Apache-2.0
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Keywords: data-lake,datasets,huggingface,iceberg
|
|
12
|
+
Classifier: Development Status :: 3 - Alpha
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Requires-Python: >=3.9
|
|
21
|
+
Requires-Dist: click>=8.0.0
|
|
22
|
+
Requires-Dist: datasets>=2.0.0
|
|
23
|
+
Requires-Dist: fsspec>=2023.1.0
|
|
24
|
+
Requires-Dist: huggingface-hub>=0.20.0
|
|
25
|
+
Requires-Dist: jinja2>=3.1.6
|
|
26
|
+
Requires-Dist: litestar>=2.0.0
|
|
27
|
+
Requires-Dist: pyarrow>=21.0.0
|
|
28
|
+
Requires-Dist: pyiceberg>=0.10.0
|
|
29
|
+
Requires-Dist: pyyaml>=6.0
|
|
30
|
+
Requires-Dist: rich>=13.0.0
|
|
31
|
+
Requires-Dist: uuid-utils>=0.9.0
|
|
32
|
+
Requires-Dist: uvicorn[standard]>=0.27.0
|
|
33
|
+
Provides-Extra: dev
|
|
34
|
+
Requires-Dist: black>=23.0.0; extra == 'dev'
|
|
35
|
+
Requires-Dist: duckdb>=0.10.0; extra == 'dev'
|
|
36
|
+
Requires-Dist: mypy>=1.0.0; extra == 'dev'
|
|
37
|
+
Requires-Dist: pytest-cov>=4.0.0; extra == 'dev'
|
|
38
|
+
Requires-Dist: pytest-playwright>=0.7.0; extra == 'dev'
|
|
39
|
+
Requires-Dist: pytest>=7.0.0; extra == 'dev'
|
|
40
|
+
Requires-Dist: requests>=2.31.0; extra == 'dev'
|
|
41
|
+
Requires-Dist: ruff>=0.1.0; extra == 'dev'
|
|
42
|
+
Description-Content-Type: text/markdown
|
|
43
|
+
|
|
44
|
+

|
|
45
|
+
|
|
46
|
+
# Faceberg
|
|
47
|
+
|
|
48
|
+
**Bridge HuggingFace datasets with Apache Iceberg tables — no data copying, just metadata.**
|
|
49
|
+
|
|
50
|
+
Faceberg maps HuggingFace datasets to Apache Iceberg tables. Your catalog metadata lives on HuggingFace Spaces with an auto-deployed REST API, and any Iceberg-compatible query engine can access the data.
|
|
51
|
+
|
|
52
|
+
## Installation
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
pip install faceberg
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
## Quick Start
|
|
59
|
+
|
|
60
|
+
```bash
|
|
61
|
+
export HF_TOKEN=your_huggingface_token
|
|
62
|
+
|
|
63
|
+
# Create a catalog on HuggingFace Hub
|
|
64
|
+
faceberg user/mycatalog init
|
|
65
|
+
|
|
66
|
+
# Add datasets
|
|
67
|
+
faceberg user/mycatalog add stanfordnlp/imdb --config plain_text
|
|
68
|
+
faceberg user/mycatalog add openai/gsm8k --config main
|
|
69
|
+
|
|
70
|
+
# Query with interactive DuckDB shell
|
|
71
|
+
faceberg user/mycatalog quack
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
```sql
|
|
75
|
+
SELECT label, substr(text, 1, 100) as preview
|
|
76
|
+
FROM iceberg_catalog.stanfordnlp.imdb
|
|
77
|
+
LIMIT 10;
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
## How It Works
|
|
81
|
+
|
|
82
|
+
```
|
|
83
|
+
HuggingFace Hub
|
|
84
|
+
┌─────────────────────────────────────────────────────────┐
|
|
85
|
+
│ │
|
|
86
|
+
│ ┌─────────────────────┐ ┌─────────────────────────┐ │
|
|
87
|
+
│ │ HF Datasets │ │ HF Spaces (Catalog) │ │
|
|
88
|
+
│ │ (Original Parquet) │◄───│ • Iceberg metadata │ │
|
|
89
|
+
│ │ │ │ • REST API endpoint │ │
|
|
90
|
+
│ │ stanfordnlp/imdb/ │ │ • faceberg.yml │ │
|
|
91
|
+
│ │ └── *.parquet │ │ │ │
|
|
92
|
+
│ └─────────────────────┘ └───────────┬─────────────┘ │
|
|
93
|
+
│ │ │
|
|
94
|
+
└─────────────────────────────────────────┼───────────────┘
|
|
95
|
+
│ Iceberg REST API
|
|
96
|
+
▼
|
|
97
|
+
┌─────────────────────────┐
|
|
98
|
+
│ Query Engines │
|
|
99
|
+
│ DuckDB, Pandas, Spark │
|
|
100
|
+
└─────────────────────────┘
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
**No data is copied** — only metadata is created. Query with DuckDB, PyIceberg, Spark, or any Iceberg-compatible tool.
|
|
104
|
+
|
|
105
|
+
## Python API
|
|
106
|
+
|
|
107
|
+
```python
|
|
108
|
+
import os
|
|
109
|
+
from faceberg import catalog
|
|
110
|
+
|
|
111
|
+
cat = catalog("user/mycatalog", hf_token=os.environ.get("HF_TOKEN"))
|
|
112
|
+
table = cat.load_table("stanfordnlp.imdb")
|
|
113
|
+
df = table.scan(limit=100).to_pandas()
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
## Share Your Catalog
|
|
117
|
+
|
|
118
|
+
Your catalog is accessible to anyone via the REST API:
|
|
119
|
+
|
|
120
|
+
```python
|
|
121
|
+
import duckdb
|
|
122
|
+
|
|
123
|
+
conn = duckdb.connect()
|
|
124
|
+
conn.execute("INSTALL iceberg; LOAD iceberg")
|
|
125
|
+
conn.execute("ATTACH 'https://user-mycatalog.hf.space' AS cat (TYPE ICEBERG)")
|
|
126
|
+
|
|
127
|
+
result = conn.execute("SELECT * FROM cat.stanfordnlp.imdb LIMIT 5").fetchdf()
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
## Documentation
|
|
131
|
+
|
|
132
|
+
**[Read the docs →](https://faceberg.kszucs.dev/)**
|
|
133
|
+
|
|
134
|
+
- [Getting Started](https://faceberg.kszucs.dev/) — Full quickstart guide
|
|
135
|
+
- [Local Catalogs](https://faceberg.kszucs.dev/local.html) — Use local catalogs for development
|
|
136
|
+
- [DuckDB Integration](https://faceberg.kszucs.dev/integrations/duckdb.html) — Advanced SQL queries
|
|
137
|
+
- [Pandas Integration](https://faceberg.kszucs.dev/integrations/pandas.html) — Load into DataFrames
|
|
138
|
+
|
|
139
|
+
## Development
|
|
140
|
+
|
|
141
|
+
```bash
|
|
142
|
+
git clone https://github.com/kszucs/faceberg
|
|
143
|
+
cd faceberg
|
|
144
|
+
pip install -e .
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
## License
|
|
148
|
+
|
|
149
|
+
Apache 2.0
|
|
@@ -1,9 +1,10 @@
|
|
|
1
1
|
faceberg/__init__.py,sha256=F3fztSclzDN7_ItHopPIChWb4YwnEs5DKZ_ckslTebY,354
|
|
2
|
-
faceberg/
|
|
3
|
-
faceberg/catalog.py,sha256=
|
|
2
|
+
faceberg/_version.py,sha256=Ok5oAXdWgR9aghaFXTafTeDW6sYO3uVe6d2Nket57R4,704
|
|
3
|
+
faceberg/catalog.py,sha256=Jr_tOrPn7967VJvSRUIHLVjdwVAdOyFre6mldyfhxvk,55283
|
|
4
4
|
faceberg/cli.py,sha256=BNwv8kZ__3c8vLSOPvWLKSsrxvNsMNDfPlkMyDHLsLk,17123
|
|
5
5
|
faceberg/config.py,sha256=SjEfipfT38trjgmTA1abpY--5AOnYlt_QM8KihDUxJY,5912
|
|
6
|
-
faceberg/
|
|
6
|
+
faceberg/discover.py,sha256=HyENObguao6bwF1eFmLd1-eo8FT9_FdaVZ3-8EQ_0XU,5575
|
|
7
|
+
faceberg/iceberg.py,sha256=_ZLQGnlCKmHoVEYVpKOMVnjufja7o-2PqJc3JI116BY,27520
|
|
7
8
|
faceberg/pretty.py,sha256=PUmQbv0HJDU5Q-psR4gTT1ON62NoIYWzS2JJy5_o-pY,6806
|
|
8
9
|
faceberg/server.py,sha256=tc_ULXyQy-5KEtLVyETjecNQajoaFGCz_aw_-rzzyOY,15369
|
|
9
10
|
faceberg/shell.py,sha256=wa5r06VhrAq0I4gc3Qrl8HvSY9we0wBFxtPp6QPL2dk,2320
|
|
@@ -12,18 +13,18 @@ faceberg/spaces/README.md,sha256=mM7M7_MKI_673DCR1HpRVvb6sYK6ohtDYos5ykCNY2s,258
|
|
|
12
13
|
faceberg/spaces/landing.html,sha256=I1Oadpg58VTuWbQBSuqqA1-g1dZOKP5vCXWrxxrEV5k,27333
|
|
13
14
|
faceberg/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
14
15
|
faceberg/tests/conftest.py,sha256=rwvBIlMDhD1USNKrt65Qn5PYefX0TTyBtIPDTtaetYA,7547
|
|
15
|
-
faceberg/tests/
|
|
16
|
-
faceberg/tests/test_catalog.py,sha256=CBhFnyASQTH1sqFH_Z_RFD74tzmzi7lvXQaSKnXA1eI,49948
|
|
16
|
+
faceberg/tests/test_catalog.py,sha256=YFrSOR4ZNZ3WifFFmLDnD5evJXN2PC-4uUnVzc2b22A,49939
|
|
17
17
|
faceberg/tests/test_catalog_duckdb.py,sha256=9-nz40sbM7xmYPHa78Y5_Ag6hrpu6o--zCyUc9Fqu4w,10392
|
|
18
18
|
faceberg/tests/test_catalog_pandas.py,sha256=jOnRb_pzsTUcgaQJsm49zbw2zaxUFRvjGq-Ub_q-vBs,9854
|
|
19
19
|
faceberg/tests/test_cli.py,sha256=CTsF9SPSpU5PHdBGOQ-GAZjXNbgPy4TdNDJj0Gm6uAM,1828
|
|
20
20
|
faceberg/tests/test_config.py,sha256=gg7xaOIhTCpPryw8SpcYgPJSoEWvy3CKRp3cEg0EF-4,12040
|
|
21
|
-
faceberg/tests/
|
|
21
|
+
faceberg/tests/test_discover.py,sha256=0xDjf_E-SNdd2ysvj8L-xTN2eFm31tRBFPxk04lfDCQ,9076
|
|
22
|
+
faceberg/tests/test_iceberg.py,sha256=t_nQhQ_Jkpj_NGIE3vf4Uhzu7rJ8eqbKRoiUEXzTp4g,31112
|
|
22
23
|
faceberg/tests/test_pretty.py,sha256=o70FhUTdqWtXQI6AKVuOxtPghFvD5OV8UnG_64wSqnY,11520
|
|
23
24
|
faceberg/tests/test_server.py,sha256=MWwRCloR3R-YWOzn95eNAv8cR3gGBg9dk65skz5eLdY,13405
|
|
24
25
|
faceberg/tests/test_server_playwright.py,sha256=-Mlt53f053fvXzFHuYMCoNpC7Dc4Q4XOHWXKaAGcHpE,17555
|
|
25
|
-
faceberg-0.1.
|
|
26
|
-
faceberg-0.1.
|
|
27
|
-
faceberg-0.1.
|
|
28
|
-
faceberg-0.1.
|
|
29
|
-
faceberg-0.1.
|
|
26
|
+
faceberg-0.1.2.dist-info/METADATA,sha256=YFj2mRCf3rbIYpRlsdZc7XSFJOh8zEg-2KX1e--yJfM,5532
|
|
27
|
+
faceberg-0.1.2.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
28
|
+
faceberg-0.1.2.dist-info/entry_points.txt,sha256=FPgDHoQRBWU0Wp_i2s4_T6dOqzKovH17g4O4oYRtKcI,47
|
|
29
|
+
faceberg-0.1.2.dist-info/licenses/LICENSE,sha256=DLb11Qr5b1cU8I9DJ9Sl9vNU3m_yqyMnmKjR15tNbt0,11345
|
|
30
|
+
faceberg-0.1.2.dist-info/RECORD,,
|