earthcode 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- earthcode-0.1.0/.gitattributes +2 -0
- earthcode-0.1.0/.github/workflows/ci.yml +49 -0
- earthcode-0.1.0/.github/workflows/pages.yml +77 -0
- earthcode-0.1.0/.github/workflows/publish.yml +128 -0
- earthcode-0.1.0/.gitignore +11 -0
- earthcode-0.1.0/LICENSE +21 -0
- earthcode-0.1.0/PKG-INFO +70 -0
- earthcode-0.1.0/README.md +23 -0
- earthcode-0.1.0/_config.yml +20 -0
- earthcode-0.1.0/_toc.yml +20 -0
- earthcode-0.1.0/cli/generate_embeddings.py +159 -0
- earthcode-0.1.0/dev.ipynb +2034 -0
- earthcode-0.1.0/earthcode/__init__.py +0 -0
- earthcode-0.1.0/earthcode/fairtool.py +577 -0
- earthcode-0.1.0/earthcode/git_add.py +383 -0
- earthcode-0.1.0/earthcode/gitclerk_add.py +21 -0
- earthcode-0.1.0/earthcode/metadata_input_definitions.py +338 -0
- earthcode-0.1.0/earthcode/search.py +209 -0
- earthcode-0.1.0/earthcode/static.py +569 -0
- earthcode-0.1.0/earthcode/validator.py +605 -0
- earthcode-0.1.0/examples/contribute_via_osc_editor.ipynb +2004 -0
- earthcode-0.1.0/examples/contribute_via_pr_osc.ipynb +2279 -0
- earthcode-0.1.0/examples/earthcode_data_discovery.ipynb +8078 -0
- earthcode-0.1.0/examples/earthcode_publishing_guide.ipynb +852 -0
- earthcode-0.1.0/examples/example_create_osc_entries.ipynb +395 -0
- earthcode-0.1.0/examples/glambie_notebook_osc.ipynb +5033 -0
- earthcode-0.1.0/guide/0.Prerequisites.ipynb +97 -0
- earthcode-0.1.0/guide/1.Project.ipynb +184 -0
- earthcode-0.1.0/guide/2.0.Product.ipynb +231 -0
- earthcode-0.1.0/guide/2.1.Product_files_PRR.ipynb +144 -0
- earthcode-0.1.0/guide/2.1.Product_files_self_hosted.ipynb +163 -0
- earthcode-0.1.0/guide/3.Workflow.ipynb +152 -0
- earthcode-0.1.0/guide/4.Experiment.ipynb +166 -0
- earthcode-0.1.0/pixi.lock +13096 -0
- earthcode-0.1.0/pixi.toml +43 -0
- earthcode-0.1.0/pyproject.toml +70 -0
- earthcode-0.1.0/tests/test_creation.py +535 -0
- earthcode-0.1.0/tests/test_fairtool.py +95 -0
- earthcode-0.1.0/tests/test_notebooks.py +227 -0
- earthcode-0.1.0/tests/test_search.py +52 -0
- earthcode-0.1.0/tests/test_validation.py +275 -0
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
pull_request:
|
|
6
|
+
workflow_dispatch:
|
|
7
|
+
|
|
8
|
+
permissions:
|
|
9
|
+
contents: read
|
|
10
|
+
|
|
11
|
+
concurrency:
|
|
12
|
+
group: ci-${{ github.workflow }}-${{ github.ref }}
|
|
13
|
+
cancel-in-progress: true
|
|
14
|
+
|
|
15
|
+
defaults:
|
|
16
|
+
run:
|
|
17
|
+
shell: bash
|
|
18
|
+
working-directory: earthcode-library
|
|
19
|
+
|
|
20
|
+
jobs:
|
|
21
|
+
tests:
|
|
22
|
+
name: Tests
|
|
23
|
+
runs-on: ubuntu-latest
|
|
24
|
+
timeout-minutes: 45
|
|
25
|
+
|
|
26
|
+
steps:
|
|
27
|
+
- name: Checkout earthcode-library
|
|
28
|
+
uses: actions/checkout@v6
|
|
29
|
+
with:
|
|
30
|
+
path: earthcode-library
|
|
31
|
+
|
|
32
|
+
- name: Checkout open-science-catalog-metadata
|
|
33
|
+
uses: actions/checkout@v6
|
|
34
|
+
with:
|
|
35
|
+
repository: ESA-EarthCODE/open-science-catalog-metadata
|
|
36
|
+
path: open-science-catalog-metadata
|
|
37
|
+
|
|
38
|
+
- name: Set up Pixi
|
|
39
|
+
uses: prefix-dev/setup-pixi@v0.9.4
|
|
40
|
+
with:
|
|
41
|
+
pixi-version: v0.59.0
|
|
42
|
+
manifest-path: earthcode-library/pixi.toml
|
|
43
|
+
environments: default
|
|
44
|
+
locked: true
|
|
45
|
+
cache: true
|
|
46
|
+
cache-write: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }}
|
|
47
|
+
|
|
48
|
+
- name: Run test suite
|
|
49
|
+
run: pixi run -e default pytest
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
name: Docs
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches:
|
|
6
|
+
- main
|
|
7
|
+
paths:
|
|
8
|
+
- README.md
|
|
9
|
+
- guide/**
|
|
10
|
+
- examples/**
|
|
11
|
+
- _config.yml
|
|
12
|
+
- _toc.yml
|
|
13
|
+
- pixi.toml
|
|
14
|
+
- pixi.lock
|
|
15
|
+
- pyproject.toml
|
|
16
|
+
- .github/workflows/pages.yml
|
|
17
|
+
pull_request:
|
|
18
|
+
paths:
|
|
19
|
+
- README.md
|
|
20
|
+
- guide/**
|
|
21
|
+
- examples/**
|
|
22
|
+
- _config.yml
|
|
23
|
+
- _toc.yml
|
|
24
|
+
- pixi.toml
|
|
25
|
+
- pixi.lock
|
|
26
|
+
- pyproject.toml
|
|
27
|
+
- .github/workflows/pages.yml
|
|
28
|
+
workflow_dispatch:
|
|
29
|
+
|
|
30
|
+
permissions:
|
|
31
|
+
contents: read
|
|
32
|
+
pages: write
|
|
33
|
+
id-token: write
|
|
34
|
+
|
|
35
|
+
concurrency:
|
|
36
|
+
group: pages
|
|
37
|
+
cancel-in-progress: true
|
|
38
|
+
|
|
39
|
+
jobs:
|
|
40
|
+
build:
|
|
41
|
+
name: Build Docs
|
|
42
|
+
runs-on: ubuntu-latest
|
|
43
|
+
|
|
44
|
+
steps:
|
|
45
|
+
- name: Checkout repository
|
|
46
|
+
uses: actions/checkout@v6
|
|
47
|
+
|
|
48
|
+
- name: Set up Python
|
|
49
|
+
uses: actions/setup-python@v5
|
|
50
|
+
with:
|
|
51
|
+
python-version: "3.12"
|
|
52
|
+
cache: pip
|
|
53
|
+
|
|
54
|
+
- name: Install Jupyter Book
|
|
55
|
+
run: python -m pip install --upgrade "jupyter-book>=1.0.4,<2"
|
|
56
|
+
|
|
57
|
+
- name: Build site
|
|
58
|
+
run: jupyter-book build .
|
|
59
|
+
|
|
60
|
+
- name: Upload Pages artifact
|
|
61
|
+
uses: actions/upload-pages-artifact@v3
|
|
62
|
+
with:
|
|
63
|
+
path: _build/html
|
|
64
|
+
|
|
65
|
+
deploy:
|
|
66
|
+
name: Deploy Docs
|
|
67
|
+
if: github.event_name == 'push' && github.ref == 'refs/heads/main'
|
|
68
|
+
needs: build
|
|
69
|
+
runs-on: ubuntu-latest
|
|
70
|
+
environment:
|
|
71
|
+
name: github-pages
|
|
72
|
+
url: ${{ steps.deployment.outputs.page_url }}
|
|
73
|
+
|
|
74
|
+
steps:
|
|
75
|
+
- name: Deploy to GitHub Pages
|
|
76
|
+
id: deployment
|
|
77
|
+
uses: actions/deploy-pages@v4
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
# name: Publish
|
|
2
|
+
|
|
3
|
+
# on:
|
|
4
|
+
# push:
|
|
5
|
+
# tags:
|
|
6
|
+
# - "v*"
|
|
7
|
+
# workflow_dispatch:
|
|
8
|
+
|
|
9
|
+
# permissions:
|
|
10
|
+
# contents: read
|
|
11
|
+
|
|
12
|
+
# concurrency:
|
|
13
|
+
# group: publish-${{ github.ref }}
|
|
14
|
+
# cancel-in-progress: false
|
|
15
|
+
|
|
16
|
+
# jobs:
|
|
17
|
+
# build:
|
|
18
|
+
# name: Build Distribution
|
|
19
|
+
# runs-on: ubuntu-latest
|
|
20
|
+
|
|
21
|
+
# steps:
|
|
22
|
+
# - name: Checkout repository
|
|
23
|
+
# uses: actions/checkout@v6
|
|
24
|
+
# with:
|
|
25
|
+
# persist-credentials: false
|
|
26
|
+
|
|
27
|
+
# - name: Set up Python
|
|
28
|
+
# uses: actions/setup-python@v6
|
|
29
|
+
# with:
|
|
30
|
+
# python-version: "3.12"
|
|
31
|
+
|
|
32
|
+
# - name: Install build tools
|
|
33
|
+
# run: python -m pip install --upgrade build twine
|
|
34
|
+
|
|
35
|
+
# - name: Build distribution artifacts
|
|
36
|
+
# run: python -m build
|
|
37
|
+
|
|
38
|
+
# - name: Check distribution metadata
|
|
39
|
+
# run: python -m twine check dist/*
|
|
40
|
+
|
|
41
|
+
# - name: Upload distribution artifacts
|
|
42
|
+
# uses: actions/upload-artifact@v5
|
|
43
|
+
# with:
|
|
44
|
+
# name: python-package-distributions
|
|
45
|
+
# path: dist/
|
|
46
|
+
|
|
47
|
+
# publish-to-testpypi:
|
|
48
|
+
# name: Publish to TestPyPI
|
|
49
|
+
# if: github.event_name == 'workflow_dispatch' && github.ref == 'refs/heads/main'
|
|
50
|
+
# needs: build
|
|
51
|
+
# runs-on: ubuntu-latest
|
|
52
|
+
# environment:
|
|
53
|
+
# name: testpypi
|
|
54
|
+
# url: https://test.pypi.org/p/earthcode
|
|
55
|
+
# permissions:
|
|
56
|
+
# id-token: write
|
|
57
|
+
|
|
58
|
+
# steps:
|
|
59
|
+
# - name: Download distribution artifacts
|
|
60
|
+
# uses: actions/download-artifact@v6
|
|
61
|
+
# with:
|
|
62
|
+
# name: python-package-distributions
|
|
63
|
+
# path: dist/
|
|
64
|
+
|
|
65
|
+
# - name: Publish to TestPyPI
|
|
66
|
+
# uses: pypa/gh-action-pypi-publish@release/v1
|
|
67
|
+
# with:
|
|
68
|
+
# repository-url: https://test.pypi.org/legacy/
|
|
69
|
+
|
|
70
|
+
# publish-to-pypi:
|
|
71
|
+
# name: Publish to PyPI
|
|
72
|
+
# if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
|
|
73
|
+
# needs: build
|
|
74
|
+
# runs-on: ubuntu-latest
|
|
75
|
+
# environment:
|
|
76
|
+
# name: pypi
|
|
77
|
+
# url: https://pypi.org/p/earthcode
|
|
78
|
+
# permissions:
|
|
79
|
+
# id-token: write
|
|
80
|
+
|
|
81
|
+
# steps:
|
|
82
|
+
# - name: Download distribution artifacts
|
|
83
|
+
# uses: actions/download-artifact@v6
|
|
84
|
+
# with:
|
|
85
|
+
# name: python-package-distributions
|
|
86
|
+
# path: dist/
|
|
87
|
+
|
|
88
|
+
# - name: Publish to PyPI
|
|
89
|
+
# uses: pypa/gh-action-pypi-publish@release/v1
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
# Best Practices
|
|
94
|
+
|
|
95
|
+
# Use Trusted Publishing, not long-lived PyPI API tokens.
|
|
96
|
+
# Publish real releases only from version tags.
|
|
97
|
+
# Build once and publish the exact same dist/ artifacts.
|
|
98
|
+
# Use TestPyPI first.
|
|
99
|
+
# Require manual approval on the GitHub pypi environment.
|
|
100
|
+
# Keep the trusted workflow file path and environment names stable.
|
|
101
|
+
# PyPI matches on exact repo, workflow filename, and optional environment name.
|
|
102
|
+
# Don’t make this a reusable workflow.
|
|
103
|
+
# PyPI currently does not support reusable workflows as trusted publishers.
|
|
104
|
+
# Treat versions as immutable.
|
|
105
|
+
# If you publish 0.1.0 to TestPyPI and need another try, bump to something like 0.1.0rc1, 0.1.0rc2, etc.
|
|
106
|
+
# What You Need To Do
|
|
107
|
+
|
|
108
|
+
# Commit and push publish.yml (line 1).
|
|
109
|
+
# In GitHub, create two environments in the repo settings:
|
|
110
|
+
# testpypi
|
|
111
|
+
# pypi
|
|
112
|
+
# On the pypi environment, require manual approval.
|
|
113
|
+
# On PyPI, add a pending Trusted Publisher for:
|
|
114
|
+
# package name: earthcode
|
|
115
|
+
# owner: ESA-EarthCODE
|
|
116
|
+
# repository: earthcode-library
|
|
117
|
+
# workflow file: .github/workflows/publish.yml
|
|
118
|
+
# environment: pypi
|
|
119
|
+
# On TestPyPI, add the same pending publisher, but with environment testpypi.
|
|
120
|
+
# If you don’t already have one, create a separate TestPyPI account.
|
|
121
|
+
# Run the workflow manually from main once to publish to TestPyPI.
|
|
122
|
+
# Verify install from TestPyPI with:
|
|
123
|
+
# pip install --index-url https://test.pypi.org/simple --extra-index-url https://pypi.org/simple earthcode==0.1.0
|
|
124
|
+
# For a real release:
|
|
125
|
+
# bump version in pyproject.toml (line 3)
|
|
126
|
+
# commit it
|
|
127
|
+
# tag it
|
|
128
|
+
# push the tag
|
earthcode-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 EarthCODE
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
earthcode-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: earthcode
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Tools for creating, validating, and searching EarthCODE Open Science Catalog metadata.
|
|
5
|
+
Project-URL: Homepage, https://github.com/ESA-EarthCODE/earthcode-library
|
|
6
|
+
Project-URL: Repository, https://github.com/ESA-EarthCODE/earthcode-library
|
|
7
|
+
Project-URL: Issues, https://github.com/ESA-EarthCODE/earthcode-library/issues
|
|
8
|
+
Project-URL: Documentation, https://esa-earthcode.github.io/earthcode-library/
|
|
9
|
+
Author-email: Krasen Samardzhiev <krasensam@gmail.com>, Deyan Samardzhiev <dean@lampata.co.uk>
|
|
10
|
+
License-Expression: MIT
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Keywords: earth observation,earthcode,geospatial
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
18
|
+
Classifier: Topic :: Scientific/Engineering
|
|
19
|
+
Requires-Python: >=3.12
|
|
20
|
+
Requires-Dist: bottleneck<2,>=1.6.0
|
|
21
|
+
Requires-Dist: dask<2026,>=2025.12.0
|
|
22
|
+
Requires-Dist: fastembed<0.8,>=0.7.4
|
|
23
|
+
Requires-Dist: fsspec<2026,>=2025.10.0
|
|
24
|
+
Requires-Dist: geopandas<2,>=1.1.2
|
|
25
|
+
Requires-Dist: jsonschema<5,>=4.25.1
|
|
26
|
+
Requires-Dist: netcdf4<2,>=1.7.3
|
|
27
|
+
Requires-Dist: numpy<3,>=2.4.1
|
|
28
|
+
Requires-Dist: pandas<4,>=3.0.0
|
|
29
|
+
Requires-Dist: pillow<12,>=10.3
|
|
30
|
+
Requires-Dist: pyarrow<24,>=23.0.0
|
|
31
|
+
Requires-Dist: pydantic<3,>=2.12.5
|
|
32
|
+
Requires-Dist: pylance<0.25,>=0.24.0
|
|
33
|
+
Requires-Dist: pystac<2,>=1.14.1
|
|
34
|
+
Requires-Dist: requests<3,>=2.32.5
|
|
35
|
+
Requires-Dist: rioxarray<0.21,>=0.20.0
|
|
36
|
+
Requires-Dist: shapely<3,>=2.1.2
|
|
37
|
+
Requires-Dist: xarray<2026,>=2025.12.0
|
|
38
|
+
Requires-Dist: xstac<2,>=1.2.0
|
|
39
|
+
Requires-Dist: zarr<4,>=3.1.5
|
|
40
|
+
Provides-Extra: dev
|
|
41
|
+
Requires-Dist: jupyter-book<2,>=1.0.4; extra == 'dev'
|
|
42
|
+
Requires-Dist: jupyterlab<5,>=4.5.0; extra == 'dev'
|
|
43
|
+
Requires-Dist: papermill<3,>=2.7.0; extra == 'dev'
|
|
44
|
+
Requires-Dist: pytest-cov<8,>=7.0.0; extra == 'dev'
|
|
45
|
+
Requires-Dist: pytest<10,>=9.0.2; extra == 'dev'
|
|
46
|
+
Description-Content-Type: text/markdown
|
|
47
|
+
|
|
48
|
+
# earthcode
|
|
49
|
+
|
|
50
|
+
Python tools for creating, validating, and searching EarthCODE Open Science Catalog metadata.
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
pip install earthcode
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
## Development
|
|
57
|
+
|
|
58
|
+
To run:
|
|
59
|
+
|
|
60
|
+
1. `git clone https://github.com/ESA-EarthCODE/earthcode-library.git`
|
|
61
|
+
2. Install pixi - https://pixi.sh/dev/installation/
|
|
62
|
+
3. `cd earthcode-library`
|
|
63
|
+
4. `pixi install`
|
|
64
|
+
5. `pixi run jupyter lab`
|
|
65
|
+
|
|
66
|
+
We have examples for:
|
|
67
|
+
- `./examples/example_create_osc_entries.ipynb` - shows how to create OSC entries
|
|
68
|
+
- `./examples/contribute_via_pr_osc.ipynb` - shows how to add newly created entries to the OSC, using a GitHub pull request
|
|
69
|
+
- `./examples/contribute_via_osc_editor.ipynb` - shows how to add entries to the OSC, using a combination of this library and the OSC Editor (a GUI tool)
|
|
70
|
+
- `./examples/earthcode_publishing_guide.ipynb` - is a simplified introduction to the OSC and the necessary steps to publish data
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
# earthcode
|
|
2
|
+
|
|
3
|
+
Python tools for creating, validating, and searching EarthCODE Open Science Catalog metadata.
|
|
4
|
+
|
|
5
|
+
```bash
|
|
6
|
+
pip install earthcode
|
|
7
|
+
```
|
|
8
|
+
|
|
9
|
+
## Development
|
|
10
|
+
|
|
11
|
+
To run:
|
|
12
|
+
|
|
13
|
+
1. `git clone https://github.com/ESA-EarthCODE/earthcode-library.git`
|
|
14
|
+
2. Install pixi - https://pixi.sh/dev/installation/
|
|
15
|
+
3. `cd earthcode-library`
|
|
16
|
+
4. `pixi install`
|
|
17
|
+
5. `pixi run jupyter lab`
|
|
18
|
+
|
|
19
|
+
We have examples for:
|
|
20
|
+
- `./examples/example_create_osc_entries.ipynb` - shows how to create OSC entries
|
|
21
|
+
- `./examples/contribute_via_pr_osc.ipynb` - shows how to add newly created entries to the OSC, using a GitHub pull request
|
|
22
|
+
- `./examples/contribute_via_osc_editor.ipynb` - shows how to add entries to the OSC, using a combination of this library and the OSC Editor (a GUI tool)
|
|
23
|
+
- `./examples/earthcode_publishing_guide.ipynb` - is a simplified introduction to the OSC and the necessary steps to publish data
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
title: earthcode
|
|
2
|
+
author: EarthCODE
|
|
3
|
+
copyright: "2026"
|
|
4
|
+
only_build_toc_files: true
|
|
5
|
+
exclude_patterns:
|
|
6
|
+
- _build
|
|
7
|
+
- .github/*
|
|
8
|
+
- .pixi/*
|
|
9
|
+
- .pytest_cache/*
|
|
10
|
+
- "**.ipynb_checkpoints"
|
|
11
|
+
execute:
|
|
12
|
+
execute_notebooks: "off"
|
|
13
|
+
repository:
|
|
14
|
+
url: https://github.com/ESA-EarthCODE/earthcode-library
|
|
15
|
+
path_to_book: .
|
|
16
|
+
branch: main
|
|
17
|
+
html:
|
|
18
|
+
use_repository_button: true
|
|
19
|
+
use_issues_button: true
|
|
20
|
+
home_page_in_navbar: true
|
earthcode-0.1.0/_toc.yml
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
format: jb-book
|
|
2
|
+
root: README
|
|
3
|
+
parts:
|
|
4
|
+
- caption: Guide
|
|
5
|
+
chapters:
|
|
6
|
+
- file: guide/0.Prerequisites
|
|
7
|
+
- file: guide/1.Project
|
|
8
|
+
- file: guide/2.0.Product
|
|
9
|
+
- file: guide/2.1.Product_files_PRR
|
|
10
|
+
- file: guide/2.1.Product_files_self_hosted
|
|
11
|
+
- file: guide/3.Workflow
|
|
12
|
+
- file: guide/4.Experiment
|
|
13
|
+
- caption: Examples
|
|
14
|
+
chapters:
|
|
15
|
+
- file: examples/earthcode_publishing_guide
|
|
16
|
+
- file: examples/earthcode_data_discovery
|
|
17
|
+
- file: examples/example_create_osc_entries
|
|
18
|
+
- file: examples/contribute_via_pr_osc
|
|
19
|
+
- file: examples/contribute_via_osc_editor
|
|
20
|
+
- file: examples/glambie_notebook_osc
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Load documents from the specified STAC open science catalog and upload to s3 storage as lance table.
|
|
3
|
+
|
|
4
|
+
Currently only handles OSC collections/catalogs for 'products', 'variables', 'eo-missions', and 'projects'.
|
|
5
|
+
It does NOT handle the stac items within collections. In future this can be handled with multiple indexes and tables.
|
|
6
|
+
|
|
7
|
+
- Build (defaults baked in): `pixi run python generate_embeddings.py`
|
|
8
|
+
- Explicit build: `pixi run python cli/generate_embeddings.py`
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
Returns:
|
|
12
|
+
None
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
import argparse
|
|
16
|
+
import json
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
import lance
|
|
19
|
+
import numpy as np
|
|
20
|
+
import pyarrow as pa
|
|
21
|
+
from fastembed import TextEmbedding
|
|
22
|
+
|
|
23
|
+
DEFAULT_ROOT_DIR = "../open-science-catalog-metadata"
|
|
24
|
+
DEFAULT_GROUPS = ["products", "variables", "eo-missions", "projects"]
|
|
25
|
+
DEFAULT_LANCE_URI = "s3://pangeo-test-fires/vector_store_v5/"
|
|
26
|
+
LANCE_BASE_STORAGE_OPTIONS = {
|
|
27
|
+
"region": "eu-west-2",
|
|
28
|
+
"aws_skip_signature": "true",
|
|
29
|
+
} # to be implemented
|
|
30
|
+
MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
# ---------------------------- helpers ---------------------------- #
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def extract_theme_ids(data):
|
|
37
|
+
theme_ids = []
|
|
38
|
+
for theme in data.get("themes", []):
|
|
39
|
+
for concept in theme.get("concepts", []):
|
|
40
|
+
cid = concept.get("id")
|
|
41
|
+
if cid:
|
|
42
|
+
theme_ids.append(str(cid))
|
|
43
|
+
return theme_ids
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def flatten_metadata(data):
|
|
47
|
+
parts = []
|
|
48
|
+
parts.append(data.get("id", ""))
|
|
49
|
+
parts.append(data.get("title", ""))
|
|
50
|
+
parts.append(data.get("description", ""))
|
|
51
|
+
parts.extend(data.get("keywords", []))
|
|
52
|
+
parts.extend(data.get("osc:variables", []))
|
|
53
|
+
parts.extend(data.get("osc:missions", []))
|
|
54
|
+
parts.extend(extract_theme_ids(data))
|
|
55
|
+
return "\n".join(p for p in parts if p)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def create_row_from_stac_file(path, group):
|
|
59
|
+
data = json.loads(path.read_text())
|
|
60
|
+
theme_ids = extract_theme_ids(data)
|
|
61
|
+
variable_ids = [str(v) for v in data.get("osc:variables", []) if v]
|
|
62
|
+
mission_ids = [str(m) for m in data.get("osc:missions", []) if m]
|
|
63
|
+
keywords = [str(k) for k in data.get("keywords", []) if k]
|
|
64
|
+
bboxes = data.get("extent", {}).get("spatial", {}).get("bbox") or [
|
|
65
|
+
[-180, -90, 180, 90]
|
|
66
|
+
]
|
|
67
|
+
bminx = min([b[0] for b in bboxes if len(b) >= 4], default=None)
|
|
68
|
+
bminy = min([b[1] for b in bboxes if len(b) >= 4], default=None)
|
|
69
|
+
bmaxx = max([b[2] for b in bboxes if len(b) >= 4], default=None)
|
|
70
|
+
bmaxy = max([b[3] for b in bboxes if len(b) >= 4], default=None)
|
|
71
|
+
return {
|
|
72
|
+
"id": data.get("id", path.parent.name),
|
|
73
|
+
"group": group,
|
|
74
|
+
"title": data.get("title", ""),
|
|
75
|
+
"description": data.get("description", ""),
|
|
76
|
+
"path": str(path),
|
|
77
|
+
"bbox_minx": bminx,
|
|
78
|
+
"bbox_miny": bminy,
|
|
79
|
+
"bbox_maxx": bmaxx,
|
|
80
|
+
"bbox_maxy": bmaxy,
|
|
81
|
+
"item_json": json.dumps(data),
|
|
82
|
+
"theme_ids": f"|{'|'.join(theme_ids)}|" if theme_ids else "",
|
|
83
|
+
"variable_ids": f"|{'|'.join(variable_ids)}|" if variable_ids else "",
|
|
84
|
+
"mission_ids": f"|{'|'.join(mission_ids)}|" if mission_ids else "",
|
|
85
|
+
"keywords": f"|{'|'.join(keywords)}|" if keywords else "",
|
|
86
|
+
"text": flatten_metadata(data),
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def load_documents(stac_dir, group):
|
|
91
|
+
if not stac_dir.exists():
|
|
92
|
+
raise FileNotFoundError(f"Group dir not found: {stac_dir}")
|
|
93
|
+
|
|
94
|
+
targets = list(stac_dir.glob("**/collection.json")) + list(
|
|
95
|
+
stac_dir.glob("**/catalog.json")
|
|
96
|
+
)
|
|
97
|
+
rows = [create_row_from_stac_file(p, group) for p in sorted(targets)]
|
|
98
|
+
|
|
99
|
+
if not rows:
|
|
100
|
+
raise RuntimeError(f"No STAC collections/catalogs found under {stac_dir}")
|
|
101
|
+
return rows
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def build_embeddings(texts, model_name):
|
|
105
|
+
model = TextEmbedding(model_name=model_name)
|
|
106
|
+
return np.asarray(list(model.embed(texts)), dtype=np.float32)
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
# ----------------------------- main ------------------------------ #
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def main():
|
|
113
|
+
parser = argparse.ArgumentParser(description="Build Lance dataset.")
|
|
114
|
+
parser.add_argument(
|
|
115
|
+
"--root-dir",
|
|
116
|
+
default=DEFAULT_ROOT_DIR,
|
|
117
|
+
help="Base OSC metadata dir containing group folders.",
|
|
118
|
+
)
|
|
119
|
+
parser.add_argument(
|
|
120
|
+
"--groups",
|
|
121
|
+
nargs="+",
|
|
122
|
+
default=DEFAULT_GROUPS,
|
|
123
|
+
help="Group folder names under root-dir to ingest (e.g., products variables eo-missions projects).",
|
|
124
|
+
)
|
|
125
|
+
parser.add_argument(
|
|
126
|
+
"--lance-uri",
|
|
127
|
+
default=DEFAULT_LANCE_URI,
|
|
128
|
+
help="Where to write the Lance dataset.",
|
|
129
|
+
)
|
|
130
|
+
parser.add_argument(
|
|
131
|
+
"--model", default=MODEL_NAME, help="FastEmbed model name."
|
|
132
|
+
)
|
|
133
|
+
args = parser.parse_args()
|
|
134
|
+
|
|
135
|
+
# get documents in pyarrow table
|
|
136
|
+
root = Path(args.root_dir)
|
|
137
|
+
rows = [row for grp in args.groups for row in load_documents(root / grp, grp)]
|
|
138
|
+
|
|
139
|
+
# build embeddings
|
|
140
|
+
texts = [r["text"] for r in rows]
|
|
141
|
+
embeddings = build_embeddings(texts, args.model)
|
|
142
|
+
embed_array = pa.FixedSizeListArray.from_arrays(
|
|
143
|
+
pa.array(embeddings.astype(np.float32).ravel(), type=pa.float32()),
|
|
144
|
+
embeddings.shape[1],
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
# build and write lance dataset
|
|
148
|
+
table = pa.Table.from_pylist(rows)
|
|
149
|
+
table = table.append_column("embedding", embed_array)
|
|
150
|
+
table = table.drop(["text"])
|
|
151
|
+
lance.write_dataset(table, args.lance_uri, mode="overwrite")
|
|
152
|
+
|
|
153
|
+
print(
|
|
154
|
+
f"Wrote {table.num_rows} rows to {args.lance_uri} with dim={embeddings.shape[1]}"
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
if __name__ == "__main__":
|
|
159
|
+
main()
|