earthcode 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. earthcode-0.1.0/.gitattributes +2 -0
  2. earthcode-0.1.0/.github/workflows/ci.yml +49 -0
  3. earthcode-0.1.0/.github/workflows/pages.yml +77 -0
  4. earthcode-0.1.0/.github/workflows/publish.yml +128 -0
  5. earthcode-0.1.0/.gitignore +11 -0
  6. earthcode-0.1.0/LICENSE +21 -0
  7. earthcode-0.1.0/PKG-INFO +70 -0
  8. earthcode-0.1.0/README.md +23 -0
  9. earthcode-0.1.0/_config.yml +20 -0
  10. earthcode-0.1.0/_toc.yml +20 -0
  11. earthcode-0.1.0/cli/generate_embeddings.py +159 -0
  12. earthcode-0.1.0/dev.ipynb +2034 -0
  13. earthcode-0.1.0/earthcode/__init__.py +0 -0
  14. earthcode-0.1.0/earthcode/fairtool.py +577 -0
  15. earthcode-0.1.0/earthcode/git_add.py +383 -0
  16. earthcode-0.1.0/earthcode/gitclerk_add.py +21 -0
  17. earthcode-0.1.0/earthcode/metadata_input_definitions.py +338 -0
  18. earthcode-0.1.0/earthcode/search.py +209 -0
  19. earthcode-0.1.0/earthcode/static.py +569 -0
  20. earthcode-0.1.0/earthcode/validator.py +605 -0
  21. earthcode-0.1.0/examples/contribute_via_osc_editor.ipynb +2004 -0
  22. earthcode-0.1.0/examples/contribute_via_pr_osc.ipynb +2279 -0
  23. earthcode-0.1.0/examples/earthcode_data_discovery.ipynb +8078 -0
  24. earthcode-0.1.0/examples/earthcode_publishing_guide.ipynb +852 -0
  25. earthcode-0.1.0/examples/example_create_osc_entries.ipynb +395 -0
  26. earthcode-0.1.0/examples/glambie_notebook_osc.ipynb +5033 -0
  27. earthcode-0.1.0/guide/0.Prerequisites.ipynb +97 -0
  28. earthcode-0.1.0/guide/1.Project.ipynb +184 -0
  29. earthcode-0.1.0/guide/2.0.Product.ipynb +231 -0
  30. earthcode-0.1.0/guide/2.1.Product_files_PRR.ipynb +144 -0
  31. earthcode-0.1.0/guide/2.1.Product_files_self_hosted.ipynb +163 -0
  32. earthcode-0.1.0/guide/3.Workflow.ipynb +152 -0
  33. earthcode-0.1.0/guide/4.Experiment.ipynb +166 -0
  34. earthcode-0.1.0/pixi.lock +13096 -0
  35. earthcode-0.1.0/pixi.toml +43 -0
  36. earthcode-0.1.0/pyproject.toml +70 -0
  37. earthcode-0.1.0/tests/test_creation.py +535 -0
  38. earthcode-0.1.0/tests/test_fairtool.py +95 -0
  39. earthcode-0.1.0/tests/test_notebooks.py +227 -0
  40. earthcode-0.1.0/tests/test_search.py +52 -0
  41. earthcode-0.1.0/tests/test_validation.py +275 -0
@@ -0,0 +1,2 @@
1
+ # SCM syntax highlighting & preventing 3-way merges
2
+ pixi.lock merge=binary linguist-language=YAML linguist-generated=true
@@ -0,0 +1,49 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ pull_request:
6
+ workflow_dispatch:
7
+
8
+ permissions:
9
+ contents: read
10
+
11
+ concurrency:
12
+ group: ci-${{ github.workflow }}-${{ github.ref }}
13
+ cancel-in-progress: true
14
+
15
+ defaults:
16
+ run:
17
+ shell: bash
18
+ working-directory: earthcode-library
19
+
20
+ jobs:
21
+ tests:
22
+ name: Tests
23
+ runs-on: ubuntu-latest
24
+ timeout-minutes: 45
25
+
26
+ steps:
27
+ - name: Checkout earthcode-library
28
+ uses: actions/checkout@v6
29
+ with:
30
+ path: earthcode-library
31
+
32
+ - name: Checkout open-science-catalog-metadata
33
+ uses: actions/checkout@v6
34
+ with:
35
+ repository: ESA-EarthCODE/open-science-catalog-metadata
36
+ path: open-science-catalog-metadata
37
+
38
+ - name: Set up Pixi
39
+ uses: prefix-dev/setup-pixi@v0.9.4
40
+ with:
41
+ pixi-version: v0.59.0
42
+ manifest-path: earthcode-library/pixi.toml
43
+ environments: default
44
+ locked: true
45
+ cache: true
46
+ cache-write: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }}
47
+
48
+ - name: Run test suite
49
+ run: pixi run -e default pytest
@@ -0,0 +1,77 @@
1
+ name: Docs
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - main
7
+ paths:
8
+ - README.md
9
+ - guide/**
10
+ - examples/**
11
+ - _config.yml
12
+ - _toc.yml
13
+ - pixi.toml
14
+ - pixi.lock
15
+ - pyproject.toml
16
+ - .github/workflows/pages.yml
17
+ pull_request:
18
+ paths:
19
+ - README.md
20
+ - guide/**
21
+ - examples/**
22
+ - _config.yml
23
+ - _toc.yml
24
+ - pixi.toml
25
+ - pixi.lock
26
+ - pyproject.toml
27
+ - .github/workflows/pages.yml
28
+ workflow_dispatch:
29
+
30
+ permissions:
31
+ contents: read
32
+ pages: write
33
+ id-token: write
34
+
35
+ concurrency:
36
+ group: pages
37
+ cancel-in-progress: true
38
+
39
+ jobs:
40
+ build:
41
+ name: Build Docs
42
+ runs-on: ubuntu-latest
43
+
44
+ steps:
45
+ - name: Checkout repository
46
+ uses: actions/checkout@v6
47
+
48
+ - name: Set up Python
49
+ uses: actions/setup-python@v5
50
+ with:
51
+ python-version: "3.12"
52
+ cache: pip
53
+
54
+ - name: Install Jupyter Book
55
+ run: python -m pip install --upgrade "jupyter-book>=1.0.4,<2"
56
+
57
+ - name: Build site
58
+ run: jupyter-book build .
59
+
60
+ - name: Upload Pages artifact
61
+ uses: actions/upload-pages-artifact@v3
62
+ with:
63
+ path: _build/html
64
+
65
+ deploy:
66
+ name: Deploy Docs
67
+ if: github.event_name == 'push' && github.ref == 'refs/heads/main'
68
+ needs: build
69
+ runs-on: ubuntu-latest
70
+ environment:
71
+ name: github-pages
72
+ url: ${{ steps.deployment.outputs.page_url }}
73
+
74
+ steps:
75
+ - name: Deploy to GitHub Pages
76
+ id: deployment
77
+ uses: actions/deploy-pages@v4
@@ -0,0 +1,128 @@
1
+ # name: Publish
2
+
3
+ # on:
4
+ # push:
5
+ # tags:
6
+ # - "v*"
7
+ # workflow_dispatch:
8
+
9
+ # permissions:
10
+ # contents: read
11
+
12
+ # concurrency:
13
+ # group: publish-${{ github.ref }}
14
+ # cancel-in-progress: false
15
+
16
+ # jobs:
17
+ # build:
18
+ # name: Build Distribution
19
+ # runs-on: ubuntu-latest
20
+
21
+ # steps:
22
+ # - name: Checkout repository
23
+ # uses: actions/checkout@v6
24
+ # with:
25
+ # persist-credentials: false
26
+
27
+ # - name: Set up Python
28
+ # uses: actions/setup-python@v6
29
+ # with:
30
+ # python-version: "3.12"
31
+
32
+ # - name: Install build tools
33
+ # run: python -m pip install --upgrade build twine
34
+
35
+ # - name: Build distribution artifacts
36
+ # run: python -m build
37
+
38
+ # - name: Check distribution metadata
39
+ # run: python -m twine check dist/*
40
+
41
+ # - name: Upload distribution artifacts
42
+ # uses: actions/upload-artifact@v5
43
+ # with:
44
+ # name: python-package-distributions
45
+ # path: dist/
46
+
47
+ # publish-to-testpypi:
48
+ # name: Publish to TestPyPI
49
+ # if: github.event_name == 'workflow_dispatch' && github.ref == 'refs/heads/main'
50
+ # needs: build
51
+ # runs-on: ubuntu-latest
52
+ # environment:
53
+ # name: testpypi
54
+ # url: https://test.pypi.org/p/earthcode
55
+ # permissions:
56
+ # id-token: write
57
+
58
+ # steps:
59
+ # - name: Download distribution artifacts
60
+ # uses: actions/download-artifact@v6
61
+ # with:
62
+ # name: python-package-distributions
63
+ # path: dist/
64
+
65
+ # - name: Publish to TestPyPI
66
+ # uses: pypa/gh-action-pypi-publish@release/v1
67
+ # with:
68
+ # repository-url: https://test.pypi.org/legacy/
69
+
70
+ # publish-to-pypi:
71
+ # name: Publish to PyPI
72
+ # if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
73
+ # needs: build
74
+ # runs-on: ubuntu-latest
75
+ # environment:
76
+ # name: pypi
77
+ # url: https://pypi.org/p/earthcode
78
+ # permissions:
79
+ # id-token: write
80
+
81
+ # steps:
82
+ # - name: Download distribution artifacts
83
+ # uses: actions/download-artifact@v6
84
+ # with:
85
+ # name: python-package-distributions
86
+ # path: dist/
87
+
88
+ # - name: Publish to PyPI
89
+ # uses: pypa/gh-action-pypi-publish@release/v1
90
+
91
+
92
+
93
+ # Best Practices
94
+
95
+ # Use Trusted Publishing, not long-lived PyPI API tokens.
96
+ # Publish real releases only from version tags.
97
+ # Build once and publish the exact same dist/ artifacts.
98
+ # Use TestPyPI first.
99
+ # Require manual approval on the GitHub pypi environment.
100
+ # Keep the trusted workflow file path and environment names stable.
101
+ # PyPI matches on exact repo, workflow filename, and optional environment name.
102
+ # Don’t make this a reusable workflow.
103
+ # PyPI currently does not support reusable workflows as trusted publishers.
104
+ # Treat versions as immutable.
105
+ # If you publish 0.1.0 to TestPyPI and need another try, bump to something like 0.1.0rc1, 0.1.0rc2, etc.
106
+ # What You Need To Do
107
+
108
+ # Commit and push publish.yml (line 1).
109
+ # In GitHub, create two environments in the repo settings:
110
+ # testpypi
111
+ # pypi
112
+ # On the pypi environment, require manual approval.
113
+ # On PyPI, add a pending Trusted Publisher for:
114
+ # package name: earthcode
115
+ # owner: ESA-EarthCODE
116
+ # repository: earthcode-library
117
+ # workflow file: .github/workflows/publish.yml
118
+ # environment: pypi
119
+ # On TestPyPI, add the same pending publisher, but with environment testpypi.
120
+ # If you don’t already have one, create a separate TestPyPI account.
121
+ # Run the workflow manually from main once to publish to TestPyPI.
122
+ # Verify install from TestPyPI with:
123
+ # pip install --index-url https://test.pypi.org/simple --extra-index-url https://pypi.org/simple earthcode==0.1.0
124
+ # For a real release:
125
+ # bump version in pyproject.toml (line 3)
126
+ # commit it
127
+ # tag it
128
+ # push the tag
@@ -0,0 +1,11 @@
1
+
2
+ # pixi environments
3
+ .pixi
4
+ *.egg-info
5
+ *__pycache__*
6
+ *.json
7
+ test*.ipynb
8
+ open-science-catalog-metadata
9
+ htmlcov/
10
+ *.coverage
11
+ _build/
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 EarthCODE
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,70 @@
1
+ Metadata-Version: 2.4
2
+ Name: earthcode
3
+ Version: 0.1.0
4
+ Summary: Tools for creating, validating, and searching EarthCODE Open Science Catalog metadata.
5
+ Project-URL: Homepage, https://github.com/ESA-EarthCODE/earthcode-library
6
+ Project-URL: Repository, https://github.com/ESA-EarthCODE/earthcode-library
7
+ Project-URL: Issues, https://github.com/ESA-EarthCODE/earthcode-library/issues
8
+ Project-URL: Documentation, https://esa-earthcode.github.io/earthcode-library/
9
+ Author-email: Krasen Samardzhiev <krasensam@gmail.com>, Deyan Samardzhiev <dean@lampata.co.uk>
10
+ License-Expression: MIT
11
+ License-File: LICENSE
12
+ Keywords: earth observation,earthcode,geospatial
13
+ Classifier: Intended Audience :: Science/Research
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Programming Language :: Python :: 3.13
18
+ Classifier: Topic :: Scientific/Engineering
19
+ Requires-Python: >=3.12
20
+ Requires-Dist: bottleneck<2,>=1.6.0
21
+ Requires-Dist: dask<2026,>=2025.12.0
22
+ Requires-Dist: fastembed<0.8,>=0.7.4
23
+ Requires-Dist: fsspec<2026,>=2025.10.0
24
+ Requires-Dist: geopandas<2,>=1.1.2
25
+ Requires-Dist: jsonschema<5,>=4.25.1
26
+ Requires-Dist: netcdf4<2,>=1.7.3
27
+ Requires-Dist: numpy<3,>=2.4.1
28
+ Requires-Dist: pandas<4,>=3.0.0
29
+ Requires-Dist: pillow<12,>=10.3
30
+ Requires-Dist: pyarrow<24,>=23.0.0
31
+ Requires-Dist: pydantic<3,>=2.12.5
32
+ Requires-Dist: pylance<0.25,>=0.24.0
33
+ Requires-Dist: pystac<2,>=1.14.1
34
+ Requires-Dist: requests<3,>=2.32.5
35
+ Requires-Dist: rioxarray<0.21,>=0.20.0
36
+ Requires-Dist: shapely<3,>=2.1.2
37
+ Requires-Dist: xarray<2026,>=2025.12.0
38
+ Requires-Dist: xstac<2,>=1.2.0
39
+ Requires-Dist: zarr<4,>=3.1.5
40
+ Provides-Extra: dev
41
+ Requires-Dist: jupyter-book<2,>=1.0.4; extra == 'dev'
42
+ Requires-Dist: jupyterlab<5,>=4.5.0; extra == 'dev'
43
+ Requires-Dist: papermill<3,>=2.7.0; extra == 'dev'
44
+ Requires-Dist: pytest-cov<8,>=7.0.0; extra == 'dev'
45
+ Requires-Dist: pytest<10,>=9.0.2; extra == 'dev'
46
+ Description-Content-Type: text/markdown
47
+
48
+ # earthcode
49
+
50
+ Python tools for creating, validating, and searching EarthCODE Open Science Catalog metadata.
51
+
52
+ ```bash
53
+ pip install earthcode
54
+ ```
55
+
56
+ ## Development
57
+
58
+ To run:
59
+
60
+ 1. `git clone https://github.com/ESA-EarthCODE/earthcode-library.git`
61
+ 2. Install pixi - https://pixi.sh/dev/installation/
62
+ 3. `cd earthcode-library`
63
+ 4. `pixi install`
64
+ 5. `pixi run jupyter lab`
65
+
66
+ We have examples for:
67
+ - `./examples/example_create_osc_entries.ipynb` - shows how to create OSC entries
68
+ - `./examples/contribute_via_pr_osc.ipynb` - shows how to add newly created entries to the OSC, using a GitHub pull request
69
+ - `./examples/contribute_via_osc_editor.ipynb` - shows how to add entries to the OSC, using a combination of this library and the OSC Editor (a GUI tool)
70
+ - `./examples/earthcode_publishing_guide.ipynb` - is a simplified introduction to the OSC and the necessary steps to publish data
@@ -0,0 +1,23 @@
1
+ # earthcode
2
+
3
+ Python tools for creating, validating, and searching EarthCODE Open Science Catalog metadata.
4
+
5
+ ```bash
6
+ pip install earthcode
7
+ ```
8
+
9
+ ## Development
10
+
11
+ To run:
12
+
13
+ 1. `git clone https://github.com/ESA-EarthCODE/earthcode-library.git`
14
+ 2. Install pixi - https://pixi.sh/dev/installation/
15
+ 3. `cd earthcode-library`
16
+ 4. `pixi install`
17
+ 5. `pixi run jupyter lab`
18
+
19
+ We have examples for:
20
+ - `./examples/example_create_osc_entries.ipynb` - shows how to create OSC entries
21
+ - `./examples/contribute_via_pr_osc.ipynb` - shows how to add newly created entries to the OSC, using a GitHub pull request
22
+ - `./examples/contribute_via_osc_editor.ipynb` - shows how to add entries to the OSC, using a combination of this library and the OSC Editor (a GUI tool)
23
+ - `./examples/earthcode_publishing_guide.ipynb` - is a simplified introduction to the OSC and the necessary steps to publish data
@@ -0,0 +1,20 @@
1
+ title: earthcode
2
+ author: EarthCODE
3
+ copyright: "2026"
4
+ only_build_toc_files: true
5
+ exclude_patterns:
6
+ - _build
7
+ - .github/*
8
+ - .pixi/*
9
+ - .pytest_cache/*
10
+ - "**.ipynb_checkpoints"
11
+ execute:
12
+ execute_notebooks: "off"
13
+ repository:
14
+ url: https://github.com/ESA-EarthCODE/earthcode-library
15
+ path_to_book: .
16
+ branch: main
17
+ html:
18
+ use_repository_button: true
19
+ use_issues_button: true
20
+ home_page_in_navbar: true
@@ -0,0 +1,20 @@
1
+ format: jb-book
2
+ root: README
3
+ parts:
4
+ - caption: Guide
5
+ chapters:
6
+ - file: guide/0.Prerequisites
7
+ - file: guide/1.Project
8
+ - file: guide/2.0.Product
9
+ - file: guide/2.1.Product_files_PRR
10
+ - file: guide/2.1.Product_files_self_hosted
11
+ - file: guide/3.Workflow
12
+ - file: guide/4.Experiment
13
+ - caption: Examples
14
+ chapters:
15
+ - file: examples/earthcode_publishing_guide
16
+ - file: examples/earthcode_data_discovery
17
+ - file: examples/example_create_osc_entries
18
+ - file: examples/contribute_via_pr_osc
19
+ - file: examples/contribute_via_osc_editor
20
+ - file: examples/glambie_notebook_osc
@@ -0,0 +1,159 @@
1
+ """
2
+ Load documents from the specified STAC open science catalog and upload to s3 storage as lance table.
3
+
4
+ Currently only handles OSC collections/catalogs for 'products', 'variables', 'eo-missions', and 'projects'.
5
+ It does NOT handle the stac items within collections. In future this can be handled with multiple indexes and tables.
6
+
7
+ - Build (defaults baked in): `pixi run python generate_embeddings.py`
8
+ - Explicit build: `pixi run python cli/generate_embeddings.py`
9
+
10
+
11
+ Returns:
12
+ None
13
+ """
14
+
15
+ import argparse
16
+ import json
17
+ from pathlib import Path
18
+ import lance
19
+ import numpy as np
20
+ import pyarrow as pa
21
+ from fastembed import TextEmbedding
22
+
23
+ DEFAULT_ROOT_DIR = "../open-science-catalog-metadata"
24
+ DEFAULT_GROUPS = ["products", "variables", "eo-missions", "projects"]
25
+ DEFAULT_LANCE_URI = "s3://pangeo-test-fires/vector_store_v5/"
26
+ LANCE_BASE_STORAGE_OPTIONS = {
27
+ "region": "eu-west-2",
28
+ "aws_skip_signature": "true",
29
+ } # to be implemented
30
+ MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
31
+
32
+
33
+ # ---------------------------- helpers ---------------------------- #
34
+
35
+
36
+ def extract_theme_ids(data):
37
+ theme_ids = []
38
+ for theme in data.get("themes", []):
39
+ for concept in theme.get("concepts", []):
40
+ cid = concept.get("id")
41
+ if cid:
42
+ theme_ids.append(str(cid))
43
+ return theme_ids
44
+
45
+
46
+ def flatten_metadata(data):
47
+ parts = []
48
+ parts.append(data.get("id", ""))
49
+ parts.append(data.get("title", ""))
50
+ parts.append(data.get("description", ""))
51
+ parts.extend(data.get("keywords", []))
52
+ parts.extend(data.get("osc:variables", []))
53
+ parts.extend(data.get("osc:missions", []))
54
+ parts.extend(extract_theme_ids(data))
55
+ return "\n".join(p for p in parts if p)
56
+
57
+
58
+ def create_row_from_stac_file(path, group):
59
+ data = json.loads(path.read_text())
60
+ theme_ids = extract_theme_ids(data)
61
+ variable_ids = [str(v) for v in data.get("osc:variables", []) if v]
62
+ mission_ids = [str(m) for m in data.get("osc:missions", []) if m]
63
+ keywords = [str(k) for k in data.get("keywords", []) if k]
64
+ bboxes = data.get("extent", {}).get("spatial", {}).get("bbox") or [
65
+ [-180, -90, 180, 90]
66
+ ]
67
+ bminx = min([b[0] for b in bboxes if len(b) >= 4], default=None)
68
+ bminy = min([b[1] for b in bboxes if len(b) >= 4], default=None)
69
+ bmaxx = max([b[2] for b in bboxes if len(b) >= 4], default=None)
70
+ bmaxy = max([b[3] for b in bboxes if len(b) >= 4], default=None)
71
+ return {
72
+ "id": data.get("id", path.parent.name),
73
+ "group": group,
74
+ "title": data.get("title", ""),
75
+ "description": data.get("description", ""),
76
+ "path": str(path),
77
+ "bbox_minx": bminx,
78
+ "bbox_miny": bminy,
79
+ "bbox_maxx": bmaxx,
80
+ "bbox_maxy": bmaxy,
81
+ "item_json": json.dumps(data),
82
+ "theme_ids": f"|{'|'.join(theme_ids)}|" if theme_ids else "",
83
+ "variable_ids": f"|{'|'.join(variable_ids)}|" if variable_ids else "",
84
+ "mission_ids": f"|{'|'.join(mission_ids)}|" if mission_ids else "",
85
+ "keywords": f"|{'|'.join(keywords)}|" if keywords else "",
86
+ "text": flatten_metadata(data),
87
+ }
88
+
89
+
90
+ def load_documents(stac_dir, group):
91
+ if not stac_dir.exists():
92
+ raise FileNotFoundError(f"Group dir not found: {stac_dir}")
93
+
94
+ targets = list(stac_dir.glob("**/collection.json")) + list(
95
+ stac_dir.glob("**/catalog.json")
96
+ )
97
+ rows = [create_row_from_stac_file(p, group) for p in sorted(targets)]
98
+
99
+ if not rows:
100
+ raise RuntimeError(f"No STAC collections/catalogs found under {stac_dir}")
101
+ return rows
102
+
103
+
104
+ def build_embeddings(texts, model_name):
105
+ model = TextEmbedding(model_name=model_name)
106
+ return np.asarray(list(model.embed(texts)), dtype=np.float32)
107
+
108
+
109
+ # ----------------------------- main ------------------------------ #
110
+
111
+
112
+ def main():
113
+ parser = argparse.ArgumentParser(description="Build Lance dataset.")
114
+ parser.add_argument(
115
+ "--root-dir",
116
+ default=DEFAULT_ROOT_DIR,
117
+ help="Base OSC metadata dir containing group folders.",
118
+ )
119
+ parser.add_argument(
120
+ "--groups",
121
+ nargs="+",
122
+ default=DEFAULT_GROUPS,
123
+ help="Group folder names under root-dir to ingest (e.g., products variables eo-missions projects).",
124
+ )
125
+ parser.add_argument(
126
+ "--lance-uri",
127
+ default=DEFAULT_LANCE_URI,
128
+ help="Where to write the Lance dataset.",
129
+ )
130
+ parser.add_argument(
131
+ "--model", default=MODEL_NAME, help="FastEmbed model name."
132
+ )
133
+ args = parser.parse_args()
134
+
135
+ # get documents in pyarrow table
136
+ root = Path(args.root_dir)
137
+ rows = [row for grp in args.groups for row in load_documents(root / grp, grp)]
138
+
139
+ # build embeddings
140
+ texts = [r["text"] for r in rows]
141
+ embeddings = build_embeddings(texts, args.model)
142
+ embed_array = pa.FixedSizeListArray.from_arrays(
143
+ pa.array(embeddings.astype(np.float32).ravel(), type=pa.float32()),
144
+ embeddings.shape[1],
145
+ )
146
+
147
+ # build and write lance dataset
148
+ table = pa.Table.from_pylist(rows)
149
+ table = table.append_column("embedding", embed_array)
150
+ table = table.drop(["text"])
151
+ lance.write_dataset(table, args.lance_uri, mode="overwrite")
152
+
153
+ print(
154
+ f"Wrote {table.num_rows} rows to {args.lance_uri} with dim={embeddings.shape[1]}"
155
+ )
156
+
157
+
158
+ if __name__ == "__main__":
159
+ main()