tacoreader 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tacoreader-0.1.0/LICENSE +22 -0
- tacoreader-0.1.0/PKG-INFO +71 -0
- tacoreader-0.1.0/README.md +49 -0
- tacoreader-0.1.0/pyproject.toml +107 -0
- tacoreader-0.1.0/tacoreader/__init__.py +18 -0
- tacoreader-0.1.0/tacoreader/croissant.py +2 -0
- tacoreader-0.1.0/tacoreader/datacard.py +241 -0
- tacoreader-0.1.0/tacoreader/datacite.py +2 -0
- tacoreader-0.1.0/tacoreader/load.py +151 -0
- tacoreader-0.1.0/tacoreader/load_local.py +202 -0
- tacoreader-0.1.0/tacoreader/load_remote.py +205 -0
- tacoreader-0.1.0/tacoreader/load_utils.py +160 -0
- tacoreader-0.1.0/tacoreader/stac.py +2 -0
tacoreader-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024, Cesar Aybar
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
22
|
+
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: tacoreader
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A Python package to read Cloud-Optimized Datasets
|
|
5
|
+
Home-page: https://github.com/tacofoundation/tacoreader
|
|
6
|
+
Author: Cesar Aybar
|
|
7
|
+
Author-email: cesar.aybar@uv.es
|
|
8
|
+
Requires-Python: >=3.9,<4.0
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
+
Requires-Dist: geopandas (>=1.0.1)
|
|
15
|
+
Requires-Dist: pyarrow (>=17.0.0)
|
|
16
|
+
Requires-Dist: requests (>=2.32.3)
|
|
17
|
+
Requires-Dist: shapely (>=2.0.6)
|
|
18
|
+
Project-URL: Documentation, https://tacofoundation.github.io/tacoreader/
|
|
19
|
+
Project-URL: Repository, https://github.com/tacofoundation/tacoreader
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
|
|
22
|
+
# tacoreader
|
|
23
|
+
|
|
24
|
+
[](https://img.shields.io/github/v/release/csaybar/tacoreader)
|
|
25
|
+
[](https://github.com/csaybar/tacoreader/actions/workflows/main.yml?query=branch%3Amain)
|
|
26
|
+
[](https://codecov.io/gh/csaybar/tacoreader)
|
|
27
|
+
[](https://img.shields.io/github/commit-activity/m/csaybar/tacoreader)
|
|
28
|
+
[](https://img.shields.io/github/license/csaybar/tacoreader)
|
|
29
|
+
|
|
30
|
+
Read TACO datasets
|
|
31
|
+
|
|
32
|
+
- **Github repository**: <https://github.com/csaybar/tacoreader/>
|
|
33
|
+
- **Documentation** <https://csaybar.github.io/tacoreader/>
|
|
34
|
+
|
|
35
|
+
## Getting started with your project
|
|
36
|
+
|
|
37
|
+
First, create a repository on GitHub with the same name as this project, and then run the following commands:
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
git init -b main
|
|
41
|
+
git add .
|
|
42
|
+
git commit -m "init commit"
|
|
43
|
+
git remote add origin git@github.com:csaybar/tacoreader.git
|
|
44
|
+
git push -u origin main
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
Finally, install the environment and the pre-commit hooks with
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
make install
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
You are now ready to start development on your project!
|
|
54
|
+
The CI/CD pipeline will be triggered when you open a pull request, merge to main, or when you create a new release.
|
|
55
|
+
|
|
56
|
+
To finalize the set-up for publishing to PyPI or Artifactory, see [here](https://fpgmaas.github.io/cookiecutter-poetry/features/publishing/#set-up-for-pypi).
|
|
57
|
+
For activating the automatic documentation with MkDocs, see [here](https://fpgmaas.github.io/cookiecutter-poetry/features/mkdocs/#enabling-the-documentation-on-github).
|
|
58
|
+
To enable the code coverage reports, see [here](https://fpgmaas.github.io/cookiecutter-poetry/features/codecov/).
|
|
59
|
+
|
|
60
|
+
## Releasing a new version
|
|
61
|
+
|
|
62
|
+
- Create an API Token on [PyPI](https://pypi.org/).
|
|
63
|
+
- Add the API Token to your projects secrets with the name `PYPI_TOKEN` by visiting [this page](https://github.com/csaybar/tacoreader/settings/secrets/actions/new).
|
|
64
|
+
- Create a [new release](https://github.com/csaybar/tacoreader/releases/new) on Github.
|
|
65
|
+
- Create a new tag in the form `*.*.*`.
|
|
66
|
+
- For more details, see [here](https://fpgmaas.github.io/cookiecutter-poetry/features/cicd/#how-to-trigger-a-release).
|
|
67
|
+
|
|
68
|
+
---
|
|
69
|
+
|
|
70
|
+
Repository initiated with [fpgmaas/cookiecutter-poetry](https://github.com/fpgmaas/cookiecutter-poetry).
|
|
71
|
+
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
# tacoreader
|
|
2
|
+
|
|
3
|
+
[](https://img.shields.io/github/v/release/csaybar/tacoreader)
|
|
4
|
+
[](https://github.com/csaybar/tacoreader/actions/workflows/main.yml?query=branch%3Amain)
|
|
5
|
+
[](https://codecov.io/gh/csaybar/tacoreader)
|
|
6
|
+
[](https://img.shields.io/github/commit-activity/m/csaybar/tacoreader)
|
|
7
|
+
[](https://img.shields.io/github/license/csaybar/tacoreader)
|
|
8
|
+
|
|
9
|
+
Read TACO datasets
|
|
10
|
+
|
|
11
|
+
- **Github repository**: <https://github.com/csaybar/tacoreader/>
|
|
12
|
+
- **Documentation** <https://csaybar.github.io/tacoreader/>
|
|
13
|
+
|
|
14
|
+
## Getting started with your project
|
|
15
|
+
|
|
16
|
+
First, create a repository on GitHub with the same name as this project, and then run the following commands:
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
git init -b main
|
|
20
|
+
git add .
|
|
21
|
+
git commit -m "init commit"
|
|
22
|
+
git remote add origin git@github.com:csaybar/tacoreader.git
|
|
23
|
+
git push -u origin main
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
Finally, install the environment and the pre-commit hooks with
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
make install
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
You are now ready to start development on your project!
|
|
33
|
+
The CI/CD pipeline will be triggered when you open a pull request, merge to main, or when you create a new release.
|
|
34
|
+
|
|
35
|
+
To finalize the set-up for publishing to PyPI or Artifactory, see [here](https://fpgmaas.github.io/cookiecutter-poetry/features/publishing/#set-up-for-pypi).
|
|
36
|
+
For activating the automatic documentation with MkDocs, see [here](https://fpgmaas.github.io/cookiecutter-poetry/features/mkdocs/#enabling-the-documentation-on-github).
|
|
37
|
+
To enable the code coverage reports, see [here](https://fpgmaas.github.io/cookiecutter-poetry/features/codecov/).
|
|
38
|
+
|
|
39
|
+
## Releasing a new version
|
|
40
|
+
|
|
41
|
+
- Create an API Token on [PyPI](https://pypi.org/).
|
|
42
|
+
- Add the API Token to your projects secrets with the name `PYPI_TOKEN` by visiting [this page](https://github.com/csaybar/tacoreader/settings/secrets/actions/new).
|
|
43
|
+
- Create a [new release](https://github.com/csaybar/tacoreader/releases/new) on Github.
|
|
44
|
+
- Create a new tag in the form `*.*.*`.
|
|
45
|
+
- For more details, see [here](https://fpgmaas.github.io/cookiecutter-poetry/features/cicd/#how-to-trigger-a-release).
|
|
46
|
+
|
|
47
|
+
---
|
|
48
|
+
|
|
49
|
+
Repository initiated with [fpgmaas/cookiecutter-poetry](https://github.com/fpgmaas/cookiecutter-poetry).
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
[tool.poetry]
|
|
2
|
+
name = "tacoreader"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "A Python package to read Cloud-Optimized Datasets"
|
|
5
|
+
authors = ["Cesar Aybar <cesar.aybar@uv.es>"]
|
|
6
|
+
repository = "https://github.com/tacofoundation/tacoreader"
|
|
7
|
+
documentation = "https://tacofoundation.github.io/tacoreader/"
|
|
8
|
+
readme = "README.md"
|
|
9
|
+
packages = [
|
|
10
|
+
{include = "tacoreader"}
|
|
11
|
+
]
|
|
12
|
+
|
|
13
|
+
[tool.poetry.dependencies]
|
|
14
|
+
python = ">=3.9,<4.0"
|
|
15
|
+
pyarrow = ">=17.0.0"
|
|
16
|
+
geopandas = ">=1.0.1"
|
|
17
|
+
shapely = ">=2.0.6"
|
|
18
|
+
requests = ">=2.32.3"
|
|
19
|
+
|
|
20
|
+
[tool.poetry.group.dev.dependencies]
|
|
21
|
+
pytest = "^7.2.0"
|
|
22
|
+
pytest-cov = "^4.0.0"
|
|
23
|
+
deptry = "^0.16.2"
|
|
24
|
+
mypy = "^1.5.1"
|
|
25
|
+
pre-commit = "^3.4.0"
|
|
26
|
+
tox = "^4.11.1"
|
|
27
|
+
|
|
28
|
+
[tool.poetry.group.docs.dependencies]
|
|
29
|
+
mkdocs = "^1.4.2"
|
|
30
|
+
mkdocs-material = "^9.2.7"
|
|
31
|
+
mkdocstrings = {extras = ["python"], version = "^0.26.1"}
|
|
32
|
+
|
|
33
|
+
[build-system]
|
|
34
|
+
requires = ["poetry-core>=1.0.0"]
|
|
35
|
+
build-backend = "poetry.core.masonry.api"
|
|
36
|
+
|
|
37
|
+
[tool.mypy]
|
|
38
|
+
files = ["tacoreader"]
|
|
39
|
+
disallow_untyped_defs = "True"
|
|
40
|
+
disallow_any_unimported = "True"
|
|
41
|
+
no_implicit_optional = "True"
|
|
42
|
+
check_untyped_defs = "True"
|
|
43
|
+
warn_return_any = "True"
|
|
44
|
+
warn_unused_ignores = "True"
|
|
45
|
+
show_error_codes = "True"
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
[tool.pytest.ini_options]
|
|
50
|
+
testpaths = ["tests"]
|
|
51
|
+
|
|
52
|
+
[tool.ruff]
|
|
53
|
+
target-version = "py39"
|
|
54
|
+
line-length = 120
|
|
55
|
+
fix = true
|
|
56
|
+
select = [
|
|
57
|
+
# flake8-2020
|
|
58
|
+
"YTT",
|
|
59
|
+
# flake8-bandit
|
|
60
|
+
"S",
|
|
61
|
+
# flake8-bugbear
|
|
62
|
+
"B",
|
|
63
|
+
# flake8-builtins
|
|
64
|
+
"A",
|
|
65
|
+
# flake8-comprehensions
|
|
66
|
+
"C4",
|
|
67
|
+
# flake8-debugger
|
|
68
|
+
"T10",
|
|
69
|
+
# flake8-simplify
|
|
70
|
+
"SIM",
|
|
71
|
+
# isort
|
|
72
|
+
"I",
|
|
73
|
+
# mccabe
|
|
74
|
+
"C90",
|
|
75
|
+
# pycodestyle
|
|
76
|
+
"E", "W",
|
|
77
|
+
# pyflakes
|
|
78
|
+
"F",
|
|
79
|
+
# pygrep-hooks
|
|
80
|
+
"PGH",
|
|
81
|
+
# pyupgrade
|
|
82
|
+
"UP",
|
|
83
|
+
# ruff
|
|
84
|
+
"RUF",
|
|
85
|
+
# tryceratops
|
|
86
|
+
"TRY",
|
|
87
|
+
]
|
|
88
|
+
ignore = [
|
|
89
|
+
# LineTooLong
|
|
90
|
+
"E501",
|
|
91
|
+
# DoNotAssignLambda
|
|
92
|
+
"E731",
|
|
93
|
+
]
|
|
94
|
+
|
|
95
|
+
[tool.ruff.format]
|
|
96
|
+
preview = true
|
|
97
|
+
|
|
98
|
+
[tool.coverage.report]
|
|
99
|
+
skip_empty = true
|
|
100
|
+
|
|
101
|
+
[tool.coverage.run]
|
|
102
|
+
branch = true
|
|
103
|
+
source = ["tacoreader"]
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
[tool.ruff.per-file-ignores]
|
|
107
|
+
"tests/*" = ["S101"]
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
from tacoreader.croissant import read_croissant
|
|
2
|
+
from tacoreader.datacard import read_datacard
|
|
3
|
+
from tacoreader.datacite import read_datacite
|
|
4
|
+
from tacoreader.load import load, load_metadata
|
|
5
|
+
from tacoreader.stac import read_stac
|
|
6
|
+
|
|
7
|
+
__all__ = [
|
|
8
|
+
"load",
|
|
9
|
+
"load_metadata",
|
|
10
|
+
"read_datacard",
|
|
11
|
+
"read_stac",
|
|
12
|
+
"read_croissant",
|
|
13
|
+
"read_datacite",
|
|
14
|
+
]
|
|
15
|
+
|
|
16
|
+
__version__ = "0.1.0"
|
|
17
|
+
|
|
18
|
+
|
|
@@ -0,0 +1,241 @@
|
|
|
1
|
+
import pathlib
|
|
2
|
+
from typing import List, Union
|
|
3
|
+
|
|
4
|
+
from tacoreader.load import load_metadata
|
|
5
|
+
|
|
6
|
+
try:
|
|
7
|
+
from mdutils.mdutils import MdUtils
|
|
8
|
+
except ImportError:
|
|
9
|
+
raise ImportError("Please install the mdutils package with: pip install mdutils")
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def read_datacard(
|
|
13
|
+
file: Union[str, pathlib.Path, List[pathlib.Path], List[str]],
|
|
14
|
+
outfile: Union[str, pathlib.Path],
|
|
15
|
+
) -> pathlib.Path:
|
|
16
|
+
|
|
17
|
+
# Create a JSON file
|
|
18
|
+
taco_object: dict = load_metadata(file)
|
|
19
|
+
|
|
20
|
+
# Convert the output file to a pathlib.Path object
|
|
21
|
+
if isinstance(outfile, str):
|
|
22
|
+
outfile = pathlib.Path(outfile)
|
|
23
|
+
|
|
24
|
+
# Create the README.md file
|
|
25
|
+
md_file = MdUtils(file_name=outfile)
|
|
26
|
+
|
|
27
|
+
# --- YAML Header ---
|
|
28
|
+
md_file.new_line("---")
|
|
29
|
+
md_file.new_line("license:")
|
|
30
|
+
for item in taco_object["licenses"]:
|
|
31
|
+
md_file.new_line(f" - {item}")
|
|
32
|
+
md_file.new_line("language:")
|
|
33
|
+
md_file.new_line("- en")
|
|
34
|
+
if taco_object["keywords"]:
|
|
35
|
+
md_file.new_line("tags:")
|
|
36
|
+
for tag in taco_object["keywords"]:
|
|
37
|
+
md_file.new_line(f" - {tag}")
|
|
38
|
+
md_file.new_line(f'pretty_name: {taco_object["id"]}')
|
|
39
|
+
md_file.new_line("---")
|
|
40
|
+
|
|
41
|
+
# --- Title and Description ---
|
|
42
|
+
md_file.new_header(level=1, title=taco_object["id"])
|
|
43
|
+
if taco_object["title"]:
|
|
44
|
+
md_file.new_line(f'**{taco_object["title"]}**', bold_italics_code="b")
|
|
45
|
+
md_file.new_paragraph(taco_object["description"])
|
|
46
|
+
|
|
47
|
+
# --- Code Snippet ---
|
|
48
|
+
md_file.new_header(level=2, title="🌮 TACO Snippet")
|
|
49
|
+
md_file.new_paragraph("Load this dataset using the `tacoreader` library.")
|
|
50
|
+
md_file.new_line("```python")
|
|
51
|
+
md_file.new_line("import tacoreader")
|
|
52
|
+
md_file.new_line("dataset = tacoreader.load('...')")
|
|
53
|
+
md_file.new_line("```")
|
|
54
|
+
|
|
55
|
+
# example in R
|
|
56
|
+
md_file.new_line("\n")
|
|
57
|
+
md_file.new_paragraph("Or in R:")
|
|
58
|
+
md_file.new_line("```r")
|
|
59
|
+
md_file.new_line("library(tacoreader)")
|
|
60
|
+
md_file.new_line("dataset <- tacoreader::load('...')")
|
|
61
|
+
md_file.new_line("```")
|
|
62
|
+
|
|
63
|
+
# --- Sensor Information ---
|
|
64
|
+
if taco_object["optical_data"]:
|
|
65
|
+
md_file.new_header(level=2, title="🛰️ Sensor Information")
|
|
66
|
+
md_file.new_paragraph(
|
|
67
|
+
f'The sensor related to the dataset: **{taco_object["optical_data"]["sensor"]}**'
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
# --- Task Information ---
|
|
71
|
+
md_file.new_header(level=2, title="🎯 Task")
|
|
72
|
+
md_file.new_paragraph(
|
|
73
|
+
f'The task associated with this dataset: **{taco_object["task"]}**'
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
# --- Raw Repository Link ---
|
|
77
|
+
if taco_object["raw_link"]:
|
|
78
|
+
md_file.new_header(level=2, title="📂 Original Data Repository")
|
|
79
|
+
md_file.new_paragraph(
|
|
80
|
+
f'Source location of the raw data:**{md_file.new_inline_link(link=taco_object["raw_link"]["href"])}**'
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
# --- Discussion Link ---
|
|
84
|
+
if taco_object["discuss_link"]:
|
|
85
|
+
md_file.new_header(level=2, title="💬 Discussion")
|
|
86
|
+
md_file.new_paragraph(
|
|
87
|
+
f'Insights or clarifications about the dataset: **{md_file.new_inline_link(link=taco_object["discuss_link"]["href"])}**'
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
# --- Split Strategy ---
|
|
91
|
+
if taco_object["split_strategy"]:
|
|
92
|
+
md_file.new_header(level=2, title="🔀 Split Strategy")
|
|
93
|
+
md_file.new_paragraph(
|
|
94
|
+
f'How the dataset is divided for training, validation, and testing: **{taco_object["split_strategy"]}**'
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
# --- Scientific Publications ---
|
|
98
|
+
if taco_object["scientific"]["publications"]:
|
|
99
|
+
md_file.new_header(level=2, title="📚 Scientific Publications")
|
|
100
|
+
md_file.new_paragraph("Publications that reference or describe the dataset.")
|
|
101
|
+
for idx, pub in enumerate(taco_object["scientific"]["publications"], start=1):
|
|
102
|
+
# Add the publication information
|
|
103
|
+
md_file.new_paragraph(f"### Publication {idx:02d}")
|
|
104
|
+
md_file.new_line("- **DOI**: " + md_file.new_inline_link(link=pub["doi"]))
|
|
105
|
+
md_file.new_line("- **Summary**: " + pub["summary"])
|
|
106
|
+
md_file.new_line("- **BibTeX Citation**:")
|
|
107
|
+
md_file.new_line("```bibtex")
|
|
108
|
+
md_file.new_line(pub["citation"].strip("\n"))
|
|
109
|
+
md_file.new_line("```")
|
|
110
|
+
md_file.new_line("\n")
|
|
111
|
+
|
|
112
|
+
# --- Data Providers ---
|
|
113
|
+
if taco_object["providers"]:
|
|
114
|
+
md_file.new_header(level=2, title="🤝 Data Providers")
|
|
115
|
+
md_file.new_paragraph(
|
|
116
|
+
"Organizations or individuals responsible for the dataset."
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
# Define table headers
|
|
120
|
+
table_headers = ["**Name**", "**Role**", "**URL**"]
|
|
121
|
+
table_data = [table_headers]
|
|
122
|
+
|
|
123
|
+
# Populate table with provider data
|
|
124
|
+
for provider in taco_object["providers"]:
|
|
125
|
+
table_data.append(
|
|
126
|
+
[
|
|
127
|
+
provider["name"] or "N/A",
|
|
128
|
+
(
|
|
129
|
+
", ".join(provider["roles"])
|
|
130
|
+
if isinstance(provider["roles"], list)
|
|
131
|
+
else provider["roles"] or "N/A"
|
|
132
|
+
),
|
|
133
|
+
(
|
|
134
|
+
md_file.new_inline_link(link=provider["links"][0]["href"])
|
|
135
|
+
if provider["links"]
|
|
136
|
+
else "N/A"
|
|
137
|
+
),
|
|
138
|
+
]
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
# Flatten the list for Markdown formatting
|
|
142
|
+
flat_table_data = [cell for row in table_data for cell in row]
|
|
143
|
+
|
|
144
|
+
# Create the table
|
|
145
|
+
md_file.new_table(
|
|
146
|
+
columns=3, rows=len(table_data), text=flat_table_data, text_align="left"
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
# --- Curators ---
|
|
150
|
+
if taco_object["curators"]:
|
|
151
|
+
md_file.new_header(level=2, title="🧑🔬 Curators")
|
|
152
|
+
md_file.new_paragraph(
|
|
153
|
+
"Responsible for structuring the dataset in the TACO format."
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
# Define table headers
|
|
157
|
+
table_headers = ["**Name**", "**Organization**", "**URL**"]
|
|
158
|
+
table_data = [table_headers]
|
|
159
|
+
|
|
160
|
+
# Populate table with curator data
|
|
161
|
+
for curator in taco_object["curators"]:
|
|
162
|
+
table_data.append(
|
|
163
|
+
[
|
|
164
|
+
curator["name"] or "N/A",
|
|
165
|
+
curator["organization"] or "N/A",
|
|
166
|
+
(
|
|
167
|
+
md_file.new_inline_link(link=curator["links"][0]["href"])
|
|
168
|
+
if curator["links"]
|
|
169
|
+
else "N/A"
|
|
170
|
+
),
|
|
171
|
+
]
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
# Flatten the list for Markdown formatting
|
|
175
|
+
flat_table_data = [cell for row in table_data for cell in row]
|
|
176
|
+
# Create the table
|
|
177
|
+
md_file.new_table(
|
|
178
|
+
columns=3, rows=len(table_data), text=flat_table_data, text_align="left"
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
# --- Labels ---
|
|
182
|
+
if taco_object["labels"]:
|
|
183
|
+
md_file.new_header(level=2, title="🏷️ Labels")
|
|
184
|
+
md_file.new_paragraph(taco_object["labels"]["label_description"])
|
|
185
|
+
table_headers = ["**Name**", "**Category**", "**Description**"]
|
|
186
|
+
table_data = [table_headers]
|
|
187
|
+
for item in taco_object["labels"]["label_classes"]:
|
|
188
|
+
table_data.append(
|
|
189
|
+
[
|
|
190
|
+
item["name"] or "N/A",
|
|
191
|
+
(
|
|
192
|
+
str(item["category"]) if item["category"] is not None else "N/A"
|
|
193
|
+
), # Zero is a valid category
|
|
194
|
+
item["description"] or "N/A",
|
|
195
|
+
]
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
# Flatten the list for Markdown formatting
|
|
199
|
+
flat_table_data = [cell for row in table_data for cell in row]
|
|
200
|
+
md_file.new_table(
|
|
201
|
+
columns=3, rows=len(table_data), text=flat_table_data, text_align="left"
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
# --- Optical Bands ---
|
|
205
|
+
if taco_object["optical_data"]:
|
|
206
|
+
md_file.new_header(level=2, title="🌈 Optical Bands")
|
|
207
|
+
md_file.new_paragraph("Spectral bands related to the sensor.")
|
|
208
|
+
table_headers = [
|
|
209
|
+
"**Name**",
|
|
210
|
+
"**Common Name**",
|
|
211
|
+
"**Description**",
|
|
212
|
+
"**Center Wavelength**",
|
|
213
|
+
"**Full Width Half Max**",
|
|
214
|
+
"**Index**",
|
|
215
|
+
]
|
|
216
|
+
table_data = [table_headers]
|
|
217
|
+
|
|
218
|
+
for item in taco_object["optical_data"]["bands"]:
|
|
219
|
+
table_data.append(
|
|
220
|
+
[
|
|
221
|
+
item["name"] or "N/A",
|
|
222
|
+
item["common_name"] or "N/A",
|
|
223
|
+
item["description"] or "N/A",
|
|
224
|
+
item["center_wavelength"] or "N/A",
|
|
225
|
+
item["full_width_half_max"] or "N/A",
|
|
226
|
+
str(item["index"]) if item["index"] is not None else "N/A",
|
|
227
|
+
]
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
# Flatten the list for Markdown formatting
|
|
231
|
+
flat_table_data = [cell for row in table_data for cell in row]
|
|
232
|
+
md_file.new_table(
|
|
233
|
+
columns=6, rows=len(table_data), text=flat_table_data, text_align="left"
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
# Export the data to the output file
|
|
237
|
+
file = md_file.get_md_text().replace("\n\n\n \n", "").replace(" \n", "\n")
|
|
238
|
+
with open(outfile, "w") as f:
|
|
239
|
+
f.write(file)
|
|
240
|
+
|
|
241
|
+
return outfile
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
import pathlib
|
|
2
|
+
import re
|
|
3
|
+
from typing import List, Union
|
|
4
|
+
|
|
5
|
+
import geopandas as gpd
|
|
6
|
+
import pandas as pd
|
|
7
|
+
import requests
|
|
8
|
+
|
|
9
|
+
from tacoreader import load_local, load_remote, load_utils
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def load(file: Union[str, pathlib.Path, List[pathlib.Path], List[str]]) -> pd.DataFrame:
|
|
13
|
+
"""Load the dataframe of a tortilla file.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
file (Union[str, pathlib.Path, List]): The path of
|
|
17
|
+
the tortilla file. If the file is split into multiple
|
|
18
|
+
parts, a list of paths is accepted. Also, multiple
|
|
19
|
+
parts can be read by putting a asterisk (*) at the end
|
|
20
|
+
of the file name. For example, "file*.tortilla". In this
|
|
21
|
+
case, the function will create a list will all the partitions
|
|
22
|
+
before the reading process.
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
pd.DataFrame: The dataframe of the tortilla file.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
# Transform our snippet into a list of files
|
|
29
|
+
# If it is not a snippet, it will return the same file
|
|
30
|
+
file = load_utils.snippet2files(file=file)
|
|
31
|
+
|
|
32
|
+
if isinstance(file, list):
|
|
33
|
+
if load_utils.is_valid_url(file):
|
|
34
|
+
dataframe = load_remote.remote_files2dataframe(file)
|
|
35
|
+
else:
|
|
36
|
+
dataframe = load_local.local_files2dataframe(file)
|
|
37
|
+
elif isinstance(file, (str, pathlib.Path)):
|
|
38
|
+
if load_utils.is_valid_url(file):
|
|
39
|
+
dataframe = load_remote.remote_file2dataframe(file)
|
|
40
|
+
else:
|
|
41
|
+
dataframe = load_local.local_file2dataframe(file)
|
|
42
|
+
else:
|
|
43
|
+
raise ValueError("Invalid file type. Must be a list, string or pathlib.Path.")
|
|
44
|
+
|
|
45
|
+
# Clean up the dataframe
|
|
46
|
+
dataframe = load_utils.sort_columns_add_geometry(dataframe)
|
|
47
|
+
|
|
48
|
+
return TortillaDataFrame(dataframe)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def lazy_load(offset: int, file: Union[str, pathlib.Path]) -> pd.DataFrame:
|
|
52
|
+
"""Lazy load a tortilla file.
|
|
53
|
+
|
|
54
|
+
Useful for datasets that have tortillas as samples (tortillas inside tortillas).
|
|
55
|
+
The offset is used to read a specific part of the main tortilla file.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
offset (int): The byte offset where the reading process will start.
|
|
59
|
+
file (Union[str, pathlib.Path]): The path tot the main tortilla file.
|
|
60
|
+
|
|
61
|
+
Returns:
|
|
62
|
+
pd.DataFrame: The dataframe of the tortilla file.
|
|
63
|
+
"""
|
|
64
|
+
|
|
65
|
+
if load_utils.is_valid_url(file):
|
|
66
|
+
dataframe = load_remote.remote_lazyfile2dataframe(offset, file)
|
|
67
|
+
else:
|
|
68
|
+
dataframe = load_local.local_lazyfile2dataframe(offset, file)
|
|
69
|
+
|
|
70
|
+
# Clean up the dataframe
|
|
71
|
+
dataframe = load_utils.sort_columns_add_geometry(dataframe)
|
|
72
|
+
|
|
73
|
+
return TortillaDataFrame(dataframe)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def load_metadata(
|
|
77
|
+
file: Union[str, pathlib.Path, List[pathlib.Path], List[str]]
|
|
78
|
+
) -> dict:
|
|
79
|
+
"""Load the metadata of a tortilla or taco file.
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
file (Union[str, pathlib.Path, List]): The path of
|
|
83
|
+
the taco file. If the file is split into multiple
|
|
84
|
+
parts, a list of paths is accepted. Also, multiple
|
|
85
|
+
parts can be read by putting a asterisk (*) at the end
|
|
86
|
+
of the file name. For example, "file*.tortilla". In this
|
|
87
|
+
case, the function will create a list will all the partitions
|
|
88
|
+
before the reading process.
|
|
89
|
+
|
|
90
|
+
Returns:
|
|
91
|
+
dict: The metadata of the taco file.
|
|
92
|
+
"""
|
|
93
|
+
|
|
94
|
+
# Transform our snippet into a list of files
|
|
95
|
+
# If it is not a snippet, it will return the same file
|
|
96
|
+
file = load_utils.snippet2files(file=file)
|
|
97
|
+
|
|
98
|
+
# Load the metadata
|
|
99
|
+
if isinstance(file, list):
|
|
100
|
+
if load_utils.is_valid_url(file):
|
|
101
|
+
metadata = load_remote.remote_files2metadata(file)
|
|
102
|
+
else:
|
|
103
|
+
metadata = load_local.local_files2metadata(file)
|
|
104
|
+
elif isinstance(file, (str, pathlib.Path)):
|
|
105
|
+
if load_utils.is_valid_url(file):
|
|
106
|
+
metadata = load_remote.remote_file2metadata(file)
|
|
107
|
+
else:
|
|
108
|
+
metadata = load_local.local_file2metadata(file)
|
|
109
|
+
else:
|
|
110
|
+
raise ValueError("Invalid file type. Must be a list, string or pathlib.Path.")
|
|
111
|
+
|
|
112
|
+
return metadata
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
class TortillaDataFrame(gpd.GeoDataFrame):
|
|
116
|
+
@property
|
|
117
|
+
def _constructor(self):
|
|
118
|
+
return TortillaDataFrame
|
|
119
|
+
|
|
120
|
+
@staticmethod
|
|
121
|
+
def get_internal_path(row):
|
|
122
|
+
pattern: re.Pattern = re.compile(r"/vsisubfile/(\d+)_(\d+),(.+)")
|
|
123
|
+
offset, length, path = pattern.match(row["internal:subfile"]).groups()
|
|
124
|
+
|
|
125
|
+
# If it is a curl file, remove the first 9 characters
|
|
126
|
+
if path.startswith("/vsicurl/"):
|
|
127
|
+
path = path[9:]
|
|
128
|
+
|
|
129
|
+
return int(offset), int(length), path
|
|
130
|
+
|
|
131
|
+
def read(self, idx):
|
|
132
|
+
row = self.iloc[idx]
|
|
133
|
+
if row["internal:file_format"] == "TORTILLA":
|
|
134
|
+
offset, length, path = self.get_internal_path(row)
|
|
135
|
+
return lazy_load(row["tortilla:offset"], path)
|
|
136
|
+
elif row["internal:file_format"] == "BYTES":
|
|
137
|
+
|
|
138
|
+
# Obtain the offset, length and internal path
|
|
139
|
+
offset, length, path = self.get_internal_path(row)
|
|
140
|
+
|
|
141
|
+
# Get the bytes
|
|
142
|
+
if load_utils.is_valid_url(path):
|
|
143
|
+
headers = {"Range": f"bytes={offset}-{offset + length - 1}"}
|
|
144
|
+
response: requests.Response = requests.get(path, headers=headers)
|
|
145
|
+
return response.content
|
|
146
|
+
else:
|
|
147
|
+
with open(path, "rb") as f:
|
|
148
|
+
f.seek(int(offset))
|
|
149
|
+
return f.read(int(length))
|
|
150
|
+
else:
|
|
151
|
+
return row["internal:subfile"]
|
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import pathlib
|
|
3
|
+
from typing import List, Union
|
|
4
|
+
|
|
5
|
+
import pandas as pd
|
|
6
|
+
import pyarrow as pa
|
|
7
|
+
import pyarrow.parquet as pq
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def local_file2dataframe(file: Union[str, pathlib.Path]):
|
|
11
|
+
"""Read the dataframe of tortilla file given a local path.
|
|
12
|
+
|
|
13
|
+
Args:
|
|
14
|
+
files (Union[str, pathlib.Path]): A local path pointing to the
|
|
15
|
+
tortilla file.
|
|
16
|
+
|
|
17
|
+
Returns:
|
|
18
|
+
pd.DataFrame: The dataframe of the tortilla file.
|
|
19
|
+
"""
|
|
20
|
+
with open(file, "rb") as f:
|
|
21
|
+
static_bytes = f.read(50)
|
|
22
|
+
|
|
23
|
+
# SPLIT the static bytes
|
|
24
|
+
MB: bytes = static_bytes[:2]
|
|
25
|
+
FO: bytes = static_bytes[2:10]
|
|
26
|
+
FL: bytes = static_bytes[10:18]
|
|
27
|
+
DF: str = static_bytes[18:42].strip().decode()
|
|
28
|
+
# DP: str = static_bytes[42:50]
|
|
29
|
+
|
|
30
|
+
if MB != b"#y":
|
|
31
|
+
raise ValueError("You are not a tortilla 🫓 or a TACO 🌮")
|
|
32
|
+
|
|
33
|
+
# Read the NEXT 8 bytes of the file
|
|
34
|
+
footer_offset: int = int.from_bytes(FO, "little")
|
|
35
|
+
|
|
36
|
+
# Seek to the FOOTER offset
|
|
37
|
+
f.seek(footer_offset)
|
|
38
|
+
|
|
39
|
+
# Select the FOOTER length
|
|
40
|
+
# Read the FOOTER
|
|
41
|
+
footer_length: int = int.from_bytes(FL, "little")
|
|
42
|
+
dataframe = pq.read_table(pa.BufferReader(f.read(footer_length))).to_pandas()
|
|
43
|
+
|
|
44
|
+
# Convert dataset to DataFrame
|
|
45
|
+
dataframe["internal:file_format"] = DF
|
|
46
|
+
dataframe["internal:mode"] = "local"
|
|
47
|
+
dataframe["internal:subfile"] = dataframe.apply(
|
|
48
|
+
lambda row: f"/vsisubfile/{row['tortilla:offset']}_{row['tortilla:length']},{file}",
|
|
49
|
+
axis=1,
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
return dataframe
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def local_files2dataframe(files: Union[List[str], List[pathlib.Path]]):
|
|
56
|
+
"""Read the dataframe of tortilla files given local paths.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
files (Union[List[str], List[pathlib.Path]]): A list of local
|
|
60
|
+
paths pointing to the tortilla files.
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
pd.DataFrame: The dataframe of the tortilla file.
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
# Merge the dataframe of the files
|
|
67
|
+
container = []
|
|
68
|
+
for file in files:
|
|
69
|
+
with open(file, "rb") as f:
|
|
70
|
+
static_bytes = f.read(50)
|
|
71
|
+
|
|
72
|
+
# SPLIT the static bytes
|
|
73
|
+
MB: bytes = static_bytes[:2]
|
|
74
|
+
FO: bytes = static_bytes[2:10]
|
|
75
|
+
FL: bytes = static_bytes[10:18]
|
|
76
|
+
DF: str = static_bytes[18:42].strip().decode()
|
|
77
|
+
# DP: str = static_bytes[42:50]
|
|
78
|
+
|
|
79
|
+
if MB != b"#y":
|
|
80
|
+
raise ValueError("You are not a tortilla 🫓 or a TACO 🌮")
|
|
81
|
+
|
|
82
|
+
# Read the NEXT 8 bytes of the file
|
|
83
|
+
footer_offset: int = int.from_bytes(FO, "little")
|
|
84
|
+
|
|
85
|
+
# Seek to the FOOTER offset
|
|
86
|
+
f.seek(footer_offset)
|
|
87
|
+
|
|
88
|
+
# Select the FOOTER length
|
|
89
|
+
# Read the FOOTER
|
|
90
|
+
footer_length: int = int.from_bytes(FL, "little")
|
|
91
|
+
dataframe = pq.read_table(
|
|
92
|
+
pa.BufferReader(f.read(footer_length))
|
|
93
|
+
).to_pandas()
|
|
94
|
+
|
|
95
|
+
# Convert dataset to DataFrame
|
|
96
|
+
dataframe["internal:file_format"] = DF
|
|
97
|
+
dataframe["internal:mode"] = "local"
|
|
98
|
+
dataframe["internal:subfile"] = dataframe.apply(
|
|
99
|
+
lambda row: f"/vsisubfile/{row['tortilla:offset']}_{row['tortilla:length']},{file}",
|
|
100
|
+
axis=1,
|
|
101
|
+
)
|
|
102
|
+
container.append(dataframe)
|
|
103
|
+
|
|
104
|
+
return pd.concat(container, ignore_index=True)
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def local_lazyfile2dataframe(
|
|
108
|
+
offset: int, file: Union[str, pathlib.Path]
|
|
109
|
+
) -> pd.DataFrame:
|
|
110
|
+
"""Read the dataframe of tortilla file that is a subfile
|
|
111
|
+
of a larger file.
|
|
112
|
+
|
|
113
|
+
Args:
|
|
114
|
+
offset (int): The offset of the subfile.
|
|
115
|
+
file (Union[str, pathlib.Path]): A local path pointing to the
|
|
116
|
+
main tortilla file.
|
|
117
|
+
|
|
118
|
+
Returns:
|
|
119
|
+
pd.DataFrame: The dataframe of the tortilla file.
|
|
120
|
+
"""
|
|
121
|
+
|
|
122
|
+
with open(file, "rb") as f:
|
|
123
|
+
# Seek to the OFFSET
|
|
124
|
+
f.seek(offset)
|
|
125
|
+
|
|
126
|
+
static_bytes = f.read(50)
|
|
127
|
+
|
|
128
|
+
# SPLIT the static bytes
|
|
129
|
+
MB: bytes = static_bytes[:2]
|
|
130
|
+
FO: bytes = static_bytes[2:10]
|
|
131
|
+
FL: bytes = static_bytes[10:18]
|
|
132
|
+
DF: str = static_bytes[18:42].strip().decode()
|
|
133
|
+
# DP: str = static_bytes[42:50]
|
|
134
|
+
|
|
135
|
+
if MB != b"#y":
|
|
136
|
+
raise ValueError("You are not a tortilla 🫓 or a TACO 🌮")
|
|
137
|
+
|
|
138
|
+
# Read the NEXT 8 bytes of the file
|
|
139
|
+
footer_offset: int = int.from_bytes(FO, "little") + offset
|
|
140
|
+
|
|
141
|
+
# Seek to the FOOTER offset
|
|
142
|
+
f.seek(footer_offset)
|
|
143
|
+
|
|
144
|
+
# Select the FOOTER length
|
|
145
|
+
# Read the FOOTER
|
|
146
|
+
footer_length: int = int.from_bytes(FL, "little")
|
|
147
|
+
dataframe = pq.read_table(pa.BufferReader(f.read(footer_length))).to_pandas()
|
|
148
|
+
|
|
149
|
+
# Fix the offset
|
|
150
|
+
dataframe["tortilla:offset"] = dataframe["tortilla:offset"] + offset
|
|
151
|
+
|
|
152
|
+
# Convert dataset to DataFrame
|
|
153
|
+
dataframe["internal:file_format"] = DF
|
|
154
|
+
dataframe["internal:mode"] = "local"
|
|
155
|
+
dataframe["internal:subfile"] = dataframe.apply(
|
|
156
|
+
lambda row: f"/vsisubfile/{row['tortilla:offset']}_{row['tortilla:length']},{file}",
|
|
157
|
+
axis=1,
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
return dataframe
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def local_file2metadata(file: Union[str, pathlib.Path]) -> dict:
|
|
164
|
+
"""Read the dataframe of a taco file given a local path.
|
|
165
|
+
|
|
166
|
+
Args:
|
|
167
|
+
file (Union[str, pathlib.Path]): A local path pointing to the
|
|
168
|
+
taco file.
|
|
169
|
+
|
|
170
|
+
Returns:
|
|
171
|
+
dict: The metadata of the taco file.
|
|
172
|
+
"""
|
|
173
|
+
with open(file, "rb") as f:
|
|
174
|
+
f.seek(50)
|
|
175
|
+
|
|
176
|
+
# Read the Collection offset (CO)
|
|
177
|
+
CO: int = int.from_bytes(f.read(8), "little")
|
|
178
|
+
|
|
179
|
+
# Read the Collection length (CL)
|
|
180
|
+
CL: int = int.from_bytes(f.read(8), "little")
|
|
181
|
+
|
|
182
|
+
# Seek to the Collection offset
|
|
183
|
+
f.seek(CO)
|
|
184
|
+
|
|
185
|
+
# Read the Collection (JSON UTF-8 encoded)
|
|
186
|
+
collection: dict = json.loads(f.read(CL).decode())
|
|
187
|
+
|
|
188
|
+
return collection
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def local_files2metadata(files: Union[List[str], List[pathlib.Path]]) -> dict:
|
|
192
|
+
"""Read the metadata of taco files given local paths.
|
|
193
|
+
|
|
194
|
+
Args:
|
|
195
|
+
files (Union[List[str], List[pathlib.Path]]): A list of local
|
|
196
|
+
paths pointing to the taco files.
|
|
197
|
+
|
|
198
|
+
Returns:
|
|
199
|
+
dict: The metadata of the taco file.
|
|
200
|
+
"""
|
|
201
|
+
|
|
202
|
+
return local_file2dataframe(files[0])
|
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import pathlib
|
|
3
|
+
from typing import List, Union
|
|
4
|
+
|
|
5
|
+
import pandas as pd
|
|
6
|
+
import pyarrow as pa
|
|
7
|
+
import pyarrow.parquet as pq
|
|
8
|
+
import requests
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def remote_file2dataframe(file: str) -> pd.DataFrame:
|
|
12
|
+
"""Read the dataframe of a tortilla file given a URL. The
|
|
13
|
+
server must support HTTP Range requests.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
files (str): A URL pointing to the tortilla file.
|
|
17
|
+
Returns:
|
|
18
|
+
pd.DataFrame: The dataframe of the tortilla file.
|
|
19
|
+
"""
|
|
20
|
+
# Fetch the first 8 bytes of the file
|
|
21
|
+
headers = {"Range": "bytes=0-50"}
|
|
22
|
+
response: requests.Response = requests.get(file, headers=headers)
|
|
23
|
+
static_bytes: bytes = response.content
|
|
24
|
+
|
|
25
|
+
# SPLIT the static bytes
|
|
26
|
+
MB: bytes = static_bytes[:2]
|
|
27
|
+
FO: bytes = static_bytes[2:10]
|
|
28
|
+
FL: bytes = static_bytes[10:18]
|
|
29
|
+
DF: str = static_bytes[18:42].strip().decode()
|
|
30
|
+
|
|
31
|
+
# Check if the file is a tortilla
|
|
32
|
+
if MB != b"#y":
|
|
33
|
+
raise ValueError("You are not a tortilla 🫓 or a TACO 🌮")
|
|
34
|
+
|
|
35
|
+
# Interpret the bytes as a little-endian integer
|
|
36
|
+
footer_offset: int = int.from_bytes(FO, "little")
|
|
37
|
+
footer_length: int = int.from_bytes(FL, "little")
|
|
38
|
+
|
|
39
|
+
# Fetch the footer
|
|
40
|
+
headers = {"Range": f"bytes={footer_offset}-{footer_offset + footer_length - 1}"}
|
|
41
|
+
with requests.get(file, headers=headers) as response:
|
|
42
|
+
# Interpret the response as a parquet table
|
|
43
|
+
dataframe = pq.read_table(pa.BufferReader(response.content)).to_pandas()
|
|
44
|
+
|
|
45
|
+
# Add the file format and mode
|
|
46
|
+
dataframe["internal:file_format"] = DF
|
|
47
|
+
dataframe["internal:mode"] = "online"
|
|
48
|
+
dataframe["internal:subfile"] = dataframe.apply(
|
|
49
|
+
lambda row: f"/vsisubfile/{row['tortilla:offset']}_{row['tortilla:length']},/vsicurl/{file}",
|
|
50
|
+
axis=1,
|
|
51
|
+
)
|
|
52
|
+
return dataframe
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def remote_files2dataframe(files: List[str]) -> pd.DataFrame:
|
|
56
|
+
"""Read the dataframe of tortillas files given a set of URLs. The
|
|
57
|
+
server must support HTTP Range requests.
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
files (List[str]): A list of URLs pointing to the
|
|
61
|
+
tortilla files.
|
|
62
|
+
|
|
63
|
+
Returns:
|
|
64
|
+
pd.DataFrame: The dataframe of the tortilla file.
|
|
65
|
+
"""
|
|
66
|
+
|
|
67
|
+
container = []
|
|
68
|
+
for file in files:
|
|
69
|
+
|
|
70
|
+
# Fetch the first 8 bytes of the file
|
|
71
|
+
headers = {"Range": "bytes=0-50"}
|
|
72
|
+
response: requests.Response = requests.get(file, headers=headers)
|
|
73
|
+
static_bytes: bytes = response.content
|
|
74
|
+
|
|
75
|
+
# SPLIT the static bytes
|
|
76
|
+
MB: bytes = static_bytes[:2]
|
|
77
|
+
FO: bytes = static_bytes[2:10]
|
|
78
|
+
FL: bytes = static_bytes[10:18]
|
|
79
|
+
DF: str = static_bytes[18:42].strip().decode()
|
|
80
|
+
|
|
81
|
+
# Check if the file is a tortilla
|
|
82
|
+
if MB != b"#y":
|
|
83
|
+
raise ValueError("You are not a tortilla 🫓 or a TACO 🌮")
|
|
84
|
+
|
|
85
|
+
# Interpret the bytes as a little-endian integer
|
|
86
|
+
footer_offset: int = int.from_bytes(FO, "little")
|
|
87
|
+
footer_length: int = int.from_bytes(FL, "little")
|
|
88
|
+
|
|
89
|
+
# Fetch the footer
|
|
90
|
+
headers = {"Range": f"bytes={footer_offset}-{footer_offset + footer_length}"}
|
|
91
|
+
with requests.get(file, headers=headers) as response:
|
|
92
|
+
|
|
93
|
+
# Interpret the response as a parquet table
|
|
94
|
+
dataframe = pq.read_table(pa.BufferReader(response.content)).to_pandas()
|
|
95
|
+
|
|
96
|
+
# Add the file format and mode
|
|
97
|
+
dataframe["internal:file_format"] = DF
|
|
98
|
+
dataframe["internal:mode"] = "online"
|
|
99
|
+
dataframe["internal:subfile"] = dataframe.apply(
|
|
100
|
+
lambda row: f"/vsisubfile/{row['tortilla:offset']}_{row['tortilla:length']},/vsicurl/{file}",
|
|
101
|
+
axis=1,
|
|
102
|
+
)
|
|
103
|
+
container.append(dataframe)
|
|
104
|
+
|
|
105
|
+
return pd.concat(container, ignore_index=True)
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def remote_lazyfile2dataframe(
|
|
109
|
+
offset: int, file: Union[str, pathlib.Path]
|
|
110
|
+
) -> pd.DataFrame:
|
|
111
|
+
"""Read the dataframe of tortilla file that is a subfile
|
|
112
|
+
of a larger file.
|
|
113
|
+
|
|
114
|
+
Args:
|
|
115
|
+
offset (int): The offset of the subfile.
|
|
116
|
+
file (Union[str, pathlib.Path]): A local path pointing to the
|
|
117
|
+
main tortilla file.
|
|
118
|
+
|
|
119
|
+
Returns:
|
|
120
|
+
pd.DataFrame: The dataframe of the tortilla file.
|
|
121
|
+
"""
|
|
122
|
+
|
|
123
|
+
# Fetch the first 8 bytes of the file
|
|
124
|
+
initb, endb = offset, offset + 50
|
|
125
|
+
headers = {"Range": f"bytes={initb}-{endb}"}
|
|
126
|
+
response: requests.Response = requests.get(file, headers=headers)
|
|
127
|
+
static_bytes: bytes = response.content
|
|
128
|
+
|
|
129
|
+
# SPLIT the static bytes
|
|
130
|
+
MB: bytes = static_bytes[:2]
|
|
131
|
+
FO: bytes = static_bytes[2:10]
|
|
132
|
+
FL: bytes = static_bytes[10:18]
|
|
133
|
+
DF: str = static_bytes[18:42].strip().decode()
|
|
134
|
+
|
|
135
|
+
# Check if the file is a tortilla
|
|
136
|
+
if MB != b"#y":
|
|
137
|
+
raise ValueError("You are not a tortilla 🫓 or a TACO 🌮")
|
|
138
|
+
|
|
139
|
+
# Interpret the bytes as a little-endian integer
|
|
140
|
+
footer_offset: int = int.from_bytes(FO, "little") + offset
|
|
141
|
+
footer_length: int = int.from_bytes(FL, "little")
|
|
142
|
+
|
|
143
|
+
# Fetch the footer
|
|
144
|
+
headers = {"Range": f"bytes={footer_offset}-{footer_offset + footer_length - 1}"}
|
|
145
|
+
with requests.get(file, headers=headers) as response:
|
|
146
|
+
# Interpret the response as a parquet table
|
|
147
|
+
dataframe = pq.read_table(pa.BufferReader(response.content)).to_pandas()
|
|
148
|
+
|
|
149
|
+
# Fix the offset
|
|
150
|
+
dataframe["tortilla:offset"] = dataframe["tortilla:offset"] + offset
|
|
151
|
+
|
|
152
|
+
# Add the file format and mode
|
|
153
|
+
dataframe["internal:file_format"] = DF
|
|
154
|
+
dataframe["internal:mode"] = "online"
|
|
155
|
+
dataframe["internal:subfile"] = dataframe.apply(
|
|
156
|
+
lambda row: f"/vsisubfile/{row['tortilla:offset']}_{row['tortilla:length']},/vsicurl/{file}",
|
|
157
|
+
axis=1,
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
return dataframe
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def remote_file2metadata(file: str) -> dict:
|
|
164
|
+
"""Read the metadata of a taco file given a URL. The
|
|
165
|
+
server must support HTTP Range requests.
|
|
166
|
+
|
|
167
|
+
Args:
|
|
168
|
+
file (str): A URL pointing to the taco file.
|
|
169
|
+
|
|
170
|
+
Returns:
|
|
171
|
+
dict: The metadata of the taco file.
|
|
172
|
+
"""
|
|
173
|
+
# Fetch the first 8 bytes of the file
|
|
174
|
+
headers = {"Range": "bytes=0-66"}
|
|
175
|
+
response: requests.Response = requests.get(file, headers=headers)
|
|
176
|
+
static_bytes: bytes = response.content
|
|
177
|
+
|
|
178
|
+
# SPLIT the static bytes
|
|
179
|
+
MB: bytes = static_bytes[:2]
|
|
180
|
+
CO: int = int.from_bytes(static_bytes[50:58], "little")
|
|
181
|
+
CL: int = int.from_bytes(static_bytes[58:66], "little")
|
|
182
|
+
|
|
183
|
+
# Check if the file is a tortilla
|
|
184
|
+
if MB != b"#y":
|
|
185
|
+
raise ValueError("You are not a tortilla 🫓 or a TACO 🌮")
|
|
186
|
+
|
|
187
|
+
# Read the Collection (JSON UTF-8 encoded)
|
|
188
|
+
headers = {"Range": f"bytes={CO}-{CO + CL}"}
|
|
189
|
+
collection: dict = json.loads(requests.get(file, headers=headers).content.decode())
|
|
190
|
+
|
|
191
|
+
return collection
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def remote_files2metadata(files: List[str]) -> dict:
|
|
195
|
+
"""Read the metadata of taco files given a set of URLs. The server
|
|
196
|
+
must support HTTP Range requests.
|
|
197
|
+
|
|
198
|
+
Args:
|
|
199
|
+
files (List[str]): A list of URLs pointing to the
|
|
200
|
+
taco files.
|
|
201
|
+
|
|
202
|
+
Returns:
|
|
203
|
+
dict: The metadata of the taco file.
|
|
204
|
+
"""
|
|
205
|
+
return remote_file2metadata(files[0])
|
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
import pathlib
|
|
2
|
+
import re
|
|
3
|
+
import urllib
|
|
4
|
+
from typing import List, Tuple, Union
|
|
5
|
+
|
|
6
|
+
import geopandas as gpd
|
|
7
|
+
import requests
|
|
8
|
+
import shapely.wkt
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def is_valid_url(url: Union[str, List[str]]) -> bool:
|
|
12
|
+
"""Check if a URL or list of URLs is valid.
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
url (Union[str, List[str]]): The URL(s) to check. It can
|
|
16
|
+
be a single URL or a list of URLs.
|
|
17
|
+
|
|
18
|
+
Returns:
|
|
19
|
+
bool: True if all URLs are valid, False otherwise.
|
|
20
|
+
"""
|
|
21
|
+
if isinstance(url, pathlib.Path):
|
|
22
|
+
return False
|
|
23
|
+
|
|
24
|
+
if isinstance(url, list):
|
|
25
|
+
return all(is_valid_url(single_url) for single_url in url)
|
|
26
|
+
|
|
27
|
+
try:
|
|
28
|
+
result = urllib.parse.urlparse(url)
|
|
29
|
+
return all([result.scheme in ["http", "https"], result.netloc])
|
|
30
|
+
except ValueError:
|
|
31
|
+
return False
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def split_name_and_path(file: Union[str, pathlib.Path]) -> Tuple[str, str]:
|
|
35
|
+
"""
|
|
36
|
+
Split a file path or URL into its name and path components.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
file (Union[str, pathlib.Path]): The input file or URL.
|
|
40
|
+
|
|
41
|
+
Returns:
|
|
42
|
+
Tuple[str, str]: A tuple containing the name and path.
|
|
43
|
+
"""
|
|
44
|
+
if isinstance(file, (str, pathlib.Path)):
|
|
45
|
+
# Handle URLs
|
|
46
|
+
parsed = urllib.parse.urlparse(file)
|
|
47
|
+
if parsed.scheme in {"http", "https"}:
|
|
48
|
+
# Extract the name and path from a URL
|
|
49
|
+
name = pathlib.PurePosixPath(parsed.path).name
|
|
50
|
+
path = f"{parsed.scheme}://{parsed.netloc}{pathlib.PurePosixPath(parsed.path).parent.as_posix()}"
|
|
51
|
+
else:
|
|
52
|
+
# Handle local file paths
|
|
53
|
+
file_path = pathlib.Path(file)
|
|
54
|
+
name = file_path.name
|
|
55
|
+
path = file_path.parent.as_posix()
|
|
56
|
+
return name, path
|
|
57
|
+
else:
|
|
58
|
+
raise ValueError("Input must be a string or pathlib.Path.")
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def snippet2files(
|
|
62
|
+
file: Union[str, pathlib.Path, List[str], List[pathlib.Path]]
|
|
63
|
+
) -> Union[List[pathlib.Path], List[str], str, pathlib.Path]:
|
|
64
|
+
"""Convert snippets of a multi-part file to a list of files.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
file (Union[str, pathlib.Path, List[str], List[pathlib.Path]]): A file, a
|
|
68
|
+
list of files, or a snippet of a multi-part file.
|
|
69
|
+
|
|
70
|
+
Raises:
|
|
71
|
+
FileNotFoundError: If a file is not found.
|
|
72
|
+
FileNotFoundError: If a partial file is missing.
|
|
73
|
+
|
|
74
|
+
Returns:
|
|
75
|
+
List[pathlib.Path]: A list of files or a single path. The path can be a
|
|
76
|
+
local path or a URL.
|
|
77
|
+
"""
|
|
78
|
+
|
|
79
|
+
# Check if the file is a list
|
|
80
|
+
if isinstance(file, list):
|
|
81
|
+
files = file
|
|
82
|
+
else:
|
|
83
|
+
# Does the file finish with: *.tortilla?
|
|
84
|
+
if re.match(r".*\*\.tortilla$", str(file)):
|
|
85
|
+
|
|
86
|
+
# Split in name and path
|
|
87
|
+
name, path = split_name_and_path(file)
|
|
88
|
+
|
|
89
|
+
# Get the filename without the snippet (i.e., *.tortilla)
|
|
90
|
+
filename: str = re.sub(r"\*\.tortilla$", "", name)
|
|
91
|
+
|
|
92
|
+
# check if file is a url
|
|
93
|
+
if is_valid_url(path):
|
|
94
|
+
# It is expected that the file is a multi-part file in the same url path
|
|
95
|
+
dumbfile: str = f"{path}/{filename}.0000.part.tortilla"
|
|
96
|
+
headers = {"Range": "bytes=42-50"}
|
|
97
|
+
response: requests.Response = requests.get(dumbfile, headers=headers)
|
|
98
|
+
npartitions: int = int.from_bytes(response.content, "little")
|
|
99
|
+
|
|
100
|
+
# Check if all parts are there
|
|
101
|
+
files = []
|
|
102
|
+
for d in range(npartitions):
|
|
103
|
+
partial_file = f"{path}/{filename}.{str(d).zfill(4)}.part.tortilla"
|
|
104
|
+
files.append(partial_file)
|
|
105
|
+
else:
|
|
106
|
+
# Get all files in the directory
|
|
107
|
+
file = pathlib.Path(file)
|
|
108
|
+
filename = pathlib.Path(filename)
|
|
109
|
+
|
|
110
|
+
# It is expected that the file is a multi-part file in the same directory
|
|
111
|
+
dumbfile: pathlib.Path = file.resolve().parent / (
|
|
112
|
+
filename.stem + ".0000.part.tortilla"
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
# check how many parts are there
|
|
116
|
+
with open(dumbfile, "rb") as f:
|
|
117
|
+
f.seek(42)
|
|
118
|
+
npartitions: int = int.from_bytes(f.read(8), "little")
|
|
119
|
+
|
|
120
|
+
# Check if all parts are there
|
|
121
|
+
files = []
|
|
122
|
+
for d in range(npartitions):
|
|
123
|
+
partial_file = filename.with_suffix(
|
|
124
|
+
f".{str(d).zfill(4)}.part.tortilla"
|
|
125
|
+
)
|
|
126
|
+
if partial_file.exists():
|
|
127
|
+
files.append(partial_file) # Add the file to the list
|
|
128
|
+
else:
|
|
129
|
+
raise FileNotFoundError(f"Missing partial file: {partial_file}")
|
|
130
|
+
|
|
131
|
+
else:
|
|
132
|
+
files = file
|
|
133
|
+
|
|
134
|
+
return files
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def sort_columns_add_geometry(metadata):
|
|
138
|
+
"""Sort the columns of a metadata DataFrame.
|
|
139
|
+
Also, convert the "stac:centroid" column to a geometry column.
|
|
140
|
+
|
|
141
|
+
Args:
|
|
142
|
+
metadata (pd.DataFrame): The metadata DataFrame.
|
|
143
|
+
|
|
144
|
+
Returns:
|
|
145
|
+
pd.DataFrame: The metadata DataFrame with sorted columns.
|
|
146
|
+
"""
|
|
147
|
+
if "stac:centroid" in metadata.columns:
|
|
148
|
+
metadata = gpd.GeoDataFrame(
|
|
149
|
+
data=metadata,
|
|
150
|
+
geometry=metadata["stac:centroid"].apply(shapely.wkt.loads),
|
|
151
|
+
crs="EPSG:4326",
|
|
152
|
+
)
|
|
153
|
+
columns = metadata.columns
|
|
154
|
+
prefixes = ["internal:", "tortilla:", "stac:", "rai:"]
|
|
155
|
+
sorted_columns = [
|
|
156
|
+
col for prefix in prefixes for col in columns if col.startswith(prefix)
|
|
157
|
+
]
|
|
158
|
+
rest = [col for col in columns if col not in sorted_columns and col != "geometry"]
|
|
159
|
+
columns = sorted_columns + rest + (["geometry"] if "geometry" in columns else [])
|
|
160
|
+
return metadata[columns]
|