tacoreader 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,22 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024, Cesar Aybar
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
22
+
@@ -0,0 +1,71 @@
1
+ Metadata-Version: 2.1
2
+ Name: tacoreader
3
+ Version: 0.1.0
4
+ Summary: A Python package to read Cloud-Optimized Datasets
5
+ Home-page: https://github.com/tacofoundation/tacoreader
6
+ Author: Cesar Aybar
7
+ Author-email: cesar.aybar@uv.es
8
+ Requires-Python: >=3.9,<4.0
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: Programming Language :: Python :: 3.9
11
+ Classifier: Programming Language :: Python :: 3.10
12
+ Classifier: Programming Language :: Python :: 3.11
13
+ Classifier: Programming Language :: Python :: 3.12
14
+ Requires-Dist: geopandas (>=1.0.1)
15
+ Requires-Dist: pyarrow (>=17.0.0)
16
+ Requires-Dist: requests (>=2.32.3)
17
+ Requires-Dist: shapely (>=2.0.6)
18
+ Project-URL: Documentation, https://tacofoundation.github.io/tacoreader/
19
+ Project-URL: Repository, https://github.com/tacofoundation/tacoreader
20
+ Description-Content-Type: text/markdown
21
+
22
+ # tacoreader
23
+
24
+ [![Release](https://img.shields.io/github/v/release/csaybar/tacoreader)](https://img.shields.io/github/v/release/csaybar/tacoreader)
25
+ [![Build status](https://img.shields.io/github/actions/workflow/status/csaybar/tacoreader/main.yml?branch=main)](https://github.com/csaybar/tacoreader/actions/workflows/main.yml?query=branch%3Amain)
26
+ [![codecov](https://codecov.io/gh/csaybar/tacoreader/branch/main/graph/badge.svg)](https://codecov.io/gh/csaybar/tacoreader)
27
+ [![Commit activity](https://img.shields.io/github/commit-activity/m/csaybar/tacoreader)](https://img.shields.io/github/commit-activity/m/csaybar/tacoreader)
28
+ [![License](https://img.shields.io/github/license/csaybar/tacoreader)](https://img.shields.io/github/license/csaybar/tacoreader)
29
+
30
+ Read TACO datasets
31
+
32
+ - **Github repository**: <https://github.com/csaybar/tacoreader/>
33
+ - **Documentation** <https://csaybar.github.io/tacoreader/>
34
+
35
+ ## Getting started with your project
36
+
37
+ First, create a repository on GitHub with the same name as this project, and then run the following commands:
38
+
39
+ ```bash
40
+ git init -b main
41
+ git add .
42
+ git commit -m "init commit"
43
+ git remote add origin git@github.com:csaybar/tacoreader.git
44
+ git push -u origin main
45
+ ```
46
+
47
+ Finally, install the environment and the pre-commit hooks with
48
+
49
+ ```bash
50
+ make install
51
+ ```
52
+
53
+ You are now ready to start development on your project!
54
+ The CI/CD pipeline will be triggered when you open a pull request, merge to main, or when you create a new release.
55
+
56
+ To finalize the set-up for publishing to PyPI or Artifactory, see [here](https://fpgmaas.github.io/cookiecutter-poetry/features/publishing/#set-up-for-pypi).
57
+ For activating the automatic documentation with MkDocs, see [here](https://fpgmaas.github.io/cookiecutter-poetry/features/mkdocs/#enabling-the-documentation-on-github).
58
+ To enable the code coverage reports, see [here](https://fpgmaas.github.io/cookiecutter-poetry/features/codecov/).
59
+
60
+ ## Releasing a new version
61
+
62
+ - Create an API Token on [PyPI](https://pypi.org/).
63
+ - Add the API Token to your projects secrets with the name `PYPI_TOKEN` by visiting [this page](https://github.com/csaybar/tacoreader/settings/secrets/actions/new).
64
+ - Create a [new release](https://github.com/csaybar/tacoreader/releases/new) on Github.
65
+ - Create a new tag in the form `*.*.*`.
66
+ - For more details, see [here](https://fpgmaas.github.io/cookiecutter-poetry/features/cicd/#how-to-trigger-a-release).
67
+
68
+ ---
69
+
70
+ Repository initiated with [fpgmaas/cookiecutter-poetry](https://github.com/fpgmaas/cookiecutter-poetry).
71
+
@@ -0,0 +1,49 @@
1
+ # tacoreader
2
+
3
+ [![Release](https://img.shields.io/github/v/release/csaybar/tacoreader)](https://img.shields.io/github/v/release/csaybar/tacoreader)
4
+ [![Build status](https://img.shields.io/github/actions/workflow/status/csaybar/tacoreader/main.yml?branch=main)](https://github.com/csaybar/tacoreader/actions/workflows/main.yml?query=branch%3Amain)
5
+ [![codecov](https://codecov.io/gh/csaybar/tacoreader/branch/main/graph/badge.svg)](https://codecov.io/gh/csaybar/tacoreader)
6
+ [![Commit activity](https://img.shields.io/github/commit-activity/m/csaybar/tacoreader)](https://img.shields.io/github/commit-activity/m/csaybar/tacoreader)
7
+ [![License](https://img.shields.io/github/license/csaybar/tacoreader)](https://img.shields.io/github/license/csaybar/tacoreader)
8
+
9
+ Read TACO datasets
10
+
11
+ - **Github repository**: <https://github.com/csaybar/tacoreader/>
12
+ - **Documentation** <https://csaybar.github.io/tacoreader/>
13
+
14
+ ## Getting started with your project
15
+
16
+ First, create a repository on GitHub with the same name as this project, and then run the following commands:
17
+
18
+ ```bash
19
+ git init -b main
20
+ git add .
21
+ git commit -m "init commit"
22
+ git remote add origin git@github.com:csaybar/tacoreader.git
23
+ git push -u origin main
24
+ ```
25
+
26
+ Finally, install the environment and the pre-commit hooks with
27
+
28
+ ```bash
29
+ make install
30
+ ```
31
+
32
+ You are now ready to start development on your project!
33
+ The CI/CD pipeline will be triggered when you open a pull request, merge to main, or when you create a new release.
34
+
35
+ To finalize the set-up for publishing to PyPI or Artifactory, see [here](https://fpgmaas.github.io/cookiecutter-poetry/features/publishing/#set-up-for-pypi).
36
+ For activating the automatic documentation with MkDocs, see [here](https://fpgmaas.github.io/cookiecutter-poetry/features/mkdocs/#enabling-the-documentation-on-github).
37
+ To enable the code coverage reports, see [here](https://fpgmaas.github.io/cookiecutter-poetry/features/codecov/).
38
+
39
+ ## Releasing a new version
40
+
41
+ - Create an API Token on [PyPI](https://pypi.org/).
42
+ - Add the API Token to your projects secrets with the name `PYPI_TOKEN` by visiting [this page](https://github.com/csaybar/tacoreader/settings/secrets/actions/new).
43
+ - Create a [new release](https://github.com/csaybar/tacoreader/releases/new) on Github.
44
+ - Create a new tag in the form `*.*.*`.
45
+ - For more details, see [here](https://fpgmaas.github.io/cookiecutter-poetry/features/cicd/#how-to-trigger-a-release).
46
+
47
+ ---
48
+
49
+ Repository initiated with [fpgmaas/cookiecutter-poetry](https://github.com/fpgmaas/cookiecutter-poetry).
@@ -0,0 +1,107 @@
1
+ [tool.poetry]
2
+ name = "tacoreader"
3
+ version = "0.1.0"
4
+ description = "A Python package to read Cloud-Optimized Datasets"
5
+ authors = ["Cesar Aybar <cesar.aybar@uv.es>"]
6
+ repository = "https://github.com/tacofoundation/tacoreader"
7
+ documentation = "https://tacofoundation.github.io/tacoreader/"
8
+ readme = "README.md"
9
+ packages = [
10
+ {include = "tacoreader"}
11
+ ]
12
+
13
+ [tool.poetry.dependencies]
14
+ python = ">=3.9,<4.0"
15
+ pyarrow = ">=17.0.0"
16
+ geopandas = ">=1.0.1"
17
+ shapely = ">=2.0.6"
18
+ requests = ">=2.32.3"
19
+
20
+ [tool.poetry.group.dev.dependencies]
21
+ pytest = "^7.2.0"
22
+ pytest-cov = "^4.0.0"
23
+ deptry = "^0.16.2"
24
+ mypy = "^1.5.1"
25
+ pre-commit = "^3.4.0"
26
+ tox = "^4.11.1"
27
+
28
+ [tool.poetry.group.docs.dependencies]
29
+ mkdocs = "^1.4.2"
30
+ mkdocs-material = "^9.2.7"
31
+ mkdocstrings = {extras = ["python"], version = "^0.26.1"}
32
+
33
+ [build-system]
34
+ requires = ["poetry-core>=1.0.0"]
35
+ build-backend = "poetry.core.masonry.api"
36
+
37
+ [tool.mypy]
38
+ files = ["tacoreader"]
39
+ disallow_untyped_defs = "True"
40
+ disallow_any_unimported = "True"
41
+ no_implicit_optional = "True"
42
+ check_untyped_defs = "True"
43
+ warn_return_any = "True"
44
+ warn_unused_ignores = "True"
45
+ show_error_codes = "True"
46
+
47
+
48
+
49
+ [tool.pytest.ini_options]
50
+ testpaths = ["tests"]
51
+
52
+ [tool.ruff]
53
+ target-version = "py39"
54
+ line-length = 120
55
+ fix = true
56
+ select = [
57
+ # flake8-2020
58
+ "YTT",
59
+ # flake8-bandit
60
+ "S",
61
+ # flake8-bugbear
62
+ "B",
63
+ # flake8-builtins
64
+ "A",
65
+ # flake8-comprehensions
66
+ "C4",
67
+ # flake8-debugger
68
+ "T10",
69
+ # flake8-simplify
70
+ "SIM",
71
+ # isort
72
+ "I",
73
+ # mccabe
74
+ "C90",
75
+ # pycodestyle
76
+ "E", "W",
77
+ # pyflakes
78
+ "F",
79
+ # pygrep-hooks
80
+ "PGH",
81
+ # pyupgrade
82
+ "UP",
83
+ # ruff
84
+ "RUF",
85
+ # tryceratops
86
+ "TRY",
87
+ ]
88
+ ignore = [
89
+ # LineTooLong
90
+ "E501",
91
+ # DoNotAssignLambda
92
+ "E731",
93
+ ]
94
+
95
+ [tool.ruff.format]
96
+ preview = true
97
+
98
+ [tool.coverage.report]
99
+ skip_empty = true
100
+
101
+ [tool.coverage.run]
102
+ branch = true
103
+ source = ["tacoreader"]
104
+
105
+
106
+ [tool.ruff.per-file-ignores]
107
+ "tests/*" = ["S101"]
@@ -0,0 +1,18 @@
1
+ from tacoreader.croissant import read_croissant
2
+ from tacoreader.datacard import read_datacard
3
+ from tacoreader.datacite import read_datacite
4
+ from tacoreader.load import load, load_metadata
5
+ from tacoreader.stac import read_stac
6
+
7
+ __all__ = [
8
+ "load",
9
+ "load_metadata",
10
+ "read_datacard",
11
+ "read_stac",
12
+ "read_croissant",
13
+ "read_datacite",
14
+ ]
15
+
16
+ __version__ = "0.1.0"
17
+
18
+
@@ -0,0 +1,2 @@
1
+ def read_croissant():
2
+ print("Do not implement it yet")
@@ -0,0 +1,241 @@
1
+ import pathlib
2
+ from typing import List, Union
3
+
4
+ from tacoreader.load import load_metadata
5
+
6
+ try:
7
+ from mdutils.mdutils import MdUtils
8
+ except ImportError:
9
+ raise ImportError("Please install the mdutils package with: pip install mdutils")
10
+
11
+
12
+ def read_datacard(
13
+ file: Union[str, pathlib.Path, List[pathlib.Path], List[str]],
14
+ outfile: Union[str, pathlib.Path],
15
+ ) -> pathlib.Path:
16
+
17
+ # Create a JSON file
18
+ taco_object: dict = load_metadata(file)
19
+
20
+ # Convert the output file to a pathlib.Path object
21
+ if isinstance(outfile, str):
22
+ outfile = pathlib.Path(outfile)
23
+
24
+ # Create the README.md file
25
+ md_file = MdUtils(file_name=outfile)
26
+
27
+ # --- YAML Header ---
28
+ md_file.new_line("---")
29
+ md_file.new_line("license:")
30
+ for item in taco_object["licenses"]:
31
+ md_file.new_line(f" - {item}")
32
+ md_file.new_line("language:")
33
+ md_file.new_line("- en")
34
+ if taco_object["keywords"]:
35
+ md_file.new_line("tags:")
36
+ for tag in taco_object["keywords"]:
37
+ md_file.new_line(f" - {tag}")
38
+ md_file.new_line(f'pretty_name: {taco_object["id"]}')
39
+ md_file.new_line("---")
40
+
41
+ # --- Title and Description ---
42
+ md_file.new_header(level=1, title=taco_object["id"])
43
+ if taco_object["title"]:
44
+ md_file.new_line(f'**{taco_object["title"]}**', bold_italics_code="b")
45
+ md_file.new_paragraph(taco_object["description"])
46
+
47
+ # --- Code Snippet ---
48
+ md_file.new_header(level=2, title="🌮 TACO Snippet")
49
+ md_file.new_paragraph("Load this dataset using the `tacoreader` library.")
50
+ md_file.new_line("```python")
51
+ md_file.new_line("import tacoreader")
52
+ md_file.new_line("dataset = tacoreader.load('...')")
53
+ md_file.new_line("```")
54
+
55
+ # example in R
56
+ md_file.new_line("\n")
57
+ md_file.new_paragraph("Or in R:")
58
+ md_file.new_line("```r")
59
+ md_file.new_line("library(tacoreader)")
60
+ md_file.new_line("dataset <- tacoreader::load('...')")
61
+ md_file.new_line("```")
62
+
63
+ # --- Sensor Information ---
64
+ if taco_object["optical_data"]:
65
+ md_file.new_header(level=2, title="🛰️ Sensor Information")
66
+ md_file.new_paragraph(
67
+ f'The sensor related to the dataset: **{taco_object["optical_data"]["sensor"]}**'
68
+ )
69
+
70
+ # --- Task Information ---
71
+ md_file.new_header(level=2, title="🎯 Task")
72
+ md_file.new_paragraph(
73
+ f'The task associated with this dataset: **{taco_object["task"]}**'
74
+ )
75
+
76
+ # --- Raw Repository Link ---
77
+ if taco_object["raw_link"]:
78
+ md_file.new_header(level=2, title="📂 Original Data Repository")
79
+ md_file.new_paragraph(
80
+ f'Source location of the raw data:**{md_file.new_inline_link(link=taco_object["raw_link"]["href"])}**'
81
+ )
82
+
83
+ # --- Discussion Link ---
84
+ if taco_object["discuss_link"]:
85
+ md_file.new_header(level=2, title="💬 Discussion")
86
+ md_file.new_paragraph(
87
+ f'Insights or clarifications about the dataset: **{md_file.new_inline_link(link=taco_object["discuss_link"]["href"])}**'
88
+ )
89
+
90
+ # --- Split Strategy ---
91
+ if taco_object["split_strategy"]:
92
+ md_file.new_header(level=2, title="🔀 Split Strategy")
93
+ md_file.new_paragraph(
94
+ f'How the dataset is divided for training, validation, and testing: **{taco_object["split_strategy"]}**'
95
+ )
96
+
97
+ # --- Scientific Publications ---
98
+ if taco_object["scientific"]["publications"]:
99
+ md_file.new_header(level=2, title="📚 Scientific Publications")
100
+ md_file.new_paragraph("Publications that reference or describe the dataset.")
101
+ for idx, pub in enumerate(taco_object["scientific"]["publications"], start=1):
102
+ # Add the publication information
103
+ md_file.new_paragraph(f"### Publication {idx:02d}")
104
+ md_file.new_line("- **DOI**: " + md_file.new_inline_link(link=pub["doi"]))
105
+ md_file.new_line("- **Summary**: " + pub["summary"])
106
+ md_file.new_line("- **BibTeX Citation**:")
107
+ md_file.new_line("```bibtex")
108
+ md_file.new_line(pub["citation"].strip("\n"))
109
+ md_file.new_line("```")
110
+ md_file.new_line("\n")
111
+
112
+ # --- Data Providers ---
113
+ if taco_object["providers"]:
114
+ md_file.new_header(level=2, title="🤝 Data Providers")
115
+ md_file.new_paragraph(
116
+ "Organizations or individuals responsible for the dataset."
117
+ )
118
+
119
+ # Define table headers
120
+ table_headers = ["**Name**", "**Role**", "**URL**"]
121
+ table_data = [table_headers]
122
+
123
+ # Populate table with provider data
124
+ for provider in taco_object["providers"]:
125
+ table_data.append(
126
+ [
127
+ provider["name"] or "N/A",
128
+ (
129
+ ", ".join(provider["roles"])
130
+ if isinstance(provider["roles"], list)
131
+ else provider["roles"] or "N/A"
132
+ ),
133
+ (
134
+ md_file.new_inline_link(link=provider["links"][0]["href"])
135
+ if provider["links"]
136
+ else "N/A"
137
+ ),
138
+ ]
139
+ )
140
+
141
+ # Flatten the list for Markdown formatting
142
+ flat_table_data = [cell for row in table_data for cell in row]
143
+
144
+ # Create the table
145
+ md_file.new_table(
146
+ columns=3, rows=len(table_data), text=flat_table_data, text_align="left"
147
+ )
148
+
149
+ # --- Curators ---
150
+ if taco_object["curators"]:
151
+ md_file.new_header(level=2, title="🧑‍🔬 Curators")
152
+ md_file.new_paragraph(
153
+ "Responsible for structuring the dataset in the TACO format."
154
+ )
155
+
156
+ # Define table headers
157
+ table_headers = ["**Name**", "**Organization**", "**URL**"]
158
+ table_data = [table_headers]
159
+
160
+ # Populate table with curator data
161
+ for curator in taco_object["curators"]:
162
+ table_data.append(
163
+ [
164
+ curator["name"] or "N/A",
165
+ curator["organization"] or "N/A",
166
+ (
167
+ md_file.new_inline_link(link=curator["links"][0]["href"])
168
+ if curator["links"]
169
+ else "N/A"
170
+ ),
171
+ ]
172
+ )
173
+
174
+ # Flatten the list for Markdown formatting
175
+ flat_table_data = [cell for row in table_data for cell in row]
176
+ # Create the table
177
+ md_file.new_table(
178
+ columns=3, rows=len(table_data), text=flat_table_data, text_align="left"
179
+ )
180
+
181
+ # --- Labels ---
182
+ if taco_object["labels"]:
183
+ md_file.new_header(level=2, title="🏷️ Labels")
184
+ md_file.new_paragraph(taco_object["labels"]["label_description"])
185
+ table_headers = ["**Name**", "**Category**", "**Description**"]
186
+ table_data = [table_headers]
187
+ for item in taco_object["labels"]["label_classes"]:
188
+ table_data.append(
189
+ [
190
+ item["name"] or "N/A",
191
+ (
192
+ str(item["category"]) if item["category"] is not None else "N/A"
193
+ ), # Zero is a valid category
194
+ item["description"] or "N/A",
195
+ ]
196
+ )
197
+
198
+ # Flatten the list for Markdown formatting
199
+ flat_table_data = [cell for row in table_data for cell in row]
200
+ md_file.new_table(
201
+ columns=3, rows=len(table_data), text=flat_table_data, text_align="left"
202
+ )
203
+
204
+ # --- Optical Bands ---
205
+ if taco_object["optical_data"]:
206
+ md_file.new_header(level=2, title="🌈 Optical Bands")
207
+ md_file.new_paragraph("Spectral bands related to the sensor.")
208
+ table_headers = [
209
+ "**Name**",
210
+ "**Common Name**",
211
+ "**Description**",
212
+ "**Center Wavelength**",
213
+ "**Full Width Half Max**",
214
+ "**Index**",
215
+ ]
216
+ table_data = [table_headers]
217
+
218
+ for item in taco_object["optical_data"]["bands"]:
219
+ table_data.append(
220
+ [
221
+ item["name"] or "N/A",
222
+ item["common_name"] or "N/A",
223
+ item["description"] or "N/A",
224
+ item["center_wavelength"] or "N/A",
225
+ item["full_width_half_max"] or "N/A",
226
+ str(item["index"]) if item["index"] is not None else "N/A",
227
+ ]
228
+ )
229
+
230
+ # Flatten the list for Markdown formatting
231
+ flat_table_data = [cell for row in table_data for cell in row]
232
+ md_file.new_table(
233
+ columns=6, rows=len(table_data), text=flat_table_data, text_align="left"
234
+ )
235
+
236
+ # Export the data to the output file
237
+ file = md_file.get_md_text().replace("\n\n\n \n", "").replace(" \n", "\n")
238
+ with open(outfile, "w") as f:
239
+ f.write(file)
240
+
241
+ return outfile
@@ -0,0 +1,2 @@
1
+ def read_datacite():
2
+ print("Do not implement it yet")
@@ -0,0 +1,151 @@
1
+ import pathlib
2
+ import re
3
+ from typing import List, Union
4
+
5
+ import geopandas as gpd
6
+ import pandas as pd
7
+ import requests
8
+
9
+ from tacoreader import load_local, load_remote, load_utils
10
+
11
+
12
+ def load(file: Union[str, pathlib.Path, List[pathlib.Path], List[str]]) -> pd.DataFrame:
13
+ """Load the dataframe of a tortilla file.
14
+
15
+ Args:
16
+ file (Union[str, pathlib.Path, List]): The path of
17
+ the tortilla file. If the file is split into multiple
18
+ parts, a list of paths is accepted. Also, multiple
19
+ parts can be read by putting a asterisk (*) at the end
20
+ of the file name. For example, "file*.tortilla". In this
21
+ case, the function will create a list will all the partitions
22
+ before the reading process.
23
+
24
+ Returns:
25
+ pd.DataFrame: The dataframe of the tortilla file.
26
+ """
27
+
28
+ # Transform our snippet into a list of files
29
+ # If it is not a snippet, it will return the same file
30
+ file = load_utils.snippet2files(file=file)
31
+
32
+ if isinstance(file, list):
33
+ if load_utils.is_valid_url(file):
34
+ dataframe = load_remote.remote_files2dataframe(file)
35
+ else:
36
+ dataframe = load_local.local_files2dataframe(file)
37
+ elif isinstance(file, (str, pathlib.Path)):
38
+ if load_utils.is_valid_url(file):
39
+ dataframe = load_remote.remote_file2dataframe(file)
40
+ else:
41
+ dataframe = load_local.local_file2dataframe(file)
42
+ else:
43
+ raise ValueError("Invalid file type. Must be a list, string or pathlib.Path.")
44
+
45
+ # Clean up the dataframe
46
+ dataframe = load_utils.sort_columns_add_geometry(dataframe)
47
+
48
+ return TortillaDataFrame(dataframe)
49
+
50
+
51
+ def lazy_load(offset: int, file: Union[str, pathlib.Path]) -> pd.DataFrame:
52
+ """Lazy load a tortilla file.
53
+
54
+ Useful for datasets that have tortillas as samples (tortillas inside tortillas).
55
+ The offset is used to read a specific part of the main tortilla file.
56
+
57
+ Args:
58
+ offset (int): The byte offset where the reading process will start.
59
+ file (Union[str, pathlib.Path]): The path tot the main tortilla file.
60
+
61
+ Returns:
62
+ pd.DataFrame: The dataframe of the tortilla file.
63
+ """
64
+
65
+ if load_utils.is_valid_url(file):
66
+ dataframe = load_remote.remote_lazyfile2dataframe(offset, file)
67
+ else:
68
+ dataframe = load_local.local_lazyfile2dataframe(offset, file)
69
+
70
+ # Clean up the dataframe
71
+ dataframe = load_utils.sort_columns_add_geometry(dataframe)
72
+
73
+ return TortillaDataFrame(dataframe)
74
+
75
+
76
+ def load_metadata(
77
+ file: Union[str, pathlib.Path, List[pathlib.Path], List[str]]
78
+ ) -> dict:
79
+ """Load the metadata of a tortilla or taco file.
80
+
81
+ Args:
82
+ file (Union[str, pathlib.Path, List]): The path of
83
+ the taco file. If the file is split into multiple
84
+ parts, a list of paths is accepted. Also, multiple
85
+ parts can be read by putting a asterisk (*) at the end
86
+ of the file name. For example, "file*.tortilla". In this
87
+ case, the function will create a list will all the partitions
88
+ before the reading process.
89
+
90
+ Returns:
91
+ dict: The metadata of the taco file.
92
+ """
93
+
94
+ # Transform our snippet into a list of files
95
+ # If it is not a snippet, it will return the same file
96
+ file = load_utils.snippet2files(file=file)
97
+
98
+ # Load the metadata
99
+ if isinstance(file, list):
100
+ if load_utils.is_valid_url(file):
101
+ metadata = load_remote.remote_files2metadata(file)
102
+ else:
103
+ metadata = load_local.local_files2metadata(file)
104
+ elif isinstance(file, (str, pathlib.Path)):
105
+ if load_utils.is_valid_url(file):
106
+ metadata = load_remote.remote_file2metadata(file)
107
+ else:
108
+ metadata = load_local.local_file2metadata(file)
109
+ else:
110
+ raise ValueError("Invalid file type. Must be a list, string or pathlib.Path.")
111
+
112
+ return metadata
113
+
114
+
115
+ class TortillaDataFrame(gpd.GeoDataFrame):
116
+ @property
117
+ def _constructor(self):
118
+ return TortillaDataFrame
119
+
120
+ @staticmethod
121
+ def get_internal_path(row):
122
+ pattern: re.Pattern = re.compile(r"/vsisubfile/(\d+)_(\d+),(.+)")
123
+ offset, length, path = pattern.match(row["internal:subfile"]).groups()
124
+
125
+ # If it is a curl file, remove the first 9 characters
126
+ if path.startswith("/vsicurl/"):
127
+ path = path[9:]
128
+
129
+ return int(offset), int(length), path
130
+
131
+ def read(self, idx):
132
+ row = self.iloc[idx]
133
+ if row["internal:file_format"] == "TORTILLA":
134
+ offset, length, path = self.get_internal_path(row)
135
+ return lazy_load(row["tortilla:offset"], path)
136
+ elif row["internal:file_format"] == "BYTES":
137
+
138
+ # Obtain the offset, length and internal path
139
+ offset, length, path = self.get_internal_path(row)
140
+
141
+ # Get the bytes
142
+ if load_utils.is_valid_url(path):
143
+ headers = {"Range": f"bytes={offset}-{offset + length - 1}"}
144
+ response: requests.Response = requests.get(path, headers=headers)
145
+ return response.content
146
+ else:
147
+ with open(path, "rb") as f:
148
+ f.seek(int(offset))
149
+ return f.read(int(length))
150
+ else:
151
+ return row["internal:subfile"]
@@ -0,0 +1,202 @@
1
+ import json
2
+ import pathlib
3
+ from typing import List, Union
4
+
5
+ import pandas as pd
6
+ import pyarrow as pa
7
+ import pyarrow.parquet as pq
8
+
9
+
10
+ def local_file2dataframe(file: Union[str, pathlib.Path]):
11
+ """Read the dataframe of tortilla file given a local path.
12
+
13
+ Args:
14
+ files (Union[str, pathlib.Path]): A local path pointing to the
15
+ tortilla file.
16
+
17
+ Returns:
18
+ pd.DataFrame: The dataframe of the tortilla file.
19
+ """
20
+ with open(file, "rb") as f:
21
+ static_bytes = f.read(50)
22
+
23
+ # SPLIT the static bytes
24
+ MB: bytes = static_bytes[:2]
25
+ FO: bytes = static_bytes[2:10]
26
+ FL: bytes = static_bytes[10:18]
27
+ DF: str = static_bytes[18:42].strip().decode()
28
+ # DP: str = static_bytes[42:50]
29
+
30
+ if MB != b"#y":
31
+ raise ValueError("You are not a tortilla 🫓 or a TACO 🌮")
32
+
33
+ # Read the NEXT 8 bytes of the file
34
+ footer_offset: int = int.from_bytes(FO, "little")
35
+
36
+ # Seek to the FOOTER offset
37
+ f.seek(footer_offset)
38
+
39
+ # Select the FOOTER length
40
+ # Read the FOOTER
41
+ footer_length: int = int.from_bytes(FL, "little")
42
+ dataframe = pq.read_table(pa.BufferReader(f.read(footer_length))).to_pandas()
43
+
44
+ # Convert dataset to DataFrame
45
+ dataframe["internal:file_format"] = DF
46
+ dataframe["internal:mode"] = "local"
47
+ dataframe["internal:subfile"] = dataframe.apply(
48
+ lambda row: f"/vsisubfile/{row['tortilla:offset']}_{row['tortilla:length']},{file}",
49
+ axis=1,
50
+ )
51
+
52
+ return dataframe
53
+
54
+
55
+ def local_files2dataframe(files: Union[List[str], List[pathlib.Path]]):
56
+ """Read the dataframe of tortilla files given local paths.
57
+
58
+ Args:
59
+ files (Union[List[str], List[pathlib.Path]]): A list of local
60
+ paths pointing to the tortilla files.
61
+
62
+ Returns:
63
+ pd.DataFrame: The dataframe of the tortilla file.
64
+ """
65
+
66
+ # Merge the dataframe of the files
67
+ container = []
68
+ for file in files:
69
+ with open(file, "rb") as f:
70
+ static_bytes = f.read(50)
71
+
72
+ # SPLIT the static bytes
73
+ MB: bytes = static_bytes[:2]
74
+ FO: bytes = static_bytes[2:10]
75
+ FL: bytes = static_bytes[10:18]
76
+ DF: str = static_bytes[18:42].strip().decode()
77
+ # DP: str = static_bytes[42:50]
78
+
79
+ if MB != b"#y":
80
+ raise ValueError("You are not a tortilla 🫓 or a TACO 🌮")
81
+
82
+ # Read the NEXT 8 bytes of the file
83
+ footer_offset: int = int.from_bytes(FO, "little")
84
+
85
+ # Seek to the FOOTER offset
86
+ f.seek(footer_offset)
87
+
88
+ # Select the FOOTER length
89
+ # Read the FOOTER
90
+ footer_length: int = int.from_bytes(FL, "little")
91
+ dataframe = pq.read_table(
92
+ pa.BufferReader(f.read(footer_length))
93
+ ).to_pandas()
94
+
95
+ # Convert dataset to DataFrame
96
+ dataframe["internal:file_format"] = DF
97
+ dataframe["internal:mode"] = "local"
98
+ dataframe["internal:subfile"] = dataframe.apply(
99
+ lambda row: f"/vsisubfile/{row['tortilla:offset']}_{row['tortilla:length']},{file}",
100
+ axis=1,
101
+ )
102
+ container.append(dataframe)
103
+
104
+ return pd.concat(container, ignore_index=True)
105
+
106
+
107
+ def local_lazyfile2dataframe(
108
+ offset: int, file: Union[str, pathlib.Path]
109
+ ) -> pd.DataFrame:
110
+ """Read the dataframe of tortilla file that is a subfile
111
+ of a larger file.
112
+
113
+ Args:
114
+ offset (int): The offset of the subfile.
115
+ file (Union[str, pathlib.Path]): A local path pointing to the
116
+ main tortilla file.
117
+
118
+ Returns:
119
+ pd.DataFrame: The dataframe of the tortilla file.
120
+ """
121
+
122
+ with open(file, "rb") as f:
123
+ # Seek to the OFFSET
124
+ f.seek(offset)
125
+
126
+ static_bytes = f.read(50)
127
+
128
+ # SPLIT the static bytes
129
+ MB: bytes = static_bytes[:2]
130
+ FO: bytes = static_bytes[2:10]
131
+ FL: bytes = static_bytes[10:18]
132
+ DF: str = static_bytes[18:42].strip().decode()
133
+ # DP: str = static_bytes[42:50]
134
+
135
+ if MB != b"#y":
136
+ raise ValueError("You are not a tortilla 🫓 or a TACO 🌮")
137
+
138
+ # Read the NEXT 8 bytes of the file
139
+ footer_offset: int = int.from_bytes(FO, "little") + offset
140
+
141
+ # Seek to the FOOTER offset
142
+ f.seek(footer_offset)
143
+
144
+ # Select the FOOTER length
145
+ # Read the FOOTER
146
+ footer_length: int = int.from_bytes(FL, "little")
147
+ dataframe = pq.read_table(pa.BufferReader(f.read(footer_length))).to_pandas()
148
+
149
+ # Fix the offset
150
+ dataframe["tortilla:offset"] = dataframe["tortilla:offset"] + offset
151
+
152
+ # Convert dataset to DataFrame
153
+ dataframe["internal:file_format"] = DF
154
+ dataframe["internal:mode"] = "local"
155
+ dataframe["internal:subfile"] = dataframe.apply(
156
+ lambda row: f"/vsisubfile/{row['tortilla:offset']}_{row['tortilla:length']},{file}",
157
+ axis=1,
158
+ )
159
+
160
+ return dataframe
161
+
162
+
163
+ def local_file2metadata(file: Union[str, pathlib.Path]) -> dict:
164
+ """Read the dataframe of a taco file given a local path.
165
+
166
+ Args:
167
+ file (Union[str, pathlib.Path]): A local path pointing to the
168
+ taco file.
169
+
170
+ Returns:
171
+ dict: The metadata of the taco file.
172
+ """
173
+ with open(file, "rb") as f:
174
+ f.seek(50)
175
+
176
+ # Read the Collection offset (CO)
177
+ CO: int = int.from_bytes(f.read(8), "little")
178
+
179
+ # Read the Collection length (CL)
180
+ CL: int = int.from_bytes(f.read(8), "little")
181
+
182
+ # Seek to the Collection offset
183
+ f.seek(CO)
184
+
185
+ # Read the Collection (JSON UTF-8 encoded)
186
+ collection: dict = json.loads(f.read(CL).decode())
187
+
188
+ return collection
189
+
190
+
191
+ def local_files2metadata(files: Union[List[str], List[pathlib.Path]]) -> dict:
192
+ """Read the metadata of taco files given local paths.
193
+
194
+ Args:
195
+ files (Union[List[str], List[pathlib.Path]]): A list of local
196
+ paths pointing to the taco files.
197
+
198
+ Returns:
199
+ dict: The metadata of the taco file.
200
+ """
201
+
202
+ return local_file2dataframe(files[0])
@@ -0,0 +1,205 @@
1
+ import json
2
+ import pathlib
3
+ from typing import List, Union
4
+
5
+ import pandas as pd
6
+ import pyarrow as pa
7
+ import pyarrow.parquet as pq
8
+ import requests
9
+
10
+
11
+ def remote_file2dataframe(file: str) -> pd.DataFrame:
12
+ """Read the dataframe of a tortilla file given a URL. The
13
+ server must support HTTP Range requests.
14
+
15
+ Args:
16
+ files (str): A URL pointing to the tortilla file.
17
+ Returns:
18
+ pd.DataFrame: The dataframe of the tortilla file.
19
+ """
20
+ # Fetch the first 8 bytes of the file
21
+ headers = {"Range": "bytes=0-50"}
22
+ response: requests.Response = requests.get(file, headers=headers)
23
+ static_bytes: bytes = response.content
24
+
25
+ # SPLIT the static bytes
26
+ MB: bytes = static_bytes[:2]
27
+ FO: bytes = static_bytes[2:10]
28
+ FL: bytes = static_bytes[10:18]
29
+ DF: str = static_bytes[18:42].strip().decode()
30
+
31
+ # Check if the file is a tortilla
32
+ if MB != b"#y":
33
+ raise ValueError("You are not a tortilla 🫓 or a TACO 🌮")
34
+
35
+ # Interpret the bytes as a little-endian integer
36
+ footer_offset: int = int.from_bytes(FO, "little")
37
+ footer_length: int = int.from_bytes(FL, "little")
38
+
39
+ # Fetch the footer
40
+ headers = {"Range": f"bytes={footer_offset}-{footer_offset + footer_length - 1}"}
41
+ with requests.get(file, headers=headers) as response:
42
+ # Interpret the response as a parquet table
43
+ dataframe = pq.read_table(pa.BufferReader(response.content)).to_pandas()
44
+
45
+ # Add the file format and mode
46
+ dataframe["internal:file_format"] = DF
47
+ dataframe["internal:mode"] = "online"
48
+ dataframe["internal:subfile"] = dataframe.apply(
49
+ lambda row: f"/vsisubfile/{row['tortilla:offset']}_{row['tortilla:length']},/vsicurl/{file}",
50
+ axis=1,
51
+ )
52
+ return dataframe
53
+
54
+
55
+ def remote_files2dataframe(files: List[str]) -> pd.DataFrame:
56
+ """Read the dataframe of tortillas files given a set of URLs. The
57
+ server must support HTTP Range requests.
58
+
59
+ Args:
60
+ files (List[str]): A list of URLs pointing to the
61
+ tortilla files.
62
+
63
+ Returns:
64
+ pd.DataFrame: The dataframe of the tortilla file.
65
+ """
66
+
67
+ container = []
68
+ for file in files:
69
+
70
+ # Fetch the first 8 bytes of the file
71
+ headers = {"Range": "bytes=0-50"}
72
+ response: requests.Response = requests.get(file, headers=headers)
73
+ static_bytes: bytes = response.content
74
+
75
+ # SPLIT the static bytes
76
+ MB: bytes = static_bytes[:2]
77
+ FO: bytes = static_bytes[2:10]
78
+ FL: bytes = static_bytes[10:18]
79
+ DF: str = static_bytes[18:42].strip().decode()
80
+
81
+ # Check if the file is a tortilla
82
+ if MB != b"#y":
83
+ raise ValueError("You are not a tortilla 🫓 or a TACO 🌮")
84
+
85
+ # Interpret the bytes as a little-endian integer
86
+ footer_offset: int = int.from_bytes(FO, "little")
87
+ footer_length: int = int.from_bytes(FL, "little")
88
+
89
+ # Fetch the footer
90
+ headers = {"Range": f"bytes={footer_offset}-{footer_offset + footer_length}"}
91
+ with requests.get(file, headers=headers) as response:
92
+
93
+ # Interpret the response as a parquet table
94
+ dataframe = pq.read_table(pa.BufferReader(response.content)).to_pandas()
95
+
96
+ # Add the file format and mode
97
+ dataframe["internal:file_format"] = DF
98
+ dataframe["internal:mode"] = "online"
99
+ dataframe["internal:subfile"] = dataframe.apply(
100
+ lambda row: f"/vsisubfile/{row['tortilla:offset']}_{row['tortilla:length']},/vsicurl/{file}",
101
+ axis=1,
102
+ )
103
+ container.append(dataframe)
104
+
105
+ return pd.concat(container, ignore_index=True)
106
+
107
+
108
+ def remote_lazyfile2dataframe(
109
+ offset: int, file: Union[str, pathlib.Path]
110
+ ) -> pd.DataFrame:
111
+ """Read the dataframe of tortilla file that is a subfile
112
+ of a larger file.
113
+
114
+ Args:
115
+ offset (int): The offset of the subfile.
116
+ file (Union[str, pathlib.Path]): A local path pointing to the
117
+ main tortilla file.
118
+
119
+ Returns:
120
+ pd.DataFrame: The dataframe of the tortilla file.
121
+ """
122
+
123
+ # Fetch the first 8 bytes of the file
124
+ initb, endb = offset, offset + 50
125
+ headers = {"Range": f"bytes={initb}-{endb}"}
126
+ response: requests.Response = requests.get(file, headers=headers)
127
+ static_bytes: bytes = response.content
128
+
129
+ # SPLIT the static bytes
130
+ MB: bytes = static_bytes[:2]
131
+ FO: bytes = static_bytes[2:10]
132
+ FL: bytes = static_bytes[10:18]
133
+ DF: str = static_bytes[18:42].strip().decode()
134
+
135
+ # Check if the file is a tortilla
136
+ if MB != b"#y":
137
+ raise ValueError("You are not a tortilla 🫓 or a TACO 🌮")
138
+
139
+ # Interpret the bytes as a little-endian integer
140
+ footer_offset: int = int.from_bytes(FO, "little") + offset
141
+ footer_length: int = int.from_bytes(FL, "little")
142
+
143
+ # Fetch the footer
144
+ headers = {"Range": f"bytes={footer_offset}-{footer_offset + footer_length - 1}"}
145
+ with requests.get(file, headers=headers) as response:
146
+ # Interpret the response as a parquet table
147
+ dataframe = pq.read_table(pa.BufferReader(response.content)).to_pandas()
148
+
149
+ # Fix the offset
150
+ dataframe["tortilla:offset"] = dataframe["tortilla:offset"] + offset
151
+
152
+ # Add the file format and mode
153
+ dataframe["internal:file_format"] = DF
154
+ dataframe["internal:mode"] = "online"
155
+ dataframe["internal:subfile"] = dataframe.apply(
156
+ lambda row: f"/vsisubfile/{row['tortilla:offset']}_{row['tortilla:length']},/vsicurl/{file}",
157
+ axis=1,
158
+ )
159
+
160
+ return dataframe
161
+
162
+
163
+ def remote_file2metadata(file: str) -> dict:
164
+ """Read the metadata of a taco file given a URL. The
165
+ server must support HTTP Range requests.
166
+
167
+ Args:
168
+ file (str): A URL pointing to the taco file.
169
+
170
+ Returns:
171
+ dict: The metadata of the taco file.
172
+ """
173
+ # Fetch the first 8 bytes of the file
174
+ headers = {"Range": "bytes=0-66"}
175
+ response: requests.Response = requests.get(file, headers=headers)
176
+ static_bytes: bytes = response.content
177
+
178
+ # SPLIT the static bytes
179
+ MB: bytes = static_bytes[:2]
180
+ CO: int = int.from_bytes(static_bytes[50:58], "little")
181
+ CL: int = int.from_bytes(static_bytes[58:66], "little")
182
+
183
+ # Check if the file is a tortilla
184
+ if MB != b"#y":
185
+ raise ValueError("You are not a tortilla 🫓 or a TACO 🌮")
186
+
187
+ # Read the Collection (JSON UTF-8 encoded)
188
+ headers = {"Range": f"bytes={CO}-{CO + CL}"}
189
+ collection: dict = json.loads(requests.get(file, headers=headers).content.decode())
190
+
191
+ return collection
192
+
193
+
194
+ def remote_files2metadata(files: List[str]) -> dict:
195
+ """Read the metadata of taco files given a set of URLs. The server
196
+ must support HTTP Range requests.
197
+
198
+ Args:
199
+ files (List[str]): A list of URLs pointing to the
200
+ taco files.
201
+
202
+ Returns:
203
+ dict: The metadata of the taco file.
204
+ """
205
+ return remote_file2metadata(files[0])
@@ -0,0 +1,160 @@
1
+ import pathlib
2
+ import re
3
+ import urllib
4
+ from typing import List, Tuple, Union
5
+
6
+ import geopandas as gpd
7
+ import requests
8
+ import shapely.wkt
9
+
10
+
11
+ def is_valid_url(url: Union[str, List[str]]) -> bool:
12
+ """Check if a URL or list of URLs is valid.
13
+
14
+ Args:
15
+ url (Union[str, List[str]]): The URL(s) to check. It can
16
+ be a single URL or a list of URLs.
17
+
18
+ Returns:
19
+ bool: True if all URLs are valid, False otherwise.
20
+ """
21
+ if isinstance(url, pathlib.Path):
22
+ return False
23
+
24
+ if isinstance(url, list):
25
+ return all(is_valid_url(single_url) for single_url in url)
26
+
27
+ try:
28
+ result = urllib.parse.urlparse(url)
29
+ return all([result.scheme in ["http", "https"], result.netloc])
30
+ except ValueError:
31
+ return False
32
+
33
+
34
+ def split_name_and_path(file: Union[str, pathlib.Path]) -> Tuple[str, str]:
35
+ """
36
+ Split a file path or URL into its name and path components.
37
+
38
+ Args:
39
+ file (Union[str, pathlib.Path]): The input file or URL.
40
+
41
+ Returns:
42
+ Tuple[str, str]: A tuple containing the name and path.
43
+ """
44
+ if isinstance(file, (str, pathlib.Path)):
45
+ # Handle URLs
46
+ parsed = urllib.parse.urlparse(file)
47
+ if parsed.scheme in {"http", "https"}:
48
+ # Extract the name and path from a URL
49
+ name = pathlib.PurePosixPath(parsed.path).name
50
+ path = f"{parsed.scheme}://{parsed.netloc}{pathlib.PurePosixPath(parsed.path).parent.as_posix()}"
51
+ else:
52
+ # Handle local file paths
53
+ file_path = pathlib.Path(file)
54
+ name = file_path.name
55
+ path = file_path.parent.as_posix()
56
+ return name, path
57
+ else:
58
+ raise ValueError("Input must be a string or pathlib.Path.")
59
+
60
+
61
+ def snippet2files(
62
+ file: Union[str, pathlib.Path, List[str], List[pathlib.Path]]
63
+ ) -> Union[List[pathlib.Path], List[str], str, pathlib.Path]:
64
+ """Convert snippets of a multi-part file to a list of files.
65
+
66
+ Args:
67
+ file (Union[str, pathlib.Path, List[str], List[pathlib.Path]]): A file, a
68
+ list of files, or a snippet of a multi-part file.
69
+
70
+ Raises:
71
+ FileNotFoundError: If a file is not found.
72
+ FileNotFoundError: If a partial file is missing.
73
+
74
+ Returns:
75
+ List[pathlib.Path]: A list of files or a single path. The path can be a
76
+ local path or a URL.
77
+ """
78
+
79
+ # Check if the file is a list
80
+ if isinstance(file, list):
81
+ files = file
82
+ else:
83
+ # Does the file finish with: *.tortilla?
84
+ if re.match(r".*\*\.tortilla$", str(file)):
85
+
86
+ # Split in name and path
87
+ name, path = split_name_and_path(file)
88
+
89
+ # Get the filename without the snippet (i.e., *.tortilla)
90
+ filename: str = re.sub(r"\*\.tortilla$", "", name)
91
+
92
+ # check if file is a url
93
+ if is_valid_url(path):
94
+ # It is expected that the file is a multi-part file in the same url path
95
+ dumbfile: str = f"{path}/{filename}.0000.part.tortilla"
96
+ headers = {"Range": "bytes=42-50"}
97
+ response: requests.Response = requests.get(dumbfile, headers=headers)
98
+ npartitions: int = int.from_bytes(response.content, "little")
99
+
100
+ # Check if all parts are there
101
+ files = []
102
+ for d in range(npartitions):
103
+ partial_file = f"{path}/{filename}.{str(d).zfill(4)}.part.tortilla"
104
+ files.append(partial_file)
105
+ else:
106
+ # Get all files in the directory
107
+ file = pathlib.Path(file)
108
+ filename = pathlib.Path(filename)
109
+
110
+ # It is expected that the file is a multi-part file in the same directory
111
+ dumbfile: pathlib.Path = file.resolve().parent / (
112
+ filename.stem + ".0000.part.tortilla"
113
+ )
114
+
115
+ # check how many parts are there
116
+ with open(dumbfile, "rb") as f:
117
+ f.seek(42)
118
+ npartitions: int = int.from_bytes(f.read(8), "little")
119
+
120
+ # Check if all parts are there
121
+ files = []
122
+ for d in range(npartitions):
123
+ partial_file = filename.with_suffix(
124
+ f".{str(d).zfill(4)}.part.tortilla"
125
+ )
126
+ if partial_file.exists():
127
+ files.append(partial_file) # Add the file to the list
128
+ else:
129
+ raise FileNotFoundError(f"Missing partial file: {partial_file}")
130
+
131
+ else:
132
+ files = file
133
+
134
+ return files
135
+
136
+
137
+ def sort_columns_add_geometry(metadata):
138
+ """Sort the columns of a metadata DataFrame.
139
+ Also, convert the "stac:centroid" column to a geometry column.
140
+
141
+ Args:
142
+ metadata (pd.DataFrame): The metadata DataFrame.
143
+
144
+ Returns:
145
+ pd.DataFrame: The metadata DataFrame with sorted columns.
146
+ """
147
+ if "stac:centroid" in metadata.columns:
148
+ metadata = gpd.GeoDataFrame(
149
+ data=metadata,
150
+ geometry=metadata["stac:centroid"].apply(shapely.wkt.loads),
151
+ crs="EPSG:4326",
152
+ )
153
+ columns = metadata.columns
154
+ prefixes = ["internal:", "tortilla:", "stac:", "rai:"]
155
+ sorted_columns = [
156
+ col for prefix in prefixes for col in columns if col.startswith(prefix)
157
+ ]
158
+ rest = [col for col in columns if col not in sorted_columns and col != "geometry"]
159
+ columns = sorted_columns + rest + (["geometry"] if "geometry" in columns else [])
160
+ return metadata[columns]
@@ -0,0 +1,2 @@
1
+ def read_stac():
2
+ print("Do not implement it yet")