earthcode 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
earthcode/__init__.py ADDED
File without changes
earthcode/fairtool.py ADDED
@@ -0,0 +1,577 @@
1
+ #!/usr/bin/env python3
2
+
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+ import json
7
+ import logging
8
+ import random
9
+ import sys
10
+ import fnmatch
11
+ from urllib.parse import urlparse
12
+ import requests
13
+ import pystac
14
+
15
+ from dataclasses import dataclass, field
16
+ from typing import Dict, List, Mapping, Optional, Sequence, Tuple, Union, Any
17
+
18
+ from fsspec.implementations.http import HTTPFileSystem
19
+ from zarr.storage import ZipStore
20
+ from xarray import open_datatree
21
+ import rioxarray
22
+ import xarray
23
+ import geopandas as gpd
24
+ import pandas as pd
25
+ import zipfile
26
+ import csv
27
+
28
+
29
+ READERS = {
30
+ # xarray
31
+ "application/x-netcdf": xarray.open_dataset,
32
+ "application/vnd+zarr": xarray.open_zarr,
33
+
34
+ # rioxarray
35
+ "image/tiff": rioxarray.open_rasterio,
36
+ "image/cog": rioxarray.open_rasterio, # Cloud Optimized GeoTIFF (COG)
37
+
38
+ # Python standard libs
39
+ "application/zip": zipfile.ZipFile,
40
+ "application/pdf": open,
41
+ "text/plain": open,
42
+
43
+ # pandas
44
+ "text/csv": pd.read_csv,
45
+ "application/vnd.apache.parquet": pd.read_parquet,
46
+
47
+ # geopandas
48
+ "application/x-shapefile": gpd.read_file,
49
+ "application/vnd.apache.geoparquet": gpd.read_parquet,
50
+ "application/geo+json": gpd.read_file,
51
+ }
52
+
53
+ APPROVED_DATA_HOSTING_DOMAINS = [
54
+ "*.esa.int",
55
+ "s3.waw4-1.cloudferro.com",
56
+ "zenodo.org",
57
+ "doi.org",
58
+ "*.pangaea.de",
59
+ "*.copernicus.eu",
60
+ "*.ac.uk",
61
+ ]
62
+
63
+ APPROVED_METADATA_HOSTING_DOMAINS = [
64
+ "*.esa.int",
65
+ "s3.waw4-1.cloudferro.com",
66
+ "*.github.org",
67
+ ]
68
+
69
+ DEFAULT_USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:140.0) Gecko/20100101 Firefox/140.0"
70
+
71
+ CLOUD_NATIVE_FORMATS = set([
72
+ "application/vnd.apache.geoparquet",
73
+ "image/cog",
74
+ "application/vnd+zarr"
75
+ ])
76
+
77
+ fair_descriptions = {
78
+ "fair:product_url_resolves": "Test whether the dataset URL resolves successfully.",
79
+ "fair:product_has_doi": "Test whether the the dataset has an associated DOI.",
80
+ "fair:product_has_documentation": "Test whether the the dataset has documentation .",
81
+ "fair:product_approved_metadata_domain": "Test whether the metadata is hosted on an approved domain.",
82
+ "fair:product_approved_data_domain": "Test whether the data is hosted on an approved domain.",
83
+ "fair:file_access": "Test whether the metadata has per-file metadata, or if the data is a raw dump.",
84
+ "fair:file_acessible_files_rate": "Percent of assets that could be opened in tests.",
85
+ "fair:file_cloud_assets_rate": "Percent of assets that are in cloud-optimised format.",
86
+ "fair:workflow_exists": "Dataset has associated workflow."
87
+ }
88
+
89
+
90
+ @dataclass
91
+ class ProductAuditResult:
92
+ """
93
+ Holds the complete analysis result for a single Product.
94
+ """
95
+ product_id: str
96
+
97
+ # Metadata Links
98
+ via_href: Optional[str]
99
+ child_href: Optional[str]
100
+
101
+ # Flags
102
+ has_doc: bool
103
+ has_workflow: bool
104
+ has_doi: bool
105
+
106
+ # Validation Results (Access)
107
+ via_response_ok: bool
108
+ child_response_ok: bool
109
+
110
+ # Validation Results (Domains)
111
+ via_domain_ok: bool
112
+ child_domain_ok: bool
113
+
114
+ # Asset Audit Data
115
+ asset_audit: Optional[Dict[str, Any]] = None
116
+ cloud_score: float = 0.0
117
+
118
+
119
+
120
+ # ----------------------------- Helpers ----------------------------------------
121
+
122
+ def _is_prr(link: str) -> bool:
123
+ return "https://eoresults.esa.int" in link
124
+
125
+ def _is_creodias(link: str) -> bool:
126
+ return "https://s3.waw4-1.cloudferro.com/" in link
127
+
128
+ def try_response(url: str, allow_redirects: bool = True, timeout: int = 5) -> requests.Response:
129
+ """
130
+ HEAD a URL (optionally retry with UA) and return the Response.
131
+ """
132
+ headers = {}
133
+ # First attempt: HEAD
134
+ try:
135
+ resp = requests.head(url, allow_redirects=allow_redirects, timeout=timeout)
136
+ if resp.status_code == 200:
137
+ return resp
138
+ except requests.RequestException:
139
+ pass # Fall through to retry logic
140
+
141
+ # Retry logic
142
+ if _is_prr(url):
143
+ resp = requests.get(url, headers=headers, allow_redirects=allow_redirects, timeout=timeout)
144
+ else:
145
+ headers = {"User-Agent": DEFAULT_USER_AGENT}
146
+ resp = requests.head(url, headers=headers, allow_redirects=allow_redirects, timeout=timeout)
147
+
148
+ return resp
149
+
150
+ def check_domain(url: str, allowed_patterns: Sequence[str]) -> bool:
151
+ """Check if a URL's hostname matches allowed wildcard patterns."""
152
+ if not url:
153
+ return False
154
+ hostname = urlparse(url).hostname or ""
155
+ for pattern in allowed_patterns:
156
+ if fnmatch.fnmatch(hostname, pattern):
157
+ return True
158
+ return False
159
+
160
+ def check_product_doi(product, timeout: int = 5) -> bool:
161
+ """
162
+ Check whether a STAC product item has a DOI and whether it resolves.
163
+ """
164
+ try:
165
+ product_dict = product.to_dict()
166
+ doi_value = product_dict.get("sci:doi")
167
+ if doi_value:
168
+ doi_url = f"https://doi.org/{doi_value}"
169
+ response = try_response(doi_url, timeout=timeout)
170
+ return response.status_code == 200
171
+ except Exception:
172
+ return False
173
+ return False
174
+
175
+ def _load_zip_zarr(url: str, **kwargs):
176
+ class HttpZipStore(ZipStore): # type: ignore
177
+ def __init__(self, path) -> None:
178
+ super().__init__(path="", mode="r")
179
+ self.path = path
180
+
181
+ fs = HTTPFileSystem(asynchronous=False, block_size=10000) # type: ignore
182
+ zf = fs.open(url)
183
+ store = HttpZipStore(zf)
184
+ return open_datatree(store, engine="zarr", **kwargs)
185
+
186
+ def get_resolve_href(feat, asset):
187
+
188
+ # check for cloudferro assets
189
+ if asset['href'].startswith('s3://'):
190
+ return 'https://s3.waw4-1.cloudferro.com/' + asset['href']
191
+ elif asset['href'][0] != '/':
192
+ return asset['href']
193
+ else:
194
+ root_href = feat['links'][0]['href']
195
+ scheme = root_href.index('//') + 2
196
+ root_url = root_href[0: root_href[scheme:].index('/') + scheme]
197
+ return root_url + asset['href']
198
+
199
+ def load_items_from_child_link(link: str) -> Tuple[bool, List[Tuple[str, Optional[str]]]]:
200
+ prr = _is_prr(link)
201
+
202
+ if prr:
203
+ items = pystac.ItemCollection.from_file(link + "/items?limit=10000")
204
+ else:
205
+ items = pystac.ItemCollection(pystac.STACObject.from_file(link).get_all_items())
206
+
207
+ items_dict = items.to_dict()
208
+ out: List[Tuple[str, Optional[str]]] = []
209
+
210
+ for feat in items_dict.get("features", []):
211
+ assets = feat.get("assets", {})
212
+ for _name, a in assets.items():
213
+ if a.get("roles") == ["data"]:
214
+ out.append((get_resolve_href(feat, a), a.get("type")))
215
+
216
+ return prr, out
217
+
218
+ def sample_assets(
219
+ assets: Sequence[Tuple[str, Optional[str]]],
220
+ max_checks: int,
221
+ seed: Optional[int] = None,
222
+ ) -> List[Tuple[str, Optional[str]]]:
223
+ if seed is not None:
224
+ random.seed(seed)
225
+ if len(assets) <= max_checks:
226
+ return list(assets)
227
+ return random.sample(list(assets), k=max_checks)
228
+
229
+ def check_asset_readable(href: str, mime_type: Optional[str], is_prr: bool) -> bool:
230
+ mtype = mime_type or ""
231
+ reader = READERS.get(mtype)
232
+
233
+ try:
234
+ test_href = href
235
+ if is_prr:
236
+ if not href.startswith("https://eoresults.esa.int/"):
237
+ test_href = "https://eoresults.esa.int/" + href.lstrip("/")
238
+ if mtype == "application/vnd+zarr":
239
+ _load_zip_zarr(test_href)
240
+ return True
241
+ if mtype == "application/x-netcdf":
242
+ xarray.open_dataset(test_href + "#mode=bytes") # type: ignore
243
+ return True
244
+ if reader:
245
+ reader(test_href) # type: ignore
246
+ return True
247
+ return False
248
+
249
+ # non-PRR
250
+ if mtype == "application/x-netcdf":
251
+ test_href = href + "#mode=bytes"
252
+ if reader is None:
253
+ return False
254
+ reader(test_href) # type: ignore
255
+ return True
256
+
257
+ except Exception as e:
258
+ logging.debug("Asset read failed for %s (%s): %s", href, mtype, e)
259
+ return False
260
+
261
+
262
+
263
+ # ----------------------------- Core logic -------------------------------------
264
+
265
+ def analyse_product(
266
+ productCollection: Union[pystac.Item, pystac.Collection],
267
+ timeout: int = 5,
268
+ max_asset_checks: int = 10,
269
+ seed: Optional[int] = None
270
+ ) -> ProductAuditResult:
271
+ """
272
+ Analyzes a single product (Item or Collection) entirely.
273
+
274
+ Performs:
275
+ 1. Metadata extraction (links, docs, workflow)
276
+ 2. Responsiveness checks (HTTP HEAD on via/child)
277
+ 3. Domain validation
278
+ 4. Asset sampling and reading tests
279
+ 5. Cloud native scoring
280
+ """
281
+
282
+ product_id = productCollection.id
283
+
284
+ # 1. Extract Links
285
+ via_link = productCollection.get_single_link("via")
286
+ via_href = via_link.href if via_link else None
287
+
288
+ child_link = productCollection.get_single_link("child")
289
+ child_href = child_link.href if child_link else None
290
+
291
+ # 2. Check Documentation / Workflow
292
+ has_doc = False
293
+ has_workflow = False
294
+ for link in productCollection.get_links():
295
+ title = getattr(link, "title", None)
296
+ if title == "Documentation":
297
+ has_doc = True
298
+ if link.rel == "related" and isinstance(title, str) and "Experiment: " in title:
299
+ has_workflow = True
300
+
301
+ # 3. Check DOI
302
+ has_doi = check_product_doi(productCollection, timeout=timeout)
303
+
304
+ # 4. Check Response Status (Access)
305
+ via_ok = False
306
+ if via_href:
307
+ try:
308
+ via_ok = try_response(via_href, timeout=timeout).status_code == 200
309
+ except requests.RequestException:
310
+ via_ok = False
311
+
312
+ child_ok = False
313
+ if child_href:
314
+ try:
315
+ child_ok = try_response(child_href, timeout=timeout).status_code == 200
316
+ except requests.RequestException:
317
+ child_ok = False
318
+
319
+ # 5. Check Domains
320
+ via_domain_ok = False
321
+ if via_href:
322
+ via_domain_ok = check_domain(via_href, APPROVED_DATA_HOSTING_DOMAINS)
323
+
324
+ child_domain_ok = False
325
+ if child_href:
326
+ child_domain_ok = check_domain(child_href, APPROVED_METADATA_HOSTING_DOMAINS)
327
+
328
+ # 6. Asset Audit (Child link traversal)
329
+ asset_audit = None
330
+ cloud_score = 0.0
331
+
332
+ if child_href:
333
+ try:
334
+ is_prr, assets = load_items_from_child_link(child_href)
335
+
336
+ # Default assumption: assume NetCDF when type is missing
337
+ assets_norm: List[Tuple[str, Optional[str]]] = [
338
+ (href, mtype if mtype is not None else "application/x-netcdf")
339
+ for (href, mtype) in assets
340
+ ]
341
+
342
+ subset = sample_assets(assets_norm, max_checks=max_asset_checks, seed=seed)
343
+ successes = [check_asset_readable(h, t, is_prr) for (h, t) in subset]
344
+
345
+ # Calculate Asset Stats
346
+ asset_audit = {
347
+ "child_link": child_href,
348
+ "is_prr": is_prr,
349
+ "checked": [{"href": h, "type": t} for (h, t) in subset],
350
+ "success_flags": successes,
351
+ "success_rate": (sum(successes) / len(successes)) if subset else None,
352
+ }
353
+
354
+ # Calculate Cloud Score
355
+ # Score 1 if format is cloud native, else 0. Average over checked assets.
356
+ if subset:
357
+ cn_scores = [1 if t in CLOUD_NATIVE_FORMATS else 0 for (_, t) in subset]
358
+ cloud_score = sum(cn_scores) / len(cn_scores)
359
+
360
+ except Exception as e:
361
+ asset_audit = {
362
+ "child_link": child_href,
363
+ "error": f"Failed to load items: {e}",
364
+ "checked": [],
365
+ "success_flags": [],
366
+ }
367
+ cloud_score = 0.0
368
+
369
+ return ProductAuditResult(
370
+ product_id=product_id,
371
+ via_href=via_href,
372
+ child_href=child_href,
373
+ has_doc=has_doc,
374
+ has_workflow=has_workflow,
375
+ has_doi=has_doi,
376
+ via_response_ok=via_ok,
377
+ child_response_ok=child_ok,
378
+ via_domain_ok=via_domain_ok,
379
+ child_domain_ok=child_domain_ok,
380
+ asset_audit=asset_audit,
381
+ cloud_score=cloud_score
382
+ )
383
+
384
+ def product_audit_to_fair_dict(result: ProductAuditResult):
385
+ """
386
+ Converts a ProductAuditResult object into a dictionary with specific 'fair:' keys.
387
+ """
388
+
389
+ # Extract success rate safely; default to 0.0 or None if no audit occurred
390
+ accessible_files_rate = 0.0
391
+ if result.asset_audit and result.asset_audit.get("success_rate") is not None:
392
+ accessible_files_rate = result.asset_audit["success_rate"]
393
+
394
+ return {
395
+
396
+ "fair:product_url_resolves": result.via_response_ok,
397
+ "fair:product_has_doi": result.has_doi,
398
+ "fair:product_has_documentation": result.has_doc,
399
+ "fair:product_approved_metadata_domain": result.child_domain_ok,
400
+ "fair:product_approved_data_domain": result.via_domain_ok,
401
+
402
+ "fair:file_access": result.child_response_ok,
403
+ "fair:file_acessible_files_rate": accessible_files_rate,
404
+ "fair:file_cloud_assets_rate": result.cloud_score,
405
+
406
+ "fair:workflow_exists": result.has_workflow,
407
+ }
408
+
409
+
410
+
411
+ def run_audit(
412
+ catalog_path: str,
413
+ max_checks: int = 10,
414
+ seed: Optional[int] = None,
415
+ timeout: int = 5,
416
+ ) -> Dict[str, object]:
417
+ """
418
+ High-level orchestration:
419
+ 1) Load catalog
420
+ 2) Loop through products and call analyse_product on each
421
+ 3) Aggregate results into a summary dictionary
422
+ """
423
+ catalog = pystac.Catalog.from_file(catalog_path)
424
+ products_catalog = catalog.get_child("products")
425
+ if products_catalog is None:
426
+ raise ValueError("Catalog has no child named 'products'.")
427
+
428
+ # Aggregation containers
429
+ access_responses: Dict[str, bool] = {}
430
+ child_responses: Dict[str, bool] = {}
431
+ data_domain_ok: Dict[str, bool] = {}
432
+ metadata_domain_ok: Dict[str, bool] = {}
433
+ has_doc_map: Dict[str, bool] = {}
434
+ has_workflow_map: Dict[str, bool] = {}
435
+ has_doi_map: Dict[str, bool] = {}
436
+ per_child_asset_checks: Dict[str, Dict[str, object]] = {}
437
+ cloud_assets_score: Dict[str, float] = {}
438
+ num_products_with_via = 0
439
+ num_products_with_child = 0
440
+
441
+ # Main Loop
442
+ for product in products_catalog.get_children():
443
+
444
+ # Call the unified analysis function
445
+ result = analyse_product(
446
+ product,
447
+ timeout=timeout,
448
+ max_asset_checks=max_checks,
449
+ seed=seed
450
+ )
451
+
452
+ # Aggregate Results
453
+ if result.via_href:
454
+ num_products_with_via += 1
455
+ access_responses[result.product_id] = result.via_response_ok
456
+ data_domain_ok[result.product_id] = result.via_domain_ok
457
+
458
+ if result.child_href:
459
+ num_products_with_child += 1
460
+ child_responses[result.product_id] = result.child_response_ok
461
+ metadata_domain_ok[result.product_id] = result.child_domain_ok
462
+
463
+ if result.asset_audit:
464
+ per_child_asset_checks[result.product_id] = result.asset_audit
465
+ cloud_assets_score[result.product_id] = result.cloud_score
466
+
467
+ has_doc_map[result.product_id] = result.has_doc
468
+ has_workflow_map[result.product_id] = result.has_workflow
469
+ has_doi_map[result.product_id] = result.has_doi
470
+
471
+ return {
472
+ "summary": {
473
+ "num_products_with_via": num_products_with_via,
474
+ "num_products_with_child": num_products_with_child,
475
+ },
476
+ "access_ok": access_responses,
477
+ "child_ok": child_responses,
478
+ "data_domain_ok": data_domain_ok,
479
+ "metadata_domain_ok": metadata_domain_ok,
480
+ "has_documentation": has_doc_map,
481
+ "has_workflow": has_workflow_map,
482
+ "has_doi": has_doi_map,
483
+ "per_child_asset_checks": per_child_asset_checks,
484
+ "cloud_assets": cloud_assets_score,
485
+ }
486
+
487
+
488
+
489
+ def generate_example_product_analysis():
490
+ return ProductAuditResult(
491
+ product_id="waposal-waves",
492
+ via_href="https://s3.waw4-1.cloudferro.com/EarthCODE/OSCAssets/waposal/waposal_data.zip",
493
+ child_href="https://s3.waw4-1.cloudferro.com/EarthCODE/Catalogs/waposal/collection.json",
494
+ has_doc=True,
495
+ has_workflow=False,
496
+ has_doi=True,
497
+ via_response_ok=True,
498
+ child_response_ok=True,
499
+ via_domain_ok=True,
500
+ child_domain_ok=True,
501
+ asset_audit={
502
+ "child_link": "https://s3.waw4-1.cloudferro.com/EarthCODE/Catalogs/waposal/collection.json",
503
+ "is_prr": False,
504
+ "checked": [
505
+ {
506
+ "href": "https://s3.waw4-1.cloudferro.com/EarthCODE/OSCAssets/waposal/CN-S3A.zarr",
507
+ "type": "application/vnd+zarr",
508
+ },
509
+ {
510
+ "href": "https://s3.waw4-1.cloudferro.com/EarthCODE/OSCAssets/waposal/BN-CS2.zarr",
511
+ "type": "application/vnd+zarr",
512
+ },
513
+ {
514
+ "href": "https://s3.waw4-1.cloudferro.com/EarthCODE/OSCAssets/waposal/FG-S3A.zarr",
515
+ "type": "application/vnd+zarr",
516
+ },
517
+ {
518
+ "href": "https://s3.waw4-1.cloudferro.com/EarthCODE/OSCAssets/waposal/MT-S3B.zarr",
519
+ "type": "application/vnd+zarr",
520
+ },
521
+ {
522
+ "href": "https://s3.waw4-1.cloudferro.com/EarthCODE/OSCAssets/waposal/FP-CS2.zarr",
523
+ "type": "application/vnd+zarr",
524
+ },
525
+ {
526
+ "href": "https://s3.waw4-1.cloudferro.com/EarthCODE/OSCAssets/waposal/BN-S3A.zarr",
527
+ "type": "application/vnd+zarr",
528
+ },
529
+ {
530
+ "href": "https://s3.waw4-1.cloudferro.com/EarthCODE/OSCAssets/waposal/MD-S3A.zarr",
531
+ "type": "application/vnd+zarr",
532
+ },
533
+ {
534
+ "href": "https://s3.waw4-1.cloudferro.com/EarthCODE/OSCAssets/waposal/CN-S3A.zarr",
535
+ "type": "application/vnd+zarr",
536
+ },
537
+ {
538
+ "href": "https://s3.waw4-1.cloudferro.com/EarthCODE/OSCAssets/waposal/FF-S3A.zarr",
539
+ "type": "application/vnd+zarr",
540
+ },
541
+ {
542
+ "href": "https://s3.waw4-1.cloudferro.com/EarthCODE/OSCAssets/waposal/FG-S3A.zarr",
543
+ "type": "application/vnd+zarr",
544
+ },
545
+ ],
546
+ "success_flags": [
547
+ True,
548
+ True,
549
+ True,
550
+ True,
551
+ True,
552
+ True,
553
+ True,
554
+ True,
555
+ True,
556
+ True,
557
+ ],
558
+ "success_rate": 1.0,
559
+ },
560
+ cloud_score=1.0,
561
+ )
562
+
563
+ if __name__ == "__main__":
564
+ # Basic CLI wrapper for testing
565
+ parser = argparse.ArgumentParser()
566
+ parser.add_argument("catalog_path", help="Path or URL to catalog.json")
567
+ parser.add_argument("--max-checks", type=int, default=10, help="Max assets to sample per product")
568
+ parser.add_argument("--timeout", type=int, default=5, help="HTTP timeout in seconds")
569
+
570
+ args = parser.parse_args()
571
+
572
+ try:
573
+ report = run_audit(args.catalog_path, max_checks=args.max_checks, timeout=args.timeout)
574
+ print(json.dumps(report, indent=2, default=str))
575
+ except Exception as e:
576
+ logging.error("Audit failed: %s", e)
577
+ sys.exit(1)