ecological-agent-skills 3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (217) hide show
  1. package/AGENT_CONTEXT.md +191 -0
  2. package/CATALOG.md +329 -0
  3. package/LICENSE +692 -0
  4. package/README.md +347 -0
  5. package/bin/install.mjs +168 -0
  6. package/docs/comparison-with-alternatives.md +38 -0
  7. package/docs/global-examples-index.md +103 -0
  8. package/docs/repository-statistics.md +101 -0
  9. package/docs/theoretical-foundations.md +188 -0
  10. package/environment.yaml +106 -0
  11. package/examples/community/arctic_tundra_vegetation_example.md +247 -0
  12. package/examples/community/bird_landuse_example.md +63 -0
  13. package/examples/community/phytoplankton_reservoir_example.md +60 -0
  14. package/examples/community/reef_fish_indopacific_example.md +221 -0
  15. package/examples/impact/baci_road_example.md +57 -0
  16. package/examples/impact/ecosystem_services_atlantic_forest.md +83 -0
  17. package/examples/impact/forest_loss_borneo_timeseries_example.md +225 -0
  18. package/examples/occupancy/puma_camera_example.md +61 -0
  19. package/examples/occupancy/snow_leopard_himalayas_example.md +204 -0
  20. package/examples/reproducible/whittaker_biome_sdm_example.md +406 -0
  21. package/examples/sdm/anteater_cerrado_example.md +69 -0
  22. package/examples/sdm/jaguar_amazon_example.md +80 -0
  23. package/examples/sdm/koala_climate_change_example.md +170 -0
  24. package/examples/sdm/wolf_recolonization_europe_example.md +193 -0
  25. package/package.json +43 -0
  26. package/renv.lock +194 -0
  27. package/skills/SKILL_INDEX.json +1020 -0
  28. package/skills/acoustic-monitoring/SKILL.md +163 -0
  29. package/skills/acoustic-monitoring/examples/example-prompts.md +100 -0
  30. package/skills/acoustic-monitoring/examples/temperate_forest_birds_example.md +285 -0
  31. package/skills/acoustic-monitoring/resources/acoustic-indices-reference.md +93 -0
  32. package/skills/acoustic-monitoring/resources/soundscape-ecology-guide.md +90 -0
  33. package/skills/acoustic-monitoring/resources/species-id-tools-comparison.md +89 -0
  34. package/skills/acoustic-monitoring/scripts/batch_species_detection.py +360 -0
  35. package/skills/acoustic-monitoring/scripts/compute_acoustic_indices.R +235 -0
  36. package/skills/acoustic-monitoring/scripts/compute_acoustic_indices.py +374 -0
  37. package/skills/biostatistics-workbench/SKILL.md +140 -0
  38. package/skills/biostatistics-workbench/examples/example-prompts.md +39 -0
  39. package/skills/biostatistics-workbench/resources/effect-size-reference.md +81 -0
  40. package/skills/biostatistics-workbench/resources/glm-family-link-reference.md +47 -0
  41. package/skills/biostatistics-workbench/resources/test-selection-guide.md +93 -0
  42. package/skills/biostatistics-workbench/scripts/glm_pipeline.R +78 -0
  43. package/skills/biostatistics-workbench/scripts/glm_pipeline.py +210 -0
  44. package/skills/camera-trap-processing/SKILL.md +159 -0
  45. package/skills/camera-trap-processing/examples/example-prompts.md +103 -0
  46. package/skills/camera-trap-processing/examples/leopard_serengeti_example.md +231 -0
  47. package/skills/camera-trap-processing/resources/activity-patterns-reference.md +113 -0
  48. package/skills/camera-trap-processing/resources/camtrapR-workflow-guide.md +130 -0
  49. package/skills/camera-trap-processing/resources/detection-event-definition-guide.md +89 -0
  50. package/skills/camera-trap-processing/scripts/estimate_activity.R +169 -0
  51. package/skills/camera-trap-processing/scripts/process_camtrap_data.R +179 -0
  52. package/skills/camera-trap-processing/scripts/process_camtrap_data.py +192 -0
  53. package/skills/community-ecology-ordination/SKILL.md +133 -0
  54. package/skills/community-ecology-ordination/examples/example-prompts.md +35 -0
  55. package/skills/community-ecology-ordination/resources/dissimilarity-metric-guide.md +53 -0
  56. package/skills/community-ecology-ordination/resources/nmds-interpretation-guide.md +104 -0
  57. package/skills/community-ecology-ordination/scripts/__pycache__/community_analysis.cpython-311.pyc +0 -0
  58. package/skills/community-ecology-ordination/scripts/community_analysis.R +143 -0
  59. package/skills/community-ecology-ordination/scripts/community_analysis.py +231 -0
  60. package/skills/ecological-data-foundation/SKILL.md +129 -0
  61. package/skills/ecological-data-foundation/examples/example-prompts.md +40 -0
  62. package/skills/ecological-data-foundation/resources/coordinate-cleaning-flags.md +66 -0
  63. package/skills/ecological-data-foundation/resources/darwin-core-glossary.md +91 -0
  64. package/skills/ecological-data-foundation/resources/data-citation-guide.md +265 -0
  65. package/skills/ecological-data-foundation/resources/gbif-data-citation-guide.md +193 -0
  66. package/skills/ecological-data-foundation/resources/qa-checklist.md +83 -0
  67. package/skills/ecological-data-foundation/scripts/__pycache__/clean_occurrences.cpython-311.pyc +0 -0
  68. package/skills/ecological-data-foundation/scripts/__pycache__/download_from_ebird.cpython-311.pyc +0 -0
  69. package/skills/ecological-data-foundation/scripts/__pycache__/download_from_inat.cpython-311.pyc +0 -0
  70. package/skills/ecological-data-foundation/scripts/__pycache__/download_from_iucn.cpython-311.pyc +0 -0
  71. package/skills/ecological-data-foundation/scripts/__pycache__/download_from_obis.cpython-311.pyc +0 -0
  72. package/skills/ecological-data-foundation/scripts/clean_occurrences.R +230 -0
  73. package/skills/ecological-data-foundation/scripts/clean_occurrences.py +268 -0
  74. package/skills/ecological-data-foundation/scripts/download_from_ebird.R +251 -0
  75. package/skills/ecological-data-foundation/scripts/download_from_ebird.py +364 -0
  76. package/skills/ecological-data-foundation/scripts/download_from_gbif.R +315 -0
  77. package/skills/ecological-data-foundation/scripts/download_from_gbif.py +407 -0
  78. package/skills/ecological-data-foundation/scripts/download_from_inat.R +238 -0
  79. package/skills/ecological-data-foundation/scripts/download_from_inat.py +304 -0
  80. package/skills/ecological-data-foundation/scripts/download_from_iucn.R +273 -0
  81. package/skills/ecological-data-foundation/scripts/download_from_iucn.py +344 -0
  82. package/skills/ecological-data-foundation/scripts/download_from_obis.R +248 -0
  83. package/skills/ecological-data-foundation/scripts/download_from_obis.py +318 -0
  84. package/skills/ecological-impact-assessment/SKILL.md +123 -0
  85. package/skills/ecological-impact-assessment/examples/example-prompts.md +32 -0
  86. package/skills/ecological-impact-assessment/resources/baci-design-guide.md +55 -0
  87. package/skills/ecological-impact-assessment/resources/fragmentation-metrics-reference.md +86 -0
  88. package/skills/ecological-impact-assessment/resources/pressure-index-template.md +78 -0
  89. package/skills/ecological-impact-assessment/resources/study-design-guide.md +168 -0
  90. package/skills/ecological-impact-assessment/scripts/baci_analysis.R +161 -0
  91. package/skills/ecological-impact-assessment/scripts/fragmentation_analysis.py +141 -0
  92. package/skills/ecological-impact-assessment/scripts/power_analysis_baci.R +274 -0
  93. package/skills/ecosystem-services-assessment/SKILL.md +125 -0
  94. package/skills/ecosystem-services-assessment/examples/example-prompts.md +24 -0
  95. package/skills/ecosystem-services-assessment/resources/es-indicator-reference.md +45 -0
  96. package/skills/ecosystem-services-assessment/resources/invest-parameter-guide.md +86 -0
  97. package/skills/ecosystem-services-assessment/resources/rusle-coefficients.md +88 -0
  98. package/skills/ecosystem-services-assessment/scripts/__pycache__/compute_es.cpython-311.pyc +0 -0
  99. package/skills/ecosystem-services-assessment/scripts/compute_es.py +189 -0
  100. package/skills/ecosystem-services-assessment/scripts/tradeoff_analysis.R +161 -0
  101. package/skills/environmental-time-series/SKILL.md +125 -0
  102. package/skills/environmental-time-series/examples/example-prompts.md +33 -0
  103. package/skills/environmental-time-series/resources/anomaly-indices-reference.md +88 -0
  104. package/skills/environmental-time-series/resources/bfast-parameter-guide.md +69 -0
  105. package/skills/environmental-time-series/scripts/__pycache__/recovery_trajectory.cpython-311.pyc +0 -0
  106. package/skills/environmental-time-series/scripts/__pycache__/trend_analysis.cpython-311.pyc +0 -0
  107. package/skills/environmental-time-series/scripts/recovery_trajectory.R +305 -0
  108. package/skills/environmental-time-series/scripts/recovery_trajectory.py +178 -0
  109. package/skills/environmental-time-series/scripts/trend_analysis.R +192 -0
  110. package/skills/environmental-time-series/scripts/trend_analysis.py +184 -0
  111. package/skills/geoprocessing-for-ecology/SKILL.md +123 -0
  112. package/skills/geoprocessing-for-ecology/examples/example-prompts.md +32 -0
  113. package/skills/geoprocessing-for-ecology/resources/crs-reference.md +62 -0
  114. package/skills/geoprocessing-for-ecology/resources/global-predictor-sources.md +331 -0
  115. package/skills/geoprocessing-for-ecology/resources/resampling-methods.md +57 -0
  116. package/skills/geoprocessing-for-ecology/scripts/__pycache__/download_predictors.cpython-311.pyc +0 -0
  117. package/skills/geoprocessing-for-ecology/scripts/download_predictors.R +239 -0
  118. package/skills/geoprocessing-for-ecology/scripts/download_predictors.py +379 -0
  119. package/skills/geoprocessing-for-ecology/scripts/stack_and_extract.R +224 -0
  120. package/skills/geoprocessing-for-ecology/scripts/stack_and_extract.py +172 -0
  121. package/skills/landscape-connectivity/SKILL.md +170 -0
  122. package/skills/landscape-connectivity/examples/example-prompts.md +96 -0
  123. package/skills/landscape-connectivity/examples/jaguar_mesoamerica_corridor_example.md +271 -0
  124. package/skills/landscape-connectivity/resources/circuitscape-parameter-guide.md +155 -0
  125. package/skills/landscape-connectivity/resources/graph-theory-for-ecology.md +134 -0
  126. package/skills/landscape-connectivity/resources/resistance-surface-guide.md +141 -0
  127. package/skills/landscape-connectivity/scripts/connectivity_analysis.py +387 -0
  128. package/skills/landscape-connectivity/scripts/connectivity_metrics.R +274 -0
  129. package/skills/landscape-connectivity/scripts/resistance_surface.R +239 -0
  130. package/skills/model-validation-and-uncertainty/SKILL.md +131 -0
  131. package/skills/model-validation-and-uncertainty/examples/example-prompts.md +30 -0
  132. package/skills/model-validation-and-uncertainty/resources/extrapolation-risk-guide.md +236 -0
  133. package/skills/model-validation-and-uncertainty/resources/metric-selection-guide.md +52 -0
  134. package/skills/model-validation-and-uncertainty/resources/threshold-selection-guide.md +64 -0
  135. package/skills/model-validation-and-uncertainty/scripts/__pycache__/validate_model.cpython-311.pyc +0 -0
  136. package/skills/model-validation-and-uncertainty/scripts/extrapolation_risk.R +315 -0
  137. package/skills/model-validation-and-uncertainty/scripts/validate_model.py +226 -0
  138. package/skills/model-validation-and-uncertainty/scripts/validate_sdm.R +162 -0
  139. package/skills/occupancy-and-detection/SKILL.md +126 -0
  140. package/skills/occupancy-and-detection/examples/example-prompts.md +33 -0
  141. package/skills/occupancy-and-detection/resources/detection-history-format.md +100 -0
  142. package/skills/occupancy-and-detection/resources/occupancy-study-design.md +47 -0
  143. package/skills/occupancy-and-detection/scripts/__pycache__/occupancy_analysis.cpython-311.pyc +0 -0
  144. package/skills/occupancy-and-detection/scripts/occupancy_analysis.R +160 -0
  145. package/skills/occupancy-and-detection/scripts/occupancy_analysis.py +159 -0
  146. package/skills/population-viability-analysis/SKILL.md +161 -0
  147. package/skills/population-viability-analysis/examples/african_elephant_pva_example.md +266 -0
  148. package/skills/population-viability-analysis/examples/example-prompts.md +95 -0
  149. package/skills/population-viability-analysis/resources/extinction-risk-thresholds.md +128 -0
  150. package/skills/population-viability-analysis/resources/matrix-model-guide.md +139 -0
  151. package/skills/population-viability-analysis/resources/sensitivity-elasticity-reference.md +182 -0
  152. package/skills/population-viability-analysis/scripts/matrix_pva.R +258 -0
  153. package/skills/population-viability-analysis/scripts/pva_analysis.py +442 -0
  154. package/skills/population-viability-analysis/scripts/stochastic_pva.R +353 -0
  155. package/skills/predictive-modeling-best-practices/SKILL.md +136 -0
  156. package/skills/predictive-modeling-best-practices/examples/example-prompts.md +58 -0
  157. package/skills/predictive-modeling-best-practices/resources/collinearity-decision-tree.md +65 -0
  158. package/skills/predictive-modeling-best-practices/resources/sampling-bias-correction.md +267 -0
  159. package/skills/predictive-modeling-best-practices/resources/spatial-cv-guide.md +73 -0
  160. package/skills/predictive-modeling-best-practices/scripts/__pycache__/spatial_cv.cpython-311.pyc +0 -0
  161. package/skills/predictive-modeling-best-practices/scripts/collinearity_check.R +112 -0
  162. package/skills/predictive-modeling-best-practices/scripts/spatial_cv.py +182 -0
  163. package/skills/reproducible-ecology-pipeline/SKILL.md +139 -0
  164. package/skills/reproducible-ecology-pipeline/examples/example-prompts.md +35 -0
  165. package/skills/reproducible-ecology-pipeline/resources/directory-structure-template.md +94 -0
  166. package/skills/reproducible-ecology-pipeline/resources/params-yaml-template.yaml +84 -0
  167. package/skills/reproducible-ecology-pipeline/resources/reproducibility-checklist-template.md +66 -0
  168. package/skills/reproducible-ecology-pipeline/scripts/generate_file_manifest.py +110 -0
  169. package/skills/reproducible-ecology-pipeline/scripts/init_project.sh +53 -0
  170. package/skills/spatial-prioritization/SKILL.md +162 -0
  171. package/skills/spatial-prioritization/examples/biodiversity_hotspot_prioritization_example.md +289 -0
  172. package/skills/spatial-prioritization/examples/example-prompts.md +93 -0
  173. package/skills/spatial-prioritization/resources/cost-surface-reference.md +130 -0
  174. package/skills/spatial-prioritization/resources/marxan-vs-prioritizr-comparison.md +125 -0
  175. package/skills/spatial-prioritization/resources/prioritizr-formulation-guide.md +188 -0
  176. package/skills/spatial-prioritization/resources/representation-targets-guide.md +186 -0
  177. package/skills/spatial-prioritization/scripts/prioritization_sensitivity.R +320 -0
  178. package/skills/spatial-prioritization/scripts/run_prioritization.R +336 -0
  179. package/skills/species-distribution-modeling/SKILL.md +139 -0
  180. package/skills/species-distribution-modeling/examples/example-prompts.md +36 -0
  181. package/skills/species-distribution-modeling/resources/algorithm-comparison.md +25 -0
  182. package/skills/species-distribution-modeling/resources/calibration-area-guide.md +71 -0
  183. package/skills/species-distribution-modeling/resources/climate-scenario-preparation.md +170 -0
  184. package/skills/species-distribution-modeling/resources/maxent-calibration-guide.md +211 -0
  185. package/skills/species-distribution-modeling/resources/sdm-checklist.md +37 -0
  186. package/skills/species-distribution-modeling/scripts/predict_distribution.R +236 -0
  187. package/skills/species-distribution-modeling/scripts/predict_distribution.py +286 -0
  188. package/skills/species-distribution-modeling/scripts/prepare_future_layers.R +351 -0
  189. package/skills/species-distribution-modeling/scripts/project_scenarios.R +220 -0
  190. package/skills/species-distribution-modeling/scripts/run_ensemble_sdm.R +99 -0
  191. package/skills/species-distribution-modeling/scripts/sdm_pipeline.py +318 -0
  192. package/skills/species-distribution-modeling/scripts/tune_maxnet.R +344 -0
  193. package/templates/SKILL_TEMPLATE.md +225 -0
  194. package/templates/checklists/data-submission-checklist.md +38 -0
  195. package/templates/checklists/post-analysis-checklist.md +55 -0
  196. package/templates/checklists/pre-analysis-checklist.md +31 -0
  197. package/templates/prompts/debug-skill.md +47 -0
  198. package/templates/prompts/invoke-skill.md +34 -0
  199. package/templates/prompts/invoke-workflow.md +45 -0
  200. package/templates/reports/technical-report-template.md +80 -0
  201. package/templates/scripts/logger_setup.R +79 -0
  202. package/templates/scripts/logger_setup.py +119 -0
  203. package/templates/scripts/params_loader.R +28 -0
  204. package/templates/scripts/params_loader.py +38 -0
  205. package/workflows/analyze-community-structure/WORKFLOW.md +72 -0
  206. package/workflows/analyze-environmental-change/WORKFLOW.md +73 -0
  207. package/workflows/assess-ecological-impact/WORKFLOW.md +75 -0
  208. package/workflows/assess-ecosystem-services/WORKFLOW.md +68 -0
  209. package/workflows/assess-landscape-connectivity/WORKFLOW.md +84 -0
  210. package/workflows/build-fire-risk-map/WORKFLOW.md +79 -0
  211. package/workflows/produce-technical-report/WORKFLOW.md +113 -0
  212. package/workflows/run-camera-trap-occupancy/WORKFLOW.md +87 -0
  213. package/workflows/run-conservation-prioritization/WORKFLOW.md +89 -0
  214. package/workflows/run-multispecies-screening/WORKFLOW.md +197 -0
  215. package/workflows/run-occupancy-analysis/WORKFLOW.md +74 -0
  216. package/workflows/run-population-viability/WORKFLOW.md +90 -0
  217. package/workflows/run-sdm-study/WORKFLOW.md +99 -0
@@ -0,0 +1,268 @@
1
+ #!/usr/bin/env python3
2
+ # ecological-agent-skills / Copyright (C) 2026 Francisco Diego Barros Barata
3
+ # SPDX-License-Identifier: GPL-3.0-or-later
4
+
5
+ """
6
+ clean_occurrences.py
7
+ Standard occurrence cleaning pipeline using Python.
8
+ Usage: python clean_occurrences.py <input_csv> <output_dir>
9
+ Requires: pandas, geopandas, shapely
10
+ """
11
+
12
+ import logging
13
+ import sys
14
+ from datetime import datetime
15
+ from pathlib import Path
16
+
17
+ SKILL_NAME = "ecological-data-foundation"
18
+ _LOG_DIR = Path("logs")
19
+ _LOG_DIR.mkdir(parents=True, exist_ok=True)
20
+ _log_file = _LOG_DIR / f"skill_{SKILL_NAME}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"
21
+ logging.basicConfig(
22
+ level=logging.INFO,
23
+ format="[%(asctime)s] [%(levelname)s] [" + SKILL_NAME + "] %(message)s",
24
+ datefmt="%Y-%m-%d %H:%M:%S",
25
+ handlers=[
26
+ logging.StreamHandler(sys.stdout),
27
+ logging.FileHandler(_log_file, encoding="utf-8"),
28
+ ],
29
+ )
30
+ logger = logging.getLogger(SKILL_NAME)
31
+
32
+ def log_step(n: int, desc: str) -> None:
33
+ logger.info("-- STEP %d: %s", n, desc)
34
+
35
+ def log_decision(var: str, val, why: str) -> None:
36
+ logger.info("DECISION | %s = %s | %s", var, val, why)
37
+
38
+
39
+ import os
40
+ import hashlib
41
+
42
+ import pandas as pd
43
+ import numpy as np
44
+
45
+
46
+ def load(path: str) -> pd.DataFrame:
47
+ try:
48
+ df = pd.read_csv(path, low_memory=False)
49
+ logger.info("Carregadas %d linhas, %d colunas de %s", len(df), len(df.columns), path)
50
+ return df
51
+ except FileNotFoundError:
52
+ logger.error(
53
+ "Input nao encontrado: %s\n Causa provavel: arquivo nao gerado pelo passo anterior.\n Verifique a saida de: ecological-data-foundation (download_from_gbif)\n Skill anterior: ecological-data-foundation",
54
+ path,
55
+ )
56
+ sys.exit(1)
57
+ except Exception as e:
58
+ logger.error(
59
+ "Falha ao ler CSV de entrada '%s': %s\n Causa provavel: arquivo corrompido ou formato invalido.\n Skill anterior: ecological-data-foundation",
60
+ path, e,
61
+ )
62
+ sys.exit(1)
63
+
64
+
65
+ def check_required_cols(df: pd.DataFrame, required: list) -> None:
66
+ missing = [c for c in required if c not in df.columns]
67
+ if missing:
68
+ logger.error(
69
+ "Colunas obrigatorias ausentes: %s\n Causa provavel: CSV gerado por fonte diferente ou schema alterado.\n Skill anterior: ecological-data-foundation",
70
+ missing,
71
+ )
72
+ raise ValueError(f"Missing required columns: {missing}")
73
+ logger.info("Todas as colunas obrigatorias presentes: %s", required)
74
+
75
+
76
+ def flag_coordinate_issues(df: pd.DataFrame,
77
+ lat_col="decimalLatitude",
78
+ lon_col="decimalLongitude") -> pd.DataFrame:
79
+ df = df.copy()
80
+ df["QA_status"] = "OK"
81
+ lat = pd.to_numeric(df[lat_col], errors="coerce")
82
+ lon = pd.to_numeric(df[lon_col], errors="coerce")
83
+
84
+ # Invalid range
85
+ mask_range = (lat.abs() > 90) | (lon.abs() > 180)
86
+ df.loc[mask_range, "QA_status"] = "COORD_OUT_OF_RANGE"
87
+
88
+ # Zero coordinates
89
+ mask_zero = (lat == 0) & (lon == 0)
90
+ df.loc[mask_zero, "QA_status"] = "COORD_ZERO"
91
+
92
+ # Missing coords
93
+ mask_na = lat.isna() | lon.isna()
94
+ df.loc[mask_na, "QA_status"] = "MISSING_COORDS"
95
+
96
+ n_range = int(mask_range.sum())
97
+ n_zero = int(mask_zero.sum())
98
+ n_na = int(mask_na.sum())
99
+
100
+ logger.info(
101
+ "Problemas de coordenadas — Fora do intervalo: %d | Zero: %d | Ausentes: %d",
102
+ n_range, n_zero, n_na,
103
+ )
104
+ if n_range > 0:
105
+ logger.warning("Registros com coordenadas fora do intervalo valido: %d", n_range)
106
+ if n_zero > 0:
107
+ logger.warning("Registros com coordenadas zero (0,0): %d — possivelmente erros de digitacao", n_zero)
108
+ if n_na > 0:
109
+ logger.warning("Registros sem coordenadas (NA): %d", n_na)
110
+ return df
111
+
112
+
113
+ def remove_exact_duplicates(df: pd.DataFrame,
114
+ cols=("scientificName","decimalLatitude","decimalLongitude","eventDate")
115
+ ) -> pd.DataFrame:
116
+ cols_present = [c for c in cols if c in df.columns]
117
+ log_decision(
118
+ "dedup_cols",
119
+ cols_present,
120
+ "combinacao padrao para identificar duplicatas espaciotemporais",
121
+ )
122
+ n_before = len(df)
123
+ df = df.drop_duplicates(subset=cols_present, keep="first")
124
+ n_removed = n_before - len(df)
125
+ if n_removed > 0:
126
+ logger.warning("Duplicatas exatas removidas: %d", n_removed)
127
+ else:
128
+ logger.info("Nenhuma duplicata exata encontrada.")
129
+ return df
130
+
131
+
132
+ def check_temporal(df: pd.DataFrame, date_col="eventDate") -> pd.DataFrame:
133
+ if date_col not in df.columns:
134
+ logger.warning("Coluna '%s' ausente — verificacao temporal pulada.", date_col)
135
+ return df
136
+ dates = pd.to_datetime(df[date_col], errors="coerce")
137
+ future = dates > pd.Timestamp.now()
138
+ n_future = int(future.sum())
139
+ df.loc[future & (df["QA_status"] == "OK"), "QA_status"] = "DATE_FUTURE"
140
+ if n_future > 0:
141
+ logger.warning("Registros com datas futuras sinalizados: %d", n_future)
142
+ else:
143
+ logger.info("Nenhum registro com data futura encontrado.")
144
+ return df
145
+
146
+
147
+ def write_outputs(df: pd.DataFrame, output_dir: Path) -> None:
148
+ output_dir.mkdir(parents=True, exist_ok=True)
149
+ clean = df[df["QA_status"] == "OK"]
150
+ flagged = df[df["QA_status"] != "OK"]
151
+
152
+ try:
153
+ clean.to_csv(output_dir / "data_clean.csv", index=False)
154
+ logger.info("Gravado: %s", output_dir / "data_clean.csv")
155
+ flagged.to_csv(output_dir / "flagged_records.csv", index=False)
156
+ logger.info("Gravado: %s", output_dir / "flagged_records.csv")
157
+ except OSError as e:
158
+ logger.error(
159
+ "Falha ao gravar arquivos de saida em '%s': %s\n Causa provavel: sem permissao de escrita no diretorio.\n Skill anterior: ecological-data-foundation",
160
+ output_dir, e,
161
+ )
162
+ raise
163
+
164
+ # QA report
165
+ flag_counts = flagged["QA_status"].value_counts().to_dict()
166
+ lines = [
167
+ "# QA Report — Occurrence Cleaning",
168
+ f"- Raw records: {len(df):,}",
169
+ f"- Clean records: {len(clean):,}",
170
+ f"- Flagged records: {len(flagged):,}",
171
+ "",
172
+ "## Flag Counts",
173
+ ] + [f"- `{k}`: {v}" for k, v in flag_counts.items()]
174
+
175
+ try:
176
+ (output_dir / "qa_report.md").write_text("\n".join(lines))
177
+ logger.info("Gravado: %s", output_dir / "qa_report.md")
178
+ except OSError as e:
179
+ logger.error(
180
+ "Falha ao gravar relatorio QA: %s\n Causa provavel: sem permissao de escrita.\n Skill anterior: ecological-data-foundation",
181
+ e,
182
+ )
183
+ raise
184
+
185
+ if len(clean) < 30:
186
+ logger.warning(
187
+ "Apenas %d registros limpos apos todas as filtragens. SDMs requerem >= 30 registros confiaveis.",
188
+ len(clean),
189
+ )
190
+
191
+ logger.info("Concluido. Limpos: %d | Sinalizados: %d", len(clean), len(flagged))
192
+ logger.info("Saidas gravadas em: %s", output_dir)
193
+
194
+
195
+ def main():
196
+ logger.info("Script: clean_occurrences.py | Skill: %s", SKILL_NAME)
197
+
198
+ input_file = sys.argv[1] if len(sys.argv) > 1 else "data/raw/occurrences.csv"
199
+ output_dir = Path(sys.argv[2]) if len(sys.argv) > 2 else Path("data/processed")
200
+
201
+ logger.info("Input file : %s", input_file)
202
+ logger.info("Output dir : %s", output_dir)
203
+
204
+ # Input precondition check
205
+ if not Path(input_file).exists():
206
+ logger.error(
207
+ "Input nao encontrado: %s\n Causa provavel: arquivo nao gerado pelo passo anterior.\n Verifique a saida de: ecological-data-foundation (download_from_gbif)\n Skill anterior: ecological-data-foundation",
208
+ input_file,
209
+ )
210
+ sys.exit(1)
211
+
212
+ log_decision("input_file", input_file, "caminho passado como argv[1] ou padrao")
213
+ log_decision("output_dir", str(output_dir), "caminho passado como argv[2] ou padrao")
214
+
215
+ log_step(1, "Carregar dados brutos de ocorrencias")
216
+ df = load(input_file)
217
+
218
+ if len(df) == 0:
219
+ logger.warning("Arquivo de entrada nao contem registros: %s", input_file)
220
+
221
+ log_step(2, "Verificar colunas obrigatorias")
222
+ try:
223
+ check_required_cols(df, ["decimalLatitude", "decimalLongitude"])
224
+ except ValueError as e:
225
+ logger.error("Verificacao de colunas falhou: %s", e)
226
+ sys.exit(1)
227
+
228
+ log_step(3, "Sinalizar problemas de coordenadas")
229
+ log_decision(
230
+ "cc_tests",
231
+ "out_of_range,zero,missing",
232
+ "verificacoes basicas de qualidade de coordenadas sem dependencias externas",
233
+ )
234
+ try:
235
+ df = flag_coordinate_issues(df)
236
+ except Exception as e:
237
+ logger.error(
238
+ "Falha na sinalizacao de coordenadas: %s\n Causa provavel: colunas de coordenadas com tipos inesperados.\n Skill anterior: ecological-data-foundation",
239
+ e,
240
+ )
241
+ raise
242
+
243
+ log_step(4, "Remover duplicatas exatas")
244
+ try:
245
+ df = remove_exact_duplicates(df)
246
+ except Exception as e:
247
+ logger.error(
248
+ "Falha ao remover duplicatas: %s\n Causa provavel: colunas de deduplicacao ausentes ou mal tipadas.\n Skill anterior: ecological-data-foundation",
249
+ e,
250
+ )
251
+ raise
252
+
253
+ log_step(5, "Verificar datas futuras")
254
+ try:
255
+ df = check_temporal(df)
256
+ except Exception as e:
257
+ logger.error(
258
+ "Falha na verificacao temporal: %s\n Causa provavel: coluna eventDate com formato inesperado.\n Skill anterior: ecological-data-foundation",
259
+ e,
260
+ )
261
+ raise
262
+
263
+ log_step(6, "Escrever arquivos de saida e relatorio QA")
264
+ write_outputs(df, output_dir)
265
+
266
+
267
+ if __name__ == "__main__":
268
+ main()
@@ -0,0 +1,251 @@
1
+ # ecological-agent-skills / Copyright (C) 2026 Francisco Diego Barros Barata
2
+ # SPDX-License-Identifier: GPL-3.0-or-later
3
+
4
+ # Usage: Rscript download_from_ebird.R <ebd_file> <species_name_or_list_csv> <output_dir> [year_from] [year_to] [country_code]
5
+
6
+ # ── Inline logger ─────────────────────────────────────────────────────────────
7
+ SKILL_NAME <- "ecological-data-foundation"
8
+ .log_ts <- function() format(Sys.time(), "[%Y-%m-%d %H:%M:%S]")
9
+ log_info <- function(...) message(.log_ts(), " [INFO] ", sprintf(...))
10
+ log_warn <- function(...) message(.log_ts(), " [WARN] ", sprintf(...))
11
+ log_error<- function(...) message(.log_ts(), " [ERROR] ", sprintf(...))
12
+ log_step <- function(n, d) log_info("-- STEP %d: %s", n, d)
13
+ log_decision <- function(v, val, why) log_info("DECISION | %s = %s | %s", v, val, why)
14
+ dir.create("logs", recursive=TRUE, showWarnings=FALSE)
15
+
16
+ #
17
+ # Arguments:
18
+ # ebd_file : Path to the eBird Basic Dataset (EBD) text file
19
+ # (pre-downloaded from https://ebird.org/data/download)
20
+ # species_name_or_list_csv : Common or scientific name, or CSV with column "scientificName"
21
+ # output_dir : Directory to write outputs (created if absent)
22
+ # year_from : Minimum year of observation (optional, default: 2000)
23
+ # year_to : Maximum year of observation (optional, default: current year)
24
+ # country_code : ISO 3166-1 alpha-2 country code to filter (optional)
25
+ #
26
+ # Note: eBird data requires a pre-downloaded EBD file; this script parses it locally.
27
+ # Apply for access at: https://ebird.org/data/download
28
+ #
29
+ # Outputs (per species):
30
+ # occurrences_raw_eBird_{species}_{date}.csv — standardised occurrence records
31
+ # download_metadata_eBird_{species}.txt — download provenance and citation
32
+ #
33
+ # Standard output schema:
34
+ # species, decimalLatitude, decimalLongitude, eventDate, countryCode,
35
+ # basisOfRecord, coordinateUncertaintyInMeters, datasetName, occurrenceID,
36
+ # source, download_doi
37
+ # Extra eBird columns:
38
+ # effort_distance_km, duration_minutes, observer_id
39
+
40
+ suppressPackageStartupMessages(library(auk))
41
+ suppressPackageStartupMessages(library(dplyr))
42
+ suppressPackageStartupMessages(library(readr))
43
+
44
+ # ── 1. Parse arguments ───────────────────────────────────────────────────────
45
+ log_step(1, "Analisar argumentos da linha de comando")
46
+ args <- commandArgs(trailingOnly = TRUE)
47
+
48
+ if (length(args) < 3) {
49
+ ebd_file <- "data/ebird/ebd_sample.txt"
50
+ species_input <- "Jabiru mycteria"
51
+ output_dir <- "output/ebird"
52
+ year_from <- 2000
53
+ year_to <- as.integer(format(Sys.Date(), "%Y"))
54
+ country_code <- NULL
55
+ log_warn("Menos de 3 argumentos fornecidos. Usando valores padrao para teste.")
56
+ } else {
57
+ ebd_file <- args[1]
58
+ species_input <- args[2]
59
+ output_dir <- args[3]
60
+ year_from <- if (length(args) >= 4) as.integer(args[4]) else 2000
61
+ year_to <- if (length(args) >= 5) as.integer(args[5]) else as.integer(format(Sys.Date(), "%Y"))
62
+ country_code <- if (length(args) >= 6 && args[6] != "") args[6] else NULL
63
+ }
64
+
65
+ log_info("Script: download_from_ebird.R | Skill: %s", SKILL_NAME)
66
+ log_info("EBD file : %s", ebd_file)
67
+ log_info("Species input : %s", species_input)
68
+ log_info("Output dir : %s", output_dir)
69
+ log_info("Year range : %d - %d", year_from, year_to)
70
+ log_info("Country code : %s", ifelse(is.null(country_code), "nenhum", country_code))
71
+
72
+ log_decision("protocol", "STATIONARY,TRAVELING",
73
+ "apenas protocolos quantificaveis para modelagem de avistamentos")
74
+ log_decision("approved", "TRUE",
75
+ "apenas listas aprovadas pelo eBird (revisao de qualidade aplicada)")
76
+ log_decision("year_from", year_from, "filtro temporal; 2000 equilibra tamanho e qualidade")
77
+
78
+ # ── 2. Check EBD file exists ─────────────────────────────────────────────────
79
+ log_step(2, "Verificar existencia do arquivo EBD")
80
+ if (!file.exists(ebd_file)) {
81
+ log_error(
82
+ "Input nao encontrado: %s\nCausa provavel: arquivo EBD nao baixado do eBird.\nVerifique: https://ebird.org/data/download — solicite acesso e baixe o EBD.\nSkill anterior: ecological-data-foundation",
83
+ ebd_file
84
+ )
85
+ stop("EBD file not found: ", ebd_file)
86
+ }
87
+ log_info("Arquivo EBD encontrado: %s", ebd_file)
88
+
89
+ # ── 3. Create output directory ───────────────────────────────────────────────
90
+ log_step(3, "Criar diretorio de saida")
91
+ dir.create(output_dir, recursive = TRUE, showWarnings = FALSE)
92
+
93
+ # ── 4. Build species list ────────────────────────────────────────────────────
94
+ log_step(4, "Construir lista de especies")
95
+ if (grepl("\\.csv$", species_input, ignore.case = TRUE) && file.exists(species_input)) {
96
+ tryCatch({
97
+ species_df <- read_csv(species_input, show_col_types = FALSE)
98
+ if (!"scientificName" %in% names(species_df)) {
99
+ log_error(
100
+ "Coluna 'scientificName' nao encontrada em: %s\nCausa provavel: CSV mal formatado.\nSkill anterior: ecological-data-foundation",
101
+ species_input
102
+ )
103
+ stop("Missing column 'scientificName'")
104
+ }
105
+ species_list <- unique(trimws(species_df$scientificName))
106
+ log_info("Modo batch: %d especies carregadas", length(species_list))
107
+ log_decision("mode", "batch", "CSV valido com coluna scientificName")
108
+ }, error = function(e) {
109
+ log_error(
110
+ "Falha ao ler lista de especies: %s\nCausa provavel: CSV invalido.\nSkill anterior: ecological-data-foundation",
111
+ conditionMessage(e)
112
+ )
113
+ stop(e)
114
+ })
115
+ } else {
116
+ species_list <- trimws(species_input)
117
+ log_info("Modo especie unica: %s", species_list)
118
+ log_decision("mode", "single_species", "argumento nao e arquivo CSV")
119
+ }
120
+
121
+ # ── 5. Filter and parse EBD ──────────────────────────────────────────────────
122
+ log_step(5, "Filtrar e carregar EBD com auk")
123
+
124
+ # Create a temporary filtered file for all species at once
125
+ tmp_filtered <- tempfile(fileext = ".txt")
126
+
127
+ auk_filter_obj <- tryCatch({
128
+ flt <- auk_ebd(ebd_file) |>
129
+ auk_species(species_list) |>
130
+ auk_date(date = c(paste0(year_from, "-01-01"), paste0(year_to, "-12-31"))) |>
131
+ auk_protocol(c("Stationary", "Traveling")) |>
132
+ auk_complete()
133
+
134
+ if (!is.null(country_code)) {
135
+ flt <- auk_country(flt, country_code)
136
+ }
137
+ flt
138
+ }, error = function(e) {
139
+ log_error(
140
+ "Falha ao configurar filtros auk: %s\nCausa provavel: nome de especie nao encontrado no EBD ou parametros invalidos.\nVerifique nomes usando auk_species_codes().\nSkill anterior: ecological-data-foundation",
141
+ conditionMessage(e)
142
+ )
143
+ stop(e)
144
+ })
145
+
146
+ ebd_filtered <- tryCatch({
147
+ auk_filter(auk_filter_obj, file = tmp_filtered, overwrite = TRUE)
148
+ read_ebd(tmp_filtered)
149
+ }, error = function(e) {
150
+ log_error(
151
+ "Falha ao filtrar ou ler EBD: %s\nCausa provavel: arquivo EBD corrompido ou formato incompativel com versao do auk.\nVerifique: auk::auk_version_requirements().\nSkill anterior: ecological-data-foundation",
152
+ conditionMessage(e)
153
+ )
154
+ stop(e)
155
+ })
156
+
157
+ n_filtered <- nrow(ebd_filtered)
158
+ log_info("Registros apos filtragem EBD: %d", n_filtered)
159
+
160
+ if (n_filtered == 0) {
161
+ log_warn("Nenhum registro encontrado apos filtragem. Verifique nomes das especies e periodo.")
162
+ }
163
+
164
+ # ── 6. Standardise and save per-species ─────────────────────────────────────
165
+ log_step(6, "Padronizar e gravar CSVs por especie")
166
+ today_str <- format(Sys.Date(), "%Y%m%d")
167
+
168
+ for (sp in species_list) {
169
+ sp_data <- ebd_filtered[ebd_filtered$scientific_name == sp |
170
+ ebd_filtered$common_name == sp, ]
171
+ n_sp <- nrow(sp_data)
172
+ log_info("Especie '%s': %d registros", sp, n_sp)
173
+
174
+ if (n_sp == 0) {
175
+ log_warn("Nenhum registro para '%s'. Pulando.", sp)
176
+ next
177
+ }
178
+
179
+ if (n_sp < 30) {
180
+ log_warn(
181
+ "Registros insuficientes para SDM confiavel para '%s' (n = %d). Considere ampliar periodo ou area geografica.",
182
+ sp, n_sp
183
+ )
184
+ }
185
+
186
+ safe_name <- gsub(" ", "_", sp)
187
+
188
+ # Standardised schema + extra eBird columns
189
+ std <- data.frame(
190
+ species = sp,
191
+ decimalLatitude = as.numeric(sp_data$latitude),
192
+ decimalLongitude = as.numeric(sp_data$longitude),
193
+ eventDate = as.character(sp_data$observation_date),
194
+ countryCode = as.character(sp_data$country_code),
195
+ basisOfRecord = "HUMAN_OBSERVATION",
196
+ coordinateUncertaintyInMeters = NA_real_,
197
+ datasetName = "eBird Basic Dataset",
198
+ occurrenceID = as.character(sp_data$sampling_event_identifier),
199
+ source = "eBird",
200
+ download_doi = NA_character_,
201
+ effort_distance_km = as.numeric(sp_data$effort_distance_km),
202
+ duration_minutes = as.numeric(sp_data$duration_minutes),
203
+ observer_id = as.character(sp_data$observer_id),
204
+ stringsAsFactors = FALSE
205
+ )
206
+
207
+ std <- std[!is.na(std$decimalLatitude) & !is.na(std$decimalLongitude), ]
208
+
209
+ csv_path <- file.path(output_dir,
210
+ paste0("occurrences_raw_eBird_", safe_name, "_", today_str, ".csv"))
211
+ tryCatch({
212
+ write_csv(std, csv_path)
213
+ log_info("Gravado: %s (%d registros)", csv_path, nrow(std))
214
+ }, error = function(e) {
215
+ log_error(
216
+ "Falha ao gravar CSV para '%s': %s\nCausa provavel: sem permissao de escrita.\nSkill anterior: ecological-data-foundation",
217
+ sp, conditionMessage(e)
218
+ )
219
+ stop(e)
220
+ })
221
+
222
+ # Metadata
223
+ meta_lines <- c(
224
+ paste("Species:", sp),
225
+ paste("Source: eBird Basic Dataset (https://ebird.org/data/download)"),
226
+ paste("Protocols: Stationary, Traveling"),
227
+ paste("Approved only: TRUE"),
228
+ paste("Year range:", year_from, "-", year_to),
229
+ paste("Country filter:", ifelse(is.null(country_code), "none", country_code)),
230
+ paste("n_records:", nrow(std)),
231
+ paste("Download date:", Sys.Date()),
232
+ paste("Citation: eBird Basic Dataset. Version:", format(Sys.Date(), "%Y-%m"),
233
+ ". Cornell Lab of Ornithology, Ithaca, New York.", Sys.Date()),
234
+ paste("Note: eBird data requires a signed Data Use Agreement. Cite the dataset version used.")
235
+ )
236
+ meta_path <- file.path(output_dir, paste0("download_metadata_eBird_", safe_name, ".txt"))
237
+ tryCatch({
238
+ writeLines(meta_lines, meta_path)
239
+ log_info("Metadados gravados: %s", meta_path)
240
+ }, error = function(e) {
241
+ log_error(
242
+ "Falha ao gravar metadados para '%s': %s\nSkill anterior: ecological-data-foundation",
243
+ sp, conditionMessage(e)
244
+ )
245
+ })
246
+ }
247
+
248
+ # Clean up temp file
249
+ if (file.exists(tmp_filtered)) file.remove(tmp_filtered)
250
+
251
+ log_info("Todos os downloads eBird concluidos. Verifique: %s", output_dir)