ecological-agent-skills 3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (217) hide show
  1. package/AGENT_CONTEXT.md +191 -0
  2. package/CATALOG.md +329 -0
  3. package/LICENSE +692 -0
  4. package/README.md +347 -0
  5. package/bin/install.mjs +168 -0
  6. package/docs/comparison-with-alternatives.md +38 -0
  7. package/docs/global-examples-index.md +103 -0
  8. package/docs/repository-statistics.md +101 -0
  9. package/docs/theoretical-foundations.md +188 -0
  10. package/environment.yaml +106 -0
  11. package/examples/community/arctic_tundra_vegetation_example.md +247 -0
  12. package/examples/community/bird_landuse_example.md +63 -0
  13. package/examples/community/phytoplankton_reservoir_example.md +60 -0
  14. package/examples/community/reef_fish_indopacific_example.md +221 -0
  15. package/examples/impact/baci_road_example.md +57 -0
  16. package/examples/impact/ecosystem_services_atlantic_forest.md +83 -0
  17. package/examples/impact/forest_loss_borneo_timeseries_example.md +225 -0
  18. package/examples/occupancy/puma_camera_example.md +61 -0
  19. package/examples/occupancy/snow_leopard_himalayas_example.md +204 -0
  20. package/examples/reproducible/whittaker_biome_sdm_example.md +406 -0
  21. package/examples/sdm/anteater_cerrado_example.md +69 -0
  22. package/examples/sdm/jaguar_amazon_example.md +80 -0
  23. package/examples/sdm/koala_climate_change_example.md +170 -0
  24. package/examples/sdm/wolf_recolonization_europe_example.md +193 -0
  25. package/package.json +43 -0
  26. package/renv.lock +194 -0
  27. package/skills/SKILL_INDEX.json +1020 -0
  28. package/skills/acoustic-monitoring/SKILL.md +163 -0
  29. package/skills/acoustic-monitoring/examples/example-prompts.md +100 -0
  30. package/skills/acoustic-monitoring/examples/temperate_forest_birds_example.md +285 -0
  31. package/skills/acoustic-monitoring/resources/acoustic-indices-reference.md +93 -0
  32. package/skills/acoustic-monitoring/resources/soundscape-ecology-guide.md +90 -0
  33. package/skills/acoustic-monitoring/resources/species-id-tools-comparison.md +89 -0
  34. package/skills/acoustic-monitoring/scripts/batch_species_detection.py +360 -0
  35. package/skills/acoustic-monitoring/scripts/compute_acoustic_indices.R +235 -0
  36. package/skills/acoustic-monitoring/scripts/compute_acoustic_indices.py +374 -0
  37. package/skills/biostatistics-workbench/SKILL.md +140 -0
  38. package/skills/biostatistics-workbench/examples/example-prompts.md +39 -0
  39. package/skills/biostatistics-workbench/resources/effect-size-reference.md +81 -0
  40. package/skills/biostatistics-workbench/resources/glm-family-link-reference.md +47 -0
  41. package/skills/biostatistics-workbench/resources/test-selection-guide.md +93 -0
  42. package/skills/biostatistics-workbench/scripts/glm_pipeline.R +78 -0
  43. package/skills/biostatistics-workbench/scripts/glm_pipeline.py +210 -0
  44. package/skills/camera-trap-processing/SKILL.md +159 -0
  45. package/skills/camera-trap-processing/examples/example-prompts.md +103 -0
  46. package/skills/camera-trap-processing/examples/leopard_serengeti_example.md +231 -0
  47. package/skills/camera-trap-processing/resources/activity-patterns-reference.md +113 -0
  48. package/skills/camera-trap-processing/resources/camtrapR-workflow-guide.md +130 -0
  49. package/skills/camera-trap-processing/resources/detection-event-definition-guide.md +89 -0
  50. package/skills/camera-trap-processing/scripts/estimate_activity.R +169 -0
  51. package/skills/camera-trap-processing/scripts/process_camtrap_data.R +179 -0
  52. package/skills/camera-trap-processing/scripts/process_camtrap_data.py +192 -0
  53. package/skills/community-ecology-ordination/SKILL.md +133 -0
  54. package/skills/community-ecology-ordination/examples/example-prompts.md +35 -0
  55. package/skills/community-ecology-ordination/resources/dissimilarity-metric-guide.md +53 -0
  56. package/skills/community-ecology-ordination/resources/nmds-interpretation-guide.md +104 -0
  57. package/skills/community-ecology-ordination/scripts/__pycache__/community_analysis.cpython-311.pyc +0 -0
  58. package/skills/community-ecology-ordination/scripts/community_analysis.R +143 -0
  59. package/skills/community-ecology-ordination/scripts/community_analysis.py +231 -0
  60. package/skills/ecological-data-foundation/SKILL.md +129 -0
  61. package/skills/ecological-data-foundation/examples/example-prompts.md +40 -0
  62. package/skills/ecological-data-foundation/resources/coordinate-cleaning-flags.md +66 -0
  63. package/skills/ecological-data-foundation/resources/darwin-core-glossary.md +91 -0
  64. package/skills/ecological-data-foundation/resources/data-citation-guide.md +265 -0
  65. package/skills/ecological-data-foundation/resources/gbif-data-citation-guide.md +193 -0
  66. package/skills/ecological-data-foundation/resources/qa-checklist.md +83 -0
  67. package/skills/ecological-data-foundation/scripts/__pycache__/clean_occurrences.cpython-311.pyc +0 -0
  68. package/skills/ecological-data-foundation/scripts/__pycache__/download_from_ebird.cpython-311.pyc +0 -0
  69. package/skills/ecological-data-foundation/scripts/__pycache__/download_from_inat.cpython-311.pyc +0 -0
  70. package/skills/ecological-data-foundation/scripts/__pycache__/download_from_iucn.cpython-311.pyc +0 -0
  71. package/skills/ecological-data-foundation/scripts/__pycache__/download_from_obis.cpython-311.pyc +0 -0
  72. package/skills/ecological-data-foundation/scripts/clean_occurrences.R +230 -0
  73. package/skills/ecological-data-foundation/scripts/clean_occurrences.py +268 -0
  74. package/skills/ecological-data-foundation/scripts/download_from_ebird.R +251 -0
  75. package/skills/ecological-data-foundation/scripts/download_from_ebird.py +364 -0
  76. package/skills/ecological-data-foundation/scripts/download_from_gbif.R +315 -0
  77. package/skills/ecological-data-foundation/scripts/download_from_gbif.py +407 -0
  78. package/skills/ecological-data-foundation/scripts/download_from_inat.R +238 -0
  79. package/skills/ecological-data-foundation/scripts/download_from_inat.py +304 -0
  80. package/skills/ecological-data-foundation/scripts/download_from_iucn.R +273 -0
  81. package/skills/ecological-data-foundation/scripts/download_from_iucn.py +344 -0
  82. package/skills/ecological-data-foundation/scripts/download_from_obis.R +248 -0
  83. package/skills/ecological-data-foundation/scripts/download_from_obis.py +318 -0
  84. package/skills/ecological-impact-assessment/SKILL.md +123 -0
  85. package/skills/ecological-impact-assessment/examples/example-prompts.md +32 -0
  86. package/skills/ecological-impact-assessment/resources/baci-design-guide.md +55 -0
  87. package/skills/ecological-impact-assessment/resources/fragmentation-metrics-reference.md +86 -0
  88. package/skills/ecological-impact-assessment/resources/pressure-index-template.md +78 -0
  89. package/skills/ecological-impact-assessment/resources/study-design-guide.md +168 -0
  90. package/skills/ecological-impact-assessment/scripts/baci_analysis.R +161 -0
  91. package/skills/ecological-impact-assessment/scripts/fragmentation_analysis.py +141 -0
  92. package/skills/ecological-impact-assessment/scripts/power_analysis_baci.R +274 -0
  93. package/skills/ecosystem-services-assessment/SKILL.md +125 -0
  94. package/skills/ecosystem-services-assessment/examples/example-prompts.md +24 -0
  95. package/skills/ecosystem-services-assessment/resources/es-indicator-reference.md +45 -0
  96. package/skills/ecosystem-services-assessment/resources/invest-parameter-guide.md +86 -0
  97. package/skills/ecosystem-services-assessment/resources/rusle-coefficients.md +88 -0
  98. package/skills/ecosystem-services-assessment/scripts/__pycache__/compute_es.cpython-311.pyc +0 -0
  99. package/skills/ecosystem-services-assessment/scripts/compute_es.py +189 -0
  100. package/skills/ecosystem-services-assessment/scripts/tradeoff_analysis.R +161 -0
  101. package/skills/environmental-time-series/SKILL.md +125 -0
  102. package/skills/environmental-time-series/examples/example-prompts.md +33 -0
  103. package/skills/environmental-time-series/resources/anomaly-indices-reference.md +88 -0
  104. package/skills/environmental-time-series/resources/bfast-parameter-guide.md +69 -0
  105. package/skills/environmental-time-series/scripts/__pycache__/recovery_trajectory.cpython-311.pyc +0 -0
  106. package/skills/environmental-time-series/scripts/__pycache__/trend_analysis.cpython-311.pyc +0 -0
  107. package/skills/environmental-time-series/scripts/recovery_trajectory.R +305 -0
  108. package/skills/environmental-time-series/scripts/recovery_trajectory.py +178 -0
  109. package/skills/environmental-time-series/scripts/trend_analysis.R +192 -0
  110. package/skills/environmental-time-series/scripts/trend_analysis.py +184 -0
  111. package/skills/geoprocessing-for-ecology/SKILL.md +123 -0
  112. package/skills/geoprocessing-for-ecology/examples/example-prompts.md +32 -0
  113. package/skills/geoprocessing-for-ecology/resources/crs-reference.md +62 -0
  114. package/skills/geoprocessing-for-ecology/resources/global-predictor-sources.md +331 -0
  115. package/skills/geoprocessing-for-ecology/resources/resampling-methods.md +57 -0
  116. package/skills/geoprocessing-for-ecology/scripts/__pycache__/download_predictors.cpython-311.pyc +0 -0
  117. package/skills/geoprocessing-for-ecology/scripts/download_predictors.R +239 -0
  118. package/skills/geoprocessing-for-ecology/scripts/download_predictors.py +379 -0
  119. package/skills/geoprocessing-for-ecology/scripts/stack_and_extract.R +224 -0
  120. package/skills/geoprocessing-for-ecology/scripts/stack_and_extract.py +172 -0
  121. package/skills/landscape-connectivity/SKILL.md +170 -0
  122. package/skills/landscape-connectivity/examples/example-prompts.md +96 -0
  123. package/skills/landscape-connectivity/examples/jaguar_mesoamerica_corridor_example.md +271 -0
  124. package/skills/landscape-connectivity/resources/circuitscape-parameter-guide.md +155 -0
  125. package/skills/landscape-connectivity/resources/graph-theory-for-ecology.md +134 -0
  126. package/skills/landscape-connectivity/resources/resistance-surface-guide.md +141 -0
  127. package/skills/landscape-connectivity/scripts/connectivity_analysis.py +387 -0
  128. package/skills/landscape-connectivity/scripts/connectivity_metrics.R +274 -0
  129. package/skills/landscape-connectivity/scripts/resistance_surface.R +239 -0
  130. package/skills/model-validation-and-uncertainty/SKILL.md +131 -0
  131. package/skills/model-validation-and-uncertainty/examples/example-prompts.md +30 -0
  132. package/skills/model-validation-and-uncertainty/resources/extrapolation-risk-guide.md +236 -0
  133. package/skills/model-validation-and-uncertainty/resources/metric-selection-guide.md +52 -0
  134. package/skills/model-validation-and-uncertainty/resources/threshold-selection-guide.md +64 -0
  135. package/skills/model-validation-and-uncertainty/scripts/__pycache__/validate_model.cpython-311.pyc +0 -0
  136. package/skills/model-validation-and-uncertainty/scripts/extrapolation_risk.R +315 -0
  137. package/skills/model-validation-and-uncertainty/scripts/validate_model.py +226 -0
  138. package/skills/model-validation-and-uncertainty/scripts/validate_sdm.R +162 -0
  139. package/skills/occupancy-and-detection/SKILL.md +126 -0
  140. package/skills/occupancy-and-detection/examples/example-prompts.md +33 -0
  141. package/skills/occupancy-and-detection/resources/detection-history-format.md +100 -0
  142. package/skills/occupancy-and-detection/resources/occupancy-study-design.md +47 -0
  143. package/skills/occupancy-and-detection/scripts/__pycache__/occupancy_analysis.cpython-311.pyc +0 -0
  144. package/skills/occupancy-and-detection/scripts/occupancy_analysis.R +160 -0
  145. package/skills/occupancy-and-detection/scripts/occupancy_analysis.py +159 -0
  146. package/skills/population-viability-analysis/SKILL.md +161 -0
  147. package/skills/population-viability-analysis/examples/african_elephant_pva_example.md +266 -0
  148. package/skills/population-viability-analysis/examples/example-prompts.md +95 -0
  149. package/skills/population-viability-analysis/resources/extinction-risk-thresholds.md +128 -0
  150. package/skills/population-viability-analysis/resources/matrix-model-guide.md +139 -0
  151. package/skills/population-viability-analysis/resources/sensitivity-elasticity-reference.md +182 -0
  152. package/skills/population-viability-analysis/scripts/matrix_pva.R +258 -0
  153. package/skills/population-viability-analysis/scripts/pva_analysis.py +442 -0
  154. package/skills/population-viability-analysis/scripts/stochastic_pva.R +353 -0
  155. package/skills/predictive-modeling-best-practices/SKILL.md +136 -0
  156. package/skills/predictive-modeling-best-practices/examples/example-prompts.md +58 -0
  157. package/skills/predictive-modeling-best-practices/resources/collinearity-decision-tree.md +65 -0
  158. package/skills/predictive-modeling-best-practices/resources/sampling-bias-correction.md +267 -0
  159. package/skills/predictive-modeling-best-practices/resources/spatial-cv-guide.md +73 -0
  160. package/skills/predictive-modeling-best-practices/scripts/__pycache__/spatial_cv.cpython-311.pyc +0 -0
  161. package/skills/predictive-modeling-best-practices/scripts/collinearity_check.R +112 -0
  162. package/skills/predictive-modeling-best-practices/scripts/spatial_cv.py +182 -0
  163. package/skills/reproducible-ecology-pipeline/SKILL.md +139 -0
  164. package/skills/reproducible-ecology-pipeline/examples/example-prompts.md +35 -0
  165. package/skills/reproducible-ecology-pipeline/resources/directory-structure-template.md +94 -0
  166. package/skills/reproducible-ecology-pipeline/resources/params-yaml-template.yaml +84 -0
  167. package/skills/reproducible-ecology-pipeline/resources/reproducibility-checklist-template.md +66 -0
  168. package/skills/reproducible-ecology-pipeline/scripts/generate_file_manifest.py +110 -0
  169. package/skills/reproducible-ecology-pipeline/scripts/init_project.sh +53 -0
  170. package/skills/spatial-prioritization/SKILL.md +162 -0
  171. package/skills/spatial-prioritization/examples/biodiversity_hotspot_prioritization_example.md +289 -0
  172. package/skills/spatial-prioritization/examples/example-prompts.md +93 -0
  173. package/skills/spatial-prioritization/resources/cost-surface-reference.md +130 -0
  174. package/skills/spatial-prioritization/resources/marxan-vs-prioritizr-comparison.md +125 -0
  175. package/skills/spatial-prioritization/resources/prioritizr-formulation-guide.md +188 -0
  176. package/skills/spatial-prioritization/resources/representation-targets-guide.md +186 -0
  177. package/skills/spatial-prioritization/scripts/prioritization_sensitivity.R +320 -0
  178. package/skills/spatial-prioritization/scripts/run_prioritization.R +336 -0
  179. package/skills/species-distribution-modeling/SKILL.md +139 -0
  180. package/skills/species-distribution-modeling/examples/example-prompts.md +36 -0
  181. package/skills/species-distribution-modeling/resources/algorithm-comparison.md +25 -0
  182. package/skills/species-distribution-modeling/resources/calibration-area-guide.md +71 -0
  183. package/skills/species-distribution-modeling/resources/climate-scenario-preparation.md +170 -0
  184. package/skills/species-distribution-modeling/resources/maxent-calibration-guide.md +211 -0
  185. package/skills/species-distribution-modeling/resources/sdm-checklist.md +37 -0
  186. package/skills/species-distribution-modeling/scripts/predict_distribution.R +236 -0
  187. package/skills/species-distribution-modeling/scripts/predict_distribution.py +286 -0
  188. package/skills/species-distribution-modeling/scripts/prepare_future_layers.R +351 -0
  189. package/skills/species-distribution-modeling/scripts/project_scenarios.R +220 -0
  190. package/skills/species-distribution-modeling/scripts/run_ensemble_sdm.R +99 -0
  191. package/skills/species-distribution-modeling/scripts/sdm_pipeline.py +318 -0
  192. package/skills/species-distribution-modeling/scripts/tune_maxnet.R +344 -0
  193. package/templates/SKILL_TEMPLATE.md +225 -0
  194. package/templates/checklists/data-submission-checklist.md +38 -0
  195. package/templates/checklists/post-analysis-checklist.md +55 -0
  196. package/templates/checklists/pre-analysis-checklist.md +31 -0
  197. package/templates/prompts/debug-skill.md +47 -0
  198. package/templates/prompts/invoke-skill.md +34 -0
  199. package/templates/prompts/invoke-workflow.md +45 -0
  200. package/templates/reports/technical-report-template.md +80 -0
  201. package/templates/scripts/logger_setup.R +79 -0
  202. package/templates/scripts/logger_setup.py +119 -0
  203. package/templates/scripts/params_loader.R +28 -0
  204. package/templates/scripts/params_loader.py +38 -0
  205. package/workflows/analyze-community-structure/WORKFLOW.md +72 -0
  206. package/workflows/analyze-environmental-change/WORKFLOW.md +73 -0
  207. package/workflows/assess-ecological-impact/WORKFLOW.md +75 -0
  208. package/workflows/assess-ecosystem-services/WORKFLOW.md +68 -0
  209. package/workflows/assess-landscape-connectivity/WORKFLOW.md +84 -0
  210. package/workflows/build-fire-risk-map/WORKFLOW.md +79 -0
  211. package/workflows/produce-technical-report/WORKFLOW.md +113 -0
  212. package/workflows/run-camera-trap-occupancy/WORKFLOW.md +87 -0
  213. package/workflows/run-conservation-prioritization/WORKFLOW.md +89 -0
  214. package/workflows/run-multispecies-screening/WORKFLOW.md +197 -0
  215. package/workflows/run-occupancy-analysis/WORKFLOW.md +74 -0
  216. package/workflows/run-population-viability/WORKFLOW.md +90 -0
  217. package/workflows/run-sdm-study/WORKFLOW.md +99 -0
@@ -0,0 +1,193 @@
1
+ # GBIF Data Citation Guide
2
+
3
+ How to download, cite, and document GBIF occurrence data correctly for scientific publications.
4
+
5
+ ---
6
+
7
+ ## 1. Why Citation Method Matters
8
+
9
+ GBIF provides two download mechanisms with fundamentally different citability:
10
+
11
+ | Method | DOI generated | Citable for publication | Reproducible | Use case |
12
+ |---|---|---|---|---|
13
+ | `occ_search()` / `pygbif.occurrences.search()` | **No** | No — not recommended for peer review | No (results may change) | Exploration, pilot analysis, dashboards |
14
+ | `occ_download()` / `pygbif.occurrences.download()` | **Yes** | Yes — required for publication | Yes (snapshot frozen) | All published analyses |
15
+
16
+ **Rule:** If your analysis will appear in a publication or technical report, always
17
+ use the download API (`occ_download` / `occurrences.download`) to obtain a citable DOI.
18
+
19
+ ---
20
+
21
+ ## 2. How to Cite GBIF Data Correctly
22
+
23
+ ### Standard GBIF citation format (APA-style)
24
+
25
+ ```
26
+ GBIF.org (YEAR) GBIF Occurrence Download. https://doi.org/10.15468/dl.XXXXXXX
27
+ Accessed on YYYY-MM-DD.
28
+ ```
29
+
30
+ **Example:**
31
+
32
+ ```
33
+ GBIF.org (2024) GBIF Occurrence Download.
34
+ https://doi.org/10.15468/dl.abc123
35
+ Accessed on 2024-03-15.
36
+ ```
37
+
38
+ ### Fields required in every citation
39
+
40
+ | Field | Example | Notes |
41
+ |---|---|---|
42
+ | Portal name | `GBIF.org` | Always "GBIF.org", not the institution name |
43
+ | Year of download | `(2024)` | Year the download was created |
44
+ | Record type | `GBIF Occurrence Download` | Fixed string |
45
+ | DOI | `https://doi.org/10.15468/dl.XXXXXXX` | Full URL, not just the suffix |
46
+ | Access date | `Accessed on 2024-03-15` | ISO 8601 date format |
47
+
48
+ ---
49
+
50
+ ## 3. Retrieving the DOI from a Download in R
51
+
52
+ ```r
53
+ suppressPackageStartupMessages(library(rgbif))
54
+
55
+ # Initiate download (runs asynchronously on GBIF servers)
56
+ download_key <- occ_download(
57
+ pred("taxonKey", 2435098), # GBIF taxon key for your species
58
+ pred("hasCoordinate", TRUE),
59
+ pred("occurrenceStatus", "PRESENT"),
60
+ pred_in("basisOfRecord", c("HUMAN_OBSERVATION",
61
+ "MACHINE_OBSERVATION",
62
+ "PRESERVED_SPECIMEN")),
63
+ pred_lt("coordinateUncertaintyInMeters", 10000),
64
+ format = "SIMPLE_CSV"
65
+ )
66
+
67
+ # Wait for completion (polls every 10 seconds)
68
+ occ_download_wait(download_key)
69
+
70
+ # Retrieve metadata (includes DOI)
71
+ meta <- occ_download_meta(download_key)
72
+ doi <- meta$doi # e.g., "10.15468/dl.abc123"
73
+ cat("Download DOI:", doi, "\n")
74
+
75
+ # Save DOI to metadata file for citation
76
+ writeLines(
77
+ c(
78
+ paste("GBIF download key:", download_key),
79
+ paste("DOI:", doi),
80
+ paste("Citation: GBIF.org (", format(Sys.Date(), "%Y"), ") GBIF Occurrence Download.",
81
+ paste0("https://doi.org/", doi),
82
+ "Accessed on", format(Sys.Date(), "%Y-%m-%d")),
83
+ paste("Download date:", Sys.Date()),
84
+ paste("Species:", "Panthera onca") # replace with your species
85
+ ),
86
+ "download_metadata.txt"
87
+ )
88
+
89
+ # Import the data
90
+ occ_data <- occ_download_get(download_key) |> occ_download_import()
91
+ ```
92
+
93
+ ---
94
+
95
+ ## 4. Retrieving the DOI from a Download in Python
96
+
97
+ ```python
98
+ import pygbif.occurrences as occ
99
+ import time
100
+ from pathlib import Path
101
+ from datetime import date
102
+
103
+ # Initiate download
104
+ download_key = occ.download(
105
+ "taxonKey = 2435098", # replace with actual taxon key
106
+ "hasCoordinate = TRUE",
107
+ "occurrenceStatus = PRESENT",
108
+ "basisOfRecord in HUMAN_OBSERVATION,MACHINE_OBSERVATION,PRESERVED_SPECIMEN",
109
+ "coordinateUncertaintyInMeters <= 10000"
110
+ )
111
+
112
+ # Poll until complete
113
+ while True:
114
+ status = occ.download_meta(download_key[0])["status"]
115
+ print(f"Status: {status}")
116
+ if status == "SUCCEEDED":
117
+ break
118
+ elif status == "FAILED":
119
+ raise RuntimeError("GBIF download failed")
120
+ time.sleep(30)
121
+
122
+ # Get DOI from metadata
123
+ meta = occ.download_meta(download_key[0])
124
+ doi = meta.get("doi", "")
125
+ print(f"Download DOI: {doi}")
126
+
127
+ # Save metadata
128
+ output_dir = Path("output/gbif")
129
+ output_dir.mkdir(parents=True, exist_ok=True)
130
+
131
+ with open(output_dir / "download_metadata.txt", "w") as f:
132
+ f.write(f"GBIF download key: {download_key[0]}\n")
133
+ f.write(f"DOI: {doi}\n")
134
+ f.write(f"Citation: GBIF.org ({date.today().year}) GBIF Occurrence Download. "
135
+ f"https://doi.org/{doi} Accessed on {date.today().isoformat()}\n")
136
+ f.write(f"Download date: {date.today().isoformat()}\n")
137
+ ```
138
+
139
+ ---
140
+
141
+ ## 5. Registering the DOI in data_provenance.md
142
+
143
+ Every project using GBIF data must have a `data_provenance.md` at the project root.
144
+ Add an entry like:
145
+
146
+ ```markdown
147
+ ## GBIF Occurrence Data
148
+
149
+ | Species | GBIF Taxon Key | Download Key | DOI | Download Date | Filters Applied |
150
+ |---|---|---|---|---|---|
151
+ | *Panthera onca* | 2435098 | 0001234-240101 | [10.15468/dl.abc123](https://doi.org/10.15468/dl.abc123) | 2024-03-15 | hasCoordinate, PRESENT, uncertainty < 10 km |
152
+ | *Chrysocyon brachyurus* | 2441050 | 0001235-240101 | [10.15468/dl.def456](https://doi.org/10.15468/dl.def456) | 2024-03-15 | hasCoordinate, PRESENT, uncertainty < 10 km |
153
+ ```
154
+
155
+ ---
156
+
157
+ ## 6. occ_search vs occ_download — Decision Guide
158
+
159
+ | Situation | Use |
160
+ |---|---|
161
+ | Exploring data availability, checking record counts | `occ_search` (no DOI needed) |
162
+ | Pilot/exploratory analysis not for publication | `occ_search` acceptable |
163
+ | Analysis to be included in a paper, report, or thesis | **`occ_download` required** |
164
+ | Dataset with > 100,000 records | **`occ_download` required** (occ_search limited to 100k) |
165
+ | Reproducible analysis shared with collaborators | **`occ_download` required** |
166
+ | Training an SDM for conservation planning | **`occ_download` required** |
167
+
168
+ ---
169
+
170
+ ## 7. Common Pitfalls
171
+
172
+ - **Citing the GBIF portal URL instead of the DOI:** `https://www.gbif.org/occurrence/search?...` is
173
+ not citable. Always use the download DOI.
174
+ - **Using `occ_search` for final analysis:** results from `occ_search` are not frozen;
175
+ re-running the same query months later may return different records. Only `occ_download`
176
+ creates a reproducible, citeable snapshot.
177
+ - **Forgetting to record the download date:** required even when DOI is present.
178
+ - **Not saving `download_metadata.txt`:** always save alongside the occurrence CSV.
179
+ - **Using taxon name instead of taxon key:** names can be ambiguous. Use the GBIF
180
+ backbone taxon key (`occ_search(scientificName=...)$key`) for unambiguous queries.
181
+ - **Not filtering `coordinateUncertaintyInMeters`:** records with large uncertainty
182
+ (> 10 km) should be excluded or handled explicitly.
183
+
184
+ ---
185
+
186
+ ## 8. References
187
+
188
+ | Resource | URL |
189
+ |---|---|
190
+ | GBIF citation guidelines | https://www.gbif.org/citation-guidelines |
191
+ | rgbif R package | https://docs.ropensci.org/rgbif/ |
192
+ | pygbif Python package | https://pygbif.readthedocs.io/ |
193
+ | GBIF DOI minting policy | https://www.gbif.org/faq?question=what-doi-does-gbif-assign-to-downloaded-data |
@@ -0,0 +1,83 @@
1
+ # QA Checklist — Ecological Data Foundation
2
+
3
+ Use this checklist for every new dataset before proceeding to analysis.
4
+
5
+ ## 1. File and Ingest
6
+
7
+ - [ ] Source file preserved in `data/raw/` (never overwrite)
8
+ - [ ] File encoding confirmed (UTF-8 preferred)
9
+ - [ ] Delimiter and quoting confirmed
10
+ - [ ] Row count matches expected (check for truncation)
11
+ - [ ] Column names documented
12
+
13
+ ## 2. Schema and Types
14
+
15
+ - [ ] All fields mapped to Darwin Core (or equivalent standard)
16
+ - [ ] Dates parsed as ISO-8601 strings
17
+ - [ ] Coordinates as decimal degrees (float)
18
+ - [ ] Country codes as ISO 3166-1 alpha-2
19
+ - [ ] Categorical fields enumerated and valid entries listed
20
+
21
+ ## 3. Duplicates
22
+
23
+ - [ ] Exact duplicates identified and counted
24
+ - [ ] Spatial-temporal near-duplicates checked (same species, same coords ± X km, same date ± Y days)
25
+ - [ ] Resolution strategy documented and applied
26
+
27
+ ## 4. Coordinates
28
+
29
+ - [ ] Latitude in range [-90, 90]
30
+ - [ ] Longitude in range [-180, 180]
31
+ - [ ] Zero coordinates (0, 0) flagged
32
+ - [ ] Country centroid coordinates flagged
33
+ - [ ] Capital city coordinates flagged
34
+ - [ ] Coordinates fall within stated country/region polygon
35
+ - [ ] Coordinate uncertainty recorded where available
36
+
37
+ ## 5. Taxonomy
38
+
39
+ - [ ] Species names checked against reference backbone (GBIF / Catalogue of Life)
40
+ - [ ] Synonyms resolved to accepted name
41
+ - [ ] Misspellings corrected (with original preserved)
42
+ - [ ] Higher-rank-only identifications flagged
43
+ - [ ] Hybrids and cultivars handled according to study scope
44
+
45
+ ## 6. Temporal
46
+
47
+ - [ ] No dates in the future
48
+ - [ ] No dates before plausible survey era for the taxon
49
+ - [ ] Temporal precision meets study requirements
50
+ - [ ] Records with year-only precision flagged if day-level is needed
51
+
52
+ ## 7. Attribute Ranges
53
+
54
+ - [ ] Numeric fields checked for biologically impossible values
55
+ - [ ] Missing value rate per field computed and documented
56
+ - [ ] Fields exceeding missing value threshold (default 20%) flagged for decision
57
+
58
+ ## 8. Outputs
59
+
60
+ - [ ] `data_clean.csv` written with `QA_status` column
61
+ - [ ] `flagged_records.csv` written with reason codes
62
+ - [ ] `qa_report.md` summarises issue counts and resolutions
63
+ - [ ] `schema.yaml` documents all field definitions
64
+ - [ ] `metadata.xml` (EML or Dublin Core) completed
65
+
66
+ ## QA Status Codes
67
+
68
+ | Code | Meaning |
69
+ |------|---------|
70
+ | `OK` | Record passed all checks |
71
+ | `COORD_CENTROID` | Coordinates at country/institution centroid |
72
+ | `COORD_ZERO` | Coordinates are (0, 0) |
73
+ | `COORD_OUT_OF_RANGE` | lat or lon outside valid bounds |
74
+ | `COORD_OUTSIDE_COUNTRY` | Point falls outside declared country polygon |
75
+ | `DATE_FUTURE` | Event date is in the future |
76
+ | `DATE_UNLIKELY` | Event date before plausible survey era |
77
+ | `DUPLICATE_EXACT` | Identical to another record |
78
+ | `DUPLICATE_SPATIOTEMPORAL` | Near-duplicate (spatial-temporal proximity) |
79
+ | `TAXON_SYNONYM` | Name is a synonym; resolved to accepted name |
80
+ | `TAXON_MISSPELLING` | Misspelling detected and corrected |
81
+ | `TAXON_HIGH_RANK` | Identified only to genus or higher |
82
+ | `MISSING_COORDS` | No coordinate information |
83
+ | `REMOVED` | Record excluded from clean dataset |
@@ -0,0 +1,230 @@
1
+ # ecological-agent-skills / Copyright (C) 2026 Francisco Diego Barros Barata
2
+ # SPDX-License-Identifier: GPL-3.0-or-later
3
+
4
+ # Usage: Rscript clean_occurrences.R <raw_occurrences.csv> <output_dir> [country_code]
5
+
6
+ # ── Inline logger ─────────────────────────────────────────────────────────────
7
+ SKILL_NAME <- "ecological-data-foundation"
8
+ .log_ts <- function() format(Sys.time(), "[%Y-%m-%d %H:%M:%S]")
9
+ log_info <- function(...) message(.log_ts(), " [INFO] ", sprintf(...))
10
+ log_warn <- function(...) message(.log_ts(), " [WARN] ", sprintf(...))
11
+ log_error<- function(...) message(.log_ts(), " [ERROR] ", sprintf(...))
12
+ log_step <- function(n, d) log_info("-- STEP %d: %s", n, d)
13
+ log_decision <- function(v, val, why) log_info("DECISION | %s = %s | %s", v, val, why)
14
+ dir.create("logs", recursive=TRUE, showWarnings=FALSE)
15
+
16
+ # Standard occurrence cleaning pipeline
17
+ # Usage: Rscript clean_occurrences.R <input_csv> <output_dir>
18
+ # Requires: dplyr, readr, CoordinateCleaner, taxize, janitor
19
+
20
+ suppressPackageStartupMessages({
21
+ library(dplyr)
22
+ library(readr)
23
+ library(CoordinateCleaner)
24
+ library(janitor)
25
+ })
26
+
27
+ args <- commandArgs(trailingOnly = TRUE)
28
+ input_file <- ifelse(length(args) >= 1, args[1], "data/raw/occurrences.csv")
29
+ output_dir <- ifelse(length(args) >= 2, args[2], "data/processed")
30
+
31
+ log_info("Script: clean_occurrences.R | Skill: %s", SKILL_NAME)
32
+ log_info("Input file : %s", input_file)
33
+ log_info("Output dir : %s", output_dir)
34
+
35
+ # ── Input precondition check ──────────────────────────────────────────────────
36
+ if (!file.exists(input_file)) {
37
+ log_error(
38
+ "Input nao encontrado: %s\nCausa provavel: arquivo nao gerado pelo passo anterior.\nVerifique a saida de: ecological-data-foundation (download_from_gbif)\nSkill anterior: ecological-data-foundation",
39
+ input_file
40
+ )
41
+ stop("Missing: ", input_file)
42
+ }
43
+
44
+ dir.create(output_dir, recursive = TRUE, showWarnings = FALSE)
45
+
46
+ log_decision("input_file", input_file, "caminho passado como args[1] ou padrao")
47
+ log_decision("output_dir", output_dir, "caminho passado como args[2] ou padrao")
48
+
49
+ # ── 1. Ingest ──────────────────────────────────────────────────────────────
50
+ log_step(1, "Ingerir dados brutos de ocorrencias")
51
+ tryCatch({
52
+ raw <- read_csv(input_file, show_col_types = FALSE) |>
53
+ clean_names()
54
+ log_info("Registros brutos lidos: %d | Colunas: %d", nrow(raw), ncol(raw))
55
+ }, error = function(e) {
56
+ log_error(
57
+ "Falha ao ler CSV de entrada: %s\nCausa provavel: arquivo corrompido ou nao e CSV valido.\nVerifique: %s\nSkill anterior: ecological-data-foundation",
58
+ conditionMessage(e), input_file
59
+ )
60
+ stop(e)
61
+ })
62
+
63
+ if (nrow(raw) == 0) {
64
+ log_warn("Arquivo de entrada nao contem registros: %s", input_file)
65
+ }
66
+
67
+ # ── 2. Require minimum columns ─────────────────────────────────────────────
68
+ log_step(2, "Verificar colunas obrigatorias")
69
+ tryCatch({
70
+ required_cols <- c("decimal_latitude", "decimal_longitude", "species")
71
+ missing_req <- setdiff(required_cols, names(raw))
72
+ if (length(missing_req) > 0) {
73
+ log_error(
74
+ "Colunas obrigatorias ausentes: %s\nCausa provavel: CSV gerado por fonte diferente ou com nomes de colunas alterados.\nVerifique o schema do arquivo: %s\nSkill anterior: ecological-data-foundation",
75
+ paste(missing_req, collapse = ", "), input_file
76
+ )
77
+ stop("Missing required columns: ", paste(missing_req, collapse = ", "))
78
+ }
79
+ log_info("Todas as colunas obrigatorias presentes: %s", paste(required_cols, collapse = ", "))
80
+ }, error = function(e) {
81
+ log_error("Falha na verificacao de colunas: %s", conditionMessage(e))
82
+ stop(e)
83
+ })
84
+
85
+ # ── 3. Remove records with missing coordinates ─────────────────────────────
86
+ log_step(3, "Remover registros sem coordenadas e converter para numerico")
87
+ tryCatch({
88
+ n_before_na <- nrow(raw)
89
+ raw <- raw |>
90
+ filter(!is.na(decimal_latitude), !is.na(decimal_longitude)) |>
91
+ mutate(
92
+ decimal_latitude = as.numeric(decimal_latitude),
93
+ decimal_longitude = as.numeric(decimal_longitude)
94
+ )
95
+ n_removed_na <- n_before_na - nrow(raw)
96
+ if (n_removed_na > 0) {
97
+ log_warn("Registros removidos por coordenadas ausentes: %d", n_removed_na)
98
+ } else {
99
+ log_info("Nenhum registro removido por coordenadas ausentes.")
100
+ }
101
+ }, error = function(e) {
102
+ log_error(
103
+ "Falha ao filtrar coordenadas ausentes: %s\nCausa provavel: tipos de coluna inesperados no CSV.\nVerifique: %s\nSkill anterior: ecological-data-foundation",
104
+ conditionMessage(e), input_file
105
+ )
106
+ stop(e)
107
+ })
108
+
109
+ if (nrow(raw) < 30) {
110
+ log_warn("Poucos registros apos remocao de NAs (%d). Minimo recomendado para SDM: 30.", nrow(raw))
111
+ }
112
+
113
+ # ── 4. Coordinate cleaning ─────────────────────────────────────────────────
114
+ log_step(4, "Limpeza de coordenadas com CoordinateCleaner")
115
+ log_decision(
116
+ "cc_tests",
117
+ "capitals,centroids,equal,gbif,institutions,validity,zeros",
118
+ "conjunto padrao de testes para detectar registros suspeitos"
119
+ )
120
+ tryCatch({
121
+ flags <- clean_coordinates(
122
+ x = raw,
123
+ lon = "decimal_longitude",
124
+ lat = "decimal_latitude",
125
+ species = "species",
126
+ tests = c("capitals", "centroids", "equal", "gbif",
127
+ "institutions", "validity", "zeros"),
128
+ verbose = FALSE
129
+ )
130
+
131
+ clean <- raw[flags$.summary, ] |> mutate(QA_status = "OK")
132
+ flagged <- raw[!flags$.summary, ] |>
133
+ mutate(QA_status = apply(
134
+ flags[!flags$.summary, grep("^\\.", names(flags))], 1,
135
+ function(r) paste(names(r)[!r], collapse = "|")
136
+ ))
137
+
138
+ log_info("Registros limpos: %d | Registros sinalizados: %d", nrow(clean), nrow(flagged))
139
+ if (nrow(flagged) > 0) {
140
+ log_warn("%d registros sinalizados pelo CoordinateCleaner (%.1f%% do total).",
141
+ nrow(flagged), 100 * nrow(flagged) / nrow(raw))
142
+ }
143
+ }, error = function(e) {
144
+ log_error(
145
+ "Falha na limpeza de coordenadas (CoordinateCleaner): %s\nCausa provavel: dados malformados ou pacote CoordinateCleaner nao instalado.\nVerifique: install.packages('CoordinateCleaner')\nSkill anterior: ecological-data-foundation",
146
+ conditionMessage(e)
147
+ )
148
+ stop(e)
149
+ })
150
+
151
+ # ── 5. Remove exact duplicates ─────────────────────────────────────────────
152
+ log_step(5, "Remover duplicatas exatas")
153
+ tryCatch({
154
+ n_before <- nrow(clean)
155
+ clean <- clean |> distinct(species, decimal_latitude, decimal_longitude,
156
+ event_date, .keep_all = TRUE)
157
+ n_dup <- n_before - nrow(clean)
158
+ log_decision(
159
+ "dedup_cols",
160
+ "species,decimal_latitude,decimal_longitude,event_date",
161
+ "combinacao padrao para identificar duplicatas espaciotemporais"
162
+ )
163
+ if (n_dup > 0) {
164
+ log_warn("Duplicatas exatas removidas: %d", n_dup)
165
+ } else {
166
+ log_info("Nenhuma duplicata exata encontrada.")
167
+ }
168
+ }, error = function(e) {
169
+ log_error(
170
+ "Falha ao remover duplicatas: %s\nCausa provavel: coluna event_date ausente ou mal formatada.\nVerifique o schema do CSV.\nSkill anterior: ecological-data-foundation",
171
+ conditionMessage(e)
172
+ )
173
+ stop(e)
174
+ })
175
+
176
+ if (nrow(clean) < 30) {
177
+ log_warn(
178
+ "Apenas %d registros limpos apos todas as filtragens. SDMs requerem >= 30 registros confiaveis.",
179
+ nrow(clean)
180
+ )
181
+ }
182
+
183
+ # ── 6. Write outputs ───────────────────────────────────────────────────────
184
+ log_step(6, "Escrever arquivos de saida")
185
+ tryCatch({
186
+ write_csv(clean, file.path(output_dir, "data_clean.csv"))
187
+ write_csv(flagged, file.path(output_dir, "flagged_records.csv"))
188
+ log_info("Gravado: %s", file.path(output_dir, "data_clean.csv"))
189
+ log_info("Gravado: %s", file.path(output_dir, "flagged_records.csv"))
190
+ }, error = function(e) {
191
+ log_error(
192
+ "Falha ao gravar arquivos de saida: %s\nCausa provavel: sem permissao de escrita em '%s'.\nVerifique permissoes do diretorio.\nSkill anterior: ecological-data-foundation",
193
+ conditionMessage(e), output_dir
194
+ )
195
+ stop(e)
196
+ })
197
+
198
+ # ── 7. QA report ──────────────────────────────────────────────────────────
199
+ log_step(7, "Gerar relatorio de QA")
200
+ tryCatch({
201
+ report <- c(
202
+ "# QA Report — Occurrence Cleaning",
203
+ "",
204
+ paste("- Input file:", input_file),
205
+ paste("- Raw records:", nrow(raw) + nrow(flagged)),
206
+ paste("- Exact duplicates removed:", n_dup),
207
+ paste("- Records flagged by CoordinateCleaner:", nrow(flagged)),
208
+ paste("- Clean records written:", nrow(clean)),
209
+ "",
210
+ "## Flag Counts",
211
+ ""
212
+ )
213
+
214
+ flag_cols <- grep("^\\.", names(flags), value = TRUE)
215
+ for (fc in flag_cols) {
216
+ n_fail <- sum(!flags[[fc]], na.rm = TRUE)
217
+ if (n_fail > 0) report <- c(report, paste0("- `", fc, "`: ", n_fail))
218
+ }
219
+
220
+ writeLines(report, file.path(output_dir, "qa_report.md"))
221
+ log_info("Gravado: %s", file.path(output_dir, "qa_report.md"))
222
+ }, error = function(e) {
223
+ log_error(
224
+ "Falha ao gerar relatorio QA: %s\nCausa provavel: problema ao escrever no diretorio '%s'.\nSkill anterior: ecological-data-foundation",
225
+ conditionMessage(e), output_dir
226
+ )
227
+ stop(e)
228
+ })
229
+
230
+ log_info("Concluido. Registros limpos: %d | Sinalizados: %d", nrow(clean), nrow(flagged))