npm - ecological-agent-skills - Versions diffs - 3.1.0 - Mend

ecological-agent-skills 3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (217) hide show

package/skills/model-validation-and-uncertainty/scripts/validate_model.py ADDED Viewed

@@ -0,0 +1,226 @@
+#!/usr/bin/env python3
+# ecological-agent-skills / Copyright (C) 2026 Francisco Diego Barros Barata
+# SPDX-License-Identifier: GPL-3.0-or-later
+"""
+validate_model.py
+Compute AUC, TSS, calibration for binary predictions.
+Usage: python validate_model.py <predictions_csv> <output_dir>
+Requires: pandas, numpy, sklearn, matplotlib
+"""
+import logging
+import sys
+from datetime import datetime
+from pathlib import Path
+SKILL_NAME = "model-validation-and-uncertainty"
+_LOG_DIR   = Path("logs")
+_LOG_DIR.mkdir(parents=True, exist_ok=True)
+_log_file  = _LOG_DIR / f"skill_{SKILL_NAME}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"
+logging.basicConfig(
+    level=logging.INFO,
+    format="[%(asctime)s] [%(levelname)s] [" + SKILL_NAME + "] %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+    handlers=[
+        logging.StreamHandler(sys.stdout),
+        logging.FileHandler(_log_file, encoding="utf-8"),
+    ],
+)
+logger = logging.getLogger(SKILL_NAME)
+def log_step(n: int, desc: str) -> None:
+    logger.info("-- STEP %d: %s", n, desc)
+def log_decision(var: str, val, why: str) -> None:
+    logger.info("DECISION | %s = %s | %s", var, val, why)
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+from sklearn.metrics import roc_auc_score, roc_curve
+def compute_tss(y_true, y_pred_prob):
+    thresholds = np.linspace(0, 1, 101)
+    best_tss, best_thresh = -1, 0
+    for th in thresholds:
+        y_bin = (y_pred_prob >= th).astype(int)
+        tp = ((y_bin == 1) & (y_true == 1)).sum()
+        fp = ((y_bin == 1) & (y_true == 0)).sum()
+        tn = ((y_bin == 0) & (y_true == 0)).sum()
+        fn = ((y_bin == 0) & (y_true == 1)).sum()
+        sens = tp / (tp + fn) if (tp + fn) > 0 else 0
+        spec = tn / (tn + fp) if (tn + fp) > 0 else 0
+        tss  = sens + spec - 1
+        if tss > best_tss:
+            best_tss, best_thresh = tss, th
+    return best_tss, best_thresh
+def calibration_plot(y_true, y_pred, output_path, n_bins=10):
+    bins = np.linspace(0, 1, n_bins + 1)
+    bin_ids = np.digitize(y_pred, bins) - 1
+    bin_ids = np.clip(bin_ids, 0, n_bins - 1)
+    mean_pred, obs_rate, counts = [], [], []
+    for b in range(n_bins):
+        mask = bin_ids == b
+        if mask.sum() > 0:
+            mean_pred.append(y_pred[mask].mean())
+            obs_rate.append(y_true[mask].mean())
+            counts.append(mask.sum())
+    fig, ax = plt.subplots(figsize=(6, 5))
+    ax.plot([0, 1], [0, 1], "k--", label="Perfect calibration")
+    sc = ax.scatter(mean_pred, obs_rate, c=counts, cmap="Blues", s=80, edgecolor="navy", zorder=3)
+    ax.plot(mean_pred, obs_rate, color="steelblue")
+    plt.colorbar(sc, ax=ax, label="n per bin")
+    ax.set_xlabel("Mean predicted probability"); ax.set_ylabel("Observed rate")
+    ax.set_title("Calibration Plot"); ax.legend()
+    plt.tight_layout()
+    plt.savefig(output_path, dpi=150)
+    plt.close()
+def roc_plot(y_true, y_pred, auc_val, output_path):
+    fpr, tpr, _ = roc_curve(y_true, y_pred)
+    plt.figure(figsize=(5, 5))
+    plt.plot(fpr, tpr, label=f"AUC = {auc_val:.3f}", color="steelblue")
+    plt.plot([0, 1], [0, 1], "k--")
+    plt.xlabel("False Positive Rate"); plt.ylabel("True Positive Rate")
+    plt.title("ROC Curve"); plt.legend()
+    plt.tight_layout()
+    plt.savefig(output_path, dpi=150)
+    plt.close()
+def main():
+    pred_file  = sys.argv[1] if len(sys.argv) > 1 else "outputs/predictions.csv"
+    output_dir = Path(sys.argv[2]) if len(sys.argv) > 2 else Path("outputs/validation")
+    log_step(1, "Validate inputs")
+    if not Path(pred_file).exists():
+        logger.error(
+            "Predictions file not found: %s\n"
+            "Causa provavel: caminho incorreto ou modelo nao gerou predicoes ainda\n"
+            "Verifique: o argumento predictions_csv e que o modelo foi ajustado\n"
+            "Skill anterior: species-distribution-modelling",
+            pred_file
+        )
+        sys.exit(1)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    log_step(2, "Load predictions data")
+    try:
+        dat = pd.read_csv(pred_file)
+    except Exception as e:
+        logger.error(
+            "Unexpected error in load data: %s\n"
+            "Causa provavel: CSV malformado ou permissoes insuficientes\n"
+            "Verifique: encoding e estrutura do arquivo de predicoes\n"
+            "Skill anterior: species-distribution-modelling",
+            e
+        )
+        raise
+    if "observed" not in dat.columns or "predicted" not in dat.columns:
+        logger.error(
+            "Required columns missing. Expected: 'observed', 'predicted'. Found: %s\n"
+            "Causa provavel: cabecalho do CSV nao padronizado\n"
+            "Verifique: que o arquivo tem colunas 'observed' (0/1) e 'predicted' (probabilidade)\n"
+            "Skill anterior: species-distribution-modelling",
+            list(dat.columns)
+        )
+        sys.exit(1)
+    y_true = dat["observed"].values
+    y_pred = dat["predicted"].values
+    logger.info("Loaded %d predictions. Prevalence: %.3f", len(dat), y_true.mean())
+    n_pos = (y_true == 1).sum()
+    n_neg = (y_true == 0).sum()
+    log_decision("evaluation_metrics", "AUC + MaxTSS + calibration plot", "standard triad for binary SDM evaluation")
+    logger.info("Presences: %d | Absences: %d", n_pos, n_neg)
+    if n_pos < 10:
+        logger.warning(
+            "Only %d presence records. AUC and TSS estimates will be highly uncertain.", n_pos
+        )
+    if n_neg < 10:
+        logger.warning(
+            "Only %d absence/background records. Consider increasing background sample size.", n_neg
+        )
+    if np.any((y_pred < 0) | (y_pred > 1)):
+        logger.warning("Some predicted values are outside [0, 1]. Check that predictions are probabilities.")
+    log_step(3, "Compute AUC-ROC")
+    try:
+        auc = roc_auc_score(y_true, y_pred)
+        logger.info("AUC-ROC:  %.4f", auc)
+        if auc < 0.7:
+            logger.warning(
+                "AUC = %.4f is below 0.70. Model discrimination is poor. "
+                "Consider revisiting predictors or sampling design.", auc
+            )
+    except Exception as e:
+        logger.error(
+            "Unexpected error in AUC-ROC: %s\n"
+            "Causa provavel: apenas uma classe em 'observed' ou valores NA\n"
+            "Verifique: que 'observed' contem tanto 0 quanto 1 e 'predicted' nao tem NA\n"
+            "Skill anterior: species-distribution-modelling",
+            e
+        )
+        raise
+    log_step(4, "Compute MaxTSS and optimal threshold")
+    log_decision("threshold_method", "MaxTSS", "maximises sensitivity + specificity; robust for SDMs")
+    try:
+        tss, thresh = compute_tss(y_true, y_pred)
+        logger.info("Max TSS:  %.4f  (threshold = %.2f)", tss, thresh)
+        if tss < 0.4:
+            logger.warning(
+                "MaxTSS = %.4f is low. Model may have poor predictive performance.", tss
+            )
+    except Exception as e:
+        logger.error(
+            "Unexpected error in TSS computation: %s\n"
+            "Causa provavel: valores NA ou classe unica em 'observed'\n"
+            "Verifique: que 'observed' contem tanto 0 quanto 1\n"
+            "Skill anterior: species-distribution-modelling",
+            e
+        )
+        raise
+    log_step(5, "Save performance metrics")
+    try:
+        metrics = pd.DataFrame({"metric": ["AUC-ROC", "MaxTSS", "Threshold_MaxTSS"],
+                                "value":  [round(auc, 4), round(tss, 4), thresh]})
+        metrics.to_csv(output_dir / "performance_metrics.csv", index=False)
+        logger.info("Performance metrics saved.")
+    except Exception as e:
+        logger.error(
+            "Unexpected error in save metrics: %s\n"
+            "Causa provavel: diretorio sem permissao de escrita\n"
+            "Verifique: output_dir e permissoes do sistema de arquivos\n"
+            "Skill anterior: model-validation-and-uncertainty (metrics computation)",
+            e
+        )
+        raise
+    log_step(6, "Generate diagnostic plots")
+    try:
+        calibration_plot(y_true, y_pred, output_dir / "calibration_plot.png")
+        logger.info("Calibration plot saved.")
+        roc_plot(y_true, y_pred, auc, output_dir / "roc_curve.png")
+        logger.info("ROC curve saved.")
+    except Exception as e:
+        logger.error(
+            "Unexpected error in diagnostic plots: %s\n"
+            "Causa provavel: dados insuficientes por bin ou backend matplotlib indisponivel\n"
+            "Verifique: distribuicao dos valores preditos e configuracao do matplotlib\n"
+            "Skill anterior: model-validation-and-uncertainty (metrics computation)",
+            e
+        )
+        raise
+    logger.info("Outputs written to: %s", output_dir)
+if __name__ == "__main__":
+    main()

package/skills/model-validation-and-uncertainty/scripts/validate_sdm.R ADDED Viewed

@@ -0,0 +1,162 @@
+# ecological-agent-skills / Copyright (C) 2026 Francisco Diego Barros Barata
+# SPDX-License-Identifier: GPL-3.0-or-later
+# Usage: Rscript validate_sdm.R <model.rds> <test_data.csv> <output_dir> [threshold_method]
+# Compute AUC, TSS, Boyce index and calibration for SDM predictions
+# Usage: Rscript validate_sdm.R <predictions_csv> <output_dir>
+# Requires: PresenceAbsence, ecospat, dplyr, ggplot2
+# ── Inline logger ─────────────────────────────────────────────────────────────
+SKILL_NAME <- "model-validation-and-uncertainty"
+.log_ts  <- function() format(Sys.time(), "[%Y-%m-%d %H:%M:%S]")
+log_info <- function(...) message(.log_ts(), " [INFO]  ", sprintf(...))
+log_warn <- function(...) message(.log_ts(), " [WARN]  ", sprintf(...))
+log_error<- function(...) message(.log_ts(), " [ERROR] ", sprintf(...))
+log_step <- function(n, d) log_info("-- STEP %d: %s", n, d)
+log_decision <- function(v, val, why) log_info("DECISION | %s = %s | %s", v, val, why)
+dir.create("logs", recursive=TRUE, showWarnings=FALSE)
+suppressPackageStartupMessages({
+  library(dplyr)
+  library(ggplot2)
+})
+args       <- commandArgs(trailingOnly = TRUE)
+pred_file  <- ifelse(length(args) >= 1, args[1], "outputs/predictions.csv")
+output_dir <- ifelse(length(args) >= 2, args[2], "outputs/validation")
+log_step(1, "Validate inputs")
+if (!file.exists(pred_file)) {
+  log_error(
+    "Falha em validate inputs: arquivo de predicoes nao encontrado: %s\nCausa provavel: caminho incorreto ou modelo nao gerou predicoes ainda\nVerifique: o argumento predictions_csv e que o modelo foi ajustado\nSkill anterior: species-distribution-modelling",
+    pred_file
+  )
+  stop("Predictions file not found.")
+}
+dir.create(output_dir, recursive = TRUE, showWarnings = FALSE)
+log_step(2, "Load predictions data")
+tryCatch({
+  dat <- read.csv(pred_file)
+}, error = function(e) {
+  log_error(
+    "Falha em load data: %s\nCausa provavel: CSV malformado ou permissoes insuficientes\nVerifique: encoding e estrutura do arquivo de predicoes\nSkill anterior: species-distribution-modelling",
+    conditionMessage(e)
+  )
+  stop(e)
+})
+if (!all(c("observed", "predicted") %in% names(dat))) {
+  log_error(
+    "Falha em validate columns: colunas obrigatorias ausentes. Esperado: 'observed', 'predicted'. Encontrado: %s\nCausa provavel: cabecalho do CSV nao padronizado\nVerifique: que o arquivo tem colunas 'observed' (0/1) e 'predicted' (probabilidade)\nSkill anterior: species-distribution-modelling",
+    paste(names(dat), collapse = ", ")
+  )
+  stop("Required columns 'observed' and 'predicted' not found.")
+}
+log_info("Loaded %d predictions. Prevalence: %.3f", nrow(dat), mean(dat$observed))
+n_pos <- sum(dat$observed == 1)
+n_neg <- sum(dat$observed == 0)
+log_decision("evaluation_approach", "AUC + MaxTSS + calibration", "standard triad for binary SDM evaluation")
+log_info("Presences: %d | Absences: %d", n_pos, n_neg)
+if (n_pos < 10) {
+  log_warn("Only %d presence records. AUC and TSS estimates will be highly uncertain with so few presences.", n_pos)
+}
+if (n_neg < 10) {
+  log_warn("Only %d absence/background records. Consider increasing background sample size.", n_neg)
+}
+if (any(dat$predicted < 0 | dat$predicted > 1, na.rm = TRUE)) {
+  log_warn("Some predicted values are outside [0, 1]. Check that predictions are probabilities.")
+}
+log_step(3, "Compute AUC-ROC")
+tryCatch({
+  roc_data <- dat |> arrange(desc(predicted))
+  roc_data$tpr <- cumsum(roc_data$observed == 1) / n_pos
+  roc_data$fpr <- cumsum(roc_data$observed == 0) / n_neg
+  auc <- abs(sum(diff(roc_data$fpr) * (roc_data$tpr[-1] + roc_data$tpr[-nrow(roc_data)]) / 2))
+  log_info("AUC-ROC: %.3f", auc)
+  if (auc < 0.7) {
+    log_warn("AUC = %.3f is below 0.70. Model discrimination is poor. Consider revisiting predictors or sampling design.", auc)
+  }
+}, error = function(e) {
+  log_error(
+    "Falha em AUC-ROC: %s\nCausa provavel: valores NA em 'observed' ou 'predicted', ou apenas uma classe\nVerifique: que 'observed' contem 0 e 1 e 'predicted' nao tem NA\nSkill anterior: species-distribution-modelling",
+    conditionMessage(e)
+  )
+  stop(e)
+})
+log_step(4, "Compute MaxTSS and optimal threshold")
+log_decision("threshold_method", "MaxTSS", "maximises sensitivity + specificity; robust for SDMs")
+tryCatch({
+  thresholds <- seq(0, 1, by = 0.01)
+  tss_vals <- sapply(thresholds, function(th) {
+    pred_bin <- as.integer(dat$predicted >= th)
+    tp <- sum(pred_bin == 1 & dat$observed == 1)
+    fp <- sum(pred_bin == 1 & dat$observed == 0)
+    tn <- sum(pred_bin == 0 & dat$observed == 0)
+    fn <- sum(pred_bin == 0 & dat$observed == 1)
+    sens <- if ((tp + fn) > 0) tp / (tp + fn) else 0
+    spec <- if ((tn + fp) > 0) tn / (tn + fp) else 0
+    sens + spec - 1
+  })
+  best_tss_idx <- which.max(tss_vals)
+  best_thresh  <- thresholds[best_tss_idx]
+  best_tss     <- tss_vals[best_tss_idx]
+  log_info("MaxTSS: %.3f at threshold: %.2f", best_tss, best_thresh)
+  if (best_tss < 0.4) {
+    log_warn("MaxTSS = %.3f is low. Model may have poor predictive performance.", best_tss)
+  }
+}, error = function(e) {
+  log_error(
+    "Falha em TSS computation: %s\nCausa provavel: valores NA ou classe unica em 'observed'\nVerifique: que 'observed' contem tanto 0 quanto 1\nSkill anterior: species-distribution-modelling",
+    conditionMessage(e)
+  )
+  stop(e)
+})
+log_step(5, "Save performance metrics")
+tryCatch({
+  metrics <- data.frame(
+    metric = c("AUC-ROC", "MaxTSS", "Threshold_MaxTSS"),
+    value  = round(c(auc, best_tss, best_thresh), 4)
+  )
+  write.csv(metrics, file.path(output_dir, "performance_metrics.csv"), row.names = FALSE)
+  log_info("Performance metrics saved.")
+}, error = function(e) {
+  log_error(
+    "Falha em save metrics: %s\nCausa provavel: diretorio sem permissao de escrita\nVerifique: output_dir e permissoes do sistema de arquivos\nSkill anterior: model-validation-and-uncertainty (metrics computation)",
+    conditionMessage(e)
+  )
+  stop(e)
+})
+log_step(6, "Generate calibration plot")
+tryCatch({
+  dat$bin <- cut(dat$predicted, breaks = seq(0, 1, by = 0.1), include.lowest = TRUE)
+  cal <- dat |>
+    group_by(bin) |>
+    summarise(mean_pred = mean(predicted), obs_rate = mean(observed), n = n(), .groups = "drop")
+  p_cal <- ggplot(cal, aes(x = mean_pred, y = obs_rate)) +
+    geom_abline(slope = 1, intercept = 0, linetype = "dashed", colour = "grey50") +
+    geom_point(aes(size = n), colour = "#2166ac") +
+    geom_line(colour = "#2166ac") +
+    scale_size_area(max_size = 8) +
+    labs(title = "Calibration Plot", x = "Mean Predicted Probability", y = "Observed Rate",
+         size = "n") +
+    theme_bw()
+  ggsave(file.path(output_dir, "calibration_plot.png"), p_cal, width = 6, height = 5, dpi = 150)
+  log_info("Calibration plot written.")
+}, error = function(e) {
+  log_error(
+    "Falha em calibration plot: %s\nCausa provavel: dados insuficientes por bin ou valores extremos de predicao\nVerifique: distribuicao dos valores preditos e numero de registros\nSkill anterior: model-validation-and-uncertainty (metrics computation)",
+    conditionMessage(e)
+  )
+  stop(e)
+})

package/skills/occupancy-and-detection/SKILL.md ADDED Viewed

@@ -0,0 +1,126 @@
+---
+name: occupancy-and-detection
+description: "Fits single-season and dynamic occupancy models that account for imperfect detection in wildlife survey data. Use this skill when the user mentions occupancy estimation, detection probability, imperfect detection, detection histories, repeated visits, MacKenzie models, psi estimation, dynamic occupancy (colonization/extinction), goodness-of-fit testing (c-hat), site occupancy, or unmarked package analyses."
+skill_version: 1.0.0
+---
+# Skill: occupancy-and-detection
+**Domain:** Occupancy models · Imperfect detection · Replicate surveys
+**Phase:** 3 — Specialist
+**Used by:** run-occupancy-analysis
+---
+## Purpose
+Guides the agent through the design and analysis of occupancy studies that account for imperfect detection. Covers single-season and dynamic occupancy models, covariate specification, goodness-of-fit testing, and result interpretation.
+---
+## When to Invoke
+- Species were surveyed at multiple sites with repeated visits
+- Detection probability is likely < 1 and must be estimated separately from occupancy
+- The goal is to estimate ψ (occupancy) and p (detection) and their covariates
+- Designing a new monitoring protocol where detection needs to be modelled
+---
+## Inputs
+| Input | Format | Required |
+|-------|--------|----------|
+| Detection history matrix (sites × occasions) | CSV (1/0/NA) | Yes |
+| Site-level covariates (ψ covariates) | CSV | Recommended |
+| Observation-level covariates (p covariates) | CSV or 3D array | Recommended |
+| Number of seasons (for dynamic models) | Integer | Conditional |
+---
+## Outputs
+| Output | Description |
+|--------|-------------|
+| `occupancy_estimates.csv` | ψ estimates per site (if site-level) |
+| `detection_estimates.csv` | p estimates per occasion |
+| `model_selection_table.csv` | AIC table for all candidate models |
+| `covariate_effects.csv` | Beta coefficients with 95% CIs |
+| `gof_report.md` | MacKenzie-Bailey χ² goodness-of-fit |
+| `occupancy_map.tif` | Predicted occupancy surface (if spatial) |
+---
+## Steps
+### 1. Assess Study Design
+- Confirm: multiple sites, multiple repeat surveys per site within a season
+- Confirm: population is closed within season (single-season) or document seasons
+- Calculate naive occupancy (proportion of sites with ≥1 detection) as a baseline
+- Report detection rates per occasion
+### 2. Format the Detection History
+- Rows = sites, columns = survey occasions
+- Values: 1 (detected), 0 (surveyed, not detected), NA (not surveyed)
+- Standardise continuous covariates (mean = 0, SD = 1)
+### 3. Define Candidate Models
+- Build candidate model set based on a priori ecological hypotheses
+- Include a null model (ψ(.), p(.)) as baseline
+- Typical covariate hypotheses for ψ: habitat quality, elevation, disturbance index
+- Typical covariate hypotheses for p: observer, time of day, weather, survey effort
+- Avoid all-subsets model selection; limit to ≤ K candidates (K = sample size / 10)
+### 4. Fit Models
+- Use maximum likelihood (unmarked package) or Bayesian (JAGS/Stan) estimation
+- For single-season: `occu(~p_covariates ~psi_covariates)`
+- For dynamic (multi-season): specify colonisation (γ) and extinction (ε) parameters
+- Check for convergence warnings
+### 5. Goodness-of-Fit
+- Apply MacKenzie-Bailey χ² test (parametric bootstrap, n = 1000 iterations)
+- Report ĉ (overdispersion factor); if ĉ > 1.5, use QAICc instead of AICc
+- Visualise observed vs expected detection frequencies
+### 6. Model Selection
+- Rank by AICc (or QAICc)
+- Report ΔAIC and Akaike weights
+- If top models are within ΔAIC < 2, use model averaging
+### 7. Interpret Results
+- Report ψ with 95% CI on the probability scale
+- Report p with 95% CI; discuss implications for survey design
+- Report covariate effects as odds ratios or backtransformed probabilities
+- Compute minimum number of surveys needed to confirm absence (given estimated p)
+---
+## Key Decisions to Document
+- Closure assumption justification
+- Candidate model set rationale
+- Goodness-of-fit result and action taken (e.g., use QAICc)
+- Model averaging vs. best-model inference
+---
+## Tools and Libraries
+**R:** `unmarked`, `RPresence`, `PRESENCE`, `jagsUI`, `rstan`
+**Python:** `pyoccupancy` (limited), interface to JAGS via `pyjags`
+---
+## Resources
+- `resources/occupancy-study-design.md` — required replicates for target power
+- `resources/detection-history-format.md` — how to format the input matrix
+- `examples/` — worked single-season and dynamic occupancy examples
+---
+## Notes
+- At least 3 repeat surveys per site are recommended for reliable p estimation
+- Spatial replication (many sites) is more important than temporal replication per site
+- Dynamic models require careful closure assumption per season

package/skills/occupancy-and-detection/examples/example-prompts.md ADDED Viewed

@@ -0,0 +1,33 @@
+# Example Invocation Prompts — occupancy-and-detection
+## Single-Season Occupancy
+```
+Load skill: occupancy-and-detection
+Task: Single-season occupancy analysis for puma (Puma concolor) from camera trap data.
+Files:
+  - data/detection_history.csv   (80 sites × 6 survey occasions; 1/0/NA)
+  - data/site_covariates.csv     (elevation, forest_cover, dist_to_road)
+  - data/obs_covariates.csv      (effort_nights per occasion, observer_id)
+Candidate models (occupancy ~ ..., detection ~ ...):
+  ψ(forest_cover), p(effort)
+  ψ(forest_cover + dist_to_road), p(effort)
+  ψ(elevation + forest_cover), p(effort + observer)
+  ψ(.), p(.)   ← null model
+Run goodness-of-fit (MacKenzie-Bailey χ², 1000 bootstraps).
+Select by AICc. If ĉ > 1.5, use QAICc.
+Report ψ and p estimates with 95% CIs on probability scale.
+```
+## Power Analysis
+```
+Load skill: occupancy-and-detection
+Task: Power analysis for a proposed camera trap study.
+Expected occupancy (ψ): 0.4. Expected detection per occasion (p): 0.25.
+Target: 80% power to detect a 20% decline in occupancy.
+How many sites and survey occasions are needed?
+```

package/skills/occupancy-and-detection/resources/detection-history-format.md ADDED Viewed

@@ -0,0 +1,100 @@
+# Detection History Format Reference
+The detection history matrix is the primary input for all occupancy models. Correct formatting is essential.
+## Matrix Structure
+- **Rows** = sites (sampling units)
+- **Columns** = survey occasions (within season)
+- **Values**: `1` (detected), `0` (surveyed, not detected), `NA` (not surveyed)
+```
+         occ1  occ2  occ3  occ4  occ5  occ6
+site_01     1     0     1     1     0    NA
+site_02     0     0     0     0     0     0
+site_03    NA     1     1     0     1     1
+site_04     0    NA     0    NA     0     0
+site_05     0     0     0     0     0     0
+```
+## Critical Rules
+1. `0` means the site WAS surveyed but species was NOT detected — not the same as `NA`
+2. `NA` means the site was NOT surveyed on that occasion (equipment failure, weather, etc.)
+3. A row of all zeros = site was surveyed on all occasions, never detected
+4. A row of all `NA` = site was never surveyed; **remove this row**
+5. Occasions must be within the closure period (population assumed closed)
+## Building the Matrix in R
+```r
+library(dplyr)
+library(tidyr)
+# Assuming raw_data has columns: site_id, occasion, detected (0/1), surveyed (TRUE/FALSE)
+det_history <- raw_data |>
+  mutate(value = ifelse(!surveyed, NA, detected)) |>
+  pivot_wider(id_cols = site_id, names_from = occasion, values_from = value,
+              names_prefix = "occ") |>
+  column_to_rownames("site_id") |>
+  as.matrix()
+# Sanity checks
+cat("Sites:", nrow(det_history), "\n")
+cat("Occasions:", ncol(det_history), "\n")
+cat("Detection rate:", mean(det_history, na.rm = TRUE), "\n")
+cat("Sites with ≥1 detection:", sum(rowSums(det_history, na.rm=TRUE) > 0), "\n")
+cat("Sites never surveyed (remove):", sum(rowSums(!is.na(det_history)) == 0), "\n")
+```
+## Observation Covariates
+Site-level covariates (for ψ): one row per site, one column per variable.
+Observation-level covariates (for p): same dimensions as detection history matrix, or a list of matrices.
+```r
+# Site covariates (same row order as detection history)
+site_covs <- data.frame(
+  forest_cover = c(0.82, 0.45, 0.91, 0.33, 0.71),  # 0–1
+  elevation_m  = c(450, 230, 680, 150, 520),
+  row.names    = rownames(det_history)
+)
+# Observation covariates (matrix: same dimensions as detection history)
+effort_nights <- matrix(
+  c(3, 3, NA, 3, 3, 3,   # site 1
+    3, 3, 3, 3, 3, 3,    # site 2
+    ...),
+  nrow = nrow(det_history), byrow = TRUE
+)
+# Build unmarkedFrame
+library(unmarked)
+umf <- unmarkedFrameOccu(
+  y        = det_history,
+  siteCovs = site_covs,
+  obsCovs  = list(effort = effort_nights)
+)
+summary(umf)
+```
+## Standardising Covariates
+Always standardise continuous covariates to mean = 0, SD = 1 before modelling:
+```r
+site_covs_std <- site_covs |>
+  mutate(across(where(is.numeric), ~ as.vector(scale(.))))
+```
+This improves numerical stability and allows direct comparison of coefficient magnitudes.
+## Common Formatting Errors
+| Error | Symptom | Fix |
+|-------|---------|-----|
+| Using -9 or 999 as NA code | Model fails to converge | Replace with `NA` |
+| Occasions in wrong order | Apparent temporal patterns are artefacts | Sort by date within site |
+| Site covariate row order mismatched | Covariates assigned to wrong sites | Use row names to match |
+| Mixing detection probability with occupancy | Overestimates p | Only use surveys within closure period |
+| Zero variance in a covariate | Model rank deficiency | Remove constant covariates |