npm - ecological-agent-skills - Versions diffs - 3.1.0 - Mend

ecological-agent-skills 3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (217) hide show

package/skills/biostatistics-workbench/scripts/glm_pipeline.py ADDED Viewed

@@ -0,0 +1,210 @@
+#!/usr/bin/env python3
+# ecological-agent-skills / Copyright (C) 2026 Francisco Diego Barros Barata
+# SPDX-License-Identifier: GPL-3.0-or-later
+"""
+glm_pipeline.py
+Fit candidate GLMs, check assumptions, model selection.
+Usage: python glm_pipeline.py <data_csv> <response_var> <output_dir>
+Requires: pandas, numpy, statsmodels, scipy, matplotlib, seaborn
+"""
+import logging
+import sys
+from datetime import datetime
+from pathlib import Path
+SKILL_NAME = "biostatistics-workbench"
+_LOG_DIR   = Path("logs")
+_LOG_DIR.mkdir(parents=True, exist_ok=True)
+_log_file  = _LOG_DIR / f"skill_{SKILL_NAME}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"
+logging.basicConfig(
+    level=logging.INFO,
+    format="[%(asctime)s] [%(levelname)s] [" + SKILL_NAME + "] %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+    handlers=[
+        logging.StreamHandler(sys.stdout),
+        logging.FileHandler(_log_file, encoding="utf-8"),
+    ],
+)
+logger = logging.getLogger(SKILL_NAME)
+def log_step(n: int, desc: str) -> None:
+    logger.info("-- STEP %d: %s", n, desc)
+def log_decision(var: str, val, why: str) -> None:
+    logger.info("DECISION | %s = %s | %s", var, val, why)
+import numpy as np
+import pandas as pd
+import statsmodels.formula.api as smf
+import statsmodels.api as sm
+import matplotlib.pyplot as plt
+import scipy.stats as stats
+def vif_check(df: pd.DataFrame, predictors: list) -> pd.DataFrame:
+    """Compute VIF for each predictor via auxiliary regressions."""
+    from statsmodels.stats.outliers_influence import variance_inflation_factor
+    X = sm.add_constant(df[predictors].dropna())
+    vif_data = pd.DataFrame({
+        "predictor": predictors,
+        "VIF": [variance_inflation_factor(X.values, i+1) for i in range(len(predictors))]
+    }).sort_values("VIF", ascending=False)
+    return vif_data
+def fit_candidates(data: pd.DataFrame, response: str, candidates: dict, family) -> list:
+    results = []
+    for label, formula_str in candidates.items():
+        try:
+            m = smf.glm(formula_str, data=data, family=family).fit(disp=0)
+            results.append({"label": label, "formula": formula_str, "AIC": m.aic,
+                            "deviance": m.deviance, "df_resid": m.df_resid, "model": m})
+            logger.info("  %s: AIC = %.2f", label, m.aic)
+        except Exception as e:
+            logger.error(
+                "Unexpected error in fit_candidates [%s]: %s\n"
+                "Causa provavel: formula invalida, colunas ausentes, ou familia incompativel\n"
+                "Verifique: nomes das colunas no CSV e formula definida\n"
+                "Skill anterior: data-cleaning",
+                label, e
+            )
+    return results
+def model_selection_table(results: list) -> pd.DataFrame:
+    tbl = pd.DataFrame([{k: v for k, v in r.items() if k != "model"} for r in results])
+    tbl = tbl.sort_values("AIC").reset_index(drop=True)
+    tbl["deltaAIC"] = tbl["AIC"] - tbl["AIC"].min()
+    tbl["weight"]   = np.exp(-0.5 * tbl["deltaAIC"])
+    tbl["weight"]  /= tbl["weight"].sum()
+    return tbl
+def diagnostic_plots(model, label: str, output_dir: Path) -> None:
+    fig, axes = plt.subplots(1, 2, figsize=(10, 4))
+    # Residuals vs fitted
+    fitted = model.fittedvalues
+    resid  = model.resid_pearson
+    axes[0].scatter(fitted, resid, alpha=0.5, s=20)
+    axes[0].axhline(0, color="red", linestyle="--")
+    axes[0].set_xlabel("Fitted values"); axes[0].set_ylabel("Pearson residuals")
+    axes[0].set_title("Residuals vs Fitted")
+    # QQ plot
+    stats.probplot(resid, dist="norm", plot=axes[1])
+    axes[1].set_title("QQ Plot of Pearson Residuals")
+    fig.suptitle(f"Diagnostics: {label}")
+    plt.tight_layout()
+    plt.savefig(output_dir / f"diagnostics_{label}.png", dpi=150)
+    plt.close()
+def main():
+    data_file    = sys.argv[1] if len(sys.argv) > 1 else "data/processed/data.csv"
+    response_var = sys.argv[2] if len(sys.argv) > 2 else "richness"
+    output_dir   = Path(sys.argv[3]) if len(sys.argv) > 3 else Path("outputs/stats")
+    log_step(1, "Validate inputs and load data")
+    if not Path(data_file).exists():
+        logger.error(
+            "Input file not found: %s\n"
+            "Causa provavel: caminho incorreto ou arquivo nao gerado ainda\n"
+            "Verifique: o argumento data_csv e o diretorio de trabalho\n"
+            "Skill anterior: data-cleaning",
+            data_file
+        )
+        sys.exit(1)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    try:
+        dat = pd.read_csv(data_file)
+    except Exception as e:
+        logger.error(
+            "Unexpected error in load data: %s\n"
+            "Causa provavel: arquivo CSV malformado ou permissoes insuficientes\n"
+            "Verifique: encoding e estrutura do arquivo CSV\n"
+            "Skill anterior: data-cleaning",
+            e
+        )
+        raise
+    logger.info("Loaded %d rows. Response: %s", len(dat), response_var)
+    if response_var not in dat.columns:
+        logger.error(
+            "Response variable '%s' not found in columns: %s\n"
+            "Causa provavel: nome da variavel resposta incorreto\n"
+            "Verifique: cabecalho do CSV e o argumento response_var\n"
+            "Skill anterior: data-cleaning",
+            response_var, list(dat.columns)
+        )
+        sys.exit(1)
+    n_missing = dat[response_var].isna().sum()
+    if n_missing > 0:
+        log_warn_msg = (
+            "Response variable '%s' has %d missing values (%.1f%%). "
+            "Rows with NA will be dropped by statsmodels."
+        )
+        logger.warning(log_warn_msg, response_var, n_missing, 100 * n_missing / len(dat))
+    log_step(2, "Define candidate models and family")
+    # --- Define your candidate models here ---
+    candidates = {
+        "null":    f"{response_var} ~ 1",
+        "model1":  f"{response_var} ~ C(group)",
+        "model2":  f"{response_var} ~ C(group) + elevation",
+        "model3":  f"{response_var} ~ C(group) + elevation + forest_cover",
+    }
+    family = sm.families.NegativeBinomial()
+    log_decision("family", "NegativeBinomial", "count response variable; NB handles overdispersion")
+    log_decision("n_candidates", len(candidates), "null + 3 increasingly complex models for AIC comparison")
+    log_step(3, "Fit candidate models")
+    logger.info("Fitting candidate models:")
+    results = fit_candidates(dat, response_var, candidates, family)
+    if not results:
+        logger.error(
+            "No models converged successfully.\n"
+            "Causa provavel: dados insuficientes ou preditores com NA em todas as linhas\n"
+            "Verifique: completude dos dados e formulas dos candidatos\n"
+            "Skill anterior: data-cleaning"
+        )
+        sys.exit(1)
+    log_step(4, "Build model selection table")
+    try:
+        tbl = model_selection_table(results)
+        logger.info("Model selection table:\n%s", tbl[['label','AIC','deltaAIC','weight']].to_string(index=False))
+        tbl.drop(columns=["model"], errors="ignore").to_csv(output_dir / "model_selection.csv", index=False)
+    except Exception as e:
+        logger.error(
+            "Unexpected error in model selection table: %s\n"
+            "Causa provavel: nenhum modelo ajustado com sucesso\n"
+            "Verifique: etapa de fitting para mensagens de erro anteriores\n"
+            "Skill anterior: biostatistics-workbench (fitting)",
+            e
+        )
+        raise
+    log_step(5, "Summarise best model and save diagnostics")
+    try:
+        best_result = min(results, key=lambda x: x["AIC"])
+        best_model  = best_result["model"]
+        log_decision("best_model", best_result["label"], "lowest AIC among converged candidates")
+        logger.info("Best model: %s (AIC = %.2f)", best_result["label"], best_result["AIC"])
+        logger.info(str(best_model.summary()))
+        (output_dir / "best_model_summary.txt").write_text(str(best_model.summary()))
+        diagnostic_plots(best_model, best_result["label"], output_dir)
+        logger.info("Outputs written to: %s", output_dir)
+    except Exception as e:
+        logger.error(
+            "Unexpected error in best model summary/diagnostics: %s\n"
+            "Causa provavel: objeto de modelo invalido ou diretorio sem permissao de escrita\n"
+            "Verifique: output_dir e o modelo selecionado\n"
+            "Skill anterior: biostatistics-workbench (fitting)",
+            e
+        )
+        raise
+if __name__ == "__main__":
+    main()

package/skills/camera-trap-processing/SKILL.md ADDED Viewed

@@ -0,0 +1,159 @@
+---
+name: camera-trap-processing
+description: "Processes camera trap image records into structured detection data, activity patterns, and trap effort summaries. Use this skill when the user mentions camera traps, wildlife cameras, trap nights, detection events, diel activity patterns, camtrapR, temporal overlap indices (Dhat), RAI (relative abundance index), camera station data, detection history generation, or independence thresholds for photo events."
+skill_version: 1.0.0
+---
+# Skill: camera-trap-processing
+**Domain:** Camera traps · Detection events · Activity patterns · Occupancy · Abundance indices
+---
+## Purpose
+Guides the agent through processing raw camera trap image data into validated detection records, calculating activity pattern metrics, and preparing outputs for occupancy and abundance estimation. Covers detection event definition, independence filtering, trap effort computation, diel activity analysis, and integration with the occupancy-and-detection skill via detection history matrices.
+---
+## When to Invoke
+Invoke this skill when:
+- A user provides a directory of camera trap images or a raw detection CSV for processing
+- The goal is to calculate wildlife activity patterns, diel overlap, or relative activity indices
+- Detection history matrices are needed as input for occupancy models
+- The user asks about independent detection events, trap effort, or camera operation
+- Photographic mark-recapture or N-mixture models are planned
+**trigger_keywords:** `camera trap`, `wildlife camera`, `detection event`, `trap night`, `activity pattern`, `diel activity`, `photographic index`, `camtrapR`, `detection history`, `occupancy camera`, `camera operation`, `independent detection`
+---
+## Inputs
+| Input | Format | Required |
+|---|---|---|
+| Image directory (organised by camera station) | Directory tree | Required |
+| Camera metadata CSV (station, lat, lon, setup date, retrieval date) | CSV | Required |
+| Species list for filtering | TXT or CSV | Recommended |
+| Independence threshold (minutes) | Integer (default: 30) | Optional |
+| Detection record table (if images already processed) | CSV | Conditional |
+---
+## Outputs
+| Output | Description |
+|---|---|
+| `record_table.csv` | One row per independent detection event per species per camera |
+| `detection_history.csv` | Binary site × occasion matrix ready for `unmarked` |
+| `camera_operation.csv` | Daily operation status per camera (1=active, 0=inactive) |
+| `trap_effort_summary.csv` | Trap-nights per station and per species |
+| `records_per_species.csv` | Count of independent events per species |
+| `activity_plot.png` | Kernel density of diel activity with 95% CI |
+| `activity_overlap.csv` | Diel overlap index Δ between two groups |
+| `circular_stats.csv` | Mean activity time, concentration (κ), Rayleigh test |
+---
+## Steps
+1. **Validate camera metadata**
+   Confirm all stations have setup and retrieval dates, coordinates, and unique IDs.
+   Run `process_camtrap_data.R` with the image directory and metadata CSV.
+   The script creates the camera operation matrix from station dates.
+2. **Extract EXIF and build record table**
+   `camtrapR::recordTable()` reads image timestamps and species labels from directory structure.
+   Check that directory hierarchy matches: `<station>/<species>/<images>`.
+   Output: `record_table.csv` with columns station, species, datetime, filename.
+3. **Apply independence filter**
+   Events from the same species at the same station within `indep_threshold_min` (default: 30)
+   are collapsed into a single event. Record the threshold in `decision_log.md`.
+   If threshold is changed from 30 min, justify the choice using home range data.
+4. **Calculate trap effort**
+   Compute trap-nights per station from the camera operation matrix.
+   Flag stations with < 100 trap-nights in `trap_effort_summary.csv`.
+   Do not use stations with < 20 trap-nights in occupancy estimation.
+5. **Generate detection history matrix**
+   Use `camtrapR::detectionHistory()` with the chosen occasion length (default: 1 week).
+   Output: binary matrix with rows = stations, columns = occasions.
+   Pass to `occupancy-and-detection` skill for model fitting.
+6. **Estimate diel activity patterns** *(optional — invoke `estimate_activity.R`)*
+   Fit von Mises kernel to circular time-of-day data.
+   Calculate Dhat4 overlap index between two groups (e.g., dry vs. wet season).
+   If `n_independent_events < 10` per species, report relative activity index only.
+7. **Validate outputs**
+   Confirm all output files are non-empty. Check that detection history dimensions match
+   (n_stations × n_occasions). Verify no station has all-NA rows.
+   Record decisions in `decision_log.md`.
+---
+## Decision Points
+| Condition | Diagnosis | Recommended Action |
+|---|---|---|
+| Time between photos < independence threshold | Events are not independent — same animal | Collapse to one event; use 30 min default unless home range data justifies otherwise |
+| `n_independent_events` < 10 per species | Insufficient data for occupancy estimation | Report relative activity index (RAI) only; do not fit occupancy model |
+| Camera operational time < 100 trap-nights | Insufficient sampling effort | Flag station; exclude from occupancy analysis; include in effort summary |
+| Stations < 15 | Below minimum for occupancy model | Use naive occupancy with explicit caveat; see occupancy-and-detection skill |
+| Missing EXIF timestamps | Images cannot be time-ordered | Request re-processing; use filename timestamps as fallback if consistent |
+---
+## Key Decisions to Document
+Record the following in `decision_log.md` after running this skill:
+- Which independence threshold (minutes) was used and why
+- How many events were collapsed due to non-independence
+- Which stations were excluded due to low effort and why
+- What occasion length was used for detection history and why
+- Whether any species had insufficient events for occupancy (RAI reported instead)
+---
+## Tools and Libraries
+**R**
+```r
+suppressPackageStartupMessages(library(camtrapR))   # record table, detection history
+suppressPackageStartupMessages(library(overlap))    # diel overlap index (Dhat4)
+suppressPackageStartupMessages(library(circular))   # circular statistics
+suppressPackageStartupMessages(library(dplyr))      # data manipulation
+suppressPackageStartupMessages(library(ggplot2))    # plotting
+suppressPackageStartupMessages(library(lubridate))  # date handling
+```
+**Python**
+```python
+import pandas as pd          # data manipulation
+import numpy as np           # numerical operations
+import matplotlib.pyplot as plt  # plotting
+from pathlib import Path     # file system operations
+```
+---
+## Resources
+- [`skills/camera-trap-processing/resources/detection-event-definition-guide.md`](resources/detection-event-definition-guide.md) — Independence thresholds by taxon and how the choice affects estimates
+- [`skills/camera-trap-processing/resources/camtrapR-workflow-guide.md`](resources/camtrapR-workflow-guide.md) — Directory structure, EXIF extraction, and key camtrapR functions
+- [`skills/camera-trap-processing/resources/activity-patterns-reference.md`](resources/activity-patterns-reference.md) — Diel overlap index Δ, circular statistics, and seasonal stratification
+---
+## Notes
+- **Directory structure is mandatory for camtrapR:** images must be organised as `<station>/<species>/<images>`. Flat directories will cause `recordTable()` to fail. If images are flat, reorganise before processing.
+- **Independence threshold inflates occupancy if too short:** Using 5 min instead of 30 min can increase apparent detection events by 2–5×, biasing occupancy upward. Document the threshold and perform sensitivity analysis if in doubt.
+- **Trap-nights ≠ camera-nights if cameras malfunction:** Always use the camera operation matrix, not simply `retrieval_date - setup_date`. Cameras with SD card failures, battery death, or tampering must be accounted for.
+- **RAI is not equivalent to occupancy or density:** Relative Activity Index (detections per 100 trap-nights) is a naive index that conflates occupancy, detection probability, and activity. It is useful for relative comparisons only.
+- **Circular time requires conversion to radians:** Activity times must be converted from clock hours to radians (`time_rad = hour * (2*pi/24)`) before fitting kernel density or calculating Δ.

package/skills/camera-trap-processing/examples/example-prompts.md ADDED Viewed

@@ -0,0 +1,103 @@
+---
+skill_id: camera-trap-processing
+example_count: 5
+---
+# Camera Trap Processing — Example Prompts
+## Scenario 1: Multi-Species Occupancy Survey
+**Context:** Savanna park, 40 stations deployed for 60 days. Goal: build detection histories for all medium-to-large mammals for an occupancy modelling exercise.
+**Prompt:**
+> "I have 60 days of camera trap data from 40 stations in a savanna park. Process all images, define independent detection events (30-min threshold for mammals), build a weekly detection history matrix, and flag stations with less than 100 trap-nights."
+**Expected workflow:**
+1. `process_camtrap_data.R <image_dir> <metadata.csv> outputs/ 30`
+2. Inspect `trap_effort_summary.csv` — exclude stations < 100 trap-nights
+3. Load `detection_history.csv` into `unmarked` for single-season occupancy modelling
+4. Check `records_per_species.csv` — exclude species with < 10 events from occupancy
+**Key decision points:**
+- Stations excluded for effort < 100 trap-nights: re-visit in next field season
+- Species with < 10 events: report RAI only, not occupancy
+---
+## Scenario 2: Predator–Prey Temporal Overlap
+**Context:** African savanna. Compare diel activity patterns of lions and wildebeest. Determine the degree of temporal overlap (Δ) and assess whether lions are more active when wildebeest is active.
+**Prompt:**
+> "I have 6 months of camera trap data for lions and wildebeest at a savanna site. Estimate diel activity curves for both species and compute temporal overlap (Dhat4). Test whether overlap is significantly higher than expected by chance using a bootstrap permutation test."
+**Expected workflow:**
+1. `estimate_activity.R record_table.csv "Lion" outputs/`
+2. `estimate_activity.R record_table.csv "Wildebeest" outputs/`
+3. Load both activity CSV outputs; compare Dhat4 estimates with bootstrap CI
+4. Report interpretation: Δ > 0.75 = high overlap → possible pursuit or avoidance
+**Key decision points:**
+- If n Lion < 75 → bootstrap CI will be wide; interpret cautiously
+- If 95% CI of Δ does not overlap 0.5 (random) → significant temporal association
+---
+## Scenario 3: Relative Activity Index (RAI) Across a Disturbance Gradient
+**Context:** Forest–agricultural edge transect, 3 habitat types (interior, edge, farmland). Compare RAI of 5 focal species across habitat zones.
+**Prompt:**
+> "Compare the Relative Activity Index (RAI) for tapir, peccary, deer, ocelot, and armadillo across three habitat types (forest interior, forest edge, farmland) using 90 days of camera trap data. Test for significant habitat effects on RAI."
+**Expected workflow:**
+1. `process_camtrap_data.R` for entire dataset; join with habitat zone from metadata
+2. Compute RAI = detections / trap-nights × 100 per station × species
+3. Kruskal-Wallis or negative binomial GLM: `RAI ~ habitat_type + (1 | station)`
+4. Post-hoc Dunn test for pairwise habitat comparisons
+5. Plot faceted bar chart (species × habitat) with SE bars
+**Expected findings:**
+- Tapir likely shows strong avoidance of farmland
+- Armadillo may show edge preference
+- Report effect sizes (η²) alongside p-values
+---
+## Scenario 4: Camera Trap Population Index Trend
+**Context:** Long-term monitoring (5 years). Track RAI of a threatened ungulate across annual survey periods to detect population decline.
+**Prompt:**
+> "I have 5 years of annual camera trap surveys (60-day seasons, same 25 stations each year) for a threatened deer species. Calculate RAI per station per year, test for a significant linear trend, and assess whether the population is declining."
+**Expected workflow:**
+1. `process_camtrap_data.R` for each year; combine record tables with year column
+2. Compute station-level RAI per year
+3. Linear mixed model: `log(RAI + 0.01) ~ year + (1 | station)`
+4. If slope < 0 and p < 0.05 → declining trend; estimate annual rate of change
+5. Plot RAI time series per station; highlight trend line with 95% CI
+**Key decision points:**
+- If station composition differs between years → use only stations sampled all 5 years
+- If RAI changes > 50% between consecutive years → check for recording malfunctions
+---
+## Scenario 5: Camera Trap Data for Occupancy Modelling Integration
+**Context:** Joint camera trap + acoustic monitoring survey. Need to prepare occupancy inputs for 12 focal bird species detected in both data streams.
+**Prompt:**
+> "I have 45 days of camera trap data (for ground-dwelling birds) alongside acoustic monitoring results from the same stations. Prepare weekly detection history matrices for 12 focal species, combine with acoustic detection histories, and run a multi-method occupancy model in unmarked."
+**Expected workflow:**
+1. `process_camtrap_data.R` → `detection_history.csv` (camera)
+2. Acoustic detections filtered to ≥ 0.7 confidence → binary weekly matrix
+3. Bind camera and acoustic histories as two observation methods in `unmarked::occuMS()`
+4. Compare detection probability estimates between methods
+5. Report false-absence risk for each method per species
+**Key decision points:**
+- If camera detection probability < 0.10 for arboreal species → remove camera data for that species
+- If acoustic and camera occupancy estimates differ > 20% → investigate site-level covariates