PyPI - voidaccess - Versions diffs - 1.3.0__py3-none-any.whl - Mend

voidaccess 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (142) hide show

analysis/__init__.py +49 -0
analysis/opsec.py +454 -0
analysis/patterns.py +202 -0
analysis/temporal.py +201 -0
api/__init__.py +1 -0
api/auth.py +163 -0
api/main.py +509 -0
api/routes/__init__.py +1 -0
api/routes/admin.py +214 -0
api/routes/auth.py +157 -0
api/routes/entities.py +871 -0
api/routes/export.py +359 -0
api/routes/investigations.py +2567 -0
api/routes/monitors.py +405 -0
api/routes/search.py +157 -0
api/routes/settings.py +851 -0
auth/__init__.py +1 -0
auth/token_blacklist.py +108 -0
cli/__init__.py +3 -0
cli/adapters/__init__.py +1 -0
cli/adapters/sqlite.py +273 -0
cli/browser.py +376 -0
cli/commands/__init__.py +1 -0
cli/commands/configure.py +185 -0
cli/commands/enrich.py +154 -0
cli/commands/export.py +158 -0
cli/commands/investigate.py +601 -0
cli/commands/show.py +87 -0
cli/config.py +180 -0
cli/display.py +212 -0
cli/main.py +154 -0
cli/tor_detect.py +71 -0
config.py +180 -0
crawler/__init__.py +28 -0
crawler/dedup.py +97 -0
crawler/frontier.py +115 -0
crawler/spider.py +462 -0
crawler/utils.py +122 -0
db/__init__.py +47 -0
db/migrations/__init__.py +0 -0
db/migrations/env.py +80 -0
db/migrations/versions/0001_initial_schema.py +270 -0
db/migrations/versions/0002_add_investigation_status_column.py +27 -0
db/migrations/versions/0002_add_missing_tables.py +33 -0
db/migrations/versions/0003_add_canonical_value_and_entity_links.py +61 -0
db/migrations/versions/0004_add_page_posted_at.py +41 -0
db/migrations/versions/0005_add_extraction_method.py +32 -0
db/migrations/versions/0006_add_monitor_alerts.py +26 -0
db/migrations/versions/0007_add_actor_style_profiles.py +23 -0
db/migrations/versions/0008_add_users_table.py +47 -0
db/migrations/versions/0009_add_investigation_id_to_relationships.py +29 -0
db/migrations/versions/0010_add_composite_index_entity_relationships.py +22 -0
db/migrations/versions/0011_add_page_extraction_cache.py +52 -0
db/migrations/versions/0013_add_graph_status.py +31 -0
db/migrations/versions/0015_add_progress_fields.py +41 -0
db/migrations/versions/0016_backfill_graph_status.py +33 -0
db/migrations/versions/0017_add_user_api_keys.py +44 -0
db/migrations/versions/0018_add_user_id_to_investigations.py +33 -0
db/migrations/versions/0019_add_content_safety_log.py +46 -0
db/migrations/versions/0020_add_entity_source_tracking.py +50 -0
db/models.py +618 -0
db/queries.py +841 -0
db/session.py +270 -0
export/__init__.py +34 -0
export/misp.py +257 -0
export/sigma.py +342 -0
export/stix.py +418 -0
extractor/__init__.py +21 -0
extractor/llm_extract.py +372 -0
extractor/ner.py +512 -0
extractor/normalizer.py +638 -0
extractor/pipeline.py +401 -0
extractor/regex_patterns.py +325 -0
fingerprint/__init__.py +33 -0
fingerprint/profiler.py +240 -0
fingerprint/stylometry.py +249 -0
graph/__init__.py +73 -0
graph/builder.py +894 -0
graph/export.py +225 -0
graph/model.py +83 -0
graph/queries.py +297 -0
graph/visualize.py +178 -0
i18n/__init__.py +24 -0
i18n/detect.py +76 -0
i18n/query_expand.py +72 -0
i18n/translate.py +210 -0
monitor/__init__.py +27 -0
monitor/_db.py +74 -0
monitor/alerts.py +345 -0
monitor/config.py +118 -0
monitor/diff.py +75 -0
monitor/jobs.py +247 -0
monitor/scheduler.py +184 -0
scraper/__init__.py +0 -0
scraper/scrape.py +857 -0
scraper/scrape_js.py +272 -0
search/__init__.py +318 -0
search/circuit_breaker.py +240 -0
search/search.py +334 -0
sources/__init__.py +96 -0
sources/blockchain.py +444 -0
sources/cache.py +93 -0
sources/cisa.py +108 -0
sources/dns_enrichment.py +557 -0
sources/domain_reputation.py +643 -0
sources/email_reputation.py +635 -0
sources/engines.py +244 -0
sources/enrichment.py +1244 -0
sources/github_scraper.py +589 -0
sources/gitlab_scraper.py +624 -0
sources/hash_reputation.py +856 -0
sources/historical_intel.py +253 -0
sources/ip_reputation.py +521 -0
sources/paste_scraper.py +484 -0
sources/pastes.py +278 -0
sources/rss_scraper.py +576 -0
sources/seed_manager.py +373 -0
sources/seeds.py +368 -0
sources/shodan.py +103 -0
sources/telegram.py +199 -0
sources/virustotal.py +113 -0
utils/__init__.py +0 -0
utils/async_utils.py +89 -0
utils/content_safety.py +193 -0
utils/defang.py +94 -0
utils/encryption.py +34 -0
utils/ioc_freshness.py +124 -0
utils/user_keys.py +33 -0
vector/__init__.py +39 -0
vector/embedder.py +100 -0
vector/model_singleton.py +49 -0
vector/search.py +87 -0
vector/store.py +514 -0
voidaccess/__init__.py +0 -0
voidaccess/llm.py +717 -0
voidaccess/llm_utils.py +696 -0
voidaccess-1.3.0.dist-info/METADATA +395 -0
voidaccess-1.3.0.dist-info/RECORD +142 -0
voidaccess-1.3.0.dist-info/WHEEL +5 -0
voidaccess-1.3.0.dist-info/entry_points.txt +2 -0
voidaccess-1.3.0.dist-info/licenses/LICENSE +21 -0
voidaccess-1.3.0.dist-info/top_level.txt +19 -0

api/routes/monitors.py ADDED Viewed

@@ -0,0 +1,405 @@
+"""
+api/routes/monitors.py — Monitor/watch management endpoints.
+GET    /monitors              — list all watches from monitors.yaml
+POST   /monitors              — create a new watch (writes to monitors.yaml)
+DELETE /monitors/{watch_name} — delete a watch from monitors.yaml
+POST   /monitors/{watch_name}/trigger — trigger a specific watch immediately
+GET    /monitors/status       — job status for all scheduled watches
+"""
+from __future__ import annotations
+import asyncio
+import logging
+import os
+from pathlib import Path
+from typing import Optional
+from fastapi import APIRouter, Depends, HTTPException, Query
+from filelock import FileLock
+from pydantic import BaseModel
+# Cross-platform file lock strategy:
+# - Uses `filelock` library (works on Linux/Windows/macOS)
+# - Replaces fcntl.flock() which is Linux-only and silently failed on Windows
+# - FileLock creates a .lock file alongside monitors.yaml for inter-process locking
+# - Provides thread-safety for concurrent config writes across deployments
+from db.queries import (
+    acknowledge_alerts,
+    get_alert_counts_by_monitor,
+    get_alerts_for_monitor,
+    get_monitor_stats,
+    get_unacknowledged_alert_count,
+)
+from api.auth import require_password_not_reset_pending, CurrentUser
+from db.session import get_session
+logger = logging.getLogger(__name__)
+router = APIRouter()
+# Module-level scheduler reference (populated externally if running with scheduler)
+_scheduler = None
+def _get_monitor_config_path() -> Path:
+    """Get the path to monitors.yaml, configurable via MONITORS_CONFIG_PATH env var."""
+    env_path = os.getenv("MONITORS_CONFIG_PATH")
+    if env_path:
+        return Path(env_path)
+    return Path(__file__).resolve().parents[2] / "data" / "monitors.yaml"
+def _ensure_monitors_yaml_exists() -> None:
+    """Create default empty monitors.yaml if it doesn't exist."""
+    path = _get_monitor_config_path()
+    if not path.exists():
+        try:
+            path.parent.mkdir(parents=True, exist_ok=True)
+            import yaml
+            path.write_text(
+                yaml.dump({"watches": []}, default_flow_style=False),
+                encoding="utf-8",
+            )
+            logger.info(f"Created default monitors.yaml at {path}")
+        except Exception as e:
+            logger.warning(f"Could not create monitors.yaml: {e}")
+_ensure_monitors_yaml_exists()
+def set_scheduler(scheduler) -> None:
+    """Inject the APScheduler instance into this module."""
+    global _scheduler
+    _scheduler = scheduler
+_monitors_lock = asyncio.Lock()
+async def _load_monitors_no_lock() -> list[dict]:
+    """Load monitors.yaml safely, return [] if file missing. NOT thread-safe on its own."""
+    path = _get_monitor_config_path()
+    if not path.exists():
+        import yaml
+        try:
+            await asyncio.to_thread(
+                path.write_text,
+                yaml.dump({"watches": []}, default_flow_style=False),
+                encoding="utf-8"
+            )
+        except Exception as e:
+            logger.error(f"Failed to create default monitors.yaml: {e}")
+        return []
+    try:
+        import yaml
+        content = await asyncio.to_thread(path.read_text, encoding="utf-8")
+        data = yaml.safe_load(content)
+        if not data or not isinstance(data, dict):
+            return []
+        watches = data.get("watches", [])
+        return watches if isinstance(watches, list) else []
+    except Exception as e:
+        logger.error(f"Failed to load monitors.yaml: {e}")
+        return []
+async def _save_monitors_no_lock(watches: list[dict]) -> None:
+    """Save monitors.yaml safely with fsync. NOT thread-safe on its own."""
+    import yaml
+    path = _get_monitor_config_path()
+    content = yaml.dump({"watches": watches}, default_flow_style=False, allow_unicode=True)
+    def _sync_save():
+        tmp_path = path.with_suffix(".tmp")
+        with open(tmp_path, 'w', encoding='utf-8') as f:
+            f.write(content)
+            f.flush()
+            os.fsync(f.fileno())
+        os.replace(tmp_path, path)
+    await asyncio.to_thread(_sync_save)
+async def _load_monitors() -> list[dict]:
+    """Thread-safe YAML load."""
+    async with _monitors_lock:
+        return await _load_monitors_no_lock()
+async def _save_monitors(watches: list[dict]) -> None:
+    """Thread-safe YAML save."""
+    async with _monitors_lock:
+        await _save_monitors_no_lock(watches)
+# ---------------------------------------------------------------------------
+# Request models
+# ---------------------------------------------------------------------------
+class AcknowledgeAlertsBody(BaseModel):
+    alert_ids: list[int] | None = None
+class CreateMonitorRequest(BaseModel):
+    name: str
+    type: str  # "keyword" | "url"
+    query: Optional[str] = None
+    url: Optional[str] = None
+    interval_hours: float = 48.0
+    alert_on: str = "new_results"
+    webhook_url: Optional[str] = None
+    telegram_chat_id: Optional[str] = None
+    email: Optional[str] = None
+    enabled: bool = True
+# ---------------------------------------------------------------------------
+# Routes
+# ---------------------------------------------------------------------------
+@router.get("")
+async def list_monitors() -> list[dict]:
+    """
+    Return all watches defined in monitors.yaml with aggregate stats from DB.
+    """
+    watches = await _load_monitors()
+    if not watches:
+        return watches
+    with get_session() as session:
+        result = []
+        for watch in watches:
+            name = watch.get("name", "")
+            stats = get_monitor_stats(session, name)
+            enriched = {**watch, **stats}
+            result.append(enriched)
+        return result
+@router.get("/alerts/count")
+async def get_alert_count() -> dict:
+    """
+    Total unacknowledged alert count across all monitors.
+    Used by MonitorNavBadge for the live count.
+    """
+    with get_session() as session:
+        count = get_unacknowledged_alert_count(session)
+        by_monitor = get_alert_counts_by_monitor(session)
+    return {
+        "total_unacknowledged": count,
+        "by_monitor": by_monitor,
+    }
+@router.get("/status")
+async def monitors_status() -> list[dict]:
+    """Return job status for all scheduled watches."""
+    try:
+        from monitor.scheduler import get_job_status  # noqa: PLC0415
+        status = get_job_status(_scheduler)
+        result = []
+        for s in status:
+            result.append({
+                "name": s.get("name"),
+                "next_run_time": (
+                    s["next_run_time"].isoformat()
+                    if s.get("next_run_time") else None
+                ),
+                "last_run_time": (
+                    s["last_run_time"].isoformat()
+                    if s.get("last_run_time") else None
+                ),
+            })
+        return result
+    except Exception as exc:
+        logger.warning("monitors_status failed: %s", exc)
+        return []
+@router.get("/{monitor_name}/alerts")
+async def get_monitor_alerts(
+    monitor_name: str,
+    limit: int = Query(20, ge=1, le=200),
+    include_acknowledged: bool = Query(True),
+) -> dict:
+    """
+    Alert history for a specific monitor.
+    Used by MonitorDetail inline panel.
+    """
+    with get_session() as session:
+        alerts = get_alerts_for_monitor(
+            session,
+            monitor_name=monitor_name,
+            limit=limit,
+            include_acknowledged=include_acknowledged,
+        )
+    return {
+        "monitor_name": monitor_name,
+        "alerts": [
+            {
+                "id": a.id,
+                "triggered_at": a.triggered_at.isoformat(),
+                "change_type": a.change_type,
+                "summary": a.summary,
+                "severity": str(a.severity),
+                "entity_count_delta": a.entity_count_delta,
+                "delivered": a.delivered,
+                "delivery_channels": a.delivery_channels or [],
+                "acknowledged": a.acknowledged,
+                "acknowledged_at": (
+                    a.acknowledged_at.isoformat() if a.acknowledged_at else None
+                ),
+                "diff_data": a.diff_data,
+            }
+            for a in alerts
+        ],
+        "total": len(alerts),
+    }
+@router.post("/{monitor_name}/alerts/acknowledge")
+async def acknowledge_monitor_alerts(
+    monitor_name: str,
+    body: AcknowledgeAlertsBody | None = None,
+) -> dict:
+    """
+    Mark alerts as acknowledged.
+    Body: {"alert_ids": [1, 2, 3]} or empty body to acknowledge all.
+    """
+    alert_ids = body.alert_ids if body else None
+    with get_session() as session:
+        count = acknowledge_alerts(session, monitor_name, alert_ids)
+    return {"acknowledged": count}
+@router.post("")
+async def create_monitor(
+    req: CreateMonitorRequest,
+    current_user: CurrentUser = Depends(require_password_not_reset_pending),
+) -> dict:
+    """Create a new watch and append it to monitors.yaml."""
+    if req.type not in ("keyword", "url"):
+        raise HTTPException(status_code=422, detail="type must be 'keyword' or 'url'")
+    if req.type == "keyword" and not req.query:
+        raise HTTPException(status_code=422, detail="query is required for keyword watches")
+    if req.type == "url" and not req.url:
+        raise HTTPException(status_code=422, detail="url is required for url watches")
+    if req.interval_hours < 0.5:
+        raise HTTPException(status_code=422, detail="interval_hours must be >= 0.5")
+    valid_alert_on = {"new_results", "any_change", "any_appearance"}
+    if req.alert_on not in valid_alert_on:
+        raise HTTPException(
+            status_code=422,
+            detail=f"alert_on must be one of {sorted(valid_alert_on)}",
+        )
+    if not req.name or not req.name.strip():
+        raise HTTPException(status_code=422, detail="name is required")
+    name = req.name.strip()
+    try:
+        path = _get_monitor_config_path()
+        if not path.exists():
+            await _load_monitors()
+        lock_path = str(path) + ".lock"
+        def _sync_create():
+            with FileLock(lock_path, timeout=10):
+                with open(path, 'r+', encoding='utf-8') as f:
+                    content = f.read()
+                    import yaml
+                    data = yaml.safe_load(content) or {"watches": []}
+                    watches = data.get("watches", [])
+                    if not isinstance(watches, list):
+                        watches = []
+                    if any(w.get("name") == name for w in watches if isinstance(w, dict)):
+                        return "duplicate"
+                    entry: dict = {
+                        "name": name,
+                        "type": req.type,
+                        "interval_hours": req.interval_hours,
+                        "alert_on": req.alert_on,
+                        "enabled": req.enabled,
+                        "webhook_url": req.webhook_url or None,
+                        "telegram_chat_id": req.telegram_chat_id or None,
+                        "email": req.email or None,
+                    }
+                    if req.type == "keyword":
+                        entry["query"] = req.query.strip()
+                    else:
+                        entry["url"] = req.url.strip()
+                    watches.append(entry)
+                    f.seek(0)
+                    f.truncate()
+                    f.write(yaml.dump({"watches": watches}, default_flow_style=False, allow_unicode=True))
+                    f.flush()
+                    os.fsync(f.fileno())
+                    return "ok"
+        res = await asyncio.to_thread(_sync_create)
+        if res == "duplicate":
+            raise HTTPException(
+                status_code=409, detail=f"Monitor {name!r} already exists"
+            )
+        return {"created": True, "name": name}
+    except HTTPException:
+        raise
+    except Exception as exc:
+        logger.error("create_monitor failed: %s", exc)
+        raise HTTPException(status_code=500, detail=str(exc))
+@router.delete("/{watch_name}")
+async def delete_monitor(watch_name: str) -> dict:
+    """Remove a watch from monitors.yaml by name."""
+    try:
+        logger.debug(f"Lock ID: {id(_monitors_lock)}")
+        async with _monitors_lock:
+            watches = await _load_monitors_no_lock()
+            before = len(watches)
+            watches = [w for w in watches if not (isinstance(w, dict) and w.get("name") == watch_name)]
+            if len(watches) == before:
+                raise HTTPException(status_code=404, detail=f"Watch {watch_name!r} not found")
+            await _save_monitors_no_lock(watches)
+        return {"deleted": True, "name": watch_name}
+    except HTTPException:
+        raise
+    except Exception as exc:
+        logger.error("delete_monitor failed: %s", exc)
+        raise HTTPException(status_code=500, detail=str(exc))
+@router.post("/{watch_name}/trigger")
+async def trigger_monitor(watch_name: str) -> dict:
+    """Trigger a specific watch immediately."""
+    try:
+        from monitor.config import get_watch_by_name  # noqa: PLC0415
+        from monitor.scheduler import trigger_job_now  # noqa: PLC0415
+        watch = get_watch_by_name(watch_name)
+        if watch is None:
+            raise HTTPException(status_code=404, detail=f"Watch {watch_name!r} not found")
+        triggered = trigger_job_now(_scheduler, watch_name)
+        return {"triggered": triggered, "watch_name": watch_name}
+    except HTTPException:
+        raise
+    except Exception as exc:
+        logger.warning("trigger_monitor failed: %s", exc)
+        return {"triggered": False, "watch_name": watch_name}

api/routes/search.py ADDED Viewed

@@ -0,0 +1,157 @@
+"""
+api/routes/search.py — Semantic and full-text search endpoints.
+POST /search/semantic   — vector similarity search against scraped pages
+POST /search/entities   — full-text search across entity values in DB
+"""
+from __future__ import annotations
+import logging
+import os
+from typing import Optional
+from fastapi import APIRouter, Depends, HTTPException
+from pydantic import BaseModel
+from api.auth import CurrentUser, get_current_user
+logger = logging.getLogger(__name__)
+router = APIRouter()
+# ---------------------------------------------------------------------------
+# Request schemas
+# ---------------------------------------------------------------------------
+class SemanticSearchRequest(BaseModel):
+    query: str
+    n_results: int = 10
+    offset: int = 0
+class EntitySearchRequest(BaseModel):
+    query: str
+    entity_types: Optional[list[str]] = None
+    offset: int = 0
+    limit: int = 50
+# ---------------------------------------------------------------------------
+# Routes
+# ---------------------------------------------------------------------------
+@router.post("/semantic")
+async def semantic_search(
+    body: SemanticSearchRequest,
+    current_user: CurrentUser = Depends(get_current_user),
+) -> dict:
+    """
+    Return semantically similar pages from the vector store.
+    Uses ChromaDB + sentence-transformers embeddings.
+    Supports pagination via offset/n_results.
+    """
+    try:
+        from vector.search import find_related_pages
+        from vector.store import count_pages
+        results = find_related_pages(body.query, n_results=body.n_results)
+        total = count_pages()
+        if not isinstance(results, list):
+            results = []
+        user_inv_ids: set[str] = set()
+        if os.getenv("DATABASE_URL"):
+            try:
+                from db.session import get_session  # noqa: PLC0415
+                from db.models import Investigation  # noqa: PLC0415
+                with get_session() as session:
+                    rows = (
+                        session.query(Investigation.id)
+                        .filter(Investigation.user_id == current_user.user.id)
+                        .all()
+                    )
+                    user_inv_ids = {str(r[0]) for r in rows}
+            except Exception as exc:
+                logger.warning("semantic_search: failed to load user inv IDs: %s", exc)
+        results = [
+            r for r in results
+            if str(r.get("metadata", {}).get("investigation_id", "")) in user_inv_ids
+        ]
+        return {
+            "items": results,
+            "total": total,
+            "offset": body.offset,
+            "n_results": body.n_results,
+        }
+    except Exception as exc:
+        logger.warning("semantic_search failed: %s", exc)
+        return {"items": [], "total": 0, "offset": 0, "n_results": 10}
+@router.post("/entities")
+async def search_entities(
+    body: EntitySearchRequest,
+    current_user: CurrentUser = Depends(get_current_user),
+) -> list[dict]:
+    """
+    Full-text search across entity values in DB.
+    Optionally filter by entity_types list.
+    Supports pagination via offset/limit.
+    """
+    if not os.getenv("DATABASE_URL"):
+        return []
+    try:
+        from db.session import get_session  # noqa: PLC0415
+        from db.models import Entity, Investigation, InvestigationEntityLink  # noqa: PLC0415
+        import sqlalchemy as sa  # noqa: PLC0415
+        limit = max(1, min(body.limit, 200))
+        offset = max(0, body.offset)
+        with get_session() as session:
+            user_inv_ids = (
+                session.query(Investigation.id)
+                .filter(Investigation.user_id == current_user.user.id)
+                .subquery()
+            )
+            linked_entity_ids = (
+                session.query(InvestigationEntityLink.entity_id)
+                .filter(InvestigationEntityLink.investigation_id.in_(user_inv_ids))
+                .subquery()
+            )
+            q = session.query(Entity).filter(
+                sa.or_(
+                    Entity.investigation_id.in_(user_inv_ids),
+                    Entity.id.in_(linked_entity_ids),
+                ),
+                Entity.value.contains(body.query),
+            )
+            if body.entity_types:
+                q = q.filter(Entity.entity_type.in_(body.entity_types))
+            total = q.count()
+            entities = q.order_by(Entity.created_at.desc()).offset(offset).limit(limit).all()
+            return {
+                "items": [
+                    {
+                        "id": str(e.id),
+                        "entity_type": e.entity_type,
+                        "value": e.value,
+                        "confidence": e.confidence,
+                        "investigation_id": str(e.investigation_id) if e.investigation_id else None,
+                        "created_at": e.created_at.isoformat() if e.created_at else None,
+                    }
+                    for e in entities
+                ],
+                "total": total,
+                "offset": offset,
+                "limit": limit,
+            }
+    except Exception as exc:
+        logger.warning("search_entities failed: %s", exc)
+        return {"items": [], "total": 0, "offset": 0, "limit": 50}