PyPI - financebench-rag-agent - Versions diffs - 0.1.0__tar.gz - Mend

financebench-rag-agent 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (237) hide show

financebench_rag_agent-0.1.0/.env.example ADDED Viewed

@@ -0,0 +1,202 @@
+# LLM Providers
+OPENAI_API_KEY=sk-your-openai-key-here
+GROQ_API_KEY=gsk_your-groq-key-here
+ANTHROPIC_API_KEY=sk-ant-your-anthropic-key-here
+# Voyage AI — finance-tuned embeddings (Sprint 7.8 Week 1). Free tier is 50M
+# tokens / account. Sign up at https://www.voyageai.com → API keys.
+VOYAGE_API_KEY=pa-your-voyage-key-here
+# --- Embedding provider ---
+# "openai" (default) or "voyage". When "voyage", set EMBEDDING_MODEL=voyage-finance-2
+# and EMBEDDING_DIMENSIONS=1024.
+EMBEDDING_PROVIDER=openai
+EMBEDDING_MODEL=text-embedding-3-large
+EMBEDDING_DIMENSIONS=3072
+# Patronus AI (optional) — hosted fuzzy-match judge for FinanceBench eval.
+# Sign up free at https://app.patronus.ai and paste your key here. Free tier
+# is sufficient for the 150-question FinanceBench run.
+PATRONUS_API_KEY=
+# Set to true to route all LLM calls through OpenAI (bypass Groq free-tier rate
+# limits during eval runs; also bypasses Anthropic for eval cost control).
+# Keep false in production.
+FORCE_OPENAI_ONLY=false
+# Use Groq for the high-volume fast-path nodes (router, grader, query_rewriter).
+# Set false during long evals (e.g. FinanceBench) so router/grader/query_rewriter
+# go to OpenAI even when GROQ_API_KEY is set — avoids the Groq 100k tokens-per-day
+# free-tier cap. Default true preserves the production latency profile.
+USE_GROQ_FAST_PATH=true
+ENABLE_DETERMINISTIC_VALIDATOR=true
+VALIDATOR_MIN_KEEP=3
+ENABLE_LTR_GATE=false
+LTR_GATE_MODEL_PATH=data/models/ltr_gate.json
+LTR_GATE_HIGH_CONFIDENCE=0.9
+LTR_GATE_LOW_CONFIDENCE=0.1
+ENABLE_SELECTIVE_RETRIEVAL_EVALUATOR=false
+RETRIEVAL_EVALUATOR_MIN_CONFIDENCE=0.55
+# Multi-HyDE (Sprint 7.10a) — N hypothetical 10-K-style passages generated per
+# query, each embedded + searched, RRF-fused across (orig + N) paths. Targets
+# the question-phrasing vs document-phrasing vocabulary gap.
+ENABLE_MULTI_HYDE=false
+MULTI_HYDE_N=3
+MULTI_HYDE_MODEL=gpt-4o-mini
+MULTI_HYDE_TEMPERATURE=0.3
+# LangSmith Observability
+LANGCHAIN_TRACING_V2=true
+LANGCHAIN_PROJECT=rag-agent-dev
+LANGCHAIN_API_KEY=lsv2_pt_your-langsmith-key-here
+# Authentication
+JWT_SECRET=change-this-to-a-random-secret-in-production
+# Qdrant
+QDRANT_HOST=localhost
+QDRANT_PORT=6333
+QDRANT_COLLECTION=financial_docs
+# Documents root — filesystem location from which `GET /documents/{filename}`
+# serves PDFs for the frontend citation clickthrough. Path traversal is
+# rejected (only basenames inside this directory are served).
+DOCUMENTS_ROOT=data/sample
+# PostgreSQL (for LangGraph checkpointer)
+POSTGRES_DB=rag_agent
+POSTGRES_USER=rag_user
+POSTGRES_PASSWORD=change-this-in-production
+POSTGRES_HOST=localhost
+POSTGRES_PORT=5432
+# LiteLLM gateway (Sprint 8 8a) — single proxy fronting every LLM call.
+# Default empty = direct-provider behavior (Sprint 7.x compat).
+# Set after `docker compose up -d litellm` to route through the proxy:
+#   - inside compose:  http://litellm:4000
+#   - host-based dev:  http://localhost:4000
+LITELLM_URL=
+# Redis (Sprint 8 8b) — backs the LiteLLM `redis-semantic` cache.
+# Read from the litellm container's environment. docker-compose.yml sets
+# REDIS_HOST=redis / REDIS_PORT=6379 / REDIS_PASSWORD= explicitly, so
+# nothing here is required for local-compose dev. The keys are listed
+# for production deployments where Redis lives on a separate host.
+REDIS_HOST=
+REDIS_PORT=
+REDIS_PASSWORD=
+# Langfuse (Sprint 8 8c) — self-hosted observability stack. The local
+# docker-compose seeds an org/project with dev defaults so LiteLLM
+# auto-attaches on boot with no manual UI step. For shared / multi-user /
+# production deployments override every variable in this section: the
+# defaults baked into docker-compose.yml only protect a local-machine
+# instance.
+#
+# LANGFUSE_HOST / PUBLIC_KEY / SECRET_KEY — point the rag-agent at any
+# Langfuse instance (hosted or self-hosted). Empty = use the in-compose
+# `langfuse-web` defaults from docker-compose.yml.
+LANGFUSE_HOST=
+LANGFUSE_PUBLIC_KEY=
+LANGFUSE_SECRET_KEY=
+# Langfuse data-plane secrets — only consumed by docker-compose.yml to
+# parameterize the self-hosted stack (langfuse-postgres / langfuse-redis /
+# langfuse-clickhouse / langfuse-minio / langfuse-worker / langfuse-web).
+# Each has a dev-only default in docker-compose.yml so a fresh `docker
+# compose up` works without setting any of these. Override BEFORE any
+# shared deployment.
+#
+# Cryptography — regenerate each with `openssl rand -hex 32`:
+LANGFUSE_SALT=
+LANGFUSE_ENCRYPTION_KEY=
+LANGFUSE_NEXTAUTH_SECRET=
+# Data-plane passwords (langfuse-postgres / langfuse-redis / langfuse-clickhouse):
+LANGFUSE_PG_PASSWORD=
+LANGFUSE_REDIS_PASSWORD=
+LANGFUSE_CLICKHOUSE_PASSWORD=
+# MinIO (S3-compatible object storage for Langfuse event blobs):
+LANGFUSE_MINIO_USER=
+LANGFUSE_MINIO_PASSWORD=
+# Initial UI admin password for the auto-bootstrapped Langfuse user
+# `dev@local.test`. Used only on first boot of a fresh langfuse-postgres
+# volume; subsequent boots ignore it. Change BEFORE first boot if exposing
+# the Langfuse UI on a shared network.
+LANGFUSE_INIT_USER_PASSWORD=
+# Sprint 8e — per-stage Redis result cache. Three caches share one Redis DB
+# (logical DB 1, separate from LiteLLM's semantic cache on DB 0):
+#   - Voyage query embedding by query text
+#   - BGE reranker score by (query, chunk_text)
+#   - Grader relevance verdict by (query, chunk_text)
+# All three fail-open (any Redis error logs a warning and falls through to
+# compute), so leaving these unset is safe.
+#
+# IMPORTANT — host-side port is 6380, NOT 6379. Many macOS dev boxes have a
+# brew/launchd Redis already on 6379; the loopback bind wins the race vs
+# docker's wildcard so host-side cache calls would silently land on the
+# wrong Redis. Sprint 8e diagnostic uncovered this. Inside the compose
+# network the container port is still 6379.
+RAG_RESULT_CACHE_ENABLED=1            # Set to 0 to disable all three layers
+RESULT_CACHE_REDIS_HOST=localhost
+RESULT_CACHE_REDIS_PORT=6380          # docker-compose host-mapped port
+RESULT_CACHE_REDIS_DB=1
+RAG_RESULT_CACHE_TTL_SECONDS=604800   # 7 days
+# Application
+ENVIRONMENT=dev
+LOG_LEVEL=INFO
+# CORS (JSON array of allowed origins; use ["*"] only in dev).
+# Sprint 9 frontend defaults: Next.js dev on :3000, Gradio on :7860
+# (keep until Gradio is removed in 9.5).
+CORS_ORIGINS=["http://localhost:3000","http://localhost:7860"]
+# --- Local model placement (Apple Silicon stability tuning) ---
+# These settings control where transformer models load when running on Apple
+# Silicon. Defaults are tuned for *long-running* eval workloads where MPS pool
+# pressure causes OOM after ~50 inferences. Production single-query traffic
+# can override to MPS for ~3x speedup on the BGE reranker if desired.
+#
+# RERANKER_DEVICE=cpu|mps|cuda  (default: cpu)
+#   BGE cross-encoder reranker placement. CPU is ~100-200ms slower per 8-chunk
+#   batch but eliminates Apple-Silicon unified-memory OOM. Override to mps for
+#   production speed.
+RERANKER_DEVICE=cpu
+#
+# RERANKER_ADAPTER_PATH=data/models/reranker_ft_v1  (default: unset → stock BGE)
+#   Load the Sprint 7.9 LoRA fine-tuned adapter on top of BAAI/bge-reranker-v2-m3.
+#   When unset, src/services/reranker_service.py:92 silently falls back to the
+#   stock CrossEncoder — same code path, but without the multi-hop slice gains
+#   measured in Sprint 7.9 Day 7 (4/13 → 11/13). The Sprint 7.19 audit caught
+#   this exact gap: the env var was added at Sprint 7.9 but never propagated
+#   into .env or .env.example, so the FT v1 adapter was silently inactive for
+#   ~5 sprints. Always set this for production / eval runs.
+RERANKER_ADAPTER_PATH=data/models/reranker_ft_v1
+#
+# RESULT_CACHE_REDIS_HOST=localhost (default: localhost)
+# RESULT_CACHE_REDIS_PORT=6380      (default: 6379 — WRONG for host-side scripts)
+#   docker-compose maps the rag-cache redis container's internal port 6379 to
+#   host port 6380 (line 'ports: ["6380:6379"]'). Host-side scripts (eval
+#   harness, smoke tests) must use port 6380; without it, every cache call
+#   raises ConnectionRefused, retrieval starts hitting the no-filter FALLBACK
+#   path, and the grader's entity_match rejects all chunks. The full pipeline
+#   stays "working" but regresses headline pass-rate to near zero. Sprint 7.18a
+#   spent 2 hours diagnosing this. From inside docker-compose (the FastAPI
+#   container on the rag-cache network), use the service hostname `redis:6379`
+#   instead.
+RESULT_CACHE_REDIS_HOST=localhost
+RESULT_CACHE_REDIS_PORT=6380
+#
+# LLM_GUARD_USE_ONNX=1|0  (default: 1)
+#   LLM Guard PromptInjection backend. ONNX runs on CPU with ~10x lower memory
+#   footprint than the PyTorch+MPS backend. Same model weights, same accuracy.
+LLM_GUARD_USE_ONNX=1
+#
+# RAG_DISABLE_LLM_GUARD=1|0  (default: 0)
+#   Hard-disable LLM Guard (Layer 2 of injection check) entirely. Layers 1
+#   (regex) and 3 (LLM classifier) still run. Useful for non-adversarial
+#   workloads (e.g. FinanceBench eval) where Layer 2 adds no signal but
+#   consumes memory.
+RAG_DISABLE_LLM_GUARD=0

financebench_rag_agent-0.1.0/.gitignore ADDED Viewed

@@ -0,0 +1,164 @@
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.egg-info/
+dist/
+build/
+*.egg
+# Virtual environments
+.venv/
+venv/
+env/
+# Environment variables
+.env
+.env.bak
+.env.local
+.env.production
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+# macOS
+.DS_Store
+.AppleDouble
+.LSOverride
+# Data (raw PDFs can be large)
+data/raw/
+# Raw SEC HTMLs (downloaded; regenerable via scripts/download_sec_filings.py)
+data/raw/sec/
+# Qdrant local storage
+qdrant_data/
+# Postgres local data
+pg_data/
+# Sprint 7.17 grader LoRA-FT v1 exploratory artifacts (Signal 12 failure — base
+# model 45× below the validated capacity floor). Kept locally for ablation
+# reproducibility; not tracked because the experiment is documented in
+# docs/engineering-log.md Sprint 7.17 and the adapter outputs aren't reusable.
+data/models/grader_ft_v1_hard_r8/
+data/models/grader_ft_v1_mixed_r8/
+data/models/grader_ft_v1_random_r8/
+data/training/grader_ft_v1/
+# Evaluation results (regenerated)
+tests/evaluation/eval_results/*.json
+# ...except committed baselines, sprint milestone snapshots, and final results
+!tests/evaluation/eval_results/baseline_*.json
+!tests/evaluation/eval_results/after_sprint*.json
+!tests/evaluation/eval_results/final_*.json
+# Canonical milestone results under the calibrated Sonnet 4.6 + v2 judge (κ=0.932)
+!tests/evaluation/eval_results/financebench_pypdf_voyage_tiered_ft_litellm_v1_grader.json
+!tests/evaluation/eval_results/financebench_pypdf_voyage_tiered_ft_litellm_v1_grader.correctness.json
+!tests/evaluation/eval_results/financebench_pypdf_voyage_tiered_ft_litellm_v1_grader.rejudged_sonnet_v2.correctness.json
+!tests/evaluation/eval_results/financebench_pypdf_voyage_tiered_ft_litellm_v1_grader.rejudged_sonnet_v2.diff.json
+!tests/evaluation/eval_results/financebench_pypdf_voyage_tiered_ft_litellm_4fix_plus_fix2.json
+!tests/evaluation/eval_results/financebench_pypdf_voyage_tiered_ft_litellm_4fix_plus_fix2.correctness.json
+!tests/evaluation/eval_results/financebench_pypdf_voyage_tiered_ft_litellm_4fix_plus_fix2.rejudged_sonnet_v2.correctness.json
+!tests/evaluation/eval_results/financebench_pypdf_voyage_tiered_ft_litellm_4fix_plus_fix2.rejudged_sonnet_v2.diff.json
+!tests/evaluation/eval_results/financebench_pypdf_voyage_tiered_ft_litellm_4fix_plus_fix2.ragas.json
+!tests/evaluation/eval_results/financebench_pypdf_voyage_tiered_ft_litellm_4fix_plus_fix2.deepeval.json
+# Sprint 7.16 milestone (gen-v2 = anti-refusal nudge + enumerate-fully clause 8)
+!tests/evaluation/eval_results/financebench_pypdf_voyage_tiered_ft_litellm_gen_v2.json
+!tests/evaluation/eval_results/financebench_pypdf_voyage_tiered_ft_litellm_gen_v2.correctness.json
+!tests/evaluation/eval_results/financebench_pypdf_voyage_tiered_ft_litellm_gen_v2.rejudged_sonnet_v2.correctness.json
+!tests/evaluation/eval_results/financebench_pypdf_voyage_tiered_ft_litellm_gen_v2.rejudged_sonnet_v2.diff.json
+!tests/evaluation/eval_results/financebench_pypdf_voyage_tiered_ft_litellm_gen_v2.ragas.json
+!tests/evaluation/eval_results/financebench_pypdf_voyage_tiered_ft_litellm_gen_v2.deepeval.json
+# Sprint 7.16 validation artifacts
+!tests/evaluation/eval_results/validation_antirefusal_nudge.json
+!tests/evaluation/eval_results/validation_enumerate_fully.json
+!tests/evaluation/eval_results/validation_directional_clause9.json
+# Sprint 7.16 diagnostic artifacts (audit refresh + retrieval/reranker attribution + grader-on-gold)
+!tests/evaluation/eval_results/audit_failed_qs_gen_v2.json
+!tests/evaluation/eval_results/diag2_retrieval_reranker_attribution.json
+!tests/evaluation/eval_results/diag3_grader_on_gold_chunks.json
+# Sprint 7.17 grader-architecture experiment results
+!tests/evaluation/eval_results/grader_ft_v1_component_eval.json
+!tests/evaluation/eval_results/grader_models_compare.json
+!tests/evaluation/eval_results/grader_haiku_max2048.json
+# Sprint 7.17 follow-up: Caveat B falsification + Llama 3.3 70B via Fireworks
+!tests/evaluation/eval_results/grader_haiku_system_split.json
+!tests/evaluation/eval_results/grader_fireworks_llama33_70b.json
+# Sprint 7.17 follow-up #3: full 150-Q FinanceBench eval w/ Llama-3.3-70B grader (OpenRouter)
+!tests/evaluation/eval_results/financebench_openrouter_llama_grader_v1.json
+!tests/evaluation/eval_results/financebench_openrouter_llama_grader_v1.correctness.json
+!tests/evaluation/eval_results/financebench_openrouter_llama_grader_v1.rejudged_sonnet_v2.correctness.json
+!tests/evaluation/eval_results/financebench_openrouter_llama_grader_v1.rejudged_sonnet_v2.diff.json
+# Sprint 7.18a: RETRIEVAL_TOP_K=50→200 to recover RETRIEVAL_MISS bucket
+!tests/evaluation/eval_results/financebench_retrieval_topk_200_v1.json
+!tests/evaluation/eval_results/financebench_retrieval_topk_200_v1.correctness.json
+!tests/evaluation/eval_results/financebench_retrieval_topk_200_v1.rejudged_sonnet_v2.correctness.json
+!tests/evaluation/eval_results/financebench_retrieval_topk_200_v1.rejudged_sonnet_v2.diff.json
+# Sprint 7.19 Step 0: FT v1 reranker re-enabled (RERANKER_ADAPTER_PATH bug fix)
+!tests/evaluation/eval_results/financebench_ft_v1_reranker_active_v1.json
+!tests/evaluation/eval_results/financebench_ft_v1_reranker_active_v1.correctness.json
+!tests/evaluation/eval_results/financebench_ft_v1_reranker_active_v1.rejudged_sonnet_v2.correctness.json
+!tests/evaluation/eval_results/financebench_ft_v1_reranker_active_v1.rejudged_sonnet_v2.diff.json
+# Methodology artifacts referenced in docs/engineering-log.md
+!tests/evaluation/eval_results/audit_failed_qs_v1_grader.json
+!tests/evaluation/eval_results/audit_failed_qs_4fix.json
+!tests/evaluation/eval_results/hallu_model_ablation.json
+!tests/evaluation/eval_results/slice_analysis_4fix_plus_fix2.json
+!tests/evaluation/eval_results/pipeline_diagnostic_results.json
+# ...but re-ignore pipeline caches (they're intermediate, regenerated on each run)
+tests/evaluation/eval_results/*.pipeline.json
+# Jupyter
+.ipynb_checkpoints/
+# Docker
+*.log
+# Coverage
+htmlcov/
+.coverage
+.coverage.*
+# Runtime logs (from restart.sh)
+logs/
+# Claude Code local tool state
+.claude/
+CLAUDE.md
+# Internal planning / session notes — kept locally for context, not in public repo.
+# The polished public-facing version of this material lives in docs/engineering-log.md.
+SESSION_HANDOFF.md
+IMPLEMENTATION_PLAN.md
+IMPROVEMENT_PLAN.md
+PROJECT_MASTER_DOCUMENT.md
+DEPLOYMENT_PLAN.md
+COMMANDS.txt
+# CampusX course materials (reference tutorials, not part of project)
+_course_materials/
+# Personal scratch
+Untitled.txt
+# Cost tracker — internal LLM spend audit trail, kept locally only
+cost_logs/
+# Sprint 7.19 Step 1: FT v2 reranker full FinanceBench eval (the actual gate)
+!tests/evaluation/eval_results/financebench_ft_v2_reranker_active_v1.json
+!tests/evaluation/eval_results/financebench_ft_v2_reranker_active_v1.correctness.json
+!tests/evaluation/eval_results/financebench_ft_v2_reranker_active_v1.rejudged_sonnet_v2.correctness.json
+!tests/evaluation/eval_results/financebench_ft_v2_reranker_active_v1.rejudged_sonnet_v2.diff.json
+!tests/evaluation/eval_results/eval_reranker_stratified_v2.json
+# Sprint 7.19 Step 1 partial-adapter backups (kept locally for ablation, not pushed)
+data/models/reranker_ft_v2_local_partial/
+data/models/reranker_ft_v2_partial_epoch1/
+# Publishing assets (mockups, reference images, drafts) — kept local
+publish-assets/

financebench_rag_agent-0.1.0/Dockerfile ADDED Viewed

@@ -0,0 +1,44 @@
+# === Build stage ===
+FROM python:3.12-slim AS builder
+WORKDIR /app
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential \
+    && rm -rf /var/lib/apt/lists/*
+COPY pyproject.toml README.md ./
+RUN pip install --no-cache-dir .
+# === Runtime stage ===
+FROM python:3.12-slim
+LABEL maintainer="Rishabh" \
+      description="Enterprise RAG Agent API" \
+      version="0.1.0"
+WORKDIR /app
+# Create non-root user
+RUN groupadd --gid 1000 appuser && \
+    useradd --uid 1000 --gid appuser --shell /bin/bash --create-home appuser
+# Copy installed packages from builder
+COPY --from=builder /usr/local/lib/python3.12/site-packages /usr/local/lib/python3.12/site-packages
+COPY --from=builder /usr/local/bin /usr/local/bin
+# Copy application code
+COPY src/ src/
+COPY scripts/ scripts/
+COPY data/sample/ data/sample/
+# Change ownership and switch to non-root
+RUN chown -R appuser:appuser /app
+USER appuser
+EXPOSE 8000
+HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \
+    CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')" || exit 1
+CMD ["uvicorn", "src.api.main:app", "--host", "0.0.0.0", "--port", "8000"]

financebench_rag_agent-0.1.0/Makefile ADDED Viewed

@@ -0,0 +1,92 @@
+.PHONY: run dev frontend test test-unit test-integration eval lint format ingest seed-db jwt docker-up docker-down docker-all docker-build docker-prod docker-logs docker-ps docker-restart check clean migrate migrate-down migrate-create migrate-current
+# --- Development ---
+run:
+	uvicorn src.api.main:app --reload --port 8000
+dev: docker-up
+	uvicorn src.api.main:app --reload --port 8000
+frontend:
+	python -m src.frontend.gradio_app
+# --- Testing ---
+test:
+	pytest tests/unit/ tests/integration/ -v
+test-unit:
+	pytest tests/unit/ -v
+test-integration:
+	pytest tests/integration/ -v --timeout=120
+eval:
+	python tests/evaluation/run_evaluation.py --output tests/evaluation/eval_results/latest.json
+lint:
+	ruff check src/ tests/
+	ruff format --check src/ tests/
+format:
+	ruff check --fix src/ tests/
+	ruff format src/ tests/
+# --- Data ---
+ingest:
+	python scripts/ingest_documents.py --input data/raw/ --collection financial_docs
+seed-db:
+	python scripts/seed_qdrant.py --sample
+jwt:
+	python scripts/generate_jwt.py --role finance --user-id test_user
+# --- Docker ---
+docker-up:
+	docker compose up -d qdrant postgres
+docker-down:
+	docker compose down
+docker-all:
+	docker compose up --build
+# --- Production ---
+docker-build:
+	docker compose build
+docker-prod:
+	docker compose up -d --build
+docker-logs:
+	docker compose logs -f
+docker-ps:
+	docker compose ps
+docker-restart:
+	docker compose restart api frontend
+# --- Migrations (Sprint 9.0: alembic for the roles table; will grow) ---
+migrate:
+	alembic upgrade head
+migrate-down:
+	alembic downgrade -1
+migrate-current:
+	alembic current
+# Usage: make migrate-create m="add foo table"
+migrate-create:
+	alembic revision -m "$(m)"
+# --- Checks ---
+check: lint test-unit
+	@echo "All checks passed"
+# --- Cleanup ---
+clean:
+	find . -type d -name "__pycache__" -exec rm -rf {} + 2>/dev/null || true
+	find . -type f -name "*.pyc" -delete 2>/dev/null || true
+	rm -rf .pytest_cache/ htmlcov/ .coverage