PyPI - rnsr - Versions diffs - 0.1.0__py3-none-any.whl - Mend

rnsr 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (72) hide show

rnsr/__init__.py +118 -0
rnsr/__main__.py +242 -0
rnsr/agent/__init__.py +218 -0
rnsr/agent/cross_doc_navigator.py +767 -0
rnsr/agent/graph.py +1557 -0
rnsr/agent/llm_cache.py +575 -0
rnsr/agent/navigator_api.py +497 -0
rnsr/agent/provenance.py +772 -0
rnsr/agent/query_clarifier.py +617 -0
rnsr/agent/reasoning_memory.py +736 -0
rnsr/agent/repl_env.py +709 -0
rnsr/agent/rlm_navigator.py +2108 -0
rnsr/agent/self_reflection.py +602 -0
rnsr/agent/variable_store.py +308 -0
rnsr/benchmarks/__init__.py +118 -0
rnsr/benchmarks/comprehensive_benchmark.py +733 -0
rnsr/benchmarks/evaluation_suite.py +1210 -0
rnsr/benchmarks/finance_bench.py +147 -0
rnsr/benchmarks/pdf_merger.py +178 -0
rnsr/benchmarks/performance.py +321 -0
rnsr/benchmarks/quality.py +321 -0
rnsr/benchmarks/runner.py +298 -0
rnsr/benchmarks/standard_benchmarks.py +995 -0
rnsr/client.py +560 -0
rnsr/document_store.py +394 -0
rnsr/exceptions.py +74 -0
rnsr/extraction/__init__.py +172 -0
rnsr/extraction/candidate_extractor.py +357 -0
rnsr/extraction/entity_extractor.py +581 -0
rnsr/extraction/entity_linker.py +825 -0
rnsr/extraction/grounded_extractor.py +722 -0
rnsr/extraction/learned_types.py +599 -0
rnsr/extraction/models.py +232 -0
rnsr/extraction/relationship_extractor.py +600 -0
rnsr/extraction/relationship_patterns.py +511 -0
rnsr/extraction/relationship_validator.py +392 -0
rnsr/extraction/rlm_extractor.py +589 -0
rnsr/extraction/rlm_unified_extractor.py +990 -0
rnsr/extraction/tot_validator.py +610 -0
rnsr/extraction/unified_extractor.py +342 -0
rnsr/indexing/__init__.py +60 -0
rnsr/indexing/knowledge_graph.py +1128 -0
rnsr/indexing/kv_store.py +313 -0
rnsr/indexing/persistence.py +323 -0
rnsr/indexing/semantic_retriever.py +237 -0
rnsr/indexing/semantic_search.py +320 -0
rnsr/indexing/skeleton_index.py +395 -0
rnsr/ingestion/__init__.py +161 -0
rnsr/ingestion/chart_parser.py +569 -0
rnsr/ingestion/document_boundary.py +662 -0
rnsr/ingestion/font_histogram.py +334 -0
rnsr/ingestion/header_classifier.py +595 -0
rnsr/ingestion/hierarchical_cluster.py +515 -0
rnsr/ingestion/layout_detector.py +356 -0
rnsr/ingestion/layout_model.py +379 -0
rnsr/ingestion/ocr_fallback.py +177 -0
rnsr/ingestion/pipeline.py +936 -0
rnsr/ingestion/semantic_fallback.py +417 -0
rnsr/ingestion/table_parser.py +799 -0
rnsr/ingestion/text_builder.py +460 -0
rnsr/ingestion/tree_builder.py +402 -0
rnsr/ingestion/vision_retrieval.py +965 -0
rnsr/ingestion/xy_cut.py +555 -0
rnsr/llm.py +733 -0
rnsr/models.py +167 -0
rnsr/py.typed +2 -0
rnsr-0.1.0.dist-info/METADATA +592 -0
rnsr-0.1.0.dist-info/RECORD +72 -0
rnsr-0.1.0.dist-info/WHEEL +5 -0
rnsr-0.1.0.dist-info/entry_points.txt +2 -0
rnsr-0.1.0.dist-info/licenses/LICENSE +21 -0
rnsr-0.1.0.dist-info/top_level.txt +1 -0

rnsr/__init__.py ADDED Viewed

@@ -0,0 +1,118 @@
+"""
+RNSR - Recursive Neural-Symbolic Retriever
+State-of-the-art document retrieval system combining:
+- PageIndex: Vectorless, reasoning-based tree search
+- RLMs: REPL environment with recursive sub-LLM calls
+- Vision: OCR-free image-based document analysis
+This is the hybrid recursive visual-symbolic retriever that achieves
+superior performance on complex document understanding tasks.
+Key Features:
+- Font Histogram Algorithm (NOT vision models for structure)
+- Recursive XY-Cut (Visual-geometric segmentation)
+- Hierarchical Clustering (Multi-resolution topics)
+- Skeleton Index pattern (summaries + KV store)
+- Pointer-based Variable Stitching (prevents context pollution)
+- Pre-LLM Filtering (keyword/regex before expensive ToT)
+- Deep Recursive Sub-LLM Calls (configurable depth)
+- Answer Verification (sub-LLM validation)
+- Vision-based Retrieval (OCR-free page image analysis)
+- Hybrid Text+Vision Mode (best of both worlds)
+- Multi-provider LLM support (OpenAI, Anthropic, Gemini)
+Usage:
+    from rnsr import RNSRClient
+    # Simple one-line Q&A
+    client = RNSRClient()
+    answer = client.ask("contract.pdf", "What are the payment terms?")
+    # Advanced RLM navigation with full features
+    result = client.ask_advanced(
+        "complex_report.pdf",
+        "Compare liability clauses in sections 5 and 8",
+        enable_verification=True,
+        max_recursion_depth=3,
+    )
+    # Vision-based analysis (for scanned docs, charts)
+    result = client.ask_vision(
+        "scanned_document.pdf",
+        "What does the revenue chart show?",
+    )
+    # Low-level API
+    from rnsr import ingest_document, build_skeleton_index, run_rlm_navigator
+    result = ingest_document("contract.pdf")
+    skeleton, kv_store = build_skeleton_index(result.tree)
+    answer = run_rlm_navigator("What are the terms?", skeleton, kv_store)
+LLM Provider Configuration:
+    Set one of these environment variables:
+    - GOOGLE_API_KEY (Gemini)
+    - OPENAI_API_KEY (OpenAI)
+    - ANTHROPIC_API_KEY (Anthropic)
+"""
+__version__ = "0.2.0"  # Major update with RLM + Vision
+# Re-export main entry points
+from rnsr.ingestion import ingest_document, IngestionResult
+from rnsr.ingestion.pipeline import ingest_document_enhanced
+from rnsr.indexing import build_skeleton_index, SQLiteKVStore, InMemoryKVStore
+from rnsr.indexing import save_index, load_index, get_index_info, list_indexes
+from rnsr.agent import (
+    run_navigator,
+    VariableStore,
+    # RLM Navigator (State-of-the-Art)
+    RLMNavigator,
+    RLMConfig,
+    run_rlm_navigator,
+    create_rlm_navigator,
+    PreFilterEngine,
+    RecursiveSubLLMEngine,
+    AnswerVerificationEngine,
+)
+from rnsr.document_store import DocumentStore
+from rnsr.client import RNSRClient
+from rnsr.llm import get_llm, get_embed_model, LLMProvider
+__all__ = [
+    # Version
+    "__version__",
+    # High-Level Client (Simplest API)
+    "RNSRClient",
+    # Ingestion
+    "ingest_document",
+    "ingest_document_enhanced",
+    "IngestionResult",
+    # Indexing
+    "build_skeleton_index",
+    "SQLiteKVStore",
+    "InMemoryKVStore",
+    # Persistence
+    "save_index",
+    "load_index",
+    "get_index_info",
+    "list_indexes",
+    # Document Store
+    "DocumentStore",
+    # Standard Navigator
+    "run_navigator",
+    "VariableStore",
+    # RLM Navigator (State-of-the-Art)
+    "RLMNavigator",
+    "RLMConfig",
+    "run_rlm_navigator",
+    "create_rlm_navigator",
+    "PreFilterEngine",
+    "RecursiveSubLLMEngine",
+    "AnswerVerificationEngine",
+    # LLM
+    "get_llm",
+    "get_embed_model",
+    "LLMProvider",
+]

rnsr/__main__.py ADDED Viewed

@@ -0,0 +1,242 @@
+"""
+RNSR CLI - Command Line Interface
+Usage:
+    python -m rnsr ingest document.pdf
+    python -m rnsr query "What are the payment terms?"
+"""
+from __future__ import annotations
+import argparse
+import json
+import sys
+from pathlib import Path
+import structlog
+structlog.configure(
+    processors=[
+        structlog.stdlib.add_log_level,
+        structlog.dev.ConsoleRenderer(),
+    ]
+)
+logger = structlog.get_logger(__name__)
+def cmd_ingest(args):
+    """Ingest a PDF document."""
+    from rnsr.ingestion import ingest_document
+    pdf_path = Path(args.file)
+    if not pdf_path.exists():
+        print(f"Error: File not found: {pdf_path}")
+        sys.exit(1)
+    print(f"Ingesting: {pdf_path}")
+    result = ingest_document(pdf_path)
+    print(f"\n✓ Ingestion complete!")
+    print(f"  Tier used: {result.tier_used} ({result.method})")
+    print(f"  Total nodes: {result.tree.total_nodes}")
+    if result.warnings:
+        print(f"\nWarnings:")
+        for w in result.warnings:
+            print(f"  - {w}")
+    if args.output:
+        output_path = Path(args.output)
+        with open(output_path, "w") as f:
+            json.dump(result.tree.model_dump(), f, indent=2)
+        print(f"\nTree saved to: {output_path}")
+    return result
+def cmd_index(args):
+    """Build skeleton index from ingested document."""
+    from rnsr.indexing import SQLiteKVStore, build_skeleton_index
+    from rnsr.ingestion import ingest_document
+    pdf_path = Path(args.file)
+    if not pdf_path.exists():
+        print(f"Error: File not found: {pdf_path}")
+        sys.exit(1)
+    # Ingest first
+    print(f"Ingesting: {pdf_path}")
+    result = ingest_document(pdf_path)
+    # Build index
+    db_path = args.db or f"{pdf_path.stem}_index.db"
+    kv_store = SQLiteKVStore(db_path)
+    skeleton, _ = build_skeleton_index(result.tree, kv_store)
+    print(f"\n✓ Index built!")
+    print(f"  Skeleton nodes: {len(skeleton)}")
+    print(f"  KV entries: {kv_store.count()}")
+    print(f"  Database: {db_path}")
+    return skeleton, kv_store
+def cmd_query(args):
+    """Query a document."""
+    from rnsr.agent import run_navigator
+    from rnsr.indexing import SQLiteKVStore, build_skeleton_index
+    from rnsr.ingestion import ingest_document
+    pdf_path = Path(args.file)
+    if not pdf_path.exists():
+        print(f"Error: File not found: {pdf_path}")
+        sys.exit(1)
+    # Ingest
+    print(f"Ingesting: {pdf_path}")
+    result = ingest_document(pdf_path)
+    # Build index
+    skeleton, kv_store = build_skeleton_index(result.tree)
+    # Run query
+    print(f"\nQuery: {args.query}")
+    print("-" * 40)
+    answer = run_navigator(
+        question=args.query,
+        skeleton=skeleton,
+        kv_store=kv_store,
+        max_iterations=args.max_iter,
+    )
+    print(f"\nAnswer:")
+    print(answer["answer"])
+    print(f"\nConfidence: {answer['confidence']:.2f}")
+    print(f"Nodes visited: {len(answer['nodes_visited'])}")
+    print(f"Variables used: {len(answer['variables_used'])}")
+    if args.trace:
+        print(f"\nTrace:")
+        for entry in answer["trace"]:
+            print(f"  [{entry['node_type']}] {entry['action']}")
+def cmd_benchmark(args):
+    """Run benchmarks on the RNSR system."""
+    from .benchmarks import BenchmarkRunner, BenchmarkConfig
+    # Check files are provided
+    if not args.config and not args.files:
+        print("❌ Error: Provide --files or --config for benchmarking")
+        return
+    # Load config if provided
+    if args.config:
+        config = BenchmarkConfig.from_json(args.config)
+    else:
+        config = BenchmarkConfig(
+            pdf_paths=[Path(f) for f in (args.files or [])],
+            iterations=args.iterations,
+            compute_quality=args.quality or args.all,
+        )
+    print("=" * 60)
+    print("RNSR Benchmark Suite")
+    print("=" * 60)
+    print(f"Files: {len(config.pdf_paths)}")
+    print(f"Iterations: {config.iterations}")
+    # Run benchmarks
+    runner = BenchmarkRunner(config)
+    report = runner.run()
+    # Print summary
+    report.print_summary()
+    # Save results
+    output_dir = args.output or "benchmark_results"
+    output_path = Path(output_dir)
+    report_file = output_path / f"benchmark_report_{report.timestamp.replace(':', '-')}.json"
+    report.to_json(report_file)
+    print(f"\n📄 Report saved to: {report_file}")
+def main():
+    parser = argparse.ArgumentParser(
+        description="RNSR - Recursive Neural-Symbolic Retriever"
+    )
+    subparsers = parser.add_subparsers(dest="command", help="Commands")
+    # Ingest command
+    ingest_parser = subparsers.add_parser("ingest", help="Ingest a PDF document")
+    ingest_parser.add_argument("file", help="Path to PDF file")
+    ingest_parser.add_argument("-o", "--output", help="Output JSON file for tree")
+    # Index command
+    index_parser = subparsers.add_parser("index", help="Build skeleton index")
+    index_parser.add_argument("file", help="Path to PDF file")
+    index_parser.add_argument("--db", help="SQLite database path")
+    # Query command
+    query_parser = subparsers.add_parser("query", help="Query a document")
+    query_parser.add_argument("file", help="Path to PDF file")
+    query_parser.add_argument("query", help="Question to ask")
+    query_parser.add_argument("--max-iter", type=int, default=20, help="Max iterations")
+    query_parser.add_argument("--trace", action="store_true", help="Show trace")
+    # Benchmark command
+    bench_parser = subparsers.add_parser("benchmark", help="Run benchmarks")
+    bench_parser.add_argument(
+        "--config", "-c",
+        help="Path to benchmark config JSON file"
+    )
+    bench_parser.add_argument(
+        "--files", "-f",
+        nargs="+",
+        help="PDF files to benchmark"
+    )
+    bench_parser.add_argument(
+        "--iterations", "-n",
+        type=int,
+        default=3,
+        help="Number of iterations per benchmark (default: 3)"
+    )
+    bench_parser.add_argument(
+        "--output", "-o",
+        help="Output directory for results"
+    )
+    bench_parser.add_argument(
+        "--performance", "-p",
+        action="store_true",
+        help="Run performance benchmarks"
+    )
+    bench_parser.add_argument(
+        "--quality", "-q",
+        action="store_true",
+        help="Run quality benchmarks"
+    )
+    bench_parser.add_argument(
+        "--all", "-a",
+        action="store_true",
+        help="Run all benchmarks"
+    )
+    args = parser.parse_args()
+    if args.command == "ingest":
+        cmd_ingest(args)
+    elif args.command == "index":
+        cmd_index(args)
+    elif args.command == "query":
+        cmd_query(args)
+    elif args.command == "benchmark":
+        cmd_benchmark(args)
+    else:
+        parser.print_help()
+if __name__ == "__main__":
+    main()

rnsr/agent/__init__.py ADDED Viewed

@@ -0,0 +1,218 @@
+"""
+Agent Module - Recursive Navigator with Full RLM Support
+Implements the state-of-the-art hybrid retrieval system combining:
+- PageIndex: Vectorless, reasoning-based tree search
+- RLMs: REPL environment with recursive sub-LLM calls
+- RNSR: Latent hierarchy reconstruction + variable stitching
+Key Features:
+1. RLM Navigator - Full recursive language model with pre-filtering
+2. REPLEnvironment - Python REPL with DOC_VAR and code execution
+3. Variable Store - Pointer-based stitching to prevent context pollution
+4. Tree of Thoughts (ToT) - LLM-based navigation decisions
+5. Pre-filtering - Keyword/regex filtering before LLM calls
+6. Deep Recursion - Multi-level recursive sub-LLM calls
+7. Answer Verification - Sub-LLM validation of answers
+8. Async Processing - Parallel sub-LLM execution
+Enhanced Features (New):
+9. Provenance System - Traceable citations for every answer
+10. LLM Cache - Semantic-aware caching for performance
+11. Self-Reflection - Iterative self-correction loop
+12. Reasoning Memory - Learn from successful query chains
+13. Query Clarification - Handle ambiguous queries
+Inspired by:
+- PageIndex (VectifyAI): https://github.com/VectifyAI/PageIndex
+- Recursive Language Models: https://arxiv.org/html/2512.24601v1
+"""
+from rnsr.agent.graph import (
+    AgentState,
+    build_navigator_graph,
+    create_initial_state,
+    create_navigator_tools,
+    run_navigator,
+    # Tree of Thoughts (Section 7.2)
+    evaluate_children_with_tot,
+    backtrack_to_parent,
+    TOT_SYSTEM_PROMPT,
+    # RLM Recursive Execution (Section 2.2)
+    execute_sub_task_with_llm,
+    batch_execute_sub_tasks,
+    process_pending_questions,
+    DECOMPOSITION_PROMPT,
+)
+from rnsr.agent.variable_store import VariableStore, generate_pointer_name
+from rnsr.agent.navigator_api import (
+    NavigatorAPI,
+    create_navigator,
+    execute_rap_query,
+)
+from rnsr.agent.repl_env import (
+    REPLEnvironment,
+    create_repl_environment,
+    RLM_SYSTEM_PROMPT,
+    batch_process_async,
+)
+from rnsr.agent.rlm_navigator import (
+    RLMNavigator,
+    RLMConfig,
+    RLMAgentState,
+    PreFilterEngine,
+    RecursiveSubLLMEngine,
+    AnswerVerificationEngine,
+    EntityAwareDecomposer,
+    create_rlm_navigator,
+    run_rlm_navigator,
+    # Adaptive Learning
+    LearnedStopWords,
+    LearnedQueryPatterns,
+    get_learned_stop_words,
+    get_learned_query_patterns,
+)
+from rnsr.agent.cross_doc_navigator import (
+    CrossDocNavigator,
+    CrossDocQuery,
+    CrossDocAnswer,
+    DocumentResult,
+    create_cross_doc_navigator,
+)
+# New Enhancement Modules
+from rnsr.agent.provenance import (
+    ProvenanceTracker,
+    ProvenanceRecord,
+    Citation,
+    Contradiction,
+    CitationStrength,
+    create_citation,
+    format_citations_for_display,
+)
+from rnsr.agent.llm_cache import (
+    LLMCache,
+    CachedLLM,
+    get_global_cache,
+    wrap_llm_with_cache,
+)
+from rnsr.agent.self_reflection import (
+    SelfReflectionEngine,
+    ReflectionResult,
+    CritiqueResult,
+    reflect_on_answer,
+)
+from rnsr.agent.reasoning_memory import (
+    ReasoningChainMemory,
+    ReasoningChain,
+    ReasoningStep,
+    ChainMatch,
+    get_reasoning_memory,
+    store_reasoning_chain,
+    find_similar_chains,
+)
+from rnsr.agent.query_clarifier import (
+    QueryClarifier,
+    AmbiguityAnalysis,
+    ClarificationRequest,
+    ClarificationResult,
+    needs_clarification,
+    clarify_query,
+)
+__all__ = [
+    # RLM Navigator (State-of-the-Art)
+    "RLMNavigator",
+    "RLMConfig",
+    "RLMAgentState",
+    "PreFilterEngine",
+    "RecursiveSubLLMEngine",
+    "AnswerVerificationEngine",
+    "EntityAwareDecomposer",
+    "create_rlm_navigator",
+    "run_rlm_navigator",
+    # Cross-Document Navigator
+    "CrossDocNavigator",
+    "CrossDocQuery",
+    "CrossDocAnswer",
+    "DocumentResult",
+    "create_cross_doc_navigator",
+    # REPL Environment (Section 2.1 - Prompt-as-Environment)
+    "REPLEnvironment",
+    "create_repl_environment",
+    "RLM_SYSTEM_PROMPT",
+    "batch_process_async",
+    # Navigator API (Section 5.1 Phase III)
+    "NavigatorAPI",
+    "create_navigator",
+    "execute_rap_query",
+    # Variable Store
+    "VariableStore",
+    "generate_pointer_name",
+    # Agent Graph
+    "AgentState",
+    "build_navigator_graph",
+    "create_initial_state",
+    "create_navigator_tools",
+    "run_navigator",
+    # Tree of Thoughts (Section 7.2)
+    "evaluate_children_with_tot",
+    "backtrack_to_parent",
+    "TOT_SYSTEM_PROMPT",
+    # RLM Recursive Execution (Section 2.2)
+    "execute_sub_task_with_llm",
+    "batch_execute_sub_tasks",
+    "process_pending_questions",
+    "DECOMPOSITION_PROMPT",
+    # Adaptive Learning
+    "LearnedStopWords",
+    "LearnedQueryPatterns",
+    "get_learned_stop_words",
+    "get_learned_query_patterns",
+    # Provenance System (NEW)
+    "ProvenanceTracker",
+    "ProvenanceRecord",
+    "Citation",
+    "Contradiction",
+    "CitationStrength",
+    "create_citation",
+    "format_citations_for_display",
+    # LLM Cache (NEW)
+    "LLMCache",
+    "CachedLLM",
+    "get_global_cache",
+    "wrap_llm_with_cache",
+    # Self-Reflection (NEW)
+    "SelfReflectionEngine",
+    "ReflectionResult",
+    "CritiqueResult",
+    "reflect_on_answer",
+    # Reasoning Memory (NEW)
+    "ReasoningChainMemory",
+    "ReasoningChain",
+    "ReasoningStep",
+    "ChainMatch",
+    "get_reasoning_memory",
+    "store_reasoning_chain",
+    "find_similar_chains",
+    # Query Clarification (NEW)
+    "QueryClarifier",
+    "AmbiguityAnalysis",
+    "ClarificationRequest",
+    "ClarificationResult",
+    "needs_clarification",
+    "clarify_query",
+]