PyPI - polyrag - Versions diffs - 0.1.0__tar.gz - Mend

polyrag 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (141) hide show

polyrag-0.1.0/LICENSE +21 -0
polyrag-0.1.0/PKG-INFO +380 -0
polyrag-0.1.0/README.md +274 -0
polyrag-0.1.0/app/__init__.py +0 -0
polyrag-0.1.0/app/config/__init__.py +0 -0
polyrag-0.1.0/app/config/llm.py +243 -0
polyrag-0.1.0/app/config/settings.py +39 -0
polyrag-0.1.0/app/core/__init__.py +0 -0
polyrag-0.1.0/app/core/parsers/__init__.py +0 -0
polyrag-0.1.0/app/core/parsers/unified_parser.py +25 -0
polyrag-0.1.0/app/core/services/__init__.py +0 -0
polyrag-0.1.0/app/core/services/structured_data/__init__.py +3 -0
polyrag-0.1.0/app/core/services/structured_data/evaluation_assertions.py +3 -0
polyrag-0.1.0/app/core/services/structured_data/query_service.py +3 -0
polyrag-0.1.0/app/core/services/structured_data/run.py +3 -0
polyrag-0.1.0/app/core/services/structured_data/run_llm_rate_limit_stress.py +3 -0
polyrag-0.1.0/app/core/services/structured_data/run_query.py +2 -0
polyrag-0.1.0/app/utils/__init__.py +0 -0
polyrag-0.1.0/app/utils/ocr.py +7 -0
polyrag-0.1.0/connectors/__init__.py +0 -0
polyrag-0.1.0/connectors/mongodb_connector.py +411 -0
polyrag-0.1.0/connectors/neo4j_connector.py +313 -0
polyrag-0.1.0/connectors/sqs_connector.py +259 -0
polyrag-0.1.0/knowledge_graph/__init__.py +79 -0
polyrag-0.1.0/knowledge_graph/evaluate_queries.py +207 -0
polyrag-0.1.0/knowledge_graph/evaluation_assertions.py +387 -0
polyrag-0.1.0/knowledge_graph/event_resolver.py +475 -0
polyrag-0.1.0/knowledge_graph/extraction_normalizer.py +275 -0
polyrag-0.1.0/knowledge_graph/extractor.py +326 -0
polyrag-0.1.0/knowledge_graph/feature_flags.py +176 -0
polyrag-0.1.0/knowledge_graph/graph_builder.py +421 -0
polyrag-0.1.0/knowledge_graph/neo4j_storage.py +709 -0
polyrag-0.1.0/knowledge_graph/ontology_builder.py +168 -0
polyrag-0.1.0/knowledge_graph/ontology_extractor.py +117 -0
polyrag-0.1.0/knowledge_graph/ontology_linker.py +234 -0
polyrag-0.1.0/knowledge_graph/ontology_prompts.py +87 -0
polyrag-0.1.0/knowledge_graph/ontology_schemas.py +190 -0
polyrag-0.1.0/knowledge_graph/ontology_storage.py +287 -0
polyrag-0.1.0/knowledge_graph/pipeline.py +1028 -0
polyrag-0.1.0/knowledge_graph/prompts.py +123 -0
polyrag-0.1.0/knowledge_graph/query_date_parser.py +144 -0
polyrag-0.1.0/knowledge_graph/query_schemas.py +170 -0
polyrag-0.1.0/knowledge_graph/query_service.py +633 -0
polyrag-0.1.0/knowledge_graph/run.py +268 -0
polyrag-0.1.0/knowledge_graph/run_query.py +322 -0
polyrag-0.1.0/knowledge_graph/schema_parser.py +288 -0
polyrag-0.1.0/knowledge_graph/schemas.py +392 -0
polyrag-0.1.0/llm/__init__.py +0 -0
polyrag-0.1.0/llm/base.py +202 -0
polyrag-0.1.0/llm/manager.py +313 -0
polyrag-0.1.0/llm/providers/google_provider.py +229 -0
polyrag-0.1.0/llm/providers/huggingface_provider.py +164 -0
polyrag-0.1.0/llm/providers/ollama_provider.py +113 -0
polyrag-0.1.0/llm/providers/openai_provider.py +223 -0
polyrag-0.1.0/llm/rate_limit.py +306 -0
polyrag-0.1.0/llm/rate_limit_backends.py +625 -0
polyrag-0.1.0/llm/rate_limit_schemas.py +64 -0
polyrag-0.1.0/llm/schemas.py +893 -0
polyrag-0.1.0/openrag/__init__.py +27 -0
polyrag-0.1.0/openrag/config.py +175 -0
polyrag-0.1.0/openrag/embedding/__init__.py +6 -0
polyrag-0.1.0/openrag/embedding/base.py +52 -0
polyrag-0.1.0/openrag/embedding/manager.py +100 -0
polyrag-0.1.0/openrag/embedding/providers/__init__.py +5 -0
polyrag-0.1.0/openrag/embedding/providers/sentence_transformers.py +89 -0
polyrag-0.1.0/openrag/fusion.py +40 -0
polyrag-0.1.0/openrag/indexer.py +229 -0
polyrag-0.1.0/openrag/retriever.py +162 -0
polyrag-0.1.0/openrag/run_index.py +145 -0
polyrag-0.1.0/openrag/run_query.py +126 -0
polyrag-0.1.0/openrag/schemas.py +67 -0
polyrag-0.1.0/openrag/stores/__init__.py +7 -0
polyrag-0.1.0/openrag/stores/base.py +118 -0
polyrag-0.1.0/openrag/stores/factory.py +74 -0
polyrag-0.1.0/openrag/stores/text_mongodb.py +273 -0
polyrag-0.1.0/openrag/stores/text_mysql.py +273 -0
polyrag-0.1.0/openrag/stores/text_postgres.py +270 -0
polyrag-0.1.0/openrag/stores/text_sqlite.py +272 -0
polyrag-0.1.0/openrag/stores/vector_store.py +277 -0
polyrag-0.1.0/openrag/text/__init__.py +0 -0
polyrag-0.1.0/openrag/text/tokenize.py +17 -0
polyrag-0.1.0/parsers/__init__.py +22 -0
polyrag-0.1.0/parsers/csv_parser.py +320 -0
polyrag-0.1.0/parsers/docling_utils.py +514 -0
polyrag-0.1.0/parsers/docx_parser.py +509 -0
polyrag-0.1.0/parsers/llama_parse_extraction.py +353 -0
polyrag-0.1.0/parsers/pdf_parser.py +745 -0
polyrag-0.1.0/parsers/pipeline_text_parser.py +291 -0
polyrag-0.1.0/parsers/ppt_parser.py +254 -0
polyrag-0.1.0/parsers/txt_parser.py +366 -0
polyrag-0.1.0/parsers/unified_parser.py +721 -0
polyrag-0.1.0/parsers/utils.py +1838 -0
polyrag-0.1.0/parsers/xlsx_parser.py +377 -0
polyrag-0.1.0/parsers/xml_parser.py +197 -0
polyrag-0.1.0/polyrag.egg-info/PKG-INFO +380 -0
polyrag-0.1.0/polyrag.egg-info/SOURCES.txt +139 -0
polyrag-0.1.0/polyrag.egg-info/dependency_links.txt +1 -0
polyrag-0.1.0/polyrag.egg-info/requires.txt +99 -0
polyrag-0.1.0/polyrag.egg-info/top_level.txt +8 -0
polyrag-0.1.0/pyproject.toml +166 -0
polyrag-0.1.0/setup.cfg +4 -0
polyrag-0.1.0/structured_data/__init__.py +26 -0
polyrag-0.1.0/structured_data/athena_storage.py +12 -0
polyrag-0.1.0/structured_data/chunking.py +322 -0
polyrag-0.1.0/structured_data/entity_consolidator.py +231 -0
polyrag-0.1.0/structured_data/entity_resolver.py +680 -0
polyrag-0.1.0/structured_data/evaluate_queries.py +222 -0
polyrag-0.1.0/structured_data/evaluate_salary_queries.py +274 -0
polyrag-0.1.0/structured_data/evaluation_assertions.py +110 -0
polyrag-0.1.0/structured_data/extraction_normalizer.py +542 -0
polyrag-0.1.0/structured_data/extractor.py +628 -0
polyrag-0.1.0/structured_data/feature_flags.py +90 -0
polyrag-0.1.0/structured_data/pipeline.py +1001 -0
polyrag-0.1.0/structured_data/prompts.py +169 -0
polyrag-0.1.0/structured_data/query_entity_resolver.py +235 -0
polyrag-0.1.0/structured_data/query_executor.py +29 -0
polyrag-0.1.0/structured_data/query_intent.py +176 -0
polyrag-0.1.0/structured_data/query_prompts.py +397 -0
polyrag-0.1.0/structured_data/query_result_processor.py +203 -0
polyrag-0.1.0/structured_data/query_schemas.py +38 -0
polyrag-0.1.0/structured_data/query_service.py +532 -0
polyrag-0.1.0/structured_data/query_sql_generator.py +325 -0
polyrag-0.1.0/structured_data/query_sql_guardrails.py +594 -0
polyrag-0.1.0/structured_data/run.py +217 -0
polyrag-0.1.0/structured_data/run_llm_rate_limit_stress.py +451 -0
polyrag-0.1.0/structured_data/run_query.py +374 -0
polyrag-0.1.0/structured_data/schemas.py +247 -0
polyrag-0.1.0/structured_data/storage/__init__.py +11 -0
polyrag-0.1.0/structured_data/storage/athena_query_executor.py +107 -0
polyrag-0.1.0/structured_data/storage/athena_storage.py +504 -0
polyrag-0.1.0/structured_data/storage/base.py +174 -0
polyrag-0.1.0/structured_data/storage/factory.py +60 -0
polyrag-0.1.0/structured_data/storage/mysql_storage.py +202 -0
polyrag-0.1.0/structured_data/storage/postgres_storage.py +188 -0
polyrag-0.1.0/structured_data/storage/sql_query_executor.py +179 -0
polyrag-0.1.0/structured_data/storage/sqlite_storage.py +196 -0
polyrag-0.1.0/structured_data/type_utils.py +307 -0
polyrag-0.1.0/unified_pipeline/__init__.py +34 -0
polyrag-0.1.0/unified_pipeline/config.py +117 -0
polyrag-0.1.0/unified_pipeline/pipeline.py +249 -0
polyrag-0.1.0/unified_pipeline/run_process.py +232 -0

polyrag-0.1.0/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 Fermi-Dev
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

polyrag-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,380 @@
+Metadata-Version: 2.4
+Name: polyrag
+Version: 0.1.0
+Summary: Multi-modal document intelligence: BM25 + FAISS hybrid search, knowledge graph extraction, and structured SQL querying
+License-Expression: MIT
+Classifier: Development Status :: 4 - Beta
+Classifier: Intended Audience :: Developers
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Classifier: Topic :: Text Processing :: Indexing
+Requires-Python: >=3.10
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: numpy>=1.24.0
+Requires-Dist: faiss-cpu>=1.7.4
+Requires-Dist: sentence-transformers>=3.0.0
+Requires-Dist: python-dotenv>=1.0.0
+Requires-Dist: tqdm>=4.67.1
+Provides-Extra: llm
+Requires-Dist: openai>=1.0.0; extra == "llm"
+Requires-Dist: langchain>=0.3.0; extra == "llm"
+Requires-Dist: langchain-core>=0.3.0; extra == "llm"
+Requires-Dist: langchain-openai>=0.3.0; extra == "llm"
+Requires-Dist: langchain-google-genai>=2.0.0; extra == "llm"
+Requires-Dist: langchain-huggingface>=0.1.0; extra == "llm"
+Requires-Dist: langchain-ollama>=0.2.0; extra == "llm"
+Requires-Dist: accelerate>=0.27.0; extra == "llm"
+Provides-Extra: structured
+Requires-Dist: langchain>=0.3.0; extra == "structured"
+Requires-Dist: langchain-core>=0.3.0; extra == "structured"
+Requires-Dist: langchain-community>=0.3.0; extra == "structured"
+Requires-Dist: langchain-experimental>=0.3.0; extra == "structured"
+Requires-Dist: langchain-text-splitters>=0.3.0; extra == "structured"
+Requires-Dist: pandas>=2.0.0; extra == "structured"
+Requires-Dist: sqlparse>=0.5.0; extra == "structured"
+Requires-Dist: tabulate>=0.9.0; extra == "structured"
+Requires-Dist: scipy>=1.10.0; extra == "structured"
+Requires-Dist: scikit-learn>=1.3.0; extra == "structured"
+Requires-Dist: nest-asyncio>=1.6.0; extra == "structured"
+Provides-Extra: kg
+Requires-Dist: networkx>=3.0; extra == "kg"
+Requires-Dist: rdflib>=7.0.0; extra == "kg"
+Requires-Dist: neo4j>=5.15.0; extra == "kg"
+Requires-Dist: langchain-neo4j>=0.4.0; extra == "kg"
+Requires-Dist: langgraph>=0.5.0; extra == "kg"
+Requires-Dist: graphiti-core>=0.24.0; extra == "kg"
+Requires-Dist: pyvis>=0.3.2; extra == "kg"
+Requires-Dist: plotly>=6.0.0; extra == "kg"
+Requires-Dist: matplotlib>=3.7.0; extra == "kg"
+Provides-Extra: parsers
+Requires-Dist: pypdf2>=3.0.0; extra == "parsers"
+Requires-Dist: pdfplumber>=0.11.0; extra == "parsers"
+Requires-Dist: pdf2image>=1.17.0; extra == "parsers"
+Requires-Dist: python-docx>=1.1.0; extra == "parsers"
+Requires-Dist: python-pptx>=0.6.21; extra == "parsers"
+Requires-Dist: openpyxl>=3.1.0; extra == "parsers"
+Requires-Dist: docx2txt>=0.9; extra == "parsers"
+Requires-Dist: reportlab>=4.0.0; extra == "parsers"
+Requires-Dist: docling>=2.0.0; extra == "parsers"
+Requires-Dist: llama-cloud-services>=0.6.0; extra == "parsers"
+Provides-Extra: db-postgres
+Requires-Dist: psycopg>=3.2.0; extra == "db-postgres"
+Requires-Dist: psycopg2-binary>=2.9.0; extra == "db-postgres"
+Requires-Dist: psycopg-binary>=3.2.0; extra == "db-postgres"
+Provides-Extra: db-mysql
+Requires-Dist: mysql-connector-python>=8.3.0; extra == "db-mysql"
+Requires-Dist: pymysql>=1.1.0; extra == "db-mysql"
+Provides-Extra: db-mongodb
+Requires-Dist: pymongo>=4.14.0; extra == "db-mongodb"
+Requires-Dist: langchain-mongodb>=0.7.0; extra == "db-mongodb"
+Provides-Extra: db-elastic
+Requires-Dist: elasticsearch>=9.0.0; extra == "db-elastic"
+Requires-Dist: opensearch-py>=3.0.0; extra == "db-elastic"
+Provides-Extra: aws
+Requires-Dist: boto3>=1.34.0; extra == "aws"
+Requires-Dist: awswrangler>=3.0.0; extra == "aws"
+Requires-Dist: pyathena>=3.22.0; extra == "aws"
+Requires-Dist: certifi>=2025.0.0; extra == "aws"
+Requires-Dist: aws-sqs-consumer>=0.0.15; extra == "aws"
+Requires-Dist: requests-aws4auth>=1.3.0; extra == "aws"
+Provides-Extra: api
+Requires-Dist: fastapi[standard]>=0.116.0; extra == "api"
+Requires-Dist: uvicorn>=0.35.0; extra == "api"
+Requires-Dist: fastapi-cli>=0.0.13; extra == "api"
+Requires-Dist: python-multipart>=0.0.20; extra == "api"
+Requires-Dist: httpx>=0.28.0; extra == "api"
+Requires-Dist: fastmcp>=2.13.0; extra == "api"
+Requires-Dist: streamlit>=1.46.0; extra == "api"
+Requires-Dist: requests>=2.32.0; extra == "api"
+Requires-Dist: cryptography>=45.0.0; extra == "api"
+Requires-Dist: redis>=5.0.0; extra == "api"
+Requires-Dist: agentlightning>=0.1.0; extra == "api"
+Requires-Dist: clear>=2.0.0; extra == "api"
+Provides-Extra: dev
+Requires-Dist: pytest>=8.0.0; extra == "dev"
+Requires-Dist: pytest-asyncio>=0.23.0; extra == "dev"
+Requires-Dist: twine>=5.0.0; extra == "dev"
+Requires-Dist: build>=1.0.0; extra == "dev"
+Provides-Extra: all
+Requires-Dist: polyrag[api,aws,db-elastic,db-mongodb,db-mysql,db-postgres,kg,llm,parsers,structured]; extra == "all"
+Dynamic: license-file
+# doc_pros — Document Intelligence Platform
+## Goal
+Transform unstructured documents (PDFs, Word files, spreadsheets, etc.) into a **queryable structured database** — then let users ask natural language questions and get precise, SQL-computed answers instead of LLM-hallucinated guesses.
+Optionally pair structured extraction with **OpenRAG** — a fully open-source BM25 + FAISS retrieval layer — for semantic and keyword-based document search without any paid APIs.
+## Why This Exists (vs. Basic RAG)
+Traditional RAG (e.g., OpenSearch + embeddings) retrieves text chunks and asks an LLM to synthesize answers. This fails for:
+- **Numerical questions** — "What's the total salary expenditure?" (LLM can't reliably add numbers from chunks)
+- **Aggregations** — SUM, AVG, COUNT, MAX, ranking across documents
+- **Cross-document comparison** — same entity appearing in 8 yearly reports
+- **Consistency** — same question returns different answers depending on which chunks are retrieved
+This platform **extracts structured data first**, stores it in typed SQL tables, then uses the LLM only to generate the SQL query — the actual computation is done by the database engine, eliminating hallucination for factual/numerical answers.
+For free-text semantic search, **OpenRAG** adds BM25 keyword search (inverted index) and FAISS vector search (HNSW embeddings) as a parallel retrieval layer — both fully open-source.
+## Architecture
+```
+Raw Documents
+    |
+    v
+[parsers/] ─────────────> Unified Parser (Docling / LlamaParse / pdfplumber)
+    |
+    |─────────────────────────────────────────────────────────────────┐
+    v                                                                 v
+[structured_data/]                                              [openrag/]
+    |── extractor.py ────> LLM-based 3-tier extraction          |── stores/
+    |── entity_resolver.py > 5-stage entity resolution          |     |── text_sqlite.py    BM25 (SQLite)
+    |── entity_consolidator> Merge into master records          |     |── text_postgres.py  BM25 (Postgres)
+    |── storage/         ──> Pluggable SQL backend              |     |── text_mysql.py     BM25 (MySQL)
+    |     |── athena      (AWS Athena / Iceberg)                |     |── text_mongodb.py   BM25 (MongoDB)
+    |     |── postgres    (PostgreSQL)                          |     └── vector_store.py   FAISS HNSW
+    |     |── mysql       (MySQL)                              |── embedding/
+    |     └── sqlite      (local)                              |     └── sentence_transformers
+    |── query_service.py ──> NL → SQL → results               |── indexer.py   index_file / index_text
+    |                                                          |── retriever.py retrieve (hybrid/text/vector)
+    v                                                          |── run_index.py CLI
+[connectors/] ─────────> MongoDB, Neo4j, AWS SQS             └── run_query.py CLI
+    |
+    v
+[llm/] ─────────────────> OpenAI, Google Gemini providers
+    |
+    v
+[unified_pipeline/] ────> Orchestrate both pipelines (parse once, share text)
+```
+## Modules
+| Module | Purpose |
+|--------|---------|
+| [parsers/](parsers/) | Multi-format document parsing (PDF, DOCX, XLSX, CSV, PPTX, TXT, XML) |
+| [llm/](llm/) | LLM provider abstraction, model management, Pydantic schemas |
+| [connectors/](connectors/) | Database connectors (MongoDB, Neo4j, AWS SQS) |
+| [structured_data/](structured_data/) | Core pipeline: LLM extraction, entity resolution, pluggable SQL storage, NL querying |
+| [openrag/](openrag/) | Open-source BM25 + FAISS retrieval: index and search document chunks |
+| [unified_pipeline/](unified_pipeline/) | Flexible orchestration: run structured, RAG, or both with one call |
+| [search_algorithms/](search_algorithms/) | Reference BM25, HNSW, TF-IDF implementations |
+## Data Flow
+**Structured extraction (ingestion):**
+```
+Document file  ->  Parse to text (pdfplumber / Docling)
+               ->  LLM extracts structured rows (entity, metric, date, attribute...)
+               ->  Entity resolver links mentions across documents
+               ->  Consolidator merges into master records (MongoDB)
+               ->  Store typed rows in pluggable SQL backend (Athena / Postgres / MySQL / SQLite)
+```
+**RAG indexing (OpenRAG):**
+```
+Document file  ->  Parse to text
+               ->  Chunk into overlapping windows (~10k chars, 500 overlap)
+               ->  Embed with sentence-transformers (any HuggingFace model)
+               ->  Store BM25 inverted index in chosen text backend
+               ->  Store embeddings in FAISS IndexHNSWFlat + JSON sidecar
+```
+**Unified (both together):**
+```
+Document file  ->  Parse ONCE (shared text)
+               ->  Structured extraction  →  entity metadata (names, types, doc_type)
+               ->  RAG indexing with entity metadata attached to every chunk
+               ->  Query via SQL (precise aggregations) OR vector/BM25 (semantic search)
+```
+**Querying:**
+```
+NL question  ->  Extract entity candidates from question
+             ->  Resolve against MongoDB entities_master
+             ->  LLM generates SQL with entity context
+             ->  Execute SQL on chosen backend
+             ->  Return precise, computed results
+                   OR
+NL question  ->  BM25 keyword search + FAISS vector search
+             ->  Reciprocal Rank Fusion (RRF) for hybrid results
+             ->  Return ranked chunks with full text + metadata
+```
+## Tech Stack
+- **Python 3.13+** with **uv** package manager
+- **LLM Providers**: OpenAI (GPT-4o), Google Gemini
+- **Document Parsing**: Docling (local), LlamaParse (cloud), pdfplumber
+- **Structured Storage**: AWS Athena/Iceberg, PostgreSQL, MySQL, SQLite (pluggable)
+- **Entity Master**: MongoDB
+- **Graph**: Neo4j (optional)
+- **Queue**: AWS SQS for async processing
+- **Embeddings**: sentence-transformers (any HuggingFace model, fully open-source)
+- **Vector Index**: FAISS IndexHNSWFlat
+- **Text Search**: BM25 inverted index (SQLite / PostgreSQL / MySQL / MongoDB)
+- **Frameworks**: FastAPI, LangChain, LangGraph, Pydantic
+## Setup
+```bash
+# Install dependencies
+uv sync
+# Configure environment
+cp .env.example .env
+# Fill in: OPENAI_API_KEY, mongo_connection_string, AWS credentials, etc.
+```
+## Quick Start
+### Structured Extraction
+```python
+from structured_data import StructuredDataPipeline
+pipeline = StructuredDataPipeline()
+result = pipeline.process_file(
+    file_path="document.pdf",
+    tenant_id="org_123",
+    user_id="user_456",
+    resource_id="res_789",
+)
+print(result["extraction_count"])   # number of entities extracted
+print(result["document_id"])        # stable doc identifier
+# Query extracted data with natural language
+from structured_data import StructuredDataQueryService
+svc = StructuredDataQueryService()
+answer = svc.query("What is the average salary in Engineering?", tenant_id="org_123")
+print(answer["rows"])   # precise SQL-computed result
+```
+Choose your storage backend via `OPENRAG_STORAGE_BACKEND` env var:
+```bash
+OPENRAG_STORAGE_BACKEND=sqlite   # local SQLite (default for dev)
+OPENRAG_STORAGE_BACKEND=postgres # PostgreSQL
+OPENRAG_STORAGE_BACKEND=mysql    # MySQL
+OPENRAG_STORAGE_BACKEND=athena   # AWS Athena / Iceberg (production)
+```
+### OpenRAG — Semantic + Keyword Search
+```bash
+# Index a document (BM25 + FAISS)
+python -m openrag.run_index document.pdf \
+    --text-backend sqlite \
+    --embedding-model all-MiniLM-L6-v2 \
+    --verbose
+# Hybrid search (BM25 + vector, fused with RRF)
+python -m openrag.run_query "engineering salary 2024" \
+    --mode hybrid --top-k 5 --show-text
+# Text-only (BM25)
+python -m openrag.run_query "invoice total amount" --mode text --top-k 5
+# Vector-only (FAISS HNSW)
+python -m openrag.run_query "compensation benefits" --mode vector --top-k 5
+```
+Text backends for BM25: `sqlite` (default), `postgres`, `mysql`, `mongodb`.
+Embedding models: any model on HuggingFace via sentence-transformers (e.g. `all-MiniLM-L6-v2`, `BAAI/bge-small-en-v1.5`).
+```python
+from openrag import OpenRAGConfig, OpenRAGIndexer, OpenRAGRetriever
+cfg = OpenRAGConfig(
+    text_backend="sqlite",
+    embedding_model="all-MiniLM-L6-v2",
+    retrieval_mode="hybrid",
+)
+OpenRAGIndexer(cfg).index_file("document.pdf")
+retriever = OpenRAGRetriever(cfg)
+results = retriever.retrieve("engineering salary", top_k=5)
+for r in results:
+    print(r["retrieval_score"], r["text"][:120])
+    print(r.get("metadata", {}))   # entity_names, entity_types, doc_type (if indexed via unified pipeline)
+```
+### Unified Pipeline — Structured + RAG Together
+Parse once, run both pipelines, share entity metadata as chunk enrichment:
+```bash
+# Both pipelines (parse once, entity metadata enriches RAG chunks)
+python -m unified_pipeline.run_process document.pdf \
+    --mode both \
+    --tenant-id org_123 --user-id user_456 --resource-id res_789 \
+    --text-backend sqlite \
+    --embedding-model all-MiniLM-L6-v2 \
+    --verbose
+# Structured extraction only
+python -m unified_pipeline.run_process document.pdf \
+    --mode structured-only \
+    --tenant-id org_123 --user-id user_456 --resource-id res_789
+# RAG indexing only
+python -m unified_pipeline.run_process document.pdf \
+    --mode rag-only --rag-retrieval hybrid \
+    --text-backend sqlite --embedding-model all-MiniLM-L6-v2
+```
+```python
+from unified_pipeline import UnifiedDocumentPipeline, UnifiedPipelineConfig
+from openrag import OpenRAGConfig
+cfg = UnifiedPipelineConfig(
+    enable_structured=True,
+    enable_rag=True,
+    rag_retrieval_mode="hybrid",
+    tenant_id="org_123",
+    user_id="user_456",
+    resource_id="res_789",
+    openrag=OpenRAGConfig(text_backend="sqlite"),
+)
+pipeline = UnifiedDocumentPipeline(cfg)
+result = pipeline.process_file("document.pdf")
+print(result["structured"]["extraction_count"])   # entities extracted
+print(result["rag"]["chunks_indexed"])            # chunks in BM25 + FAISS
+# In "both" mode, both systems share the same document_id
+print(result["structured"]["document_id"] == result["rag"]["doc_id"])  # True
+```
+When running in `both` mode:
+- The document is parsed **once** and the text is shared between both pipelines
+- Extracted entity names, types, and document type are stored as metadata on every RAG chunk
+- Both systems use the same `document_id` so you can filter/query by it in either system
+## Environment Variables
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `OPENRAG_STORAGE_BACKEND` | `athena` | Structured extraction backend: `athena`, `postgres`, `mysql`, `sqlite` |
+| `OPENRAG_TEXT_BACKEND` | `sqlite` | RAG BM25 backend: `sqlite`, `postgres`, `mysql`, `mongodb` |
+| `OPENRAG_EMBEDDING_MODEL` | `all-MiniLM-L6-v2` | sentence-transformers model for embeddings |
+| `OPENRAG_EMBEDDING_DEVICE` | `cpu` | Compute device: `cpu`, `cuda`, `mps` |
+| `OPENRAG_FAISS_INDEX_PATH` | `./openrag.faiss` | FAISS binary index file path |
+| `OPENRAG_FAISS_METADATA_PATH` | `./openrag_meta.json` | FAISS JSON sidecar path |
+| `OPENRAG_RETRIEVAL_MODE` | `hybrid` | Default retrieval mode: `hybrid`, `text`, `vector` |
+| `PIPELINE_ENABLE_STRUCTURED` | `true` | Enable structured extraction in unified pipeline |
+| `PIPELINE_ENABLE_RAG` | `true` | Enable RAG indexing in unified pipeline |
+| `PIPELINE_RAG_RETRIEVAL_MODE` | `hybrid` | Default RAG mode in unified pipeline |
+| `PIPELINE_TENANT_ID` | `` | Tenant context for structured pipeline |
+| `PIPELINE_USER_ID` | `` | User context for structured pipeline |
+| `PIPELINE_RESOURCE_ID` | `` | Resource context for structured pipeline |

polyrag-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,274 @@
+# doc_pros — Document Intelligence Platform
+## Goal
+Transform unstructured documents (PDFs, Word files, spreadsheets, etc.) into a **queryable structured database** — then let users ask natural language questions and get precise, SQL-computed answers instead of LLM-hallucinated guesses.
+Optionally pair structured extraction with **OpenRAG** — a fully open-source BM25 + FAISS retrieval layer — for semantic and keyword-based document search without any paid APIs.
+## Why This Exists (vs. Basic RAG)
+Traditional RAG (e.g., OpenSearch + embeddings) retrieves text chunks and asks an LLM to synthesize answers. This fails for:
+- **Numerical questions** — "What's the total salary expenditure?" (LLM can't reliably add numbers from chunks)
+- **Aggregations** — SUM, AVG, COUNT, MAX, ranking across documents
+- **Cross-document comparison** — same entity appearing in 8 yearly reports
+- **Consistency** — same question returns different answers depending on which chunks are retrieved
+This platform **extracts structured data first**, stores it in typed SQL tables, then uses the LLM only to generate the SQL query — the actual computation is done by the database engine, eliminating hallucination for factual/numerical answers.
+For free-text semantic search, **OpenRAG** adds BM25 keyword search (inverted index) and FAISS vector search (HNSW embeddings) as a parallel retrieval layer — both fully open-source.
+## Architecture
+```
+Raw Documents
+    |
+    v
+[parsers/] ─────────────> Unified Parser (Docling / LlamaParse / pdfplumber)
+    |
+    |─────────────────────────────────────────────────────────────────┐
+    v                                                                 v
+[structured_data/]                                              [openrag/]
+    |── extractor.py ────> LLM-based 3-tier extraction          |── stores/
+    |── entity_resolver.py > 5-stage entity resolution          |     |── text_sqlite.py    BM25 (SQLite)
+    |── entity_consolidator> Merge into master records          |     |── text_postgres.py  BM25 (Postgres)
+    |── storage/         ──> Pluggable SQL backend              |     |── text_mysql.py     BM25 (MySQL)
+    |     |── athena      (AWS Athena / Iceberg)                |     |── text_mongodb.py   BM25 (MongoDB)
+    |     |── postgres    (PostgreSQL)                          |     └── vector_store.py   FAISS HNSW
+    |     |── mysql       (MySQL)                              |── embedding/
+    |     └── sqlite      (local)                              |     └── sentence_transformers
+    |── query_service.py ──> NL → SQL → results               |── indexer.py   index_file / index_text
+    |                                                          |── retriever.py retrieve (hybrid/text/vector)
+    v                                                          |── run_index.py CLI
+[connectors/] ─────────> MongoDB, Neo4j, AWS SQS             └── run_query.py CLI
+    |
+    v
+[llm/] ─────────────────> OpenAI, Google Gemini providers
+    |
+    v
+[unified_pipeline/] ────> Orchestrate both pipelines (parse once, share text)
+```
+## Modules
+| Module | Purpose |
+|--------|---------|
+| [parsers/](parsers/) | Multi-format document parsing (PDF, DOCX, XLSX, CSV, PPTX, TXT, XML) |
+| [llm/](llm/) | LLM provider abstraction, model management, Pydantic schemas |
+| [connectors/](connectors/) | Database connectors (MongoDB, Neo4j, AWS SQS) |
+| [structured_data/](structured_data/) | Core pipeline: LLM extraction, entity resolution, pluggable SQL storage, NL querying |
+| [openrag/](openrag/) | Open-source BM25 + FAISS retrieval: index and search document chunks |
+| [unified_pipeline/](unified_pipeline/) | Flexible orchestration: run structured, RAG, or both with one call |
+| [search_algorithms/](search_algorithms/) | Reference BM25, HNSW, TF-IDF implementations |
+## Data Flow
+**Structured extraction (ingestion):**
+```
+Document file  ->  Parse to text (pdfplumber / Docling)
+               ->  LLM extracts structured rows (entity, metric, date, attribute...)
+               ->  Entity resolver links mentions across documents
+               ->  Consolidator merges into master records (MongoDB)
+               ->  Store typed rows in pluggable SQL backend (Athena / Postgres / MySQL / SQLite)
+```
+**RAG indexing (OpenRAG):**
+```
+Document file  ->  Parse to text
+               ->  Chunk into overlapping windows (~10k chars, 500 overlap)
+               ->  Embed with sentence-transformers (any HuggingFace model)
+               ->  Store BM25 inverted index in chosen text backend
+               ->  Store embeddings in FAISS IndexHNSWFlat + JSON sidecar
+```
+**Unified (both together):**
+```
+Document file  ->  Parse ONCE (shared text)
+               ->  Structured extraction  →  entity metadata (names, types, doc_type)
+               ->  RAG indexing with entity metadata attached to every chunk
+               ->  Query via SQL (precise aggregations) OR vector/BM25 (semantic search)
+```
+**Querying:**
+```
+NL question  ->  Extract entity candidates from question
+             ->  Resolve against MongoDB entities_master
+             ->  LLM generates SQL with entity context
+             ->  Execute SQL on chosen backend
+             ->  Return precise, computed results
+                   OR
+NL question  ->  BM25 keyword search + FAISS vector search
+             ->  Reciprocal Rank Fusion (RRF) for hybrid results
+             ->  Return ranked chunks with full text + metadata
+```
+## Tech Stack
+- **Python 3.13+** with **uv** package manager
+- **LLM Providers**: OpenAI (GPT-4o), Google Gemini
+- **Document Parsing**: Docling (local), LlamaParse (cloud), pdfplumber
+- **Structured Storage**: AWS Athena/Iceberg, PostgreSQL, MySQL, SQLite (pluggable)
+- **Entity Master**: MongoDB
+- **Graph**: Neo4j (optional)
+- **Queue**: AWS SQS for async processing
+- **Embeddings**: sentence-transformers (any HuggingFace model, fully open-source)
+- **Vector Index**: FAISS IndexHNSWFlat
+- **Text Search**: BM25 inverted index (SQLite / PostgreSQL / MySQL / MongoDB)
+- **Frameworks**: FastAPI, LangChain, LangGraph, Pydantic
+## Setup
+```bash
+# Install dependencies
+uv sync
+# Configure environment
+cp .env.example .env
+# Fill in: OPENAI_API_KEY, mongo_connection_string, AWS credentials, etc.
+```
+## Quick Start
+### Structured Extraction
+```python
+from structured_data import StructuredDataPipeline
+pipeline = StructuredDataPipeline()
+result = pipeline.process_file(
+    file_path="document.pdf",
+    tenant_id="org_123",
+    user_id="user_456",
+    resource_id="res_789",
+)
+print(result["extraction_count"])   # number of entities extracted
+print(result["document_id"])        # stable doc identifier
+# Query extracted data with natural language
+from structured_data import StructuredDataQueryService
+svc = StructuredDataQueryService()
+answer = svc.query("What is the average salary in Engineering?", tenant_id="org_123")
+print(answer["rows"])   # precise SQL-computed result
+```
+Choose your storage backend via `OPENRAG_STORAGE_BACKEND` env var:
+```bash
+OPENRAG_STORAGE_BACKEND=sqlite   # local SQLite (default for dev)
+OPENRAG_STORAGE_BACKEND=postgres # PostgreSQL
+OPENRAG_STORAGE_BACKEND=mysql    # MySQL
+OPENRAG_STORAGE_BACKEND=athena   # AWS Athena / Iceberg (production)
+```
+### OpenRAG — Semantic + Keyword Search
+```bash
+# Index a document (BM25 + FAISS)
+python -m openrag.run_index document.pdf \
+    --text-backend sqlite \
+    --embedding-model all-MiniLM-L6-v2 \
+    --verbose
+# Hybrid search (BM25 + vector, fused with RRF)
+python -m openrag.run_query "engineering salary 2024" \
+    --mode hybrid --top-k 5 --show-text
+# Text-only (BM25)
+python -m openrag.run_query "invoice total amount" --mode text --top-k 5
+# Vector-only (FAISS HNSW)
+python -m openrag.run_query "compensation benefits" --mode vector --top-k 5
+```
+Text backends for BM25: `sqlite` (default), `postgres`, `mysql`, `mongodb`.
+Embedding models: any model on HuggingFace via sentence-transformers (e.g. `all-MiniLM-L6-v2`, `BAAI/bge-small-en-v1.5`).
+```python
+from openrag import OpenRAGConfig, OpenRAGIndexer, OpenRAGRetriever
+cfg = OpenRAGConfig(
+    text_backend="sqlite",
+    embedding_model="all-MiniLM-L6-v2",
+    retrieval_mode="hybrid",
+)
+OpenRAGIndexer(cfg).index_file("document.pdf")
+retriever = OpenRAGRetriever(cfg)
+results = retriever.retrieve("engineering salary", top_k=5)
+for r in results:
+    print(r["retrieval_score"], r["text"][:120])
+    print(r.get("metadata", {}))   # entity_names, entity_types, doc_type (if indexed via unified pipeline)
+```
+### Unified Pipeline — Structured + RAG Together
+Parse once, run both pipelines, share entity metadata as chunk enrichment:
+```bash
+# Both pipelines (parse once, entity metadata enriches RAG chunks)
+python -m unified_pipeline.run_process document.pdf \
+    --mode both \
+    --tenant-id org_123 --user-id user_456 --resource-id res_789 \
+    --text-backend sqlite \
+    --embedding-model all-MiniLM-L6-v2 \
+    --verbose
+# Structured extraction only
+python -m unified_pipeline.run_process document.pdf \
+    --mode structured-only \
+    --tenant-id org_123 --user-id user_456 --resource-id res_789
+# RAG indexing only
+python -m unified_pipeline.run_process document.pdf \
+    --mode rag-only --rag-retrieval hybrid \
+    --text-backend sqlite --embedding-model all-MiniLM-L6-v2
+```
+```python
+from unified_pipeline import UnifiedDocumentPipeline, UnifiedPipelineConfig
+from openrag import OpenRAGConfig
+cfg = UnifiedPipelineConfig(
+    enable_structured=True,
+    enable_rag=True,
+    rag_retrieval_mode="hybrid",
+    tenant_id="org_123",
+    user_id="user_456",
+    resource_id="res_789",
+    openrag=OpenRAGConfig(text_backend="sqlite"),
+)
+pipeline = UnifiedDocumentPipeline(cfg)
+result = pipeline.process_file("document.pdf")
+print(result["structured"]["extraction_count"])   # entities extracted
+print(result["rag"]["chunks_indexed"])            # chunks in BM25 + FAISS
+# In "both" mode, both systems share the same document_id
+print(result["structured"]["document_id"] == result["rag"]["doc_id"])  # True
+```
+When running in `both` mode:
+- The document is parsed **once** and the text is shared between both pipelines
+- Extracted entity names, types, and document type are stored as metadata on every RAG chunk
+- Both systems use the same `document_id` so you can filter/query by it in either system
+## Environment Variables
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `OPENRAG_STORAGE_BACKEND` | `athena` | Structured extraction backend: `athena`, `postgres`, `mysql`, `sqlite` |
+| `OPENRAG_TEXT_BACKEND` | `sqlite` | RAG BM25 backend: `sqlite`, `postgres`, `mysql`, `mongodb` |
+| `OPENRAG_EMBEDDING_MODEL` | `all-MiniLM-L6-v2` | sentence-transformers model for embeddings |
+| `OPENRAG_EMBEDDING_DEVICE` | `cpu` | Compute device: `cpu`, `cuda`, `mps` |
+| `OPENRAG_FAISS_INDEX_PATH` | `./openrag.faiss` | FAISS binary index file path |
+| `OPENRAG_FAISS_METADATA_PATH` | `./openrag_meta.json` | FAISS JSON sidecar path |
+| `OPENRAG_RETRIEVAL_MODE` | `hybrid` | Default retrieval mode: `hybrid`, `text`, `vector` |
+| `PIPELINE_ENABLE_STRUCTURED` | `true` | Enable structured extraction in unified pipeline |
+| `PIPELINE_ENABLE_RAG` | `true` | Enable RAG indexing in unified pipeline |
+| `PIPELINE_RAG_RETRIEVAL_MODE` | `hybrid` | Default RAG mode in unified pipeline |
+| `PIPELINE_TENANT_ID` | `` | Tenant context for structured pipeline |
+| `PIPELINE_USER_ID` | `` | User context for structured pipeline |
+| `PIPELINE_RESOURCE_ID` | `` | Resource context for structured pipeline |

polyrag-0.1.0/app/__init__.py ADDED Viewed

File without changes

polyrag-0.1.0/app/config/__init__.py ADDED Viewed

File without changes