polyrag 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (141) hide show
  1. polyrag-0.1.0/LICENSE +21 -0
  2. polyrag-0.1.0/PKG-INFO +380 -0
  3. polyrag-0.1.0/README.md +274 -0
  4. polyrag-0.1.0/app/__init__.py +0 -0
  5. polyrag-0.1.0/app/config/__init__.py +0 -0
  6. polyrag-0.1.0/app/config/llm.py +243 -0
  7. polyrag-0.1.0/app/config/settings.py +39 -0
  8. polyrag-0.1.0/app/core/__init__.py +0 -0
  9. polyrag-0.1.0/app/core/parsers/__init__.py +0 -0
  10. polyrag-0.1.0/app/core/parsers/unified_parser.py +25 -0
  11. polyrag-0.1.0/app/core/services/__init__.py +0 -0
  12. polyrag-0.1.0/app/core/services/structured_data/__init__.py +3 -0
  13. polyrag-0.1.0/app/core/services/structured_data/evaluation_assertions.py +3 -0
  14. polyrag-0.1.0/app/core/services/structured_data/query_service.py +3 -0
  15. polyrag-0.1.0/app/core/services/structured_data/run.py +3 -0
  16. polyrag-0.1.0/app/core/services/structured_data/run_llm_rate_limit_stress.py +3 -0
  17. polyrag-0.1.0/app/core/services/structured_data/run_query.py +2 -0
  18. polyrag-0.1.0/app/utils/__init__.py +0 -0
  19. polyrag-0.1.0/app/utils/ocr.py +7 -0
  20. polyrag-0.1.0/connectors/__init__.py +0 -0
  21. polyrag-0.1.0/connectors/mongodb_connector.py +411 -0
  22. polyrag-0.1.0/connectors/neo4j_connector.py +313 -0
  23. polyrag-0.1.0/connectors/sqs_connector.py +259 -0
  24. polyrag-0.1.0/knowledge_graph/__init__.py +79 -0
  25. polyrag-0.1.0/knowledge_graph/evaluate_queries.py +207 -0
  26. polyrag-0.1.0/knowledge_graph/evaluation_assertions.py +387 -0
  27. polyrag-0.1.0/knowledge_graph/event_resolver.py +475 -0
  28. polyrag-0.1.0/knowledge_graph/extraction_normalizer.py +275 -0
  29. polyrag-0.1.0/knowledge_graph/extractor.py +326 -0
  30. polyrag-0.1.0/knowledge_graph/feature_flags.py +176 -0
  31. polyrag-0.1.0/knowledge_graph/graph_builder.py +421 -0
  32. polyrag-0.1.0/knowledge_graph/neo4j_storage.py +709 -0
  33. polyrag-0.1.0/knowledge_graph/ontology_builder.py +168 -0
  34. polyrag-0.1.0/knowledge_graph/ontology_extractor.py +117 -0
  35. polyrag-0.1.0/knowledge_graph/ontology_linker.py +234 -0
  36. polyrag-0.1.0/knowledge_graph/ontology_prompts.py +87 -0
  37. polyrag-0.1.0/knowledge_graph/ontology_schemas.py +190 -0
  38. polyrag-0.1.0/knowledge_graph/ontology_storage.py +287 -0
  39. polyrag-0.1.0/knowledge_graph/pipeline.py +1028 -0
  40. polyrag-0.1.0/knowledge_graph/prompts.py +123 -0
  41. polyrag-0.1.0/knowledge_graph/query_date_parser.py +144 -0
  42. polyrag-0.1.0/knowledge_graph/query_schemas.py +170 -0
  43. polyrag-0.1.0/knowledge_graph/query_service.py +633 -0
  44. polyrag-0.1.0/knowledge_graph/run.py +268 -0
  45. polyrag-0.1.0/knowledge_graph/run_query.py +322 -0
  46. polyrag-0.1.0/knowledge_graph/schema_parser.py +288 -0
  47. polyrag-0.1.0/knowledge_graph/schemas.py +392 -0
  48. polyrag-0.1.0/llm/__init__.py +0 -0
  49. polyrag-0.1.0/llm/base.py +202 -0
  50. polyrag-0.1.0/llm/manager.py +313 -0
  51. polyrag-0.1.0/llm/providers/google_provider.py +229 -0
  52. polyrag-0.1.0/llm/providers/huggingface_provider.py +164 -0
  53. polyrag-0.1.0/llm/providers/ollama_provider.py +113 -0
  54. polyrag-0.1.0/llm/providers/openai_provider.py +223 -0
  55. polyrag-0.1.0/llm/rate_limit.py +306 -0
  56. polyrag-0.1.0/llm/rate_limit_backends.py +625 -0
  57. polyrag-0.1.0/llm/rate_limit_schemas.py +64 -0
  58. polyrag-0.1.0/llm/schemas.py +893 -0
  59. polyrag-0.1.0/openrag/__init__.py +27 -0
  60. polyrag-0.1.0/openrag/config.py +175 -0
  61. polyrag-0.1.0/openrag/embedding/__init__.py +6 -0
  62. polyrag-0.1.0/openrag/embedding/base.py +52 -0
  63. polyrag-0.1.0/openrag/embedding/manager.py +100 -0
  64. polyrag-0.1.0/openrag/embedding/providers/__init__.py +5 -0
  65. polyrag-0.1.0/openrag/embedding/providers/sentence_transformers.py +89 -0
  66. polyrag-0.1.0/openrag/fusion.py +40 -0
  67. polyrag-0.1.0/openrag/indexer.py +229 -0
  68. polyrag-0.1.0/openrag/retriever.py +162 -0
  69. polyrag-0.1.0/openrag/run_index.py +145 -0
  70. polyrag-0.1.0/openrag/run_query.py +126 -0
  71. polyrag-0.1.0/openrag/schemas.py +67 -0
  72. polyrag-0.1.0/openrag/stores/__init__.py +7 -0
  73. polyrag-0.1.0/openrag/stores/base.py +118 -0
  74. polyrag-0.1.0/openrag/stores/factory.py +74 -0
  75. polyrag-0.1.0/openrag/stores/text_mongodb.py +273 -0
  76. polyrag-0.1.0/openrag/stores/text_mysql.py +273 -0
  77. polyrag-0.1.0/openrag/stores/text_postgres.py +270 -0
  78. polyrag-0.1.0/openrag/stores/text_sqlite.py +272 -0
  79. polyrag-0.1.0/openrag/stores/vector_store.py +277 -0
  80. polyrag-0.1.0/openrag/text/__init__.py +0 -0
  81. polyrag-0.1.0/openrag/text/tokenize.py +17 -0
  82. polyrag-0.1.0/parsers/__init__.py +22 -0
  83. polyrag-0.1.0/parsers/csv_parser.py +320 -0
  84. polyrag-0.1.0/parsers/docling_utils.py +514 -0
  85. polyrag-0.1.0/parsers/docx_parser.py +509 -0
  86. polyrag-0.1.0/parsers/llama_parse_extraction.py +353 -0
  87. polyrag-0.1.0/parsers/pdf_parser.py +745 -0
  88. polyrag-0.1.0/parsers/pipeline_text_parser.py +291 -0
  89. polyrag-0.1.0/parsers/ppt_parser.py +254 -0
  90. polyrag-0.1.0/parsers/txt_parser.py +366 -0
  91. polyrag-0.1.0/parsers/unified_parser.py +721 -0
  92. polyrag-0.1.0/parsers/utils.py +1838 -0
  93. polyrag-0.1.0/parsers/xlsx_parser.py +377 -0
  94. polyrag-0.1.0/parsers/xml_parser.py +197 -0
  95. polyrag-0.1.0/polyrag.egg-info/PKG-INFO +380 -0
  96. polyrag-0.1.0/polyrag.egg-info/SOURCES.txt +139 -0
  97. polyrag-0.1.0/polyrag.egg-info/dependency_links.txt +1 -0
  98. polyrag-0.1.0/polyrag.egg-info/requires.txt +99 -0
  99. polyrag-0.1.0/polyrag.egg-info/top_level.txt +8 -0
  100. polyrag-0.1.0/pyproject.toml +166 -0
  101. polyrag-0.1.0/setup.cfg +4 -0
  102. polyrag-0.1.0/structured_data/__init__.py +26 -0
  103. polyrag-0.1.0/structured_data/athena_storage.py +12 -0
  104. polyrag-0.1.0/structured_data/chunking.py +322 -0
  105. polyrag-0.1.0/structured_data/entity_consolidator.py +231 -0
  106. polyrag-0.1.0/structured_data/entity_resolver.py +680 -0
  107. polyrag-0.1.0/structured_data/evaluate_queries.py +222 -0
  108. polyrag-0.1.0/structured_data/evaluate_salary_queries.py +274 -0
  109. polyrag-0.1.0/structured_data/evaluation_assertions.py +110 -0
  110. polyrag-0.1.0/structured_data/extraction_normalizer.py +542 -0
  111. polyrag-0.1.0/structured_data/extractor.py +628 -0
  112. polyrag-0.1.0/structured_data/feature_flags.py +90 -0
  113. polyrag-0.1.0/structured_data/pipeline.py +1001 -0
  114. polyrag-0.1.0/structured_data/prompts.py +169 -0
  115. polyrag-0.1.0/structured_data/query_entity_resolver.py +235 -0
  116. polyrag-0.1.0/structured_data/query_executor.py +29 -0
  117. polyrag-0.1.0/structured_data/query_intent.py +176 -0
  118. polyrag-0.1.0/structured_data/query_prompts.py +397 -0
  119. polyrag-0.1.0/structured_data/query_result_processor.py +203 -0
  120. polyrag-0.1.0/structured_data/query_schemas.py +38 -0
  121. polyrag-0.1.0/structured_data/query_service.py +532 -0
  122. polyrag-0.1.0/structured_data/query_sql_generator.py +325 -0
  123. polyrag-0.1.0/structured_data/query_sql_guardrails.py +594 -0
  124. polyrag-0.1.0/structured_data/run.py +217 -0
  125. polyrag-0.1.0/structured_data/run_llm_rate_limit_stress.py +451 -0
  126. polyrag-0.1.0/structured_data/run_query.py +374 -0
  127. polyrag-0.1.0/structured_data/schemas.py +247 -0
  128. polyrag-0.1.0/structured_data/storage/__init__.py +11 -0
  129. polyrag-0.1.0/structured_data/storage/athena_query_executor.py +107 -0
  130. polyrag-0.1.0/structured_data/storage/athena_storage.py +504 -0
  131. polyrag-0.1.0/structured_data/storage/base.py +174 -0
  132. polyrag-0.1.0/structured_data/storage/factory.py +60 -0
  133. polyrag-0.1.0/structured_data/storage/mysql_storage.py +202 -0
  134. polyrag-0.1.0/structured_data/storage/postgres_storage.py +188 -0
  135. polyrag-0.1.0/structured_data/storage/sql_query_executor.py +179 -0
  136. polyrag-0.1.0/structured_data/storage/sqlite_storage.py +196 -0
  137. polyrag-0.1.0/structured_data/type_utils.py +307 -0
  138. polyrag-0.1.0/unified_pipeline/__init__.py +34 -0
  139. polyrag-0.1.0/unified_pipeline/config.py +117 -0
  140. polyrag-0.1.0/unified_pipeline/pipeline.py +249 -0
  141. polyrag-0.1.0/unified_pipeline/run_process.py +232 -0
polyrag-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Fermi-Dev
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
polyrag-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,380 @@
1
+ Metadata-Version: 2.4
2
+ Name: polyrag
3
+ Version: 0.1.0
4
+ Summary: Multi-modal document intelligence: BM25 + FAISS hybrid search, knowledge graph extraction, and structured SQL querying
5
+ License-Expression: MIT
6
+ Classifier: Development Status :: 4 - Beta
7
+ Classifier: Intended Audience :: Developers
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Programming Language :: Python :: 3.10
10
+ Classifier: Programming Language :: Python :: 3.11
11
+ Classifier: Programming Language :: Python :: 3.12
12
+ Classifier: Programming Language :: Python :: 3.13
13
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
14
+ Classifier: Topic :: Text Processing :: Indexing
15
+ Requires-Python: >=3.10
16
+ Description-Content-Type: text/markdown
17
+ License-File: LICENSE
18
+ Requires-Dist: numpy>=1.24.0
19
+ Requires-Dist: faiss-cpu>=1.7.4
20
+ Requires-Dist: sentence-transformers>=3.0.0
21
+ Requires-Dist: python-dotenv>=1.0.0
22
+ Requires-Dist: tqdm>=4.67.1
23
+ Provides-Extra: llm
24
+ Requires-Dist: openai>=1.0.0; extra == "llm"
25
+ Requires-Dist: langchain>=0.3.0; extra == "llm"
26
+ Requires-Dist: langchain-core>=0.3.0; extra == "llm"
27
+ Requires-Dist: langchain-openai>=0.3.0; extra == "llm"
28
+ Requires-Dist: langchain-google-genai>=2.0.0; extra == "llm"
29
+ Requires-Dist: langchain-huggingface>=0.1.0; extra == "llm"
30
+ Requires-Dist: langchain-ollama>=0.2.0; extra == "llm"
31
+ Requires-Dist: accelerate>=0.27.0; extra == "llm"
32
+ Provides-Extra: structured
33
+ Requires-Dist: langchain>=0.3.0; extra == "structured"
34
+ Requires-Dist: langchain-core>=0.3.0; extra == "structured"
35
+ Requires-Dist: langchain-community>=0.3.0; extra == "structured"
36
+ Requires-Dist: langchain-experimental>=0.3.0; extra == "structured"
37
+ Requires-Dist: langchain-text-splitters>=0.3.0; extra == "structured"
38
+ Requires-Dist: pandas>=2.0.0; extra == "structured"
39
+ Requires-Dist: sqlparse>=0.5.0; extra == "structured"
40
+ Requires-Dist: tabulate>=0.9.0; extra == "structured"
41
+ Requires-Dist: scipy>=1.10.0; extra == "structured"
42
+ Requires-Dist: scikit-learn>=1.3.0; extra == "structured"
43
+ Requires-Dist: nest-asyncio>=1.6.0; extra == "structured"
44
+ Provides-Extra: kg
45
+ Requires-Dist: networkx>=3.0; extra == "kg"
46
+ Requires-Dist: rdflib>=7.0.0; extra == "kg"
47
+ Requires-Dist: neo4j>=5.15.0; extra == "kg"
48
+ Requires-Dist: langchain-neo4j>=0.4.0; extra == "kg"
49
+ Requires-Dist: langgraph>=0.5.0; extra == "kg"
50
+ Requires-Dist: graphiti-core>=0.24.0; extra == "kg"
51
+ Requires-Dist: pyvis>=0.3.2; extra == "kg"
52
+ Requires-Dist: plotly>=6.0.0; extra == "kg"
53
+ Requires-Dist: matplotlib>=3.7.0; extra == "kg"
54
+ Provides-Extra: parsers
55
+ Requires-Dist: pypdf2>=3.0.0; extra == "parsers"
56
+ Requires-Dist: pdfplumber>=0.11.0; extra == "parsers"
57
+ Requires-Dist: pdf2image>=1.17.0; extra == "parsers"
58
+ Requires-Dist: python-docx>=1.1.0; extra == "parsers"
59
+ Requires-Dist: python-pptx>=0.6.21; extra == "parsers"
60
+ Requires-Dist: openpyxl>=3.1.0; extra == "parsers"
61
+ Requires-Dist: docx2txt>=0.9; extra == "parsers"
62
+ Requires-Dist: reportlab>=4.0.0; extra == "parsers"
63
+ Requires-Dist: docling>=2.0.0; extra == "parsers"
64
+ Requires-Dist: llama-cloud-services>=0.6.0; extra == "parsers"
65
+ Provides-Extra: db-postgres
66
+ Requires-Dist: psycopg>=3.2.0; extra == "db-postgres"
67
+ Requires-Dist: psycopg2-binary>=2.9.0; extra == "db-postgres"
68
+ Requires-Dist: psycopg-binary>=3.2.0; extra == "db-postgres"
69
+ Provides-Extra: db-mysql
70
+ Requires-Dist: mysql-connector-python>=8.3.0; extra == "db-mysql"
71
+ Requires-Dist: pymysql>=1.1.0; extra == "db-mysql"
72
+ Provides-Extra: db-mongodb
73
+ Requires-Dist: pymongo>=4.14.0; extra == "db-mongodb"
74
+ Requires-Dist: langchain-mongodb>=0.7.0; extra == "db-mongodb"
75
+ Provides-Extra: db-elastic
76
+ Requires-Dist: elasticsearch>=9.0.0; extra == "db-elastic"
77
+ Requires-Dist: opensearch-py>=3.0.0; extra == "db-elastic"
78
+ Provides-Extra: aws
79
+ Requires-Dist: boto3>=1.34.0; extra == "aws"
80
+ Requires-Dist: awswrangler>=3.0.0; extra == "aws"
81
+ Requires-Dist: pyathena>=3.22.0; extra == "aws"
82
+ Requires-Dist: certifi>=2025.0.0; extra == "aws"
83
+ Requires-Dist: aws-sqs-consumer>=0.0.15; extra == "aws"
84
+ Requires-Dist: requests-aws4auth>=1.3.0; extra == "aws"
85
+ Provides-Extra: api
86
+ Requires-Dist: fastapi[standard]>=0.116.0; extra == "api"
87
+ Requires-Dist: uvicorn>=0.35.0; extra == "api"
88
+ Requires-Dist: fastapi-cli>=0.0.13; extra == "api"
89
+ Requires-Dist: python-multipart>=0.0.20; extra == "api"
90
+ Requires-Dist: httpx>=0.28.0; extra == "api"
91
+ Requires-Dist: fastmcp>=2.13.0; extra == "api"
92
+ Requires-Dist: streamlit>=1.46.0; extra == "api"
93
+ Requires-Dist: requests>=2.32.0; extra == "api"
94
+ Requires-Dist: cryptography>=45.0.0; extra == "api"
95
+ Requires-Dist: redis>=5.0.0; extra == "api"
96
+ Requires-Dist: agentlightning>=0.1.0; extra == "api"
97
+ Requires-Dist: clear>=2.0.0; extra == "api"
98
+ Provides-Extra: dev
99
+ Requires-Dist: pytest>=8.0.0; extra == "dev"
100
+ Requires-Dist: pytest-asyncio>=0.23.0; extra == "dev"
101
+ Requires-Dist: twine>=5.0.0; extra == "dev"
102
+ Requires-Dist: build>=1.0.0; extra == "dev"
103
+ Provides-Extra: all
104
+ Requires-Dist: polyrag[api,aws,db-elastic,db-mongodb,db-mysql,db-postgres,kg,llm,parsers,structured]; extra == "all"
105
+ Dynamic: license-file
106
+
107
+ # doc_pros — Document Intelligence Platform
108
+
109
+ ## Goal
110
+
111
+ Transform unstructured documents (PDFs, Word files, spreadsheets, etc.) into a **queryable structured database** — then let users ask natural language questions and get precise, SQL-computed answers instead of LLM-hallucinated guesses.
112
+
113
+ Optionally pair structured extraction with **OpenRAG** — a fully open-source BM25 + FAISS retrieval layer — for semantic and keyword-based document search without any paid APIs.
114
+
115
+ ## Why This Exists (vs. Basic RAG)
116
+
117
+ Traditional RAG (e.g., OpenSearch + embeddings) retrieves text chunks and asks an LLM to synthesize answers. This fails for:
118
+
119
+ - **Numerical questions** — "What's the total salary expenditure?" (LLM can't reliably add numbers from chunks)
120
+ - **Aggregations** — SUM, AVG, COUNT, MAX, ranking across documents
121
+ - **Cross-document comparison** — same entity appearing in 8 yearly reports
122
+ - **Consistency** — same question returns different answers depending on which chunks are retrieved
123
+
124
+ This platform **extracts structured data first**, stores it in typed SQL tables, then uses the LLM only to generate the SQL query — the actual computation is done by the database engine, eliminating hallucination for factual/numerical answers.
125
+
126
+ For free-text semantic search, **OpenRAG** adds BM25 keyword search (inverted index) and FAISS vector search (HNSW embeddings) as a parallel retrieval layer — both fully open-source.
127
+
128
+ ## Architecture
129
+
130
+ ```
131
+ Raw Documents
132
+ |
133
+ v
134
+ [parsers/] ─────────────> Unified Parser (Docling / LlamaParse / pdfplumber)
135
+ |
136
+ |─────────────────────────────────────────────────────────────────┐
137
+ v v
138
+ [structured_data/] [openrag/]
139
+ |── extractor.py ────> LLM-based 3-tier extraction |── stores/
140
+ |── entity_resolver.py > 5-stage entity resolution | |── text_sqlite.py BM25 (SQLite)
141
+ |── entity_consolidator> Merge into master records | |── text_postgres.py BM25 (Postgres)
142
+ |── storage/ ──> Pluggable SQL backend | |── text_mysql.py BM25 (MySQL)
143
+ | |── athena (AWS Athena / Iceberg) | |── text_mongodb.py BM25 (MongoDB)
144
+ | |── postgres (PostgreSQL) | └── vector_store.py FAISS HNSW
145
+ | |── mysql (MySQL) |── embedding/
146
+ | └── sqlite (local) | └── sentence_transformers
147
+ |── query_service.py ──> NL → SQL → results |── indexer.py index_file / index_text
148
+ | |── retriever.py retrieve (hybrid/text/vector)
149
+ v |── run_index.py CLI
150
+ [connectors/] ─────────> MongoDB, Neo4j, AWS SQS └── run_query.py CLI
151
+ |
152
+ v
153
+ [llm/] ─────────────────> OpenAI, Google Gemini providers
154
+ |
155
+ v
156
+ [unified_pipeline/] ────> Orchestrate both pipelines (parse once, share text)
157
+ ```
158
+
159
+ ## Modules
160
+
161
+ | Module | Purpose |
162
+ |--------|---------|
163
+ | [parsers/](parsers/) | Multi-format document parsing (PDF, DOCX, XLSX, CSV, PPTX, TXT, XML) |
164
+ | [llm/](llm/) | LLM provider abstraction, model management, Pydantic schemas |
165
+ | [connectors/](connectors/) | Database connectors (MongoDB, Neo4j, AWS SQS) |
166
+ | [structured_data/](structured_data/) | Core pipeline: LLM extraction, entity resolution, pluggable SQL storage, NL querying |
167
+ | [openrag/](openrag/) | Open-source BM25 + FAISS retrieval: index and search document chunks |
168
+ | [unified_pipeline/](unified_pipeline/) | Flexible orchestration: run structured, RAG, or both with one call |
169
+ | [search_algorithms/](search_algorithms/) | Reference BM25, HNSW, TF-IDF implementations |
170
+
171
+ ## Data Flow
172
+
173
+ **Structured extraction (ingestion):**
174
+ ```
175
+ Document file -> Parse to text (pdfplumber / Docling)
176
+ -> LLM extracts structured rows (entity, metric, date, attribute...)
177
+ -> Entity resolver links mentions across documents
178
+ -> Consolidator merges into master records (MongoDB)
179
+ -> Store typed rows in pluggable SQL backend (Athena / Postgres / MySQL / SQLite)
180
+ ```
181
+
182
+ **RAG indexing (OpenRAG):**
183
+ ```
184
+ Document file -> Parse to text
185
+ -> Chunk into overlapping windows (~10k chars, 500 overlap)
186
+ -> Embed with sentence-transformers (any HuggingFace model)
187
+ -> Store BM25 inverted index in chosen text backend
188
+ -> Store embeddings in FAISS IndexHNSWFlat + JSON sidecar
189
+ ```
190
+
191
+ **Unified (both together):**
192
+ ```
193
+ Document file -> Parse ONCE (shared text)
194
+ -> Structured extraction → entity metadata (names, types, doc_type)
195
+ -> RAG indexing with entity metadata attached to every chunk
196
+ -> Query via SQL (precise aggregations) OR vector/BM25 (semantic search)
197
+ ```
198
+
199
+ **Querying:**
200
+ ```
201
+ NL question -> Extract entity candidates from question
202
+ -> Resolve against MongoDB entities_master
203
+ -> LLM generates SQL with entity context
204
+ -> Execute SQL on chosen backend
205
+ -> Return precise, computed results
206
+
207
+ OR
208
+
209
+ NL question -> BM25 keyword search + FAISS vector search
210
+ -> Reciprocal Rank Fusion (RRF) for hybrid results
211
+ -> Return ranked chunks with full text + metadata
212
+ ```
213
+
214
+ ## Tech Stack
215
+
216
+ - **Python 3.13+** with **uv** package manager
217
+ - **LLM Providers**: OpenAI (GPT-4o), Google Gemini
218
+ - **Document Parsing**: Docling (local), LlamaParse (cloud), pdfplumber
219
+ - **Structured Storage**: AWS Athena/Iceberg, PostgreSQL, MySQL, SQLite (pluggable)
220
+ - **Entity Master**: MongoDB
221
+ - **Graph**: Neo4j (optional)
222
+ - **Queue**: AWS SQS for async processing
223
+ - **Embeddings**: sentence-transformers (any HuggingFace model, fully open-source)
224
+ - **Vector Index**: FAISS IndexHNSWFlat
225
+ - **Text Search**: BM25 inverted index (SQLite / PostgreSQL / MySQL / MongoDB)
226
+ - **Frameworks**: FastAPI, LangChain, LangGraph, Pydantic
227
+
228
+ ## Setup
229
+
230
+ ```bash
231
+ # Install dependencies
232
+ uv sync
233
+
234
+ # Configure environment
235
+ cp .env.example .env
236
+ # Fill in: OPENAI_API_KEY, mongo_connection_string, AWS credentials, etc.
237
+ ```
238
+
239
+ ## Quick Start
240
+
241
+ ### Structured Extraction
242
+
243
+ ```python
244
+ from structured_data import StructuredDataPipeline
245
+
246
+ pipeline = StructuredDataPipeline()
247
+ result = pipeline.process_file(
248
+ file_path="document.pdf",
249
+ tenant_id="org_123",
250
+ user_id="user_456",
251
+ resource_id="res_789",
252
+ )
253
+ print(result["extraction_count"]) # number of entities extracted
254
+ print(result["document_id"]) # stable doc identifier
255
+
256
+ # Query extracted data with natural language
257
+ from structured_data import StructuredDataQueryService
258
+
259
+ svc = StructuredDataQueryService()
260
+ answer = svc.query("What is the average salary in Engineering?", tenant_id="org_123")
261
+ print(answer["rows"]) # precise SQL-computed result
262
+ ```
263
+
264
+ Choose your storage backend via `OPENRAG_STORAGE_BACKEND` env var:
265
+
266
+ ```bash
267
+ OPENRAG_STORAGE_BACKEND=sqlite # local SQLite (default for dev)
268
+ OPENRAG_STORAGE_BACKEND=postgres # PostgreSQL
269
+ OPENRAG_STORAGE_BACKEND=mysql # MySQL
270
+ OPENRAG_STORAGE_BACKEND=athena # AWS Athena / Iceberg (production)
271
+ ```
272
+
273
+ ### OpenRAG — Semantic + Keyword Search
274
+
275
+ ```bash
276
+ # Index a document (BM25 + FAISS)
277
+ python -m openrag.run_index document.pdf \
278
+ --text-backend sqlite \
279
+ --embedding-model all-MiniLM-L6-v2 \
280
+ --verbose
281
+
282
+ # Hybrid search (BM25 + vector, fused with RRF)
283
+ python -m openrag.run_query "engineering salary 2024" \
284
+ --mode hybrid --top-k 5 --show-text
285
+
286
+ # Text-only (BM25)
287
+ python -m openrag.run_query "invoice total amount" --mode text --top-k 5
288
+
289
+ # Vector-only (FAISS HNSW)
290
+ python -m openrag.run_query "compensation benefits" --mode vector --top-k 5
291
+ ```
292
+
293
+ Text backends for BM25: `sqlite` (default), `postgres`, `mysql`, `mongodb`.
294
+ Embedding models: any model on HuggingFace via sentence-transformers (e.g. `all-MiniLM-L6-v2`, `BAAI/bge-small-en-v1.5`).
295
+
296
+ ```python
297
+ from openrag import OpenRAGConfig, OpenRAGIndexer, OpenRAGRetriever
298
+
299
+ cfg = OpenRAGConfig(
300
+ text_backend="sqlite",
301
+ embedding_model="all-MiniLM-L6-v2",
302
+ retrieval_mode="hybrid",
303
+ )
304
+ OpenRAGIndexer(cfg).index_file("document.pdf")
305
+
306
+ retriever = OpenRAGRetriever(cfg)
307
+ results = retriever.retrieve("engineering salary", top_k=5)
308
+ for r in results:
309
+ print(r["retrieval_score"], r["text"][:120])
310
+ print(r.get("metadata", {})) # entity_names, entity_types, doc_type (if indexed via unified pipeline)
311
+ ```
312
+
313
+ ### Unified Pipeline — Structured + RAG Together
314
+
315
+ Parse once, run both pipelines, share entity metadata as chunk enrichment:
316
+
317
+ ```bash
318
+ # Both pipelines (parse once, entity metadata enriches RAG chunks)
319
+ python -m unified_pipeline.run_process document.pdf \
320
+ --mode both \
321
+ --tenant-id org_123 --user-id user_456 --resource-id res_789 \
322
+ --text-backend sqlite \
323
+ --embedding-model all-MiniLM-L6-v2 \
324
+ --verbose
325
+
326
+ # Structured extraction only
327
+ python -m unified_pipeline.run_process document.pdf \
328
+ --mode structured-only \
329
+ --tenant-id org_123 --user-id user_456 --resource-id res_789
330
+
331
+ # RAG indexing only
332
+ python -m unified_pipeline.run_process document.pdf \
333
+ --mode rag-only --rag-retrieval hybrid \
334
+ --text-backend sqlite --embedding-model all-MiniLM-L6-v2
335
+ ```
336
+
337
+ ```python
338
+ from unified_pipeline import UnifiedDocumentPipeline, UnifiedPipelineConfig
339
+ from openrag import OpenRAGConfig
340
+
341
+ cfg = UnifiedPipelineConfig(
342
+ enable_structured=True,
343
+ enable_rag=True,
344
+ rag_retrieval_mode="hybrid",
345
+ tenant_id="org_123",
346
+ user_id="user_456",
347
+ resource_id="res_789",
348
+ openrag=OpenRAGConfig(text_backend="sqlite"),
349
+ )
350
+ pipeline = UnifiedDocumentPipeline(cfg)
351
+ result = pipeline.process_file("document.pdf")
352
+
353
+ print(result["structured"]["extraction_count"]) # entities extracted
354
+ print(result["rag"]["chunks_indexed"]) # chunks in BM25 + FAISS
355
+ # In "both" mode, both systems share the same document_id
356
+ print(result["structured"]["document_id"] == result["rag"]["doc_id"]) # True
357
+ ```
358
+
359
+ When running in `both` mode:
360
+ - The document is parsed **once** and the text is shared between both pipelines
361
+ - Extracted entity names, types, and document type are stored as metadata on every RAG chunk
362
+ - Both systems use the same `document_id` so you can filter/query by it in either system
363
+
364
+ ## Environment Variables
365
+
366
+ | Variable | Default | Description |
367
+ |----------|---------|-------------|
368
+ | `OPENRAG_STORAGE_BACKEND` | `athena` | Structured extraction backend: `athena`, `postgres`, `mysql`, `sqlite` |
369
+ | `OPENRAG_TEXT_BACKEND` | `sqlite` | RAG BM25 backend: `sqlite`, `postgres`, `mysql`, `mongodb` |
370
+ | `OPENRAG_EMBEDDING_MODEL` | `all-MiniLM-L6-v2` | sentence-transformers model for embeddings |
371
+ | `OPENRAG_EMBEDDING_DEVICE` | `cpu` | Compute device: `cpu`, `cuda`, `mps` |
372
+ | `OPENRAG_FAISS_INDEX_PATH` | `./openrag.faiss` | FAISS binary index file path |
373
+ | `OPENRAG_FAISS_METADATA_PATH` | `./openrag_meta.json` | FAISS JSON sidecar path |
374
+ | `OPENRAG_RETRIEVAL_MODE` | `hybrid` | Default retrieval mode: `hybrid`, `text`, `vector` |
375
+ | `PIPELINE_ENABLE_STRUCTURED` | `true` | Enable structured extraction in unified pipeline |
376
+ | `PIPELINE_ENABLE_RAG` | `true` | Enable RAG indexing in unified pipeline |
377
+ | `PIPELINE_RAG_RETRIEVAL_MODE` | `hybrid` | Default RAG mode in unified pipeline |
378
+ | `PIPELINE_TENANT_ID` | `` | Tenant context for structured pipeline |
379
+ | `PIPELINE_USER_ID` | `` | User context for structured pipeline |
380
+ | `PIPELINE_RESOURCE_ID` | `` | Resource context for structured pipeline |
@@ -0,0 +1,274 @@
1
+ # doc_pros — Document Intelligence Platform
2
+
3
+ ## Goal
4
+
5
+ Transform unstructured documents (PDFs, Word files, spreadsheets, etc.) into a **queryable structured database** — then let users ask natural language questions and get precise, SQL-computed answers instead of LLM-hallucinated guesses.
6
+
7
+ Optionally pair structured extraction with **OpenRAG** — a fully open-source BM25 + FAISS retrieval layer — for semantic and keyword-based document search without any paid APIs.
8
+
9
+ ## Why This Exists (vs. Basic RAG)
10
+
11
+ Traditional RAG (e.g., OpenSearch + embeddings) retrieves text chunks and asks an LLM to synthesize answers. This fails for:
12
+
13
+ - **Numerical questions** — "What's the total salary expenditure?" (LLM can't reliably add numbers from chunks)
14
+ - **Aggregations** — SUM, AVG, COUNT, MAX, ranking across documents
15
+ - **Cross-document comparison** — same entity appearing in 8 yearly reports
16
+ - **Consistency** — same question returns different answers depending on which chunks are retrieved
17
+
18
+ This platform **extracts structured data first**, stores it in typed SQL tables, then uses the LLM only to generate the SQL query — the actual computation is done by the database engine, eliminating hallucination for factual/numerical answers.
19
+
20
+ For free-text semantic search, **OpenRAG** adds BM25 keyword search (inverted index) and FAISS vector search (HNSW embeddings) as a parallel retrieval layer — both fully open-source.
21
+
22
+ ## Architecture
23
+
24
+ ```
25
+ Raw Documents
26
+ |
27
+ v
28
+ [parsers/] ─────────────> Unified Parser (Docling / LlamaParse / pdfplumber)
29
+ |
30
+ |─────────────────────────────────────────────────────────────────┐
31
+ v v
32
+ [structured_data/] [openrag/]
33
+ |── extractor.py ────> LLM-based 3-tier extraction |── stores/
34
+ |── entity_resolver.py > 5-stage entity resolution | |── text_sqlite.py BM25 (SQLite)
35
+ |── entity_consolidator> Merge into master records | |── text_postgres.py BM25 (Postgres)
36
+ |── storage/ ──> Pluggable SQL backend | |── text_mysql.py BM25 (MySQL)
37
+ | |── athena (AWS Athena / Iceberg) | |── text_mongodb.py BM25 (MongoDB)
38
+ | |── postgres (PostgreSQL) | └── vector_store.py FAISS HNSW
39
+ | |── mysql (MySQL) |── embedding/
40
+ | └── sqlite (local) | └── sentence_transformers
41
+ |── query_service.py ──> NL → SQL → results |── indexer.py index_file / index_text
42
+ | |── retriever.py retrieve (hybrid/text/vector)
43
+ v |── run_index.py CLI
44
+ [connectors/] ─────────> MongoDB, Neo4j, AWS SQS └── run_query.py CLI
45
+ |
46
+ v
47
+ [llm/] ─────────────────> OpenAI, Google Gemini providers
48
+ |
49
+ v
50
+ [unified_pipeline/] ────> Orchestrate both pipelines (parse once, share text)
51
+ ```
52
+
53
+ ## Modules
54
+
55
+ | Module | Purpose |
56
+ |--------|---------|
57
+ | [parsers/](parsers/) | Multi-format document parsing (PDF, DOCX, XLSX, CSV, PPTX, TXT, XML) |
58
+ | [llm/](llm/) | LLM provider abstraction, model management, Pydantic schemas |
59
+ | [connectors/](connectors/) | Database connectors (MongoDB, Neo4j, AWS SQS) |
60
+ | [structured_data/](structured_data/) | Core pipeline: LLM extraction, entity resolution, pluggable SQL storage, NL querying |
61
+ | [openrag/](openrag/) | Open-source BM25 + FAISS retrieval: index and search document chunks |
62
+ | [unified_pipeline/](unified_pipeline/) | Flexible orchestration: run structured, RAG, or both with one call |
63
+ | [search_algorithms/](search_algorithms/) | Reference BM25, HNSW, TF-IDF implementations |
64
+
65
+ ## Data Flow
66
+
67
+ **Structured extraction (ingestion):**
68
+ ```
69
+ Document file -> Parse to text (pdfplumber / Docling)
70
+ -> LLM extracts structured rows (entity, metric, date, attribute...)
71
+ -> Entity resolver links mentions across documents
72
+ -> Consolidator merges into master records (MongoDB)
73
+ -> Store typed rows in pluggable SQL backend (Athena / Postgres / MySQL / SQLite)
74
+ ```
75
+
76
+ **RAG indexing (OpenRAG):**
77
+ ```
78
+ Document file -> Parse to text
79
+ -> Chunk into overlapping windows (~10k chars, 500 overlap)
80
+ -> Embed with sentence-transformers (any HuggingFace model)
81
+ -> Store BM25 inverted index in chosen text backend
82
+ -> Store embeddings in FAISS IndexHNSWFlat + JSON sidecar
83
+ ```
84
+
85
+ **Unified (both together):**
86
+ ```
87
+ Document file -> Parse ONCE (shared text)
88
+ -> Structured extraction → entity metadata (names, types, doc_type)
89
+ -> RAG indexing with entity metadata attached to every chunk
90
+ -> Query via SQL (precise aggregations) OR vector/BM25 (semantic search)
91
+ ```
92
+
93
+ **Querying:**
94
+ ```
95
+ NL question -> Extract entity candidates from question
96
+ -> Resolve against MongoDB entities_master
97
+ -> LLM generates SQL with entity context
98
+ -> Execute SQL on chosen backend
99
+ -> Return precise, computed results
100
+
101
+ OR
102
+
103
+ NL question -> BM25 keyword search + FAISS vector search
104
+ -> Reciprocal Rank Fusion (RRF) for hybrid results
105
+ -> Return ranked chunks with full text + metadata
106
+ ```
107
+
108
+ ## Tech Stack
109
+
110
+ - **Python 3.13+** with **uv** package manager
111
+ - **LLM Providers**: OpenAI (GPT-4o), Google Gemini
112
+ - **Document Parsing**: Docling (local), LlamaParse (cloud), pdfplumber
113
+ - **Structured Storage**: AWS Athena/Iceberg, PostgreSQL, MySQL, SQLite (pluggable)
114
+ - **Entity Master**: MongoDB
115
+ - **Graph**: Neo4j (optional)
116
+ - **Queue**: AWS SQS for async processing
117
+ - **Embeddings**: sentence-transformers (any HuggingFace model, fully open-source)
118
+ - **Vector Index**: FAISS IndexHNSWFlat
119
+ - **Text Search**: BM25 inverted index (SQLite / PostgreSQL / MySQL / MongoDB)
120
+ - **Frameworks**: FastAPI, LangChain, LangGraph, Pydantic
121
+
122
+ ## Setup
123
+
124
+ ```bash
125
+ # Install dependencies
126
+ uv sync
127
+
128
+ # Configure environment
129
+ cp .env.example .env
130
+ # Fill in: OPENAI_API_KEY, mongo_connection_string, AWS credentials, etc.
131
+ ```
132
+
133
+ ## Quick Start
134
+
135
+ ### Structured Extraction
136
+
137
+ ```python
138
+ from structured_data import StructuredDataPipeline
139
+
140
+ pipeline = StructuredDataPipeline()
141
+ result = pipeline.process_file(
142
+ file_path="document.pdf",
143
+ tenant_id="org_123",
144
+ user_id="user_456",
145
+ resource_id="res_789",
146
+ )
147
+ print(result["extraction_count"]) # number of entities extracted
148
+ print(result["document_id"]) # stable doc identifier
149
+
150
+ # Query extracted data with natural language
151
+ from structured_data import StructuredDataQueryService
152
+
153
+ svc = StructuredDataQueryService()
154
+ answer = svc.query("What is the average salary in Engineering?", tenant_id="org_123")
155
+ print(answer["rows"]) # precise SQL-computed result
156
+ ```
157
+
158
+ Choose your storage backend via `OPENRAG_STORAGE_BACKEND` env var:
159
+
160
+ ```bash
161
+ OPENRAG_STORAGE_BACKEND=sqlite # local SQLite (default for dev)
162
+ OPENRAG_STORAGE_BACKEND=postgres # PostgreSQL
163
+ OPENRAG_STORAGE_BACKEND=mysql # MySQL
164
+ OPENRAG_STORAGE_BACKEND=athena # AWS Athena / Iceberg (production)
165
+ ```
166
+
167
+ ### OpenRAG — Semantic + Keyword Search
168
+
169
+ ```bash
170
+ # Index a document (BM25 + FAISS)
171
+ python -m openrag.run_index document.pdf \
172
+ --text-backend sqlite \
173
+ --embedding-model all-MiniLM-L6-v2 \
174
+ --verbose
175
+
176
+ # Hybrid search (BM25 + vector, fused with RRF)
177
+ python -m openrag.run_query "engineering salary 2024" \
178
+ --mode hybrid --top-k 5 --show-text
179
+
180
+ # Text-only (BM25)
181
+ python -m openrag.run_query "invoice total amount" --mode text --top-k 5
182
+
183
+ # Vector-only (FAISS HNSW)
184
+ python -m openrag.run_query "compensation benefits" --mode vector --top-k 5
185
+ ```
186
+
187
+ Text backends for BM25: `sqlite` (default), `postgres`, `mysql`, `mongodb`.
188
+ Embedding models: any model on HuggingFace via sentence-transformers (e.g. `all-MiniLM-L6-v2`, `BAAI/bge-small-en-v1.5`).
189
+
190
+ ```python
191
+ from openrag import OpenRAGConfig, OpenRAGIndexer, OpenRAGRetriever
192
+
193
+ cfg = OpenRAGConfig(
194
+ text_backend="sqlite",
195
+ embedding_model="all-MiniLM-L6-v2",
196
+ retrieval_mode="hybrid",
197
+ )
198
+ OpenRAGIndexer(cfg).index_file("document.pdf")
199
+
200
+ retriever = OpenRAGRetriever(cfg)
201
+ results = retriever.retrieve("engineering salary", top_k=5)
202
+ for r in results:
203
+ print(r["retrieval_score"], r["text"][:120])
204
+ print(r.get("metadata", {})) # entity_names, entity_types, doc_type (if indexed via unified pipeline)
205
+ ```
206
+
207
+ ### Unified Pipeline — Structured + RAG Together
208
+
209
+ Parse once, run both pipelines, share entity metadata as chunk enrichment:
210
+
211
+ ```bash
212
+ # Both pipelines (parse once, entity metadata enriches RAG chunks)
213
+ python -m unified_pipeline.run_process document.pdf \
214
+ --mode both \
215
+ --tenant-id org_123 --user-id user_456 --resource-id res_789 \
216
+ --text-backend sqlite \
217
+ --embedding-model all-MiniLM-L6-v2 \
218
+ --verbose
219
+
220
+ # Structured extraction only
221
+ python -m unified_pipeline.run_process document.pdf \
222
+ --mode structured-only \
223
+ --tenant-id org_123 --user-id user_456 --resource-id res_789
224
+
225
+ # RAG indexing only
226
+ python -m unified_pipeline.run_process document.pdf \
227
+ --mode rag-only --rag-retrieval hybrid \
228
+ --text-backend sqlite --embedding-model all-MiniLM-L6-v2
229
+ ```
230
+
231
+ ```python
232
+ from unified_pipeline import UnifiedDocumentPipeline, UnifiedPipelineConfig
233
+ from openrag import OpenRAGConfig
234
+
235
+ cfg = UnifiedPipelineConfig(
236
+ enable_structured=True,
237
+ enable_rag=True,
238
+ rag_retrieval_mode="hybrid",
239
+ tenant_id="org_123",
240
+ user_id="user_456",
241
+ resource_id="res_789",
242
+ openrag=OpenRAGConfig(text_backend="sqlite"),
243
+ )
244
+ pipeline = UnifiedDocumentPipeline(cfg)
245
+ result = pipeline.process_file("document.pdf")
246
+
247
+ print(result["structured"]["extraction_count"]) # entities extracted
248
+ print(result["rag"]["chunks_indexed"]) # chunks in BM25 + FAISS
249
+ # In "both" mode, both systems share the same document_id
250
+ print(result["structured"]["document_id"] == result["rag"]["doc_id"]) # True
251
+ ```
252
+
253
+ When running in `both` mode:
254
+ - The document is parsed **once** and the text is shared between both pipelines
255
+ - Extracted entity names, types, and document type are stored as metadata on every RAG chunk
256
+ - Both systems use the same `document_id` so you can filter/query by it in either system
257
+
258
+ ## Environment Variables
259
+
260
+ | Variable | Default | Description |
261
+ |----------|---------|-------------|
262
+ | `OPENRAG_STORAGE_BACKEND` | `athena` | Structured extraction backend: `athena`, `postgres`, `mysql`, `sqlite` |
263
+ | `OPENRAG_TEXT_BACKEND` | `sqlite` | RAG BM25 backend: `sqlite`, `postgres`, `mysql`, `mongodb` |
264
+ | `OPENRAG_EMBEDDING_MODEL` | `all-MiniLM-L6-v2` | sentence-transformers model for embeddings |
265
+ | `OPENRAG_EMBEDDING_DEVICE` | `cpu` | Compute device: `cpu`, `cuda`, `mps` |
266
+ | `OPENRAG_FAISS_INDEX_PATH` | `./openrag.faiss` | FAISS binary index file path |
267
+ | `OPENRAG_FAISS_METADATA_PATH` | `./openrag_meta.json` | FAISS JSON sidecar path |
268
+ | `OPENRAG_RETRIEVAL_MODE` | `hybrid` | Default retrieval mode: `hybrid`, `text`, `vector` |
269
+ | `PIPELINE_ENABLE_STRUCTURED` | `true` | Enable structured extraction in unified pipeline |
270
+ | `PIPELINE_ENABLE_RAG` | `true` | Enable RAG indexing in unified pipeline |
271
+ | `PIPELINE_RAG_RETRIEVAL_MODE` | `hybrid` | Default RAG mode in unified pipeline |
272
+ | `PIPELINE_TENANT_ID` | `` | Tenant context for structured pipeline |
273
+ | `PIPELINE_USER_ID` | `` | User context for structured pipeline |
274
+ | `PIPELINE_RESOURCE_ID` | `` | Resource context for structured pipeline |
File without changes
File without changes