rakam-systems-vectorstore 0.1.1rc7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. rakam_systems_vectorstore-0.1.1rc7/.gitignore +46 -0
  2. rakam_systems_vectorstore-0.1.1rc7/.python-version +1 -0
  3. rakam_systems_vectorstore-0.1.1rc7/PKG-INFO +370 -0
  4. rakam_systems_vectorstore-0.1.1rc7/README.md +301 -0
  5. rakam_systems_vectorstore-0.1.1rc7/main.py +6 -0
  6. rakam_systems_vectorstore-0.1.1rc7/pyproject.toml +111 -0
  7. rakam_systems_vectorstore-0.1.1rc7/src/rakam_systems_vectorstore/MANIFEST.in +26 -0
  8. rakam_systems_vectorstore-0.1.1rc7/src/rakam_systems_vectorstore/README.md +1071 -0
  9. rakam_systems_vectorstore-0.1.1rc7/src/rakam_systems_vectorstore/__init__.py +93 -0
  10. rakam_systems_vectorstore-0.1.1rc7/src/rakam_systems_vectorstore/components/__init__.py +0 -0
  11. rakam_systems_vectorstore-0.1.1rc7/src/rakam_systems_vectorstore/components/chunker/__init__.py +19 -0
  12. rakam_systems_vectorstore-0.1.1rc7/src/rakam_systems_vectorstore/components/chunker/advanced_chunker.py +1019 -0
  13. rakam_systems_vectorstore-0.1.1rc7/src/rakam_systems_vectorstore/components/chunker/text_chunker.py +154 -0
  14. rakam_systems_vectorstore-0.1.1rc7/src/rakam_systems_vectorstore/components/embedding_model/__init__.py +0 -0
  15. rakam_systems_vectorstore-0.1.1rc7/src/rakam_systems_vectorstore/components/embedding_model/configurable_embeddings.py +546 -0
  16. rakam_systems_vectorstore-0.1.1rc7/src/rakam_systems_vectorstore/components/embedding_model/openai_embeddings.py +259 -0
  17. rakam_systems_vectorstore-0.1.1rc7/src/rakam_systems_vectorstore/components/loader/__init__.py +31 -0
  18. rakam_systems_vectorstore-0.1.1rc7/src/rakam_systems_vectorstore/components/loader/adaptive_loader.py +512 -0
  19. rakam_systems_vectorstore-0.1.1rc7/src/rakam_systems_vectorstore/components/loader/code_loader.py +699 -0
  20. rakam_systems_vectorstore-0.1.1rc7/src/rakam_systems_vectorstore/components/loader/doc_loader.py +812 -0
  21. rakam_systems_vectorstore-0.1.1rc7/src/rakam_systems_vectorstore/components/loader/eml_loader.py +556 -0
  22. rakam_systems_vectorstore-0.1.1rc7/src/rakam_systems_vectorstore/components/loader/html_loader.py +626 -0
  23. rakam_systems_vectorstore-0.1.1rc7/src/rakam_systems_vectorstore/components/loader/md_loader.py +622 -0
  24. rakam_systems_vectorstore-0.1.1rc7/src/rakam_systems_vectorstore/components/loader/odt_loader.py +750 -0
  25. rakam_systems_vectorstore-0.1.1rc7/src/rakam_systems_vectorstore/components/loader/pdf_loader.py +771 -0
  26. rakam_systems_vectorstore-0.1.1rc7/src/rakam_systems_vectorstore/components/loader/pdf_loader_light.py +723 -0
  27. rakam_systems_vectorstore-0.1.1rc7/src/rakam_systems_vectorstore/components/loader/tabular_loader.py +597 -0
  28. rakam_systems_vectorstore-0.1.1rc7/src/rakam_systems_vectorstore/components/vectorstore/__init__.py +0 -0
  29. rakam_systems_vectorstore-0.1.1rc7/src/rakam_systems_vectorstore/components/vectorstore/apps.py +10 -0
  30. rakam_systems_vectorstore-0.1.1rc7/src/rakam_systems_vectorstore/components/vectorstore/configurable_pg_vector_store.py +1661 -0
  31. rakam_systems_vectorstore-0.1.1rc7/src/rakam_systems_vectorstore/components/vectorstore/faiss_vector_store.py +878 -0
  32. rakam_systems_vectorstore-0.1.1rc7/src/rakam_systems_vectorstore/components/vectorstore/migrations/0001_initial.py +55 -0
  33. rakam_systems_vectorstore-0.1.1rc7/src/rakam_systems_vectorstore/components/vectorstore/migrations/__init__.py +0 -0
  34. rakam_systems_vectorstore-0.1.1rc7/src/rakam_systems_vectorstore/components/vectorstore/models.py +10 -0
  35. rakam_systems_vectorstore-0.1.1rc7/src/rakam_systems_vectorstore/components/vectorstore/pg_models.py +97 -0
  36. rakam_systems_vectorstore-0.1.1rc7/src/rakam_systems_vectorstore/components/vectorstore/pg_vector_store.py +827 -0
  37. rakam_systems_vectorstore-0.1.1rc7/src/rakam_systems_vectorstore/config.py +266 -0
  38. rakam_systems_vectorstore-0.1.1rc7/src/rakam_systems_vectorstore/core.py +8 -0
  39. rakam_systems_vectorstore-0.1.1rc7/src/rakam_systems_vectorstore/pyproject.toml +113 -0
  40. rakam_systems_vectorstore-0.1.1rc7/src/rakam_systems_vectorstore/server/README.md +290 -0
  41. rakam_systems_vectorstore-0.1.1rc7/src/rakam_systems_vectorstore/server/__init__.py +20 -0
  42. rakam_systems_vectorstore-0.1.1rc7/src/rakam_systems_vectorstore/server/mcp_server_vector.py +325 -0
  43. rakam_systems_vectorstore-0.1.1rc7/src/rakam_systems_vectorstore/setup.py +103 -0
@@ -0,0 +1,46 @@
1
+ # Python specific
2
+ *.pyc
3
+ *.pyo
4
+ *.pyd
5
+ __pycache__/
6
+ .pytest_cache/
7
+ *.so
8
+
9
+ # Environments
10
+ .env
11
+ *.env
12
+ *.venv*
13
+ venv/
14
+ *venv/
15
+ ENV/
16
+ env/
17
+ env.bak/
18
+ venv.bak/
19
+
20
+ # VS Code
21
+ .vscode/
22
+ .vscode/*
23
+
24
+ # PyCharm
25
+ .idea/
26
+ .idea/*
27
+
28
+ # OS specific
29
+ .DS_Store
30
+ Thumbs.db
31
+
32
+ #data
33
+ data/
34
+ dist/
35
+ logs/
36
+
37
+ # Build artifacts
38
+ *.egg-info/
39
+
40
+ # tracking data
41
+ agent_tracking/
42
+
43
+ # docs
44
+ docs/
45
+
46
+ temp_path/
@@ -0,0 +1,370 @@
1
+ Metadata-Version: 2.4
2
+ Name: rakam-systems-vectorstore
3
+ Version: 0.1.1rc7
4
+ Summary: Utility package for interacting with vectorstores
5
+ Project-URL: Homepage, https://github.com/Rakam-AI/rakam_systems-inhouse
6
+ Project-URL: Documentation, https://github.com/Rakam-AI/rakam_systems-inhouse
7
+ Project-URL: Repository, https://github.com/Rakam-AI/rakam_systems-inhouse
8
+ Project-URL: Issues, https://github.com/Rakam-AI/rakam_systems-inhouse/issues
9
+ Author-email: Mohamed Hilel <mohammedjassemhlel@gmail.com>, Peng Zheng <pengzheng990630@outlook.com>
10
+ Keywords: embeddings,faiss,pgvector,rag,semantic-search,vector-store
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: License :: OSI Approved :: Apache Software License
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.10
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
18
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
19
+ Requires-Python: >=3.10
20
+ Requires-Dist: numpy>=1.24.0
21
+ Requires-Dist: pyyaml>=6.0
22
+ Requires-Dist: rakam-system-core
23
+ Requires-Dist: tqdm>=4.66.0
24
+ Provides-Extra: all
25
+ Requires-Dist: beautifulsoup4>=4.12.0; extra == 'all'
26
+ Requires-Dist: chonkie==1.4.2; extra == 'all'
27
+ Requires-Dist: cohere>=4.0.0; extra == 'all'
28
+ Requires-Dist: django>=4.0.0; extra == 'all'
29
+ Requires-Dist: docling==2.62.0; extra == 'all'
30
+ Requires-Dist: faiss-cpu>=1.12.0; extra == 'all'
31
+ Requires-Dist: odfpy==1.4.1; extra == 'all'
32
+ Requires-Dist: openai>=1.0.0; extra == 'all'
33
+ Requires-Dist: pgvector; extra == 'all'
34
+ Requires-Dist: psycopg2-binary>=2.9.9; extra == 'all'
35
+ Requires-Dist: pymupdf4llm>=0.0.17; extra == 'all'
36
+ Requires-Dist: pymupdf>=1.24.0; extra == 'all'
37
+ Requires-Dist: python-docx>=1.2.0; extra == 'all'
38
+ Requires-Dist: python-magic>=0.4.27; extra == 'all'
39
+ Requires-Dist: sentence-transformers>=5.1.0; extra == 'all'
40
+ Requires-Dist: torch>=2.0.0; extra == 'all'
41
+ Provides-Extra: cohere
42
+ Requires-Dist: cohere>=4.0.0; extra == 'cohere'
43
+ Provides-Extra: dev
44
+ Requires-Dist: black>=23.0.0; extra == 'dev'
45
+ Requires-Dist: pytest-django>=4.5.0; extra == 'dev'
46
+ Requires-Dist: pytest>=7.0.0; extra == 'dev'
47
+ Requires-Dist: ruff>=0.1.0; extra == 'dev'
48
+ Provides-Extra: faiss
49
+ Requires-Dist: faiss-cpu>=1.12.0; extra == 'faiss'
50
+ Provides-Extra: loaders
51
+ Requires-Dist: beautifulsoup4>=4.12.0; extra == 'loaders'
52
+ Requires-Dist: chonkie==1.4.2; extra == 'loaders'
53
+ Requires-Dist: docling==2.62.0; extra == 'loaders'
54
+ Requires-Dist: odfpy==1.4.1; extra == 'loaders'
55
+ Requires-Dist: pymupdf4llm>=0.0.17; extra == 'loaders'
56
+ Requires-Dist: pymupdf>=1.24.0; extra == 'loaders'
57
+ Requires-Dist: python-docx>=1.2.0; extra == 'loaders'
58
+ Requires-Dist: python-magic>=0.4.27; extra == 'loaders'
59
+ Provides-Extra: local-embeddings
60
+ Requires-Dist: sentence-transformers>=5.1.0; extra == 'local-embeddings'
61
+ Requires-Dist: torch>=2.0.0; extra == 'local-embeddings'
62
+ Provides-Extra: openai
63
+ Requires-Dist: openai>=1.0.0; extra == 'openai'
64
+ Provides-Extra: postgres
65
+ Requires-Dist: django>=4.0.0; extra == 'postgres'
66
+ Requires-Dist: pgvector; extra == 'postgres'
67
+ Requires-Dist: psycopg2-binary>=2.9.9; extra == 'postgres'
68
+ Description-Content-Type: text/markdown
69
+
70
+ # Rakam System Vectorstore
71
+
72
+ The vectorstore package of Rakam Systems providing vector database solutions and document processing capabilities.
73
+
74
+ ## Overview
75
+
76
+ `rakam-systems-vectorstore` provides comprehensive vector storage, embedding models, and document loading capabilities. This package depends on `rakam-systems-core`.
77
+
78
+ ## Features
79
+
80
+ - **Configuration-First Design**: Change your entire vector store setup via YAML - no code changes
81
+ - **Multiple Backends**: PostgreSQL with pgvector and FAISS in-memory storage
82
+ - **Flexible Embeddings**: Support for SentenceTransformers, OpenAI, and Cohere
83
+ - **Document Loaders**: PDF, DOCX, HTML, Markdown, CSV, and more
84
+ - **Search Capabilities**: Vector search, keyword search (BM25), and hybrid search
85
+ - **Chunking**: Intelligent text chunking with context preservation
86
+ - **Configuration**: Comprehensive YAML/JSON configuration support
87
+
88
+ ### 🎯 Configuration Convenience
89
+
90
+ The vectorstore package's configurable design allows you to:
91
+
92
+ - **Switch embedding models** without code changes (local ↔ OpenAI ↔ Cohere)
93
+ - **Change search algorithms** instantly (BM25 ↔ ts_rank ↔ hybrid)
94
+ - **Adjust search parameters** (similarity metrics, top-k, hybrid weights)
95
+ - **Toggle features** (hybrid search, caching, reranking)
96
+ - **Tune performance** (batch sizes, chunk sizes, connection pools)
97
+ - **Swap backends** (FAISS ↔ PostgreSQL) by updating config
98
+
99
+ **Example**: Test different embedding models to find the best accuracy/cost balance - just update your YAML config file, no code changes needed!
100
+
101
+ ## Installation
102
+
103
+ ```bash
104
+ # Requires core package
105
+ pip install -e ./rakam-systems-core
106
+
107
+ # Install vectorstore package
108
+ pip install -e ./rakam-systems-vectorstore
109
+
110
+ # With specific backends
111
+ pip install -e "./rakam-systems-vectorstore[postgres]"
112
+ pip install -e "./rakam-systems-vectorstore[faiss]"
113
+ pip install -e "./rakam-systems-vectorstore[all]"
114
+ ```
115
+
116
+ ## Quick Start
117
+
118
+ ### FAISS Vector Store (In-Memory)
119
+
120
+ ```python
121
+ from rakam_systems_vectorstore.components.vectorstore.faiss_vector_store import FaissStore
122
+ from rakam_systems_vectorstore.core import Node, NodeMetadata
123
+
124
+ # Create store
125
+ store = FaissStore(
126
+ name="my_store",
127
+ base_index_path="./indexes",
128
+ embedding_model="Snowflake/snowflake-arctic-embed-m",
129
+ initialising=True
130
+ )
131
+
132
+ # Create nodes
133
+ nodes = [
134
+ Node(
135
+ content="Python is great for AI",
136
+ metadata=NodeMetadata(source_file_uuid="doc1", position=0)
137
+ )
138
+ ]
139
+
140
+ # Add and search
141
+ store.create_collection_from_nodes("my_collection", nodes)
142
+ results, _ = store.search("my_collection", "AI programming", number=5)
143
+ ```
144
+
145
+ ### PostgreSQL Vector Store
146
+
147
+ ```python
148
+ import os
149
+ import django
150
+ from django.conf import settings
151
+
152
+ # Configure Django (required)
153
+ if not settings.configured:
154
+ settings.configure(
155
+ INSTALLED_APPS=[
156
+ 'django.contrib.contenttypes',
157
+ 'rakam_systems_vectorstore.components.vectorstore',
158
+ ],
159
+ DATABASES={
160
+ 'default': {
161
+ 'ENGINE': 'django.db.backends.postgresql',
162
+ 'NAME': os.getenv('POSTGRES_DB', 'vectorstore_db'),
163
+ 'USER': os.getenv('POSTGRES_USER', 'postgres'),
164
+ 'PASSWORD': os.getenv('POSTGRES_PASSWORD', 'postgres'),
165
+ 'HOST': os.getenv('POSTGRES_HOST', 'localhost'),
166
+ 'PORT': os.getenv('POSTGRES_PORT', '5432'),
167
+ }
168
+ },
169
+ DEFAULT_AUTO_FIELD='django.db.models.BigAutoField',
170
+ )
171
+ django.setup()
172
+
173
+ from rakam_systems_vectorstore import ConfigurablePgVectorStore, VectorStoreConfig
174
+
175
+ # Create configuration
176
+ config = VectorStoreConfig(
177
+ embedding={
178
+ "model_type": "sentence_transformer",
179
+ "model_name": "Snowflake/snowflake-arctic-embed-m"
180
+ },
181
+ search={
182
+ "similarity_metric": "cosine",
183
+ "enable_hybrid_search": True
184
+ }
185
+ )
186
+
187
+ # Create and use store
188
+ store = ConfigurablePgVectorStore(config=config)
189
+ store.setup()
190
+ store.add_nodes(nodes)
191
+ results = store.search("What is AI?", top_k=5)
192
+ store.shutdown()
193
+ ```
194
+
195
+ ## Core Components
196
+
197
+ ### Vector Stores
198
+
199
+ - **ConfigurablePgVectorStore**: PostgreSQL with pgvector, supports hybrid search and keyword search
200
+ - **FaissStore**: In-memory FAISS-based vector search
201
+
202
+ ### Embeddings
203
+
204
+ - **ConfigurableEmbeddings**: Supports multiple backends
205
+ - SentenceTransformers (local)
206
+ - OpenAI embeddings
207
+ - Cohere embeddings
208
+
209
+ ### Document Loaders
210
+
211
+ - **AdaptiveLoader**: Automatically detects and loads various file types
212
+ - **PdfLoader**: Advanced PDF processing with Docling
213
+ - **PdfLoaderLight**: Lightweight PDF to markdown conversion
214
+ - **DocLoader**: Microsoft Word documents
215
+ - **OdtLoader**: OpenDocument Text files
216
+ - **MdLoader**: Markdown files
217
+ - **HtmlLoader**: HTML files
218
+ - **EmlLoader**: Email files
219
+ - **TabularLoader**: CSV, Excel files
220
+ - **CodeLoader**: Source code files
221
+
222
+ ### Chunking
223
+
224
+ - **TextChunker**: Sentence-based chunking with Chonkie
225
+ - **AdvancedChunker**: Context-aware chunking with heading preservation
226
+
227
+ ## Package Structure
228
+
229
+ ```
230
+ rakam-systems-vectorstore/
231
+ ├── src/rakam_systems_vectorstore/
232
+ │ ├── core.py # Node, VSFile, NodeMetadata
233
+ │ ├── config.py # VectorStoreConfig
234
+ │ ├── components/
235
+ │ │ ├── vectorstore/ # Store implementations
236
+ │ │ │ ├── configurable_pg_vectorstore.py
237
+ │ │ │ └── faiss_vector_store.py
238
+ │ │ ├── embedding_model/ # Embedding models
239
+ │ │ │ └── configurable_embeddings.py
240
+ │ │ ├── loader/ # Document loaders
241
+ │ │ │ ├── adaptive_loader.py
242
+ │ │ │ ├── pdf_loader.py
243
+ │ │ │ ├── pdf_loader_light.py
244
+ │ │ │ └── ... (other loaders)
245
+ │ │ └── chunker/ # Text chunkers
246
+ │ │ ├── text_chunker.py
247
+ │ │ └── advanced_chunker.py
248
+ │ ├── docs/ # Package documentation
249
+ │ └── server/ # MCP server
250
+ └── pyproject.toml
251
+ ```
252
+
253
+ ## Search Capabilities
254
+
255
+ ### Vector Search
256
+
257
+ Semantic similarity search using embeddings:
258
+
259
+ ```python
260
+ results = store.search("machine learning algorithms", top_k=10)
261
+ ```
262
+
263
+ ### Keyword Search (BM25)
264
+
265
+ Full-text search with BM25 ranking:
266
+
267
+ ```python
268
+ results = store.keyword_search(
269
+ query="machine learning",
270
+ top_k=10,
271
+ ranking_algorithm="bm25"
272
+ )
273
+ ```
274
+
275
+ ### Hybrid Search
276
+
277
+ Combines vector and keyword search:
278
+
279
+ ```python
280
+ results = store.hybrid_search(
281
+ query="neural networks",
282
+ top_k=10,
283
+ alpha=0.7 # 70% vector, 30% keyword
284
+ )
285
+ ```
286
+
287
+ ## Configuration
288
+
289
+ ### From YAML
290
+
291
+ ```yaml
292
+ # vectorstore_config.yaml
293
+ name: my_vectorstore
294
+
295
+ embedding:
296
+ model_type: sentence_transformer
297
+ model_name: Snowflake/snowflake-arctic-embed-m
298
+ batch_size: 128
299
+ normalize: true
300
+
301
+ database:
302
+ host: localhost
303
+ port: 5432
304
+ database: vectorstore_db
305
+ user: postgres
306
+ password: postgres
307
+
308
+ search:
309
+ similarity_metric: cosine
310
+ default_top_k: 5
311
+ enable_hybrid_search: true
312
+ hybrid_alpha: 0.7
313
+
314
+ index:
315
+ chunk_size: 512
316
+ chunk_overlap: 50
317
+ ```
318
+
319
+ ```python
320
+ config = VectorStoreConfig.from_yaml("vectorstore_config.yaml")
321
+ store = ConfigurablePgVectorStore(config=config)
322
+ ```
323
+
324
+ ## Documentation
325
+
326
+ Detailed documentation is available in the `src/rakam_systems_vectorstore/docs/` directory:
327
+
328
+ - [Installation Guide](src/rakam_systems_vectorstore/docs/INSTALLATION.md)
329
+ - [Quick Install](src/rakam_systems_vectorstore/docs/QUICK_INSTALL.md)
330
+ - [Architecture](src/rakam_systems_vectorstore/docs/ARCHITECTURE.md)
331
+ - [Package Structure](src/rakam_systems_vectorstore/docs/PACKAGE_STRUCTURE.md)
332
+
333
+ Loader-specific documentation:
334
+
335
+ - [PDF Loader](src/rakam_systems_vectorstore/components/loader/docs/PDF_LOADER_ARCHITECTURE.md)
336
+ - [DOC Loader](src/rakam_systems_vectorstore/components/loader/docs/DOC_LOADER_README.md)
337
+ - [Tabular Loader](src/rakam_systems_vectorstore/components/loader/docs/TABULAR_LOADER_README.md)
338
+ - [EML Loader](src/rakam_systems_vectorstore/components/loader/docs/EML_LOADER_README.md)
339
+
340
+ ## Examples
341
+
342
+ See the `examples/ai_vectorstore_examples/` directory in the main repository for complete examples:
343
+
344
+ - Basic FAISS example
345
+ - PostgreSQL example
346
+ - Configurable vectorstore examples
347
+ - PDF loader examples
348
+ - Keyword search examples
349
+
350
+ ## Environment Variables
351
+
352
+ - `POSTGRES_HOST`: PostgreSQL host (default: localhost)
353
+ - `POSTGRES_PORT`: PostgreSQL port (default: 5432)
354
+ - `POSTGRES_DB`: Database name (default: vectorstore_db)
355
+ - `POSTGRES_USER`: Database user (default: postgres)
356
+ - `POSTGRES_PASSWORD`: Database password
357
+ - `OPENAI_API_KEY`: For OpenAI embeddings
358
+ - `COHERE_API_KEY`: For Cohere embeddings
359
+ - `HUGGINGFACE_TOKEN`: For private HuggingFace models
360
+
361
+ ## License
362
+
363
+ Apache 2.0
364
+
365
+ ## Links
366
+
367
+ - [Main Repository](https://github.com/Rakam-AI/rakam-systems)
368
+ - [Documentation](../docs/)
369
+ - [Core Package](../rakam-systems-core/)
370
+ - [Agent Package](../rakam-system-agent/)