ragnar-cli 0.1.0.pre.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE.txt +21 -0
- data/README.md +439 -0
- data/exe/ragnar +6 -0
- data/lib/ragnar/chunker.rb +97 -0
- data/lib/ragnar/cli.rb +542 -0
- data/lib/ragnar/context_repacker.rb +121 -0
- data/lib/ragnar/database.rb +267 -0
- data/lib/ragnar/embedder.rb +137 -0
- data/lib/ragnar/indexer.rb +234 -0
- data/lib/ragnar/llm_manager.rb +43 -0
- data/lib/ragnar/query_processor.rb +398 -0
- data/lib/ragnar/query_rewriter.rb +75 -0
- data/lib/ragnar/topic_modeling/engine.rb +221 -0
- data/lib/ragnar/topic_modeling/labeling_strategies.rb +300 -0
- data/lib/ragnar/topic_modeling/llm_adapter.rb +131 -0
- data/lib/ragnar/topic_modeling/metrics.rb +186 -0
- data/lib/ragnar/topic_modeling/term_extractor.rb +170 -0
- data/lib/ragnar/topic_modeling/topic.rb +117 -0
- data/lib/ragnar/topic_modeling/topic_labeler.rb +61 -0
- data/lib/ragnar/topic_modeling.rb +24 -0
- data/lib/ragnar/umap_processor.rb +228 -0
- data/lib/ragnar/umap_transform_service.rb +124 -0
- data/lib/ragnar/version.rb +5 -0
- data/lib/ragnar.rb +36 -0
- data/lib/ragnar_cli.rb +2 -0
- metadata +234 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: c0b9db7d48838621cadf2a90bff6fc4afca333ecd7fdc2364666196fce437474
|
4
|
+
data.tar.gz: '0468b4cdb2893fb80b7b52ad9c9dc4857bb6a90e0105c0e4356ba0612a5bcfae'
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: c398c02f5019e86476a59ebe64639ec42b009c4699d3704a39492950d00e3be0e252c0cdd84101b24121f359f12c8652c3129643376fc4b84b03c9ab99843f13
|
7
|
+
data.tar.gz: c295cfa1ec329d954d7d86e026b29536d83194ed47ed3a28c96527cfc3c8ae7c8d652a09d4b4e771599e1671a16523bcffa79abb01feddb5592edb48561b19ff
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2024 Your Name
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
13
|
+
all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,439 @@
|
|
1
|
+
# Ragnar
|
2
|
+
|
3
|
+
A complete Ruby implementation of Retrieval-Augmented Generation (RAG) pipeline using native Ruby ML/NLP gems.
|
4
|
+
|
5
|
+
## Overview
|
6
|
+
|
7
|
+
Ragnar provides a production-ready RAG pipeline for Ruby applications, integrating:
|
8
|
+
- **red-candle**: LLM inference, embeddings, and reranking
|
9
|
+
- **lancelot**: Vector database with Lance columnar storage
|
10
|
+
- **clusterkit**: UMAP dimensionality reduction and clustering
|
11
|
+
- **baran**: Text chunking and splitting
|
12
|
+
|
13
|
+
## Architecture
|
14
|
+
|
15
|
+
### Complete RAG Pipeline
|
16
|
+
|
17
|
+
```mermaid
|
18
|
+
graph TB
|
19
|
+
subgraph "Indexing Pipeline"
|
20
|
+
A[Documents] --> B[Chunker<br/>baran]
|
21
|
+
B --> C[Embedder<br/>red-candle]
|
22
|
+
C --> D[Vector DB<br/>lancelot]
|
23
|
+
D --> E[UMAP Training<br/>annembed]
|
24
|
+
E --> F[Reduced Embeddings]
|
25
|
+
end
|
26
|
+
|
27
|
+
subgraph "Query Pipeline"
|
28
|
+
LLMCache[LLM Manager<br/>Cached Instance]
|
29
|
+
Q[User Query] --> QR[Query Rewriter<br/>red-candle LLM]
|
30
|
+
QR --> QE[Query Embedder<br/>red-candle]
|
31
|
+
QE --> VS[Vector Search<br/>lancelot]
|
32
|
+
VS --> RRF[RRF Fusion]
|
33
|
+
RRF --> RR[Reranker<br/>red-candle]
|
34
|
+
RR --> RP[Context Repacker<br/>Deduplication & Organization]
|
35
|
+
RP --> LLM[Response Generation<br/>red-candle LLM]
|
36
|
+
LLM --> R[Answer]
|
37
|
+
|
38
|
+
LLMCache -.-> QR
|
39
|
+
LLMCache -.-> LLM
|
40
|
+
end
|
41
|
+
|
42
|
+
D -.-> VS
|
43
|
+
F -.-> VS
|
44
|
+
```
|
45
|
+
|
46
|
+
### Indexing Process
|
47
|
+
|
48
|
+
```mermaid
|
49
|
+
sequenceDiagram
|
50
|
+
participant User
|
51
|
+
participant CLI
|
52
|
+
participant Indexer
|
53
|
+
participant Chunker
|
54
|
+
participant Embedder
|
55
|
+
participant Database
|
56
|
+
|
57
|
+
User->>CLI: ragnar index ./documents
|
58
|
+
CLI->>Indexer: index_path(path)
|
59
|
+
|
60
|
+
loop For each file
|
61
|
+
Indexer->>Indexer: Read file
|
62
|
+
Indexer->>Chunker: split_text(content)
|
63
|
+
Chunker-->>Indexer: chunks[]
|
64
|
+
|
65
|
+
loop For each chunk
|
66
|
+
Indexer->>Embedder: embed(text)
|
67
|
+
Embedder-->>Indexer: embedding[768]
|
68
|
+
Indexer->>Database: add_document(chunk, embedding)
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
Database-->>CLI: stats
|
73
|
+
CLI-->>User: Indexed N documents
|
74
|
+
```
|
75
|
+
|
76
|
+
### UMAP Dimensionality Reduction
|
77
|
+
|
78
|
+
```mermaid
|
79
|
+
flowchart LR
|
80
|
+
A[High-Dim Embeddings<br/>768D] --> B[UMAP Training]
|
81
|
+
B --> C[Model]
|
82
|
+
C --> D[Low-Dim Embeddings<br/>2-50D]
|
83
|
+
|
84
|
+
B --> E[Parameters]
|
85
|
+
E --> F[n_neighbors]
|
86
|
+
E --> G[n_components]
|
87
|
+
E --> H[min_dist]
|
88
|
+
|
89
|
+
D --> I[Benefits]
|
90
|
+
I --> J[Faster Search]
|
91
|
+
I --> K[Less Memory]
|
92
|
+
I --> L[Visualization]
|
93
|
+
```
|
94
|
+
|
95
|
+
### Query Processing Pipeline
|
96
|
+
|
97
|
+
```mermaid
|
98
|
+
flowchart TB
|
99
|
+
Q[User Query] --> QA[Query Analysis<br/>w/ Cached LLM]
|
100
|
+
|
101
|
+
QA --> CI[Clarified Intent]
|
102
|
+
QA --> SQ[Sub-queries]
|
103
|
+
QA --> KT[Key Terms]
|
104
|
+
|
105
|
+
SQ --> EMB[Embed Each Query]
|
106
|
+
EMB --> VS[Vector Search]
|
107
|
+
|
108
|
+
VS --> RRF[RRF Fusion]
|
109
|
+
RRF --> RANK[Reranking]
|
110
|
+
|
111
|
+
RANK --> TOP[Top-K Documents]
|
112
|
+
TOP --> CTX[Context Preparation]
|
113
|
+
|
114
|
+
CTX --> REPACK[Context Repacking<br/>Deduplication<br/>Summarization<br/>Organization]
|
115
|
+
|
116
|
+
REPACK --> GEN[LLM Generation<br/>w/ Same Cached LLM]
|
117
|
+
CI --> GEN
|
118
|
+
|
119
|
+
GEN --> ANS[Final Answer]
|
120
|
+
```
|
121
|
+
|
122
|
+
## Installation
|
123
|
+
|
124
|
+
### As a Gem
|
125
|
+
|
126
|
+
```bash
|
127
|
+
gem install ragnar
|
128
|
+
```
|
129
|
+
|
130
|
+
### From Source
|
131
|
+
|
132
|
+
```bash
|
133
|
+
git clone https://github.com/yourusername/ragnar.git
|
134
|
+
cd ragnar
|
135
|
+
bundle install
|
136
|
+
gem build ragnar.gemspec
|
137
|
+
gem install ./ragnar-*.gem
|
138
|
+
```
|
139
|
+
|
140
|
+
## Quick Start
|
141
|
+
|
142
|
+
### 1. Index Documents
|
143
|
+
|
144
|
+
```bash
|
145
|
+
# Index a directory of text files
|
146
|
+
ragnar index ./documents
|
147
|
+
|
148
|
+
# Index with custom settings
|
149
|
+
ragnar index ./documents \
|
150
|
+
--chunk-size 1000 \
|
151
|
+
--chunk-overlap 100
|
152
|
+
```
|
153
|
+
|
154
|
+
### 2. Train UMAP (Optional)
|
155
|
+
|
156
|
+
Reduce embedding dimensions for faster search:
|
157
|
+
|
158
|
+
```bash
|
159
|
+
# Train UMAP model (auto-adjusts parameters based on data)
|
160
|
+
ragnar train-umap \
|
161
|
+
--n-components 50 \
|
162
|
+
--n-neighbors 15
|
163
|
+
|
164
|
+
# Apply to all embeddings
|
165
|
+
ragnar apply-umap
|
166
|
+
```
|
167
|
+
|
168
|
+
### 3. Query the System
|
169
|
+
|
170
|
+
```bash
|
171
|
+
# Basic query
|
172
|
+
ragnar query "What is the main purpose of this project?"
|
173
|
+
|
174
|
+
# Verbose mode shows all intermediate processing steps
|
175
|
+
ragnar query "How does the chunking process work?" --verbose
|
176
|
+
# Or use short form
|
177
|
+
ragnar query "How does the chunking process work?" -v
|
178
|
+
|
179
|
+
# JSON output for programmatic use
|
180
|
+
ragnar query "Explain the embedding model" --json
|
181
|
+
|
182
|
+
# Adjust number of retrieved documents
|
183
|
+
ragnar query "What are the key features?" --top-k 5
|
184
|
+
|
185
|
+
# Combine options for detailed analysis
|
186
|
+
ragnar query "Compare Ruby with Python" -v --top-k 5
|
187
|
+
```
|
188
|
+
|
189
|
+
#### Verbose Mode Output
|
190
|
+
|
191
|
+
When using `--verbose` or `-v`, you'll see:
|
192
|
+
1. **Query Analysis**: Original query, clarified intent, sub-queries, and key terms
|
193
|
+
2. **Document Retrieval**: Each sub-query's embedding and search results
|
194
|
+
3. **RRF Fusion**: How multiple search results are combined
|
195
|
+
4. **Reranking**: Top documents after relevance scoring
|
196
|
+
5. **Context Repacking**: How retrieved chunks are organized and compressed
|
197
|
+
6. **Response Generation**: The final LLM prompt and response
|
198
|
+
7. **Final Results**: Confidence score and source attribution
|
199
|
+
|
200
|
+
### 4. Check Statistics
|
201
|
+
|
202
|
+
```bash
|
203
|
+
ragnar stats
|
204
|
+
```
|
205
|
+
|
206
|
+
## Features
|
207
|
+
|
208
|
+
### Intelligent Query Processing
|
209
|
+
|
210
|
+
1. **Query Rewriting**: Clarifies intent and generates sub-queries
|
211
|
+
2. **Multi-Query Search**: Searches with multiple query variations
|
212
|
+
3. **RRF Fusion**: Combines results using Reciprocal Rank Fusion
|
213
|
+
4. **Reranking**: Uses cross-encoder for precise relevance scoring
|
214
|
+
5. **Context Repacking**: Deduplicates and organizes retrieved chunks for optimal LLM consumption
|
215
|
+
6. **LLM Caching**: Single LLM instance shared between query rewriting and response generation
|
216
|
+
7. **Contextual Response**: Generates answers with LLM based on repacked context
|
217
|
+
|
218
|
+
### Embedding Management
|
219
|
+
|
220
|
+
- **High-dimensional embeddings** (768D) for semantic accuracy
|
221
|
+
- **UMAP reduction** to lower dimensions (2-50D) for efficiency
|
222
|
+
- **Automatic parameter adjustment** based on dataset size
|
223
|
+
- **Batch processing** for large document collections
|
224
|
+
|
225
|
+
### Database Features
|
226
|
+
|
227
|
+
- **Lance columnar format** for efficient storage
|
228
|
+
- **Vector similarity search** with configurable metrics
|
229
|
+
- **Metadata tracking** for source attribution
|
230
|
+
- **Incremental indexing** support
|
231
|
+
|
232
|
+
## Configuration
|
233
|
+
|
234
|
+
### Default Settings
|
235
|
+
|
236
|
+
```ruby
|
237
|
+
DEFAULT_DB_PATH = "ragnar_database"
|
238
|
+
DEFAULT_CHUNK_SIZE = 512
|
239
|
+
DEFAULT_CHUNK_OVERLAP = 50
|
240
|
+
DEFAULT_EMBEDDING_MODEL = "jinaai/jina-embeddings-v2-base-en"
|
241
|
+
```
|
242
|
+
|
243
|
+
### Supported Models
|
244
|
+
|
245
|
+
**Embedding Models** (via red-candle):
|
246
|
+
- jinaai/jina-embeddings-v2-base-en
|
247
|
+
- BAAI/bge-base-en-v1.5
|
248
|
+
- sentence-transformers/all-MiniLM-L6-v2
|
249
|
+
|
250
|
+
**LLM Models** (via red-candle):
|
251
|
+
- Qwen/Qwen2.5-1.5B-Instruct
|
252
|
+
- microsoft/phi-2
|
253
|
+
- TinyLlama/TinyLlama-1.1B-Chat-v1.0
|
254
|
+
|
255
|
+
**Reranker Models** (via red-candle):
|
256
|
+
- BAAI/bge-reranker-base
|
257
|
+
- cross-encoder/ms-marco-MiniLM-L-6-v2
|
258
|
+
|
259
|
+
## Advanced Usage
|
260
|
+
|
261
|
+
### Programmatic API
|
262
|
+
|
263
|
+
```ruby
|
264
|
+
require 'ragnar'
|
265
|
+
|
266
|
+
# Initialize components
|
267
|
+
indexer = Ragnar::Indexer.new(
|
268
|
+
db_path: "my_database",
|
269
|
+
chunk_size: 1000
|
270
|
+
)
|
271
|
+
|
272
|
+
# Index documents
|
273
|
+
stats = indexer.index_path("./documents")
|
274
|
+
|
275
|
+
# Query the system
|
276
|
+
processor = Ragnar::QueryProcessor.new(db_path: "my_database")
|
277
|
+
result = processor.query(
|
278
|
+
"What is Ruby?",
|
279
|
+
top_k: 5,
|
280
|
+
verbose: true
|
281
|
+
)
|
282
|
+
|
283
|
+
puts result[:answer]
|
284
|
+
puts "Confidence: #{result[:confidence]}%"
|
285
|
+
```
|
286
|
+
|
287
|
+
### Custom Chunking Strategies
|
288
|
+
|
289
|
+
```ruby
|
290
|
+
chunker = Ragnar::Chunker.new(
|
291
|
+
chunk_size: 1000,
|
292
|
+
chunk_overlap: 200,
|
293
|
+
separators: ["\n\n", "\n", ". ", " "]
|
294
|
+
)
|
295
|
+
|
296
|
+
chunks = chunker.chunk_text(document_text)
|
297
|
+
```
|
298
|
+
|
299
|
+
### Embedding Optimization
|
300
|
+
|
301
|
+
```ruby
|
302
|
+
# For small datasets (<100 documents)
|
303
|
+
processor = Ragnar::UmapProcessor.new
|
304
|
+
processor.train(
|
305
|
+
n_components: 10, # Fewer components
|
306
|
+
n_neighbors: 5, # Fewer neighbors
|
307
|
+
min_dist: 0.05 # Tighter clusters
|
308
|
+
)
|
309
|
+
|
310
|
+
# For large datasets (>10,000 documents)
|
311
|
+
processor.train(
|
312
|
+
n_components: 50, # More components
|
313
|
+
n_neighbors: 30, # More neighbors
|
314
|
+
min_dist: 0.1 # Standard distance
|
315
|
+
)
|
316
|
+
```
|
317
|
+
|
318
|
+
## Performance Considerations
|
319
|
+
|
320
|
+
### Memory Usage
|
321
|
+
|
322
|
+
- **Indexing**: ~100MB per 1000 documents (768D embeddings)
|
323
|
+
- **UMAP Training**: ~80MB for 10,000 vectors
|
324
|
+
- **Query Processing**: ~50MB overhead for models (reduced with LLM caching)
|
325
|
+
- **LLM Caching**: Single model instance (~500MB-2GB depending on model size)
|
326
|
+
|
327
|
+
### Speed Benchmarks
|
328
|
+
|
329
|
+
- **Indexing**: ~10 documents/second (including embedding)
|
330
|
+
- **UMAP Training**: 30-60 seconds for 10,000 vectors
|
331
|
+
- **Query Processing**: 1-3 seconds per query (faster with cached LLM)
|
332
|
+
- **Vector Search**: <100ms for 100,000 vectors
|
333
|
+
- **Context Repacking**: <50ms for typical document sets
|
334
|
+
- **LLM Loading**: 2-5 seconds (only on first query with caching)
|
335
|
+
|
336
|
+
### Optimization Tips
|
337
|
+
|
338
|
+
1. **Use UMAP** for datasets >1000 documents
|
339
|
+
2. **Batch index** large document collections
|
340
|
+
3. **Cache embeddings** for repeated queries
|
341
|
+
4. **Adjust chunk size** based on document type:
|
342
|
+
- Technical docs: 500-1000 tokens
|
343
|
+
- Narrative text: 200-500 tokens
|
344
|
+
- Q&A content: 100-300 tokens
|
345
|
+
|
346
|
+
## Troubleshooting
|
347
|
+
|
348
|
+
### Common Issues
|
349
|
+
|
350
|
+
**UMAP fails with "index out of bounds"**
|
351
|
+
- Cause: Too few samples for the requested parameters
|
352
|
+
- Solution: System auto-adjusts, but you can manually set lower n_neighbors
|
353
|
+
|
354
|
+
**Slow indexing performance**
|
355
|
+
- Try smaller chunk sizes
|
356
|
+
- Use batch processing
|
357
|
+
- Consider using a faster embedding model
|
358
|
+
|
359
|
+
**Poor query results**
|
360
|
+
- Index more documents (RAG works best with 100+ documents)
|
361
|
+
- Adjust chunk size and overlap
|
362
|
+
- Try different embedding models
|
363
|
+
|
364
|
+
## Development
|
365
|
+
|
366
|
+
```bash
|
367
|
+
# Install dependencies
|
368
|
+
bundle install
|
369
|
+
|
370
|
+
# Run tests
|
371
|
+
bundle exec rspec
|
372
|
+
|
373
|
+
# Build gem
|
374
|
+
gem build ragnar.gemspec
|
375
|
+
```
|
376
|
+
|
377
|
+
## Architecture Details
|
378
|
+
|
379
|
+
### Component Responsibilities
|
380
|
+
|
381
|
+
| Component | Purpose | Key Methods |
|
382
|
+
|-----------|---------|-------------|
|
383
|
+
| Chunker | Split text into semantic chunks | `chunk_text()` |
|
384
|
+
| Embedder | Generate vector embeddings | `embed_text()`, `embed_batch()` |
|
385
|
+
| Database | Store and search vectors | `add_document()`, `search_similar()` |
|
386
|
+
| LLMManager | Cache and manage LLM instances | `get_llm()`, `default_llm()` |
|
387
|
+
| ContextRepacker | Optimize retrieved context | `repack()`, `repack_with_summary()` |
|
388
|
+
| QueryRewriter | Analyze and expand queries | `rewrite()` |
|
389
|
+
| QueryProcessor | Orchestrate query pipeline | `query()` |
|
390
|
+
| UmapProcessor | Reduce embedding dimensions | `train()`, `apply()` |
|
391
|
+
|
392
|
+
### Data Flow
|
393
|
+
|
394
|
+
1. **Documents** → Chunker → Text chunks
|
395
|
+
2. **Text chunks** → Embedder → Embeddings (768D)
|
396
|
+
3. **Embeddings** → Database → Stored vectors
|
397
|
+
4. **Stored vectors** → UMAP → Reduced vectors (2-50D)
|
398
|
+
5. **Query** → Rewriter (w/ cached LLM) → Sub-queries
|
399
|
+
6. **Sub-queries** → Embedder → Query vectors
|
400
|
+
7. **Query vectors** → Database → Similar documents
|
401
|
+
8. **Documents** → Reranker → Top results
|
402
|
+
9. **Top results** → Context Repacker → Optimized context
|
403
|
+
10. **Optimized context** → LLM (same cached instance) → Final answer
|
404
|
+
|
405
|
+
## Contributing
|
406
|
+
|
407
|
+
Contributions are welcome! Please:
|
408
|
+
|
409
|
+
1. Fork the repository
|
410
|
+
2. Create a feature branch
|
411
|
+
3. Add tests for new functionality
|
412
|
+
4. Ensure all tests pass
|
413
|
+
5. Submit a pull request
|
414
|
+
|
415
|
+
## License
|
416
|
+
|
417
|
+
MIT License - see LICENSE file for details
|
418
|
+
|
419
|
+
## Acknowledgments
|
420
|
+
|
421
|
+
This project integrates several excellent Ruby gems:
|
422
|
+
- [red-candle](https://github.com/red-candle) - Ruby ML/LLM toolkit
|
423
|
+
- [lancelot](https://github.com/lancelot) - Lance database bindings
|
424
|
+
- [clusterkit](https://github.com/cpetersen/clusterkit) - UMAP and clustering implementation
|
425
|
+
- [parsekit](https://github.com/cpetersen/parsekit) - Content extraction
|
426
|
+
- [baran](https://github.com/baran) - Text splitting utilities
|
427
|
+
|
428
|
+
## Roadmap
|
429
|
+
|
430
|
+
- [ ] Add support for PDF and HTML documents
|
431
|
+
- [ ] Implement incremental indexing
|
432
|
+
- [ ] Add conversation memory for multi-turn queries
|
433
|
+
- [ ] Support for hybrid search (vector + keyword)
|
434
|
+
- [ ] Web UI for interactive queries
|
435
|
+
- [ ] Docker containerization
|
436
|
+
- [ ] Performance benchmarking suite
|
437
|
+
- [ ] Support for multiple embedding models simultaneously
|
438
|
+
- [ ] Query result caching
|
439
|
+
- [ ] Automatic index optimization
|
data/exe/ragnar
ADDED
@@ -0,0 +1,97 @@
|
|
1
|
+
module Ragnar
|
2
|
+
class Chunker
|
3
|
+
attr_reader :chunk_size, :chunk_overlap
|
4
|
+
|
5
|
+
def initialize(chunk_size: Ragnar::DEFAULT_CHUNK_SIZE, chunk_overlap: Ragnar::DEFAULT_CHUNK_OVERLAP)
|
6
|
+
@chunk_size = chunk_size
|
7
|
+
@chunk_overlap = chunk_overlap
|
8
|
+
# Use RecursiveCharacterTextSplitter for better chunking
|
9
|
+
@splitter = Baran::RecursiveCharacterTextSplitter.new(
|
10
|
+
chunk_size: chunk_size,
|
11
|
+
chunk_overlap: chunk_overlap,
|
12
|
+
separators: ["\n\n", "\n", ". ", " ", ""]
|
13
|
+
)
|
14
|
+
end
|
15
|
+
|
16
|
+
def chunk_text(text, metadata = {})
|
17
|
+
return [] if text.nil? || text.strip.empty?
|
18
|
+
|
19
|
+
# Use Baran to split the text into chunks
|
20
|
+
chunks = @splitter.chunks(text)
|
21
|
+
|
22
|
+
# Add metadata to each chunk
|
23
|
+
# Baran returns chunks as hashes with :text and :cursor keys
|
24
|
+
chunks.map.with_index do |chunk_data, index|
|
25
|
+
# Extract the actual text from the chunk
|
26
|
+
chunk_text = if chunk_data.is_a?(Hash)
|
27
|
+
chunk_data[:text] || chunk_data["text"]
|
28
|
+
else
|
29
|
+
chunk_data.to_s
|
30
|
+
end
|
31
|
+
|
32
|
+
{
|
33
|
+
text: chunk_text,
|
34
|
+
index: index,
|
35
|
+
metadata: metadata.merge(
|
36
|
+
chunk_index: index,
|
37
|
+
total_chunks: chunks.size,
|
38
|
+
chunk_size: chunk_text.size
|
39
|
+
)
|
40
|
+
}
|
41
|
+
end
|
42
|
+
rescue => e
|
43
|
+
puts "Error chunking text: #{e.message}"
|
44
|
+
[]
|
45
|
+
end
|
46
|
+
|
47
|
+
def chunk_file(file_path)
|
48
|
+
unless File.exist?(file_path)
|
49
|
+
raise "File not found: #{file_path}"
|
50
|
+
end
|
51
|
+
|
52
|
+
text = File.read(file_path, encoding: 'utf-8', invalid: :replace, undef: :replace)
|
53
|
+
|
54
|
+
metadata = {
|
55
|
+
file_path: File.absolute_path(file_path),
|
56
|
+
file_name: File.basename(file_path),
|
57
|
+
file_size: File.size(file_path),
|
58
|
+
file_modified: File.mtime(file_path).to_s
|
59
|
+
}
|
60
|
+
|
61
|
+
chunk_text(text, metadata)
|
62
|
+
end
|
63
|
+
|
64
|
+
def chunk_documents(documents)
|
65
|
+
all_chunks = []
|
66
|
+
|
67
|
+
documents.each do |doc|
|
68
|
+
if doc.is_a?(String)
|
69
|
+
# If it's a file path
|
70
|
+
if File.exist?(doc)
|
71
|
+
all_chunks.concat(chunk_file(doc))
|
72
|
+
else
|
73
|
+
# Treat as raw text
|
74
|
+
all_chunks.concat(chunk_text(doc))
|
75
|
+
end
|
76
|
+
elsif doc.is_a?(Hash)
|
77
|
+
# If it's a document hash with text and metadata
|
78
|
+
text = doc[:text] || doc["text"]
|
79
|
+
metadata = doc[:metadata] || doc["metadata"] || {}
|
80
|
+
all_chunks.concat(chunk_text(text, metadata))
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
all_chunks
|
85
|
+
end
|
86
|
+
|
87
|
+
def self.semantic_chunker(model: nil)
|
88
|
+
# Future enhancement: Use more sophisticated chunking with semantic boundaries
|
89
|
+
# Could use sentence embeddings to find natural break points
|
90
|
+
Baran::RecursiveCharacterTextSplitter.new(
|
91
|
+
chunk_size: Ragnar::DEFAULT_CHUNK_SIZE,
|
92
|
+
chunk_overlap: Ragnar::DEFAULT_CHUNK_OVERLAP,
|
93
|
+
separators: ["\n\n", "\n", ". ", " ", ""]
|
94
|
+
)
|
95
|
+
end
|
96
|
+
end
|
97
|
+
end
|