smart_rag 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +33 -0
- data/README.en.md +115 -0
- data/README.md +144 -0
- data/config/database.yml +42 -0
- data/config/fulltext_search.yml +111 -0
- data/config/llm_config.yml +15 -0
- data/config/smart_rag.yml +156 -0
- data/db/fix_search_issues.sql +81 -0
- data/db/migrations/001_create_source_documents.rb +26 -0
- data/db/migrations/002_create_source_sections.rb +20 -0
- data/db/migrations/003_create_tags.rb +17 -0
- data/db/migrations/004_create_research_topics.rb +16 -0
- data/db/migrations/005_create_relationship_tables.rb +42 -0
- data/db/migrations/006_create_text_search_configs.rb +28 -0
- data/db/migrations/007_create_section_fts.rb +109 -0
- data/db/migrations/008_create_embeddings.rb +28 -0
- data/db/migrations/009_create_search_logs.rb +30 -0
- data/db/migrations/010_add_metadata_to_source_documents.rb +10 -0
- data/db/migrations/011_add_source_fields_to_source_documents.rb +23 -0
- data/db/rebuild_fts_complete.sql +51 -0
- data/db/seeds/text_search_configs.sql +28 -0
- data/examples/01_quick_start.rb +32 -0
- data/examples/02_document_management.rb +41 -0
- data/examples/03_search_operations.rb +46 -0
- data/examples/04_topics_and_tags.rb +38 -0
- data/examples/05_advanced_patterns.rb +154 -0
- data/examples/06_error_handling_and_retry.rb +64 -0
- data/examples/README.md +42 -0
- data/examples/common.rb +57 -0
- data/lib/smart_rag/chunker/markdown_chunker.rb +315 -0
- data/lib/smart_rag/config.rb +126 -0
- data/lib/smart_rag/core/document_processor.rb +537 -0
- data/lib/smart_rag/core/embedding.rb +340 -0
- data/lib/smart_rag/core/fulltext_manager.rb +483 -0
- data/lib/smart_rag/core/markitdown_bridge.rb +85 -0
- data/lib/smart_rag/core/query_processor.rb +577 -0
- data/lib/smart_rag/errors.rb +88 -0
- data/lib/smart_rag/models/embedding.rb +140 -0
- data/lib/smart_rag/models/model_base.rb +106 -0
- data/lib/smart_rag/models/research_topic.rb +171 -0
- data/lib/smart_rag/models/research_topic_section.rb +86 -0
- data/lib/smart_rag/models/research_topic_tag.rb +89 -0
- data/lib/smart_rag/models/search_log.rb +198 -0
- data/lib/smart_rag/models/section_fts.rb +170 -0
- data/lib/smart_rag/models/section_tag.rb +81 -0
- data/lib/smart_rag/models/source_document.rb +204 -0
- data/lib/smart_rag/models/source_section.rb +201 -0
- data/lib/smart_rag/models/tag.rb +214 -0
- data/lib/smart_rag/models/text_search_config.rb +168 -0
- data/lib/smart_rag/models.rb +116 -0
- data/lib/smart_rag/parsers/query_parser.rb +291 -0
- data/lib/smart_rag/retrieve.rb +745 -0
- data/lib/smart_rag/services/embedding_service.rb +278 -0
- data/lib/smart_rag/services/fulltext_search_service.rb +456 -0
- data/lib/smart_rag/services/hybrid_search_service.rb +768 -0
- data/lib/smart_rag/services/summarization_service.rb +322 -0
- data/lib/smart_rag/services/tag_service.rb +614 -0
- data/lib/smart_rag/services/vector_search_service.rb +347 -0
- data/lib/smart_rag/smart_chunking/chunk.rb +10 -0
- data/lib/smart_rag/smart_chunking/media_context.rb +9 -0
- data/lib/smart_rag/smart_chunking/merger.rb +94 -0
- data/lib/smart_rag/smart_chunking/parser.rb +75 -0
- data/lib/smart_rag/smart_chunking/pipeline.rb +45 -0
- data/lib/smart_rag/smart_chunking/section.rb +11 -0
- data/lib/smart_rag/smart_chunking/structure_detector.rb +31 -0
- data/lib/smart_rag/smart_chunking/tokenizer.rb +24 -0
- data/lib/smart_rag/version.rb +3 -0
- data/lib/smart_rag.rb +986 -0
- data/workers/analyze_content.rb +6 -0
- data/workers/get_embedding.rb +7 -0
- metadata +311 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA256:
|
|
3
|
+
metadata.gz: 19e9abcf3817e98fad15499879a25db6fbb563bc189f5a8f0c58fbea2145cbf8
|
|
4
|
+
data.tar.gz: 88f7e4a0f1929b83c9346e8261928bdd72690de4c9da3689a3ca7882a548a6b1
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: f3c7c945032e61c816f383ea6ae1ea87b0be9d4fb2a794175543489e91763825f73fbb925c2a52b37d786af2e1dd126180b4da3cb5c31ded4130e1fa5bc4ebda
|
|
7
|
+
data.tar.gz: 6de0148938c32c3842d3d3e2a8f31641c508be193d70e9fa11d01c93dd413e3e805f2fec3afb348981455566989654436cb5e7f70c0351b2b94e1dc21978a94a
|
data/CHANGELOG.md
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
## Unreleased
|
|
4
|
+
|
|
5
|
+
### Added
|
|
6
|
+
- 新增 `retrieve(plan:)` 结构化检索入口(`RetrievalPlan -> EvidencePack`)。
|
|
7
|
+
- 新增 `SmartRAG::Retrieve` 执行器:支持多 query、mode 映射、signals、provenance、stats、explain。
|
|
8
|
+
- 新增索引治理接口:
|
|
9
|
+
- `rebuild_fts(document_id=nil)`
|
|
10
|
+
- `rebuild_embeddings(document_id=nil)`
|
|
11
|
+
- `reindex(document_id=nil)`
|
|
12
|
+
- `dedupe_by_content_hash`
|
|
13
|
+
- 新增 `source_documents` 字段与索引:
|
|
14
|
+
- `source_type`
|
|
15
|
+
- `source_uri`
|
|
16
|
+
- `content_hash`
|
|
17
|
+
- 新增轻量单测入口 `spec/unit_spec_helper.rb`(不依赖数据库连接)。
|
|
18
|
+
- 新增回填任务:`rake db:backfill_source_fields`(历史数据回填新字段)。
|
|
19
|
+
- 新增一键发布任务:`rake db:prepare_release`(backfill -> dedupe -> reindex)。
|
|
20
|
+
- 新增 API:
|
|
21
|
+
- `backfill_source_fields(limit: nil, dry_run: false)`
|
|
22
|
+
- `prepare_release_indexes(document_id: nil, dry_run: false)`
|
|
23
|
+
|
|
24
|
+
### Changed
|
|
25
|
+
- `retrieve` 现支持 `global_filters.source_type` 与 `global_filters.source_uri_prefix` 的执行过滤。
|
|
26
|
+
- `retrieve` 新增 `global_filters.topic_ids` 的执行过滤(按 section-topic 关系过滤)。
|
|
27
|
+
- `retrieve` 新增 `budget.diversity.by_source` 执行约束。
|
|
28
|
+
- `dedupe_by_content_hash` 从“仅 content_hash”升级为“`source_uri + content_hash`”去重。
|
|
29
|
+
- 检索日志 (`search_logs.filters`) 新增保存 `plan/stats/explain/warnings`,用于回放与调试。
|
|
30
|
+
|
|
31
|
+
### Compatibility
|
|
32
|
+
- 保留 `search(...)` 旧接口,不破坏现有调用。
|
|
33
|
+
- 对未支持字段通过 `explain.ignored_fields` 明确返回,不做静默忽略。
|
data/README.en.md
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
# SmartRAG
|
|
2
|
+
|
|
3
|
+
[中文 README](README.md)
|
|
4
|
+
|
|
5
|
+
SmartRAG is a Ruby-based hybrid RAG library that combines vector retrieval, full-text search, and topic/tag organization for document intelligence workflows.
|
|
6
|
+
|
|
7
|
+
## Overview
|
|
8
|
+
|
|
9
|
+
- Hybrid retrieval: vector + full-text + weighted fusion
|
|
10
|
+
- Document ingestion from local files and URLs
|
|
11
|
+
- Topic and tag management APIs
|
|
12
|
+
- Search logs and system statistics
|
|
13
|
+
- Runnable example scripts for quick onboarding
|
|
14
|
+
|
|
15
|
+
## Default Model Setup
|
|
16
|
+
|
|
17
|
+
Current defaults use local Ollama-compatible endpoints:
|
|
18
|
+
|
|
19
|
+
- Embedding model: `qwen3-embedding`
|
|
20
|
+
- Text LLM model: `qwen3`
|
|
21
|
+
- Embedding endpoint: `http://localhost:11434/v1/embeddings`
|
|
22
|
+
- LLM endpoint: `http://localhost:11434/v1/chat/completions`
|
|
23
|
+
|
|
24
|
+
You can override these via `.env` or `config/smart_rag.yml`.
|
|
25
|
+
|
|
26
|
+
## Quick Start
|
|
27
|
+
|
|
28
|
+
### 1) Install dependencies
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
bundle install
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
### 2) Configure environment
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
cp .env.example .env
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
Required DB variables:
|
|
41
|
+
|
|
42
|
+
- `SMARTRAG_DB_HOST`
|
|
43
|
+
- `SMARTRAG_DB_PORT`
|
|
44
|
+
- `SMARTRAG_DB_NAME`
|
|
45
|
+
- `SMARTRAG_DB_USER`
|
|
46
|
+
- `SMARTRAG_DB_PASSWORD`
|
|
47
|
+
|
|
48
|
+
### 3) Setup database
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
bundle exec rake db:create
|
|
52
|
+
bundle exec rake db:migrate
|
|
53
|
+
bundle exec rake db:seed
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
### 4) Import test docs (optional)
|
|
57
|
+
|
|
58
|
+
```bash
|
|
59
|
+
ruby test/import_doc.rb import
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
### 5) Run sample scripts
|
|
63
|
+
|
|
64
|
+
```bash
|
|
65
|
+
ruby examples/01_quick_start.rb
|
|
66
|
+
ruby examples/03_search_operations.rb
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
## Minimal Usage
|
|
70
|
+
|
|
71
|
+
```ruby
|
|
72
|
+
require "smart_rag"
|
|
73
|
+
|
|
74
|
+
config = SmartRAG::Config.load("config/smart_rag.yml")
|
|
75
|
+
client = SmartRAG::SmartRAG.new(config)
|
|
76
|
+
|
|
77
|
+
client.add_document("test/python_basics.md", generate_embeddings: true)
|
|
78
|
+
results = client.search("What is machine learning?", search_type: "hybrid", limit: 5)
|
|
79
|
+
|
|
80
|
+
puts results[:results].map { |r| r[:section_title] }
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
## Development Commands
|
|
84
|
+
|
|
85
|
+
- `bundle exec rspec`: run RSpec tests
|
|
86
|
+
- `ruby test/test_rag.rb`: run E2E script
|
|
87
|
+
- `bundle exec rake db:reset`: recreate database
|
|
88
|
+
- `gem build smart_rag.gemspec`: build gem package
|
|
89
|
+
|
|
90
|
+
## Project Structure
|
|
91
|
+
|
|
92
|
+
```text
|
|
93
|
+
lib/
|
|
94
|
+
smart_rag.rb # Main API entry
|
|
95
|
+
smart_rag/core/ # Core processing logic
|
|
96
|
+
smart_rag/services/ # Search/tag/embedding services
|
|
97
|
+
config/ # Runtime config files
|
|
98
|
+
db/ # Migrations and seed SQL
|
|
99
|
+
examples/ # Example programs
|
|
100
|
+
test/ # Manual/E2E scripts + sample docs
|
|
101
|
+
spec/ # RSpec tests
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
## Documentation Map
|
|
105
|
+
|
|
106
|
+
See `docs/DOCUMENTATION_INDEX.en.md` for a curated map of all docs, reading order, and maintenance notes.
|
|
107
|
+
Chinese version: `docs/DOCUMENTATION_INDEX.md`.
|
|
108
|
+
|
|
109
|
+
## Notes
|
|
110
|
+
|
|
111
|
+
- Some legacy docs still contain older defaults (for example OpenAI references). Runtime truth is `config/smart_rag.yml`.
|
|
112
|
+
|
|
113
|
+
## License
|
|
114
|
+
|
|
115
|
+
MIT
|
data/README.md
ADDED
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
# SmartRAG
|
|
2
|
+
|
|
3
|
+
[English README](README.en.md)
|
|
4
|
+
|
|
5
|
+
SmartRAG 是一个 Ruby 混合检索增强生成(RAG)库,结合向量检索、全文检索与主题/标签管理,用于文档智能检索与问答场景。
|
|
6
|
+
|
|
7
|
+
## 项目概览
|
|
8
|
+
|
|
9
|
+
- 混合检索:向量检索 + 全文检索 + 权重融合
|
|
10
|
+
- 支持本地文件与 URL 文档导入
|
|
11
|
+
- 提供主题与标签管理 API
|
|
12
|
+
- 提供搜索日志与系统统计能力
|
|
13
|
+
- 内置示例脚本便于快速上手
|
|
14
|
+
|
|
15
|
+
## 默认模型配置
|
|
16
|
+
|
|
17
|
+
当前默认配置为本地 Ollama 兼容端点:
|
|
18
|
+
|
|
19
|
+
- Embedding 模型:`qwen3-embedding`
|
|
20
|
+
- 文本 LLM 模型:`qwen3`
|
|
21
|
+
- Embedding 端点:`http://localhost:11434/v1/embeddings`
|
|
22
|
+
- LLM 端点:`http://localhost:11434/v1/chat/completions`
|
|
23
|
+
|
|
24
|
+
可通过 `.env` 或 `config/smart_rag.yml` 覆盖以上默认值。
|
|
25
|
+
|
|
26
|
+
## 快速开始
|
|
27
|
+
|
|
28
|
+
### 1) 安装依赖
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
bundle install
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
### 2) 配置环境变量
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
cp .env.example .env
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
必填数据库变量:
|
|
41
|
+
|
|
42
|
+
- `SMARTRAG_DB_HOST`
|
|
43
|
+
- `SMARTRAG_DB_PORT`
|
|
44
|
+
- `SMARTRAG_DB_NAME`
|
|
45
|
+
- `SMARTRAG_DB_USER`
|
|
46
|
+
- `SMARTRAG_DB_PASSWORD`
|
|
47
|
+
|
|
48
|
+
### 3) 初始化数据库
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
bundle exec rake db:create
|
|
52
|
+
bundle exec rake db:migrate
|
|
53
|
+
bundle exec rake db:seed
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
### 4) 导入测试文档(可选)
|
|
57
|
+
|
|
58
|
+
```bash
|
|
59
|
+
ruby test/import_doc.rb import
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
### 5) 运行示例程序
|
|
63
|
+
|
|
64
|
+
```bash
|
|
65
|
+
ruby examples/01_quick_start.rb
|
|
66
|
+
ruby examples/03_search_operations.rb
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
## 最小调用示例
|
|
70
|
+
|
|
71
|
+
```ruby
|
|
72
|
+
require "smart_rag"
|
|
73
|
+
|
|
74
|
+
config = SmartRAG::Config.load("config/smart_rag.yml")
|
|
75
|
+
client = SmartRAG::SmartRAG.new(config)
|
|
76
|
+
|
|
77
|
+
client.add_document("test/python_basics.md", generate_embeddings: true)
|
|
78
|
+
results = client.search("机器学习是什么?", search_type: "hybrid", limit: 5)
|
|
79
|
+
|
|
80
|
+
puts results[:results].map { |r| r[:section_title] }
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
## 开发常用命令
|
|
84
|
+
|
|
85
|
+
- `bundle exec rspec`:运行 RSpec 测试
|
|
86
|
+
- `ruby test/test_rag.rb`:运行端到端测试脚本
|
|
87
|
+
- `bundle exec rake db:reset`:重建数据库
|
|
88
|
+
- `gem build smart_rag.gemspec`:构建 gem 包
|
|
89
|
+
|
|
90
|
+
## 运维命令(发布前)
|
|
91
|
+
|
|
92
|
+
标准发布前步骤(推荐):
|
|
93
|
+
|
|
94
|
+
1. 迁移数据库
|
|
95
|
+
|
|
96
|
+
```bash
|
|
97
|
+
bundle exec rake db:migrate
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
2. 回填历史数据字段(`source_type/source_uri/content_hash`)
|
|
101
|
+
|
|
102
|
+
```bash
|
|
103
|
+
bundle exec rake db:backfill_source_fields
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
3. 执行一键检索发布准备(backfill -> dedupe -> reindex)
|
|
107
|
+
|
|
108
|
+
```bash
|
|
109
|
+
bundle exec rake db:prepare_release
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
仅预演(不写入):
|
|
113
|
+
|
|
114
|
+
```bash
|
|
115
|
+
DRY_RUN=1 bundle exec rake db:backfill_source_fields
|
|
116
|
+
DRY_RUN=1 bundle exec rake db:prepare_release
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
## 目录结构
|
|
120
|
+
|
|
121
|
+
```text
|
|
122
|
+
lib/
|
|
123
|
+
smart_rag.rb # 主 API 入口
|
|
124
|
+
smart_rag/core/ # 核心处理逻辑
|
|
125
|
+
smart_rag/services/ # 搜索/标签/嵌入服务
|
|
126
|
+
config/ # 运行时配置
|
|
127
|
+
db/ # 迁移与种子 SQL
|
|
128
|
+
examples/ # 示例代码
|
|
129
|
+
test/ # 手工/E2E 脚本与样例文档
|
|
130
|
+
spec/ # RSpec 测试
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
## 文档导航
|
|
134
|
+
|
|
135
|
+
完整文档清单、阅读顺序和维护建议见 `docs/DOCUMENTATION_INDEX.md`。
|
|
136
|
+
英文版见 `docs/DOCUMENTATION_INDEX.en.md`。
|
|
137
|
+
|
|
138
|
+
## 说明
|
|
139
|
+
|
|
140
|
+
- 部分历史文档仍保留旧默认值(如 OpenAI 示例)。运行时配置以 `config/smart_rag.yml` 为准。
|
|
141
|
+
|
|
142
|
+
## 许可证
|
|
143
|
+
|
|
144
|
+
MIT
|
data/config/database.yml
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
# Database configuration for different environments
|
|
2
|
+
# This is a sample configuration file
|
|
3
|
+
# Copy and modify for your specific environment
|
|
4
|
+
|
|
5
|
+
default: &default
|
|
6
|
+
adapter: postgresql
|
|
7
|
+
encoding: unicode
|
|
8
|
+
pool: 5
|
|
9
|
+
timeout: 5000
|
|
10
|
+
|
|
11
|
+
development:
|
|
12
|
+
<<: *default
|
|
13
|
+
host: <%= ENV['SMARTRAG_DB_HOST'] || 'localhost' %>
|
|
14
|
+
port: <%= ENV['SMARTRAG_DB_PORT'] || 5432 %>
|
|
15
|
+
database: <%= ENV['SMARTRAG_DB_NAME'] || 'smart_rag_development' %>
|
|
16
|
+
username: <%= ENV['SMARTRAG_DB_USER'] || 'rag_user' %>
|
|
17
|
+
password: <%= ENV['SMARTRAG_DB_PASSWORD'] || 'rag_pwd' %>
|
|
18
|
+
|
|
19
|
+
test:
|
|
20
|
+
<<: *default
|
|
21
|
+
host: <%= ENV['SMARTRAG_TEST_DB_HOST'] || 'localhost' %>
|
|
22
|
+
port: <%= ENV['SMARTRAG_TEST_DB_PORT'] || 5432 %>
|
|
23
|
+
database: <%= ENV['SMARTRAG_TEST_DB_NAME'] || 'smart_rag_test' %>
|
|
24
|
+
username: <%= ENV['SMARTRAG_TEST_DB_USER'] || 'postgres' %>
|
|
25
|
+
password: <%= ENV['SMARTRAG_TEST_DB_PASSWORD'] %>
|
|
26
|
+
|
|
27
|
+
staging:
|
|
28
|
+
<<: *default
|
|
29
|
+
host: <%= ENV['SMARTRAG_DB_HOST'] %>
|
|
30
|
+
port: <%= ENV['SMARTRAG_DB_PORT'] || 5432 %>
|
|
31
|
+
database: <%= ENV['SMARTRAG_DB_NAME'] || 'smart_rag_staging' %>
|
|
32
|
+
username: <%= ENV['SMARTRAG_DB_USER'] %>
|
|
33
|
+
password: <%= ENV['SMARTRAG_DB_PASSWORD'] %>
|
|
34
|
+
|
|
35
|
+
production:
|
|
36
|
+
<<: *default
|
|
37
|
+
host: <%= ENV['SMARTRAG_DB_HOST'] %>
|
|
38
|
+
port: <%= ENV['SMARTRAG_DB_PORT'] || 5432 %>
|
|
39
|
+
database: <%= ENV['SMARTRAG_DB_NAME'] || 'smart_rag_production' %>
|
|
40
|
+
username: <%= ENV['SMARTRAG_DB_USER'] %>
|
|
41
|
+
password: <%= ENV['SMARTRAG_DB_PASSWORD'] %>
|
|
42
|
+
pool: 10
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
# Full-text search configuration
|
|
2
|
+
# This file contains language-specific settings for full-text search
|
|
3
|
+
|
|
4
|
+
default: &default
|
|
5
|
+
# Default language for text search
|
|
6
|
+
default_language: en
|
|
7
|
+
|
|
8
|
+
# Maximum number of search results to return
|
|
9
|
+
max_results: 100
|
|
10
|
+
|
|
11
|
+
# Enable Chinese text segmentation with pg_jieba
|
|
12
|
+
enable_jieba: true
|
|
13
|
+
|
|
14
|
+
# Hybrid search weight configuration
|
|
15
|
+
# These weights determine how results are combined in hybrid searches
|
|
16
|
+
hybrid_weight:
|
|
17
|
+
fulltext: 0.4 # Weight for full-text search results
|
|
18
|
+
vector: 0.6 # Weight for vector search results
|
|
19
|
+
|
|
20
|
+
# RRF (Reciprocal Rank Fusion) algorithm parameter
|
|
21
|
+
# Higher k values give more weight to lower-ranked results
|
|
22
|
+
rrf_k: 60
|
|
23
|
+
|
|
24
|
+
# Search result caching
|
|
25
|
+
cache:
|
|
26
|
+
enabled: false # Disabled by default, can be enabled with Redis
|
|
27
|
+
ttl: 3600 # Cache TTL in seconds
|
|
28
|
+
|
|
29
|
+
# Highlighting configuration
|
|
30
|
+
highlighting:
|
|
31
|
+
enabled: true
|
|
32
|
+
max_length: 200 # Maximum length of highlighted snippets
|
|
33
|
+
pre_tags:
|
|
34
|
+
- '<mark>'
|
|
35
|
+
post_tags:
|
|
36
|
+
- '</mark>'
|
|
37
|
+
|
|
38
|
+
# Language-specific configurations
|
|
39
|
+
languages:
|
|
40
|
+
en:
|
|
41
|
+
config_name: pg_catalog.english
|
|
42
|
+
stop_words: true # Remove common words
|
|
43
|
+
stemming: true # Apply stemming
|
|
44
|
+
zh:
|
|
45
|
+
config_name: jieba
|
|
46
|
+
custom_dict: null # Path to custom dictionary
|
|
47
|
+
stop_words: true
|
|
48
|
+
stemming: false
|
|
49
|
+
ja:
|
|
50
|
+
config_name: pg_catalog.simple
|
|
51
|
+
stop_words: true
|
|
52
|
+
stemming: false
|
|
53
|
+
ko:
|
|
54
|
+
config_name: pg_catalog.simple
|
|
55
|
+
stop_words: true
|
|
56
|
+
stemming: false
|
|
57
|
+
|
|
58
|
+
# Query configuration
|
|
59
|
+
queries:
|
|
60
|
+
# Minimum word length to include in search
|
|
61
|
+
min_word_length: 2
|
|
62
|
+
# Maximum number of words in a query
|
|
63
|
+
max_words: 20
|
|
64
|
+
# Allow phrase queries with quotes
|
|
65
|
+
phrase_queries: true
|
|
66
|
+
# Allow boolean operators (AND, OR, NOT)
|
|
67
|
+
boolean_operators: true
|
|
68
|
+
|
|
69
|
+
# Index configuration
|
|
70
|
+
indexes:
|
|
71
|
+
# Use GIN indexes for tsvector columns (recommended)
|
|
72
|
+
use_gin: true
|
|
73
|
+
# Create language-specific partitioned indexes
|
|
74
|
+
partitioned_indexes: true
|
|
75
|
+
# Automatic index maintenance
|
|
76
|
+
auto_rebuild: false
|
|
77
|
+
rebuild_schedule: null # Cron format, e.g., '0 2 * * 0' for weekly on Sunday 2am
|
|
78
|
+
|
|
79
|
+
# Synonyms and thesaurus
|
|
80
|
+
synonyms:
|
|
81
|
+
enabled: false
|
|
82
|
+
# Path to synonym dictionary file
|
|
83
|
+
# Format: word1 word2 word3 (one group per line)
|
|
84
|
+
dictionary_path: null
|
|
85
|
+
|
|
86
|
+
# Performance settings
|
|
87
|
+
performance:
|
|
88
|
+
# Enable query planning with text search
|
|
89
|
+
enable_query_planning: true
|
|
90
|
+
# Work memory for sorting (increase for large result sets)
|
|
91
|
+
work_mem: '4MB'
|
|
92
|
+
|
|
93
|
+
# Monitoring and logging
|
|
94
|
+
monitoring:
|
|
95
|
+
# Log slow queries for analysis
|
|
96
|
+
log_slow_queries: true
|
|
97
|
+
# Threshold for slow query logging (ms)
|
|
98
|
+
slow_query_threshold: 100
|
|
99
|
+
# Track query performance over time
|
|
100
|
+
track_performance: true
|
|
101
|
+
# Retention period for query logs (days)
|
|
102
|
+
log_retention_days: 30
|
|
103
|
+
|
|
104
|
+
# Advanced settings
|
|
105
|
+
advanced:
|
|
106
|
+
# Consider word proximity in ranking
|
|
107
|
+
rank_by_proximity: true
|
|
108
|
+
# Weight for title matches vs content
|
|
109
|
+
title_weight: 1.5
|
|
110
|
+
# Normalize rankings
|
|
111
|
+
normalize_rank: true
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
adapters:
|
|
2
|
+
openai: OpenAIAdapter
|
|
3
|
+
logger_file: ./log/prompt.log
|
|
4
|
+
llms:
|
|
5
|
+
OllamaEmbedding:
|
|
6
|
+
adapter: openai
|
|
7
|
+
url: http://localhost:11434/v1/
|
|
8
|
+
# Ollama OpenAI-compatible endpoint typically ignores api_key, keep a default placeholder.
|
|
9
|
+
api_key: ollama-local
|
|
10
|
+
SiliconFlow:
|
|
11
|
+
adapter: openai
|
|
12
|
+
url: https://api.siliconflow.cn/v1/
|
|
13
|
+
api_key: ""
|
|
14
|
+
worker_path: "./workers"
|
|
15
|
+
template_path: "./templates"
|
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
# SmartRAG Configuration File
|
|
2
|
+
# Copy this file and modify for your environment
|
|
3
|
+
#
|
|
4
|
+
# Production: config/smart_rag.production.yml
|
|
5
|
+
# Development: config/smart_rag.development.yml
|
|
6
|
+
# Test: config/smart_rag.test.yml
|
|
7
|
+
|
|
8
|
+
database:
|
|
9
|
+
adapter: postgresql
|
|
10
|
+
host: localhost
|
|
11
|
+
port: 5432
|
|
12
|
+
# Use environment variables for sensitive data
|
|
13
|
+
database: <%= ENV['SMARTRAG_DB_NAME'] || 'smart_rag_development' %>
|
|
14
|
+
username: <%= ENV['SMARTRAG_DB_USER'] || 'rag_user' %>
|
|
15
|
+
password: <%= ENV['SMARTRAG_DB_PASSWORD'] || 'rag_pwd' %>
|
|
16
|
+
pool: 5
|
|
17
|
+
encoding: unicode
|
|
18
|
+
# Connection timeout in seconds
|
|
19
|
+
timeout: 5000
|
|
20
|
+
# Extensions to enable
|
|
21
|
+
extensions:
|
|
22
|
+
- pgvector
|
|
23
|
+
- pg_jieba # For Chinese text segmentation
|
|
24
|
+
|
|
25
|
+
# Embedding Configuration
|
|
26
|
+
embedding:
|
|
27
|
+
# Provider can be: openai, azure_openai, local, or custom
|
|
28
|
+
provider: <%= ENV['EMBEDDING_PROVIDER'] || 'local' %>
|
|
29
|
+
api_key: <%= ENV['EMBEDDING_API_KEY'] || 'ollama-local' %>
|
|
30
|
+
endpoint: <%= ENV['EMBEDDING_ENDPOINT'] || 'http://localhost:11434/v1/embeddings' %>
|
|
31
|
+
model: <%= ENV['EMBEDDING_MODEL'] || 'qwen3-embedding' %>
|
|
32
|
+
dimensions: <%= ENV['EMBEDDING_DIMENSIONS'] || 1024 %>
|
|
33
|
+
# Request timeout in seconds
|
|
34
|
+
timeout: 30
|
|
35
|
+
# Batch size for embedding generation
|
|
36
|
+
batch_size: 10
|
|
37
|
+
|
|
38
|
+
# Full-Text Search Configuration
|
|
39
|
+
fulltext_search:
|
|
40
|
+
# Default language for text search
|
|
41
|
+
default_language: <%= ENV['DEFAULT_LANGUAGE'] || 'en' %>
|
|
42
|
+
|
|
43
|
+
# Maximum number of search results to return
|
|
44
|
+
max_results: <%= ENV['MAX_SEARCH_RESULTS'] || 100 %>
|
|
45
|
+
|
|
46
|
+
# Enable Chinese text segmentation with pg_jieba
|
|
47
|
+
enable_jieba: <%= ENV['ENABLE_JIEBA'] != 'false' %>
|
|
48
|
+
|
|
49
|
+
# Path to custom dictionary for jieba (optional)
|
|
50
|
+
custom_dict_path: <%= ENV['JIEBA_DICT_PATH'] %>
|
|
51
|
+
|
|
52
|
+
# Hybrid search weight configuration (0.0 to 1.0)
|
|
53
|
+
hybrid_weight:
|
|
54
|
+
fulltext: <%= ENV['FULLTEXT_WEIGHT'] || 0.4 %>
|
|
55
|
+
vector: <%= ENV['VECTOR_WEIGHT'] || 0.6 %>
|
|
56
|
+
|
|
57
|
+
# RRF (Reciprocal Rank Fusion) algorithm parameter
|
|
58
|
+
# Higher k values give more weight to lower-ranked results
|
|
59
|
+
rrf_k: <%= ENV['RRF_K'] || 60 %>
|
|
60
|
+
|
|
61
|
+
# Search result caching configuration
|
|
62
|
+
cache:
|
|
63
|
+
enabled: <%= ENV['CACHE_ENABLED'] != 'false' %>
|
|
64
|
+
ttl: <%= ENV['CACHE_TTL'] || 3600 %> # Cache TTL in seconds
|
|
65
|
+
# Optional Redis configuration
|
|
66
|
+
redis_url: <%= ENV['REDIS_URL'] %>
|
|
67
|
+
|
|
68
|
+
# Query logging and monitoring
|
|
69
|
+
monitoring:
|
|
70
|
+
log_slow_queries: <%= ENV['LOG_SLOW_QUERIES'] != 'false' %>
|
|
71
|
+
slow_query_threshold_ms: <%= ENV['SLOW_QUERY_THRESHOLD'] || 100 %>
|
|
72
|
+
|
|
73
|
+
# Index maintenance
|
|
74
|
+
index:
|
|
75
|
+
enable_partition: <%= ENV['ENABLE_PARTITION'] != 'false' %>
|
|
76
|
+
auto_vacuum: <%= ENV['AUTO_VACUUM'] != 'false' %>
|
|
77
|
+
# Schedule for automatic index rebuild (cron format)
|
|
78
|
+
rebuild_schedule: <%= ENV['INDEX_REBUILD_SCHEDULE'] %>
|
|
79
|
+
|
|
80
|
+
# Document Processing Configuration
|
|
81
|
+
chunking:
|
|
82
|
+
# Maximum characters per chunk
|
|
83
|
+
max_chars: <%= ENV['CHUNK_MAX_CHARS'] || 4000 %>
|
|
84
|
+
|
|
85
|
+
# Overlap between chunks to preserve context
|
|
86
|
+
overlap: <%= ENV['CHUNK_OVERLAP'] || 100 %>
|
|
87
|
+
|
|
88
|
+
# Enable splitting by Markdown headers first
|
|
89
|
+
split_by_headers: <%= ENV['SPLIT_BY_HEADERS'] != 'false' %>
|
|
90
|
+
|
|
91
|
+
# Minimum chunk size (discard chunks smaller than this)
|
|
92
|
+
min_chunk_size: <%= ENV['MIN_CHUNK_SIZE'] || 100 %>
|
|
93
|
+
|
|
94
|
+
# Search Configuration
|
|
95
|
+
search:
|
|
96
|
+
# Default number of results to return
|
|
97
|
+
default_limit: <%= ENV['DEFAULT_SEARCH_LIMIT'] || 5 %>
|
|
98
|
+
|
|
99
|
+
# Boost weight for tag matches in vector search (0.0 to 1.0)
|
|
100
|
+
tag_boost_weight: <%= ENV['TAG_BOOST_WEIGHT'] || 0.5 %>
|
|
101
|
+
|
|
102
|
+
# Maximum distance threshold for vector search results
|
|
103
|
+
max_distance_threshold: <%= ENV['MAX_DISTANCE_THRESHOLD'] || 0.3 %>
|
|
104
|
+
|
|
105
|
+
# LLM Configuration for tags and summarization
|
|
106
|
+
llm:
|
|
107
|
+
provider: <%= ENV['LLM_PROVIDER'] || 'openai' %>
|
|
108
|
+
api_key: <%= ENV['LLM_API_KEY'] || 'ollama-local' %>
|
|
109
|
+
endpoint: <%= ENV['LLM_ENDPOINT'] || 'http://localhost:11434/v1/chat/completions' %>
|
|
110
|
+
model: <%= ENV['LLM_MODEL'] || 'qwen3' %>
|
|
111
|
+
|
|
112
|
+
# Generation parameters
|
|
113
|
+
temperature: <%= ENV['LLM_TEMPERATURE'] || 0.3 %>
|
|
114
|
+
max_tokens: <%= ENV['LLM_MAX_TOKENS'] || 2000 %>
|
|
115
|
+
timeout: <%= ENV['LLM_TIMEOUT'] || 60 %>
|
|
116
|
+
|
|
117
|
+
# Document Processing
|
|
118
|
+
document:
|
|
119
|
+
# Supported file types for upload/processing
|
|
120
|
+
supported_types:
|
|
121
|
+
- .md
|
|
122
|
+
- .txt
|
|
123
|
+
- .pdf
|
|
124
|
+
- .docx
|
|
125
|
+
- .html
|
|
126
|
+
|
|
127
|
+
# Maximum file size in MB
|
|
128
|
+
max_file_size: <%= ENV['MAX_FILE_SIZE'] || 50 %>
|
|
129
|
+
|
|
130
|
+
# Download timeout for URLs
|
|
131
|
+
download_timeout: <%= ENV['DOWNLOAD_TIMEOUT'] || 30 %>
|
|
132
|
+
|
|
133
|
+
# Logging Configuration
|
|
134
|
+
logging:
|
|
135
|
+
# Log level: debug, info, warn, error, fatal
|
|
136
|
+
level: <%= ENV['LOG_LEVEL'] || 'info' %>
|
|
137
|
+
|
|
138
|
+
# Log format: json, plain
|
|
139
|
+
format: <%= ENV['LOG_FORMAT'] || 'json' %>
|
|
140
|
+
|
|
141
|
+
# Log file path (null for stdout)
|
|
142
|
+
file_path: <%= ENV['LOG_FILE_PATH'] %>
|
|
143
|
+
|
|
144
|
+
# Enable query logging
|
|
145
|
+
enable_query_log: <%= ENV['ENABLE_QUERY_LOG'] != 'false' %>
|
|
146
|
+
|
|
147
|
+
# Performance Settings
|
|
148
|
+
performance:
|
|
149
|
+
# Thread pool size for parallel processing
|
|
150
|
+
thread_pool_size: <%= ENV['THREAD_POOL_SIZE'] || 4 %>
|
|
151
|
+
|
|
152
|
+
# Batch size for database operations
|
|
153
|
+
db_batch_size: <%= ENV['DB_BATCH_SIZE'] || 100 %>
|
|
154
|
+
|
|
155
|
+
# Enable connection pooling
|
|
156
|
+
connection_pooling: <%= ENV['CONNECTION_POOLING'] != 'false' %>
|