smart_rag 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG.md +33 -0
  3. data/README.en.md +115 -0
  4. data/README.md +144 -0
  5. data/config/database.yml +42 -0
  6. data/config/fulltext_search.yml +111 -0
  7. data/config/llm_config.yml +15 -0
  8. data/config/smart_rag.yml +156 -0
  9. data/db/fix_search_issues.sql +81 -0
  10. data/db/migrations/001_create_source_documents.rb +26 -0
  11. data/db/migrations/002_create_source_sections.rb +20 -0
  12. data/db/migrations/003_create_tags.rb +17 -0
  13. data/db/migrations/004_create_research_topics.rb +16 -0
  14. data/db/migrations/005_create_relationship_tables.rb +42 -0
  15. data/db/migrations/006_create_text_search_configs.rb +28 -0
  16. data/db/migrations/007_create_section_fts.rb +109 -0
  17. data/db/migrations/008_create_embeddings.rb +28 -0
  18. data/db/migrations/009_create_search_logs.rb +30 -0
  19. data/db/migrations/010_add_metadata_to_source_documents.rb +10 -0
  20. data/db/migrations/011_add_source_fields_to_source_documents.rb +23 -0
  21. data/db/rebuild_fts_complete.sql +51 -0
  22. data/db/seeds/text_search_configs.sql +28 -0
  23. data/examples/01_quick_start.rb +32 -0
  24. data/examples/02_document_management.rb +41 -0
  25. data/examples/03_search_operations.rb +46 -0
  26. data/examples/04_topics_and_tags.rb +38 -0
  27. data/examples/05_advanced_patterns.rb +154 -0
  28. data/examples/06_error_handling_and_retry.rb +64 -0
  29. data/examples/README.md +42 -0
  30. data/examples/common.rb +57 -0
  31. data/lib/smart_rag/chunker/markdown_chunker.rb +315 -0
  32. data/lib/smart_rag/config.rb +126 -0
  33. data/lib/smart_rag/core/document_processor.rb +537 -0
  34. data/lib/smart_rag/core/embedding.rb +340 -0
  35. data/lib/smart_rag/core/fulltext_manager.rb +483 -0
  36. data/lib/smart_rag/core/markitdown_bridge.rb +85 -0
  37. data/lib/smart_rag/core/query_processor.rb +577 -0
  38. data/lib/smart_rag/errors.rb +88 -0
  39. data/lib/smart_rag/models/embedding.rb +140 -0
  40. data/lib/smart_rag/models/model_base.rb +106 -0
  41. data/lib/smart_rag/models/research_topic.rb +171 -0
  42. data/lib/smart_rag/models/research_topic_section.rb +86 -0
  43. data/lib/smart_rag/models/research_topic_tag.rb +89 -0
  44. data/lib/smart_rag/models/search_log.rb +198 -0
  45. data/lib/smart_rag/models/section_fts.rb +170 -0
  46. data/lib/smart_rag/models/section_tag.rb +81 -0
  47. data/lib/smart_rag/models/source_document.rb +204 -0
  48. data/lib/smart_rag/models/source_section.rb +201 -0
  49. data/lib/smart_rag/models/tag.rb +214 -0
  50. data/lib/smart_rag/models/text_search_config.rb +168 -0
  51. data/lib/smart_rag/models.rb +116 -0
  52. data/lib/smart_rag/parsers/query_parser.rb +291 -0
  53. data/lib/smart_rag/retrieve.rb +745 -0
  54. data/lib/smart_rag/services/embedding_service.rb +278 -0
  55. data/lib/smart_rag/services/fulltext_search_service.rb +456 -0
  56. data/lib/smart_rag/services/hybrid_search_service.rb +768 -0
  57. data/lib/smart_rag/services/summarization_service.rb +322 -0
  58. data/lib/smart_rag/services/tag_service.rb +614 -0
  59. data/lib/smart_rag/services/vector_search_service.rb +347 -0
  60. data/lib/smart_rag/smart_chunking/chunk.rb +10 -0
  61. data/lib/smart_rag/smart_chunking/media_context.rb +9 -0
  62. data/lib/smart_rag/smart_chunking/merger.rb +94 -0
  63. data/lib/smart_rag/smart_chunking/parser.rb +75 -0
  64. data/lib/smart_rag/smart_chunking/pipeline.rb +45 -0
  65. data/lib/smart_rag/smart_chunking/section.rb +11 -0
  66. data/lib/smart_rag/smart_chunking/structure_detector.rb +31 -0
  67. data/lib/smart_rag/smart_chunking/tokenizer.rb +24 -0
  68. data/lib/smart_rag/version.rb +3 -0
  69. data/lib/smart_rag.rb +986 -0
  70. data/workers/analyze_content.rb +6 -0
  71. data/workers/get_embedding.rb +7 -0
  72. metadata +311 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 19e9abcf3817e98fad15499879a25db6fbb563bc189f5a8f0c58fbea2145cbf8
4
+ data.tar.gz: 88f7e4a0f1929b83c9346e8261928bdd72690de4c9da3689a3ca7882a548a6b1
5
+ SHA512:
6
+ metadata.gz: f3c7c945032e61c816f383ea6ae1ea87b0be9d4fb2a794175543489e91763825f73fbb925c2a52b37d786af2e1dd126180b4da3cb5c31ded4130e1fa5bc4ebda
7
+ data.tar.gz: 6de0148938c32c3842d3d3e2a8f31641c508be193d70e9fa11d01c93dd413e3e805f2fec3afb348981455566989654436cb5e7f70c0351b2b94e1dc21978a94a
data/CHANGELOG.md ADDED
@@ -0,0 +1,33 @@
1
+ # Changelog
2
+
3
+ ## Unreleased
4
+
5
+ ### Added
6
+ - 新增 `retrieve(plan:)` 结构化检索入口(`RetrievalPlan -> EvidencePack`)。
7
+ - 新增 `SmartRAG::Retrieve` 执行器:支持多 query、mode 映射、signals、provenance、stats、explain。
8
+ - 新增索引治理接口:
9
+ - `rebuild_fts(document_id=nil)`
10
+ - `rebuild_embeddings(document_id=nil)`
11
+ - `reindex(document_id=nil)`
12
+ - `dedupe_by_content_hash`
13
+ - 新增 `source_documents` 字段与索引:
14
+ - `source_type`
15
+ - `source_uri`
16
+ - `content_hash`
17
+ - 新增轻量单测入口 `spec/unit_spec_helper.rb`(不依赖数据库连接)。
18
+ - 新增回填任务:`rake db:backfill_source_fields`(历史数据回填新字段)。
19
+ - 新增一键发布任务:`rake db:prepare_release`(backfill -> dedupe -> reindex)。
20
+ - 新增 API:
21
+ - `backfill_source_fields(limit: nil, dry_run: false)`
22
+ - `prepare_release_indexes(document_id: nil, dry_run: false)`
23
+
24
+ ### Changed
25
+ - `retrieve` 现支持 `global_filters.source_type` 与 `global_filters.source_uri_prefix` 的执行过滤。
26
+ - `retrieve` 新增 `global_filters.topic_ids` 的执行过滤(按 section-topic 关系过滤)。
27
+ - `retrieve` 新增 `budget.diversity.by_source` 执行约束。
28
+ - `dedupe_by_content_hash` 从“仅 content_hash”升级为“`source_uri + content_hash`”去重。
29
+ - 检索日志 (`search_logs.filters`) 新增保存 `plan/stats/explain/warnings`,用于回放与调试。
30
+
31
+ ### Compatibility
32
+ - 保留 `search(...)` 旧接口,不破坏现有调用。
33
+ - 对未支持字段通过 `explain.ignored_fields` 明确返回,不做静默忽略。
data/README.en.md ADDED
@@ -0,0 +1,115 @@
1
+ # SmartRAG
2
+
3
+ [中文 README](README.md)
4
+
5
+ SmartRAG is a Ruby-based hybrid RAG library that combines vector retrieval, full-text search, and topic/tag organization for document intelligence workflows.
6
+
7
+ ## Overview
8
+
9
+ - Hybrid retrieval: vector + full-text + weighted fusion
10
+ - Document ingestion from local files and URLs
11
+ - Topic and tag management APIs
12
+ - Search logs and system statistics
13
+ - Runnable example scripts for quick onboarding
14
+
15
+ ## Default Model Setup
16
+
17
+ Current defaults use local Ollama-compatible endpoints:
18
+
19
+ - Embedding model: `qwen3-embedding`
20
+ - Text LLM model: `qwen3`
21
+ - Embedding endpoint: `http://localhost:11434/v1/embeddings`
22
+ - LLM endpoint: `http://localhost:11434/v1/chat/completions`
23
+
24
+ You can override these via `.env` or `config/smart_rag.yml`.
25
+
26
+ ## Quick Start
27
+
28
+ ### 1) Install dependencies
29
+
30
+ ```bash
31
+ bundle install
32
+ ```
33
+
34
+ ### 2) Configure environment
35
+
36
+ ```bash
37
+ cp .env.example .env
38
+ ```
39
+
40
+ Required DB variables:
41
+
42
+ - `SMARTRAG_DB_HOST`
43
+ - `SMARTRAG_DB_PORT`
44
+ - `SMARTRAG_DB_NAME`
45
+ - `SMARTRAG_DB_USER`
46
+ - `SMARTRAG_DB_PASSWORD`
47
+
48
+ ### 3) Setup database
49
+
50
+ ```bash
51
+ bundle exec rake db:create
52
+ bundle exec rake db:migrate
53
+ bundle exec rake db:seed
54
+ ```
55
+
56
+ ### 4) Import test docs (optional)
57
+
58
+ ```bash
59
+ ruby test/import_doc.rb import
60
+ ```
61
+
62
+ ### 5) Run sample scripts
63
+
64
+ ```bash
65
+ ruby examples/01_quick_start.rb
66
+ ruby examples/03_search_operations.rb
67
+ ```
68
+
69
+ ## Minimal Usage
70
+
71
+ ```ruby
72
+ require "smart_rag"
73
+
74
+ config = SmartRAG::Config.load("config/smart_rag.yml")
75
+ client = SmartRAG::SmartRAG.new(config)
76
+
77
+ client.add_document("test/python_basics.md", generate_embeddings: true)
78
+ results = client.search("What is machine learning?", search_type: "hybrid", limit: 5)
79
+
80
+ puts results[:results].map { |r| r[:section_title] }
81
+ ```
82
+
83
+ ## Development Commands
84
+
85
+ - `bundle exec rspec`: run RSpec tests
86
+ - `ruby test/test_rag.rb`: run E2E script
87
+ - `bundle exec rake db:reset`: recreate database
88
+ - `gem build smart_rag.gemspec`: build gem package
89
+
90
+ ## Project Structure
91
+
92
+ ```text
93
+ lib/
94
+ smart_rag.rb # Main API entry
95
+ smart_rag/core/ # Core processing logic
96
+ smart_rag/services/ # Search/tag/embedding services
97
+ config/ # Runtime config files
98
+ db/ # Migrations and seed SQL
99
+ examples/ # Example programs
100
+ test/ # Manual/E2E scripts + sample docs
101
+ spec/ # RSpec tests
102
+ ```
103
+
104
+ ## Documentation Map
105
+
106
+ See `docs/DOCUMENTATION_INDEX.en.md` for a curated map of all docs, reading order, and maintenance notes.
107
+ Chinese version: `docs/DOCUMENTATION_INDEX.md`.
108
+
109
+ ## Notes
110
+
111
+ - Some legacy docs still contain older defaults (for example OpenAI references). Runtime truth is `config/smart_rag.yml`.
112
+
113
+ ## License
114
+
115
+ MIT
data/README.md ADDED
@@ -0,0 +1,144 @@
1
+ # SmartRAG
2
+
3
+ [English README](README.en.md)
4
+
5
+ SmartRAG 是一个 Ruby 混合检索增强生成(RAG)库,结合向量检索、全文检索与主题/标签管理,用于文档智能检索与问答场景。
6
+
7
+ ## 项目概览
8
+
9
+ - 混合检索:向量检索 + 全文检索 + 权重融合
10
+ - 支持本地文件与 URL 文档导入
11
+ - 提供主题与标签管理 API
12
+ - 提供搜索日志与系统统计能力
13
+ - 内置示例脚本便于快速上手
14
+
15
+ ## 默认模型配置
16
+
17
+ 当前默认配置为本地 Ollama 兼容端点:
18
+
19
+ - Embedding 模型:`qwen3-embedding`
20
+ - 文本 LLM 模型:`qwen3`
21
+ - Embedding 端点:`http://localhost:11434/v1/embeddings`
22
+ - LLM 端点:`http://localhost:11434/v1/chat/completions`
23
+
24
+ 可通过 `.env` 或 `config/smart_rag.yml` 覆盖以上默认值。
25
+
26
+ ## 快速开始
27
+
28
+ ### 1) 安装依赖
29
+
30
+ ```bash
31
+ bundle install
32
+ ```
33
+
34
+ ### 2) 配置环境变量
35
+
36
+ ```bash
37
+ cp .env.example .env
38
+ ```
39
+
40
+ 必填数据库变量:
41
+
42
+ - `SMARTRAG_DB_HOST`
43
+ - `SMARTRAG_DB_PORT`
44
+ - `SMARTRAG_DB_NAME`
45
+ - `SMARTRAG_DB_USER`
46
+ - `SMARTRAG_DB_PASSWORD`
47
+
48
+ ### 3) 初始化数据库
49
+
50
+ ```bash
51
+ bundle exec rake db:create
52
+ bundle exec rake db:migrate
53
+ bundle exec rake db:seed
54
+ ```
55
+
56
+ ### 4) 导入测试文档(可选)
57
+
58
+ ```bash
59
+ ruby test/import_doc.rb import
60
+ ```
61
+
62
+ ### 5) 运行示例程序
63
+
64
+ ```bash
65
+ ruby examples/01_quick_start.rb
66
+ ruby examples/03_search_operations.rb
67
+ ```
68
+
69
+ ## 最小调用示例
70
+
71
+ ```ruby
72
+ require "smart_rag"
73
+
74
+ config = SmartRAG::Config.load("config/smart_rag.yml")
75
+ client = SmartRAG::SmartRAG.new(config)
76
+
77
+ client.add_document("test/python_basics.md", generate_embeddings: true)
78
+ results = client.search("机器学习是什么?", search_type: "hybrid", limit: 5)
79
+
80
+ puts results[:results].map { |r| r[:section_title] }
81
+ ```
82
+
83
+ ## 开发常用命令
84
+
85
+ - `bundle exec rspec`:运行 RSpec 测试
86
+ - `ruby test/test_rag.rb`:运行端到端测试脚本
87
+ - `bundle exec rake db:reset`:重建数据库
88
+ - `gem build smart_rag.gemspec`:构建 gem 包
89
+
90
+ ## 运维命令(发布前)
91
+
92
+ 标准发布前步骤(推荐):
93
+
94
+ 1. 迁移数据库
95
+
96
+ ```bash
97
+ bundle exec rake db:migrate
98
+ ```
99
+
100
+ 2. 回填历史数据字段(`source_type/source_uri/content_hash`)
101
+
102
+ ```bash
103
+ bundle exec rake db:backfill_source_fields
104
+ ```
105
+
106
+ 3. 执行一键检索发布准备(backfill -> dedupe -> reindex)
107
+
108
+ ```bash
109
+ bundle exec rake db:prepare_release
110
+ ```
111
+
112
+ 仅预演(不写入):
113
+
114
+ ```bash
115
+ DRY_RUN=1 bundle exec rake db:backfill_source_fields
116
+ DRY_RUN=1 bundle exec rake db:prepare_release
117
+ ```
118
+
119
+ ## 目录结构
120
+
121
+ ```text
122
+ lib/
123
+ smart_rag.rb # 主 API 入口
124
+ smart_rag/core/ # 核心处理逻辑
125
+ smart_rag/services/ # 搜索/标签/嵌入服务
126
+ config/ # 运行时配置
127
+ db/ # 迁移与种子 SQL
128
+ examples/ # 示例代码
129
+ test/ # 手工/E2E 脚本与样例文档
130
+ spec/ # RSpec 测试
131
+ ```
132
+
133
+ ## 文档导航
134
+
135
+ 完整文档清单、阅读顺序和维护建议见 `docs/DOCUMENTATION_INDEX.md`。
136
+ 英文版见 `docs/DOCUMENTATION_INDEX.en.md`。
137
+
138
+ ## 说明
139
+
140
+ - 部分历史文档仍保留旧默认值(如 OpenAI 示例)。运行时配置以 `config/smart_rag.yml` 为准。
141
+
142
+ ## 许可证
143
+
144
+ MIT
@@ -0,0 +1,42 @@
1
+ # Database configuration for different environments
2
+ # This is a sample configuration file
3
+ # Copy and modify for your specific environment
4
+
5
+ default: &default
6
+ adapter: postgresql
7
+ encoding: unicode
8
+ pool: 5
9
+ timeout: 5000
10
+
11
+ development:
12
+ <<: *default
13
+ host: <%= ENV['SMARTRAG_DB_HOST'] || 'localhost' %>
14
+ port: <%= ENV['SMARTRAG_DB_PORT'] || 5432 %>
15
+ database: <%= ENV['SMARTRAG_DB_NAME'] || 'smart_rag_development' %>
16
+ username: <%= ENV['SMARTRAG_DB_USER'] || 'rag_user' %>
17
+ password: <%= ENV['SMARTRAG_DB_PASSWORD'] || 'rag_pwd' %>
18
+
19
+ test:
20
+ <<: *default
21
+ host: <%= ENV['SMARTRAG_TEST_DB_HOST'] || 'localhost' %>
22
+ port: <%= ENV['SMARTRAG_TEST_DB_PORT'] || 5432 %>
23
+ database: <%= ENV['SMARTRAG_TEST_DB_NAME'] || 'smart_rag_test' %>
24
+ username: <%= ENV['SMARTRAG_TEST_DB_USER'] || 'postgres' %>
25
+ password: <%= ENV['SMARTRAG_TEST_DB_PASSWORD'] %>
26
+
27
+ staging:
28
+ <<: *default
29
+ host: <%= ENV['SMARTRAG_DB_HOST'] %>
30
+ port: <%= ENV['SMARTRAG_DB_PORT'] || 5432 %>
31
+ database: <%= ENV['SMARTRAG_DB_NAME'] || 'smart_rag_staging' %>
32
+ username: <%= ENV['SMARTRAG_DB_USER'] %>
33
+ password: <%= ENV['SMARTRAG_DB_PASSWORD'] %>
34
+
35
+ production:
36
+ <<: *default
37
+ host: <%= ENV['SMARTRAG_DB_HOST'] %>
38
+ port: <%= ENV['SMARTRAG_DB_PORT'] || 5432 %>
39
+ database: <%= ENV['SMARTRAG_DB_NAME'] || 'smart_rag_production' %>
40
+ username: <%= ENV['SMARTRAG_DB_USER'] %>
41
+ password: <%= ENV['SMARTRAG_DB_PASSWORD'] %>
42
+ pool: 10
@@ -0,0 +1,111 @@
1
+ # Full-text search configuration
2
+ # This file contains language-specific settings for full-text search
3
+
4
+ default: &default
5
+ # Default language for text search
6
+ default_language: en
7
+
8
+ # Maximum number of search results to return
9
+ max_results: 100
10
+
11
+ # Enable Chinese text segmentation with pg_jieba
12
+ enable_jieba: true
13
+
14
+ # Hybrid search weight configuration
15
+ # These weights determine how results are combined in hybrid searches
16
+ hybrid_weight:
17
+ fulltext: 0.4 # Weight for full-text search results
18
+ vector: 0.6 # Weight for vector search results
19
+
20
+ # RRF (Reciprocal Rank Fusion) algorithm parameter
21
+ # Higher k values give more weight to lower-ranked results
22
+ rrf_k: 60
23
+
24
+ # Search result caching
25
+ cache:
26
+ enabled: false # Disabled by default, can be enabled with Redis
27
+ ttl: 3600 # Cache TTL in seconds
28
+
29
+ # Highlighting configuration
30
+ highlighting:
31
+ enabled: true
32
+ max_length: 200 # Maximum length of highlighted snippets
33
+ pre_tags:
34
+ - '<mark>'
35
+ post_tags:
36
+ - '</mark>'
37
+
38
+ # Language-specific configurations
39
+ languages:
40
+ en:
41
+ config_name: pg_catalog.english
42
+ stop_words: true # Remove common words
43
+ stemming: true # Apply stemming
44
+ zh:
45
+ config_name: jieba
46
+ custom_dict: null # Path to custom dictionary
47
+ stop_words: true
48
+ stemming: false
49
+ ja:
50
+ config_name: pg_catalog.simple
51
+ stop_words: true
52
+ stemming: false
53
+ ko:
54
+ config_name: pg_catalog.simple
55
+ stop_words: true
56
+ stemming: false
57
+
58
+ # Query configuration
59
+ queries:
60
+ # Minimum word length to include in search
61
+ min_word_length: 2
62
+ # Maximum number of words in a query
63
+ max_words: 20
64
+ # Allow phrase queries with quotes
65
+ phrase_queries: true
66
+ # Allow boolean operators (AND, OR, NOT)
67
+ boolean_operators: true
68
+
69
+ # Index configuration
70
+ indexes:
71
+ # Use GIN indexes for tsvector columns (recommended)
72
+ use_gin: true
73
+ # Create language-specific partitioned indexes
74
+ partitioned_indexes: true
75
+ # Automatic index maintenance
76
+ auto_rebuild: false
77
+ rebuild_schedule: null # Cron format, e.g., '0 2 * * 0' for weekly on Sunday 2am
78
+
79
+ # Synonyms and thesaurus
80
+ synonyms:
81
+ enabled: false
82
+ # Path to synonym dictionary file
83
+ # Format: word1 word2 word3 (one group per line)
84
+ dictionary_path: null
85
+
86
+ # Performance settings
87
+ performance:
88
+ # Enable query planning with text search
89
+ enable_query_planning: true
90
+ # Work memory for sorting (increase for large result sets)
91
+ work_mem: '4MB'
92
+
93
+ # Monitoring and logging
94
+ monitoring:
95
+ # Log slow queries for analysis
96
+ log_slow_queries: true
97
+ # Threshold for slow query logging (ms)
98
+ slow_query_threshold: 100
99
+ # Track query performance over time
100
+ track_performance: true
101
+ # Retention period for query logs (days)
102
+ log_retention_days: 30
103
+
104
+ # Advanced settings
105
+ advanced:
106
+ # Consider word proximity in ranking
107
+ rank_by_proximity: true
108
+ # Weight for title matches vs content
109
+ title_weight: 1.5
110
+ # Normalize rankings
111
+ normalize_rank: true
@@ -0,0 +1,15 @@
1
+ adapters:
2
+ openai: OpenAIAdapter
3
+ logger_file: ./log/prompt.log
4
+ llms:
5
+ OllamaEmbedding:
6
+ adapter: openai
7
+ url: http://localhost:11434/v1/
8
+ # Ollama OpenAI-compatible endpoint typically ignores api_key, keep a default placeholder.
9
+ api_key: ollama-local
10
+ SiliconFlow:
11
+ adapter: openai
12
+ url: https://api.siliconflow.cn/v1/
13
+ api_key: ""
14
+ worker_path: "./workers"
15
+ template_path: "./templates"
@@ -0,0 +1,156 @@
1
+ # SmartRAG Configuration File
2
+ # Copy this file and modify for your environment
3
+ #
4
+ # Production: config/smart_rag.production.yml
5
+ # Development: config/smart_rag.development.yml
6
+ # Test: config/smart_rag.test.yml
7
+
8
+ database:
9
+ adapter: postgresql
10
+ host: localhost
11
+ port: 5432
12
+ # Use environment variables for sensitive data
13
+ database: <%= ENV['SMARTRAG_DB_NAME'] || 'smart_rag_development' %>
14
+ username: <%= ENV['SMARTRAG_DB_USER'] || 'rag_user' %>
15
+ password: <%= ENV['SMARTRAG_DB_PASSWORD'] || 'rag_pwd' %>
16
+ pool: 5
17
+ encoding: unicode
18
+ # Connection timeout in seconds
19
+ timeout: 5000
20
+ # Extensions to enable
21
+ extensions:
22
+ - pgvector
23
+ - pg_jieba # For Chinese text segmentation
24
+
25
+ # Embedding Configuration
26
+ embedding:
27
+ # Provider can be: openai, azure_openai, local, or custom
28
+ provider: <%= ENV['EMBEDDING_PROVIDER'] || 'local' %>
29
+ api_key: <%= ENV['EMBEDDING_API_KEY'] || 'ollama-local' %>
30
+ endpoint: <%= ENV['EMBEDDING_ENDPOINT'] || 'http://localhost:11434/v1/embeddings' %>
31
+ model: <%= ENV['EMBEDDING_MODEL'] || 'qwen3-embedding' %>
32
+ dimensions: <%= ENV['EMBEDDING_DIMENSIONS'] || 1024 %>
33
+ # Request timeout in seconds
34
+ timeout: 30
35
+ # Batch size for embedding generation
36
+ batch_size: 10
37
+
38
+ # Full-Text Search Configuration
39
+ fulltext_search:
40
+ # Default language for text search
41
+ default_language: <%= ENV['DEFAULT_LANGUAGE'] || 'en' %>
42
+
43
+ # Maximum number of search results to return
44
+ max_results: <%= ENV['MAX_SEARCH_RESULTS'] || 100 %>
45
+
46
+ # Enable Chinese text segmentation with pg_jieba
47
+ enable_jieba: <%= ENV['ENABLE_JIEBA'] != 'false' %>
48
+
49
+ # Path to custom dictionary for jieba (optional)
50
+ custom_dict_path: <%= ENV['JIEBA_DICT_PATH'] %>
51
+
52
+ # Hybrid search weight configuration (0.0 to 1.0)
53
+ hybrid_weight:
54
+ fulltext: <%= ENV['FULLTEXT_WEIGHT'] || 0.4 %>
55
+ vector: <%= ENV['VECTOR_WEIGHT'] || 0.6 %>
56
+
57
+ # RRF (Reciprocal Rank Fusion) algorithm parameter
58
+ # Higher k values give more weight to lower-ranked results
59
+ rrf_k: <%= ENV['RRF_K'] || 60 %>
60
+
61
+ # Search result caching configuration
62
+ cache:
63
+ enabled: <%= ENV['CACHE_ENABLED'] != 'false' %>
64
+ ttl: <%= ENV['CACHE_TTL'] || 3600 %> # Cache TTL in seconds
65
+ # Optional Redis configuration
66
+ redis_url: <%= ENV['REDIS_URL'] %>
67
+
68
+ # Query logging and monitoring
69
+ monitoring:
70
+ log_slow_queries: <%= ENV['LOG_SLOW_QUERIES'] != 'false' %>
71
+ slow_query_threshold_ms: <%= ENV['SLOW_QUERY_THRESHOLD'] || 100 %>
72
+
73
+ # Index maintenance
74
+ index:
75
+ enable_partition: <%= ENV['ENABLE_PARTITION'] != 'false' %>
76
+ auto_vacuum: <%= ENV['AUTO_VACUUM'] != 'false' %>
77
+ # Schedule for automatic index rebuild (cron format)
78
+ rebuild_schedule: <%= ENV['INDEX_REBUILD_SCHEDULE'] %>
79
+
80
+ # Document Processing Configuration
81
+ chunking:
82
+ # Maximum characters per chunk
83
+ max_chars: <%= ENV['CHUNK_MAX_CHARS'] || 4000 %>
84
+
85
+ # Overlap between chunks to preserve context
86
+ overlap: <%= ENV['CHUNK_OVERLAP'] || 100 %>
87
+
88
+ # Enable splitting by Markdown headers first
89
+ split_by_headers: <%= ENV['SPLIT_BY_HEADERS'] != 'false' %>
90
+
91
+ # Minimum chunk size (discard chunks smaller than this)
92
+ min_chunk_size: <%= ENV['MIN_CHUNK_SIZE'] || 100 %>
93
+
94
+ # Search Configuration
95
+ search:
96
+ # Default number of results to return
97
+ default_limit: <%= ENV['DEFAULT_SEARCH_LIMIT'] || 5 %>
98
+
99
+ # Boost weight for tag matches in vector search (0.0 to 1.0)
100
+ tag_boost_weight: <%= ENV['TAG_BOOST_WEIGHT'] || 0.5 %>
101
+
102
+ # Maximum distance threshold for vector search results
103
+ max_distance_threshold: <%= ENV['MAX_DISTANCE_THRESHOLD'] || 0.3 %>
104
+
105
+ # LLM Configuration for tags and summarization
106
+ llm:
107
+ provider: <%= ENV['LLM_PROVIDER'] || 'openai' %>
108
+ api_key: <%= ENV['LLM_API_KEY'] || 'ollama-local' %>
109
+ endpoint: <%= ENV['LLM_ENDPOINT'] || 'http://localhost:11434/v1/chat/completions' %>
110
+ model: <%= ENV['LLM_MODEL'] || 'qwen3' %>
111
+
112
+ # Generation parameters
113
+ temperature: <%= ENV['LLM_TEMPERATURE'] || 0.3 %>
114
+ max_tokens: <%= ENV['LLM_MAX_TOKENS'] || 2000 %>
115
+ timeout: <%= ENV['LLM_TIMEOUT'] || 60 %>
116
+
117
+ # Document Processing
118
+ document:
119
+ # Supported file types for upload/processing
120
+ supported_types:
121
+ - .md
122
+ - .txt
123
+ - .pdf
124
+ - .docx
125
+ - .html
126
+
127
+ # Maximum file size in MB
128
+ max_file_size: <%= ENV['MAX_FILE_SIZE'] || 50 %>
129
+
130
+ # Download timeout for URLs
131
+ download_timeout: <%= ENV['DOWNLOAD_TIMEOUT'] || 30 %>
132
+
133
+ # Logging Configuration
134
+ logging:
135
+ # Log level: debug, info, warn, error, fatal
136
+ level: <%= ENV['LOG_LEVEL'] || 'info' %>
137
+
138
+ # Log format: json, plain
139
+ format: <%= ENV['LOG_FORMAT'] || 'json' %>
140
+
141
+ # Log file path (null for stdout)
142
+ file_path: <%= ENV['LOG_FILE_PATH'] %>
143
+
144
+ # Enable query logging
145
+ enable_query_log: <%= ENV['ENABLE_QUERY_LOG'] != 'false' %>
146
+
147
+ # Performance Settings
148
+ performance:
149
+ # Thread pool size for parallel processing
150
+ thread_pool_size: <%= ENV['THREAD_POOL_SIZE'] || 4 %>
151
+
152
+ # Batch size for database operations
153
+ db_batch_size: <%= ENV['DB_BATCH_SIZE'] || 100 %>
154
+
155
+ # Enable connection pooling
156
+ connection_pooling: <%= ENV['CONNECTION_POOLING'] != 'false' %>