index_util 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +2 -0
- data/.rubocop.yml +28 -0
- data/CHANGELOG.md +10 -0
- data/LICENSE.txt +21 -0
- data/README.md +134 -0
- data/Rakefile +10 -0
- data/exe/index_util +6 -0
- data/lib/index_util/cli.rb +83 -0
- data/lib/index_util/database.rb +59 -0
- data/lib/index_util/document.rb +77 -0
- data/lib/index_util/fragment.rb +14 -0
- data/lib/index_util/fragment_embedding.rb +21 -0
- data/lib/index_util/fragments_fts.rb +38 -0
- data/lib/index_util/index.rb +210 -0
- data/lib/index_util/index_metadata.rb +23 -0
- data/lib/index_util/migrations/001_create_core_tables.rb +50 -0
- data/lib/index_util/migrations/002_create_fragments_fts.rb +14 -0
- data/lib/index_util/migrations/003_create_fragment_vectors.rb +12 -0
- data/lib/index_util/progress.rb +45 -0
- data/lib/index_util/vector_index.rb +61 -0
- data/lib/index_util/version.rb +5 -0
- data/lib/index_util.rb +18 -0
- metadata +140 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA256:
|
|
3
|
+
metadata.gz: dce5305bc6964aed62507ac4f5bc18f584cd658ad514ed8dd79348c982f0cc4c
|
|
4
|
+
data.tar.gz: 37e66070cd90e3f0ba284229cdb1e3908604fb82fb55e08b90d527523d15a921
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: d4240cf2c8cabf7e603c95f135982b7ea0cd454db695873b62eda8c5c4380f9b545ee396d47f1e86692266f6eb5b4714a6738e3c33d0ca6ec8f86bf132b71caf
|
|
7
|
+
data.tar.gz: 8b8469fdb26cbd6d8f6dcc438ce494325a88ccdaa88717a9e959487245d390582684997febbd423d09923a95e15d3ebb5f12bf1c94bff4e005dde5537d2f61ba
|
data/.rspec
ADDED
data/.rubocop.yml
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
AllCops:
|
|
2
|
+
TargetRubyVersion: 3.2
|
|
3
|
+
NewCops: disable
|
|
4
|
+
SuggestExtensions: false
|
|
5
|
+
|
|
6
|
+
Style/StringLiterals:
|
|
7
|
+
EnforcedStyle: double_quotes
|
|
8
|
+
Exclude:
|
|
9
|
+
- "spec/**/*"
|
|
10
|
+
- "index_util.gemspec"
|
|
11
|
+
|
|
12
|
+
Style/FrozenStringLiteralComment:
|
|
13
|
+
Enabled: false
|
|
14
|
+
|
|
15
|
+
Style/Documentation:
|
|
16
|
+
Enabled: false
|
|
17
|
+
|
|
18
|
+
Layout/LineLength:
|
|
19
|
+
Max: 170
|
|
20
|
+
|
|
21
|
+
Metrics/BlockLength:
|
|
22
|
+
Enabled: false
|
|
23
|
+
Metrics/ClassLength:
|
|
24
|
+
Enabled: false
|
|
25
|
+
Metrics/MethodLength:
|
|
26
|
+
Enabled: false
|
|
27
|
+
Metrics/AbcSize:
|
|
28
|
+
Enabled: false
|
data/CHANGELOG.md
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
## 0.1.0 - 2026-06-10
|
|
4
|
+
|
|
5
|
+
- Initial release of `index_util`.
|
|
6
|
+
- Added subclass-defined indexing hooks for document discovery, content loading, checksums, sections, postprocessing, and query amendments.
|
|
7
|
+
- Added SQLite storage with FTS5 keyword search and `sqlite-vec` vector search.
|
|
8
|
+
- Added `embedding_util` integration for embeddings and reranking.
|
|
9
|
+
- Added CLI support for rebuild, new-only indexing, update indexing, querying, and indexing progress output.
|
|
10
|
+
- Added standalone examples for embedded recipes, Kreuzberg PDF extraction, and RubyAPI method search.
|
data/LICENSE.txt
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 hmdne
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
data/README.md
ADDED
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
# index_util
|
|
2
|
+
|
|
3
|
+
Subclass-defined local search indexes for Ruby.
|
|
4
|
+
|
|
5
|
+
`index_util` lets a small Ruby class define where documents come from, how they are split, and how final searchable fragments are produced. The gem stores those fragments in SQLite with FTS5 keyword lookup, `sqlite-vec` vector lookup, and `embedding_util` embeddings/reranking.
|
|
6
|
+
|
|
7
|
+
This gem is in the `0.x` series. The API is intentionally unstable until `1.0`, and public method names, configuration options, return shapes, and default profiles may change between minor releases.
|
|
8
|
+
|
|
9
|
+
## Installation
|
|
10
|
+
|
|
11
|
+
Add the gem to your Gemfile:
|
|
12
|
+
|
|
13
|
+
```ruby
|
|
14
|
+
gem "index_util"
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
Then install dependencies:
|
|
18
|
+
|
|
19
|
+
```sh
|
|
20
|
+
bundle install
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
## Quick Start
|
|
24
|
+
|
|
25
|
+
```ruby
|
|
26
|
+
#!/usr/bin/env ruby
|
|
27
|
+
|
|
28
|
+
require "index_util"
|
|
29
|
+
|
|
30
|
+
class MyIndex < IndexUtil::Index
|
|
31
|
+
def database_file
|
|
32
|
+
"myindex.sqlite3"
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
def document_list
|
|
36
|
+
Dir["docs/**/*.md"]
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def document_sections(_document, content)
|
|
40
|
+
content.split(/^## /).each_with_index.to_h { |section, index| [index, section] }
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
MyIndex.cli if $PROGRAM_NAME == __FILE__
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
Build and query:
|
|
48
|
+
|
|
49
|
+
```sh
|
|
50
|
+
./myindex.rb --index
|
|
51
|
+
./myindex.rb "how do I split text?" --limit 5
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
## CLI
|
|
55
|
+
|
|
56
|
+
Subclass scripts can call `MyIndex.cli` and use:
|
|
57
|
+
|
|
58
|
+
```sh
|
|
59
|
+
./myindex.rb --index
|
|
60
|
+
./myindex.rb --index-new
|
|
61
|
+
./myindex.rb --index-update
|
|
62
|
+
./myindex.rb "query text" --limit 5
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
The installed executable can load a class explicitly:
|
|
66
|
+
|
|
67
|
+
```sh
|
|
68
|
+
index_util --require ./myindex.rb --class MyIndex index
|
|
69
|
+
index_util --require ./myindex.rb --class MyIndex query "query text" --limit 5
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
When a script should support both direct execution and installed CLI loading, guard the class CLI call with `if $PROGRAM_NAME == __FILE__`.
|
|
73
|
+
|
|
74
|
+
Indexing commands print one-line progress to stderr while they list, load, embed, and store documents. Successful queries print a JSON array with `document` and `content`. Failures print compact JSON to stderr and exit non-zero.
|
|
75
|
+
|
|
76
|
+
## Examples
|
|
77
|
+
|
|
78
|
+
Examples live under `examples/`. Each example is a small standalone directory with its own Gemfile pointing back to this checkout.
|
|
79
|
+
|
|
80
|
+
Recipe-card example:
|
|
81
|
+
|
|
82
|
+
```sh
|
|
83
|
+
cd examples/recipes
|
|
84
|
+
bundle install
|
|
85
|
+
bundle exec ruby recipes.rb --index
|
|
86
|
+
bundle exec ruby recipes.rb "quick vegetarian dinner" --limit 3
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
Kreuzberg PDF example:
|
|
90
|
+
|
|
91
|
+
```sh
|
|
92
|
+
cd examples/pdf_notes
|
|
93
|
+
bundle install
|
|
94
|
+
bundle exec ruby pdf_notes.rb --index
|
|
95
|
+
bundle exec ruby pdf_notes.rb "how should a risk assessment be conducted?" --limit 3
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
Ruby API example:
|
|
99
|
+
|
|
100
|
+
```sh
|
|
101
|
+
cd examples/ruby_api
|
|
102
|
+
bundle install
|
|
103
|
+
bundle exec ruby ruby_api.rb --index
|
|
104
|
+
bundle exec ruby ruby_api.rb "how to split a string" --limit 5
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
## API
|
|
108
|
+
|
|
109
|
+
Subclass `IndexUtil::Index` and define:
|
|
110
|
+
|
|
111
|
+
- `database_file` returns the SQLite path.
|
|
112
|
+
- `document_list` returns source documents.
|
|
113
|
+
- `document_content(document)` defaults to `File.read(document)`.
|
|
114
|
+
- `document_checksum(document, content)` defaults to stable SHA-256 over `content.to_s`.
|
|
115
|
+
- `document_sections(document, content)` defaults to `{ nil => content }`.
|
|
116
|
+
- `document_postprocess(fragment_document, content)` defaults to identity and must return a `String`.
|
|
117
|
+
- `query_amendments(query)` defaults to `{}` and returns extra `{document_id => content}` candidates.
|
|
118
|
+
|
|
119
|
+
Fragment document ids are built as `document` for nil sections or `document#section_id` otherwise. `#` is not escaped.
|
|
120
|
+
|
|
121
|
+
## Development
|
|
122
|
+
|
|
123
|
+
```sh
|
|
124
|
+
bundle install
|
|
125
|
+
bundle exec rake
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
## Contributing
|
|
129
|
+
|
|
130
|
+
Bug reports and pull requests are welcome at `https://github.com/rbutils/index_util`.
|
|
131
|
+
|
|
132
|
+
## License
|
|
133
|
+
|
|
134
|
+
The gem is available as open source under the terms of the MIT License.
|
data/Rakefile
ADDED
data/exe/index_util
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "json"
|
|
4
|
+
require "thor"
|
|
5
|
+
|
|
6
|
+
module IndexUtil
|
|
7
|
+
class CLI < Thor
|
|
8
|
+
class_option :require, type: :string, aliases: "-r", desc: "Ruby file that defines the index class"
|
|
9
|
+
class_option :class, type: :string, aliases: "-c", desc: "Index class name to instantiate"
|
|
10
|
+
class_option :verbose, type: :boolean, default: false
|
|
11
|
+
|
|
12
|
+
desc "index", "Rebuild the index from scratch"
|
|
13
|
+
def index
|
|
14
|
+
index_instance.index_all!(progress: progress)
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
desc "index-new", "Index only new documents"
|
|
18
|
+
def index_new
|
|
19
|
+
index_instance.index_new!(progress: progress)
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
desc "index-update", "Reconcile changed and stale documents"
|
|
23
|
+
def index_update
|
|
24
|
+
index_instance.index_update!(progress: progress)
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
desc "query QUERY", "Search the index"
|
|
28
|
+
option :limit, type: :numeric, default: Index::DEFAULT_LIMIT
|
|
29
|
+
def query(query)
|
|
30
|
+
puts JSON.pretty_generate(index_instance.query(query, limit: options[:limit], verbose: options[:verbose]))
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def self.run(argv = ARGV, index_class: nil)
|
|
34
|
+
if index_class
|
|
35
|
+
run_for_class(index_class, argv)
|
|
36
|
+
else
|
|
37
|
+
start(normalize_legacy_args(argv))
|
|
38
|
+
end
|
|
39
|
+
rescue StandardError => e
|
|
40
|
+
warn JSON.generate(error: e.class.name, message: e.message)
|
|
41
|
+
warn e.backtrace.join("\n") if argv.include?("--verbose") && e.backtrace
|
|
42
|
+
exit 1
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def self.run_for_class(index_class, argv)
|
|
46
|
+
command_argv = normalize_legacy_args(argv)
|
|
47
|
+
start(command_argv, index_instance: index_class.new)
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
def self.normalize_legacy_args(argv)
|
|
51
|
+
args = argv.dup
|
|
52
|
+
legacy_commands = { "--index" => "index", "--index-new" => "index-new", "--index-update" => "index-update" }
|
|
53
|
+
legacy_flag = legacy_commands.keys.find { |flag| args.include?(flag) }
|
|
54
|
+
return [legacy_commands.fetch(legacy_flag), *args.reject { |arg| arg == legacy_flag }] if legacy_flag
|
|
55
|
+
return ["query", *args] unless args.empty? || %w[index index-new index-update query].include?(args.first)
|
|
56
|
+
|
|
57
|
+
args
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
no_commands do
|
|
61
|
+
def initialize(args = [], local_options = {}, config = {})
|
|
62
|
+
super
|
|
63
|
+
@provided_index_instance = config[:index_instance]
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
def index_instance
|
|
67
|
+
@index_instance ||= begin
|
|
68
|
+
return @provided_index_instance if @provided_index_instance
|
|
69
|
+
|
|
70
|
+
require options[:require] if options[:require]
|
|
71
|
+
class_name = options[:class] || ENV.fetch("INDEX_UTIL_CLASS", nil)
|
|
72
|
+
raise Error, "Pass --class INDEX_CLASS or set INDEX_UTIL_CLASS" unless class_name
|
|
73
|
+
|
|
74
|
+
class_name.split("::").reduce(Object) { |scope, name| scope.const_get(name) }.new
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
def progress
|
|
79
|
+
@progress ||= Progress.new($stderr)
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
end
|
|
83
|
+
end
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "fileutils"
|
|
4
|
+
require "sequel"
|
|
5
|
+
require "sequel/extensions/migration"
|
|
6
|
+
require "sqlite3"
|
|
7
|
+
|
|
8
|
+
module IndexUtil
|
|
9
|
+
module Database
|
|
10
|
+
module_function
|
|
11
|
+
|
|
12
|
+
def connect(database_file)
|
|
13
|
+
db = Sequel.sqlite(database_file)
|
|
14
|
+
configure!(db)
|
|
15
|
+
migrate!(db)
|
|
16
|
+
db
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def rebuild!(database_file)
|
|
20
|
+
close_database_files(database_file)
|
|
21
|
+
FileUtils.rm_f(database_file)
|
|
22
|
+
FileUtils.rm_f("#{database_file}-wal")
|
|
23
|
+
FileUtils.rm_f("#{database_file}-shm")
|
|
24
|
+
connect(database_file)
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def configure!(db)
|
|
28
|
+
db.run "PRAGMA foreign_keys = ON"
|
|
29
|
+
db.run "PRAGMA journal_mode = WAL"
|
|
30
|
+
db.run "PRAGMA synchronous = NORMAL"
|
|
31
|
+
verify_fts5!(db)
|
|
32
|
+
VectorIndex.load!(db)
|
|
33
|
+
db
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def migrate!(db)
|
|
37
|
+
Sequel::Migrator.run(db, migrations_path)
|
|
38
|
+
rescue Sequel::Migrator::Error => e
|
|
39
|
+
raise Error, "Unsupported index schema migration state: #{e.message}"
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def migrations_path
|
|
43
|
+
File.expand_path("migrations", __dir__)
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def verify_fts5!(db)
|
|
47
|
+
db.run "CREATE VIRTUAL TABLE temp.index_util_fts5_check USING fts5(content)"
|
|
48
|
+
db.run "DROP TABLE temp.index_util_fts5_check"
|
|
49
|
+
rescue Sequel::DatabaseError => e
|
|
50
|
+
raise Error, "SQLite FTS5 support is required by index_util: #{e.message}"
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
def close_database_files(database_file)
|
|
54
|
+
return unless database_file
|
|
55
|
+
|
|
56
|
+
FileUtils.mkdir_p(File.dirname(File.expand_path(database_file)))
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
end
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "time"
|
|
4
|
+
|
|
5
|
+
module IndexUtil
|
|
6
|
+
module Document
|
|
7
|
+
module_function
|
|
8
|
+
|
|
9
|
+
def exist?(db, document)
|
|
10
|
+
db[:documents].where(document: document.to_s).count.positive?
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def unchanged?(db, document, checksum)
|
|
14
|
+
db[:documents].where(document: document.to_s, checksum: checksum.to_s).count.positive?
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def delete_stale(db, current_documents)
|
|
18
|
+
current = current_documents.map(&:to_s)
|
|
19
|
+
dataset = db[:documents]
|
|
20
|
+
dataset = dataset.exclude(document: current) unless current.empty?
|
|
21
|
+
dataset.all.each { |row| delete(db, row.fetch(:document)) }
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def delete(db, document)
|
|
25
|
+
row = db[:documents].where(document: document.to_s).first
|
|
26
|
+
return unless row
|
|
27
|
+
|
|
28
|
+
db[:fragments].where(document_id: row.fetch(:id)).all.each do |fragment|
|
|
29
|
+
FragmentsFts.delete(db, fragment)
|
|
30
|
+
VectorIndex.delete(db, fragment.fetch(:id))
|
|
31
|
+
end
|
|
32
|
+
db[:documents].where(id: row.fetch(:id)).delete
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
def replace_with_fragments(db, attributes)
|
|
36
|
+
document = attributes.fetch(:document)
|
|
37
|
+
checksum = attributes.fetch(:checksum)
|
|
38
|
+
fragments = attributes.fetch(:fragments)
|
|
39
|
+
embeddings = attributes.fetch(:embeddings)
|
|
40
|
+
profile = attributes.fetch(:profile)
|
|
41
|
+
raise Error, "fragment and embedding counts differ" unless fragments.length == embeddings.length
|
|
42
|
+
|
|
43
|
+
db.transaction do
|
|
44
|
+
delete(db, document)
|
|
45
|
+
now = timestamp
|
|
46
|
+
document_id = db[:documents].insert(document: document.to_s, checksum: checksum.to_s, indexed_at: now, created_at: now, updated_at: now)
|
|
47
|
+
|
|
48
|
+
fragments.zip(embeddings).each do |fragment, embedding|
|
|
49
|
+
fragment_id = db[:fragments].insert(
|
|
50
|
+
document_id: document_id,
|
|
51
|
+
document: document.to_s,
|
|
52
|
+
section_id: fragment.fetch(:section_id),
|
|
53
|
+
fragment_document: fragment.fetch(:fragment_document),
|
|
54
|
+
content: fragment.fetch(:content),
|
|
55
|
+
created_at: now,
|
|
56
|
+
updated_at: now
|
|
57
|
+
)
|
|
58
|
+
stored_fragment = db[:fragments].where(id: fragment_id).first
|
|
59
|
+
db[:fragment_embeddings].insert(
|
|
60
|
+
fragment_id: fragment_id,
|
|
61
|
+
profile: profile.to_s,
|
|
62
|
+
dimensions: embedding.length,
|
|
63
|
+
vector: FragmentEmbedding.pack(embedding),
|
|
64
|
+
vector_norm: FragmentEmbedding.norm(embedding),
|
|
65
|
+
created_at: now
|
|
66
|
+
)
|
|
67
|
+
FragmentsFts.insert(db, stored_fragment)
|
|
68
|
+
VectorIndex.insert(db, fragment_id, embedding)
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
def timestamp
|
|
74
|
+
Time.now.utc.iso8601
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
end
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module IndexUtil
|
|
4
|
+
module Fragment
|
|
5
|
+
module_function
|
|
6
|
+
|
|
7
|
+
def fragment_document(document, section_id)
|
|
8
|
+
document = document.to_s
|
|
9
|
+
return document if section_id.nil?
|
|
10
|
+
|
|
11
|
+
"#{document}##{section_id}"
|
|
12
|
+
end
|
|
13
|
+
end
|
|
14
|
+
end
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "sequel"
|
|
4
|
+
|
|
5
|
+
module IndexUtil
|
|
6
|
+
module FragmentEmbedding
|
|
7
|
+
module_function
|
|
8
|
+
|
|
9
|
+
def pack(vector)
|
|
10
|
+
Sequel.blob(Array(vector).map(&:to_f).pack("f*"))
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def unpack(blob)
|
|
14
|
+
blob.unpack("f*")
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def norm(vector)
|
|
18
|
+
Math.sqrt(Array(vector).sum { |value| value.to_f * value.to_f })
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module IndexUtil
|
|
4
|
+
module FragmentsFts
|
|
5
|
+
module_function
|
|
6
|
+
|
|
7
|
+
def insert(db, fragment)
|
|
8
|
+
db[
|
|
9
|
+
"INSERT INTO fragments_fts(rowid, fragment_document, content) VALUES (?, ?, ?)",
|
|
10
|
+
fragment.fetch(:id),
|
|
11
|
+
fragment.fetch(:fragment_document),
|
|
12
|
+
fragment.fetch(:content)
|
|
13
|
+
].insert
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def delete(db, fragment)
|
|
17
|
+
db[
|
|
18
|
+
"INSERT INTO fragments_fts(fragments_fts, rowid, fragment_document, content) VALUES ('delete', ?, ?, ?)",
|
|
19
|
+
fragment.fetch(:id),
|
|
20
|
+
fragment.fetch(:fragment_document),
|
|
21
|
+
fragment.fetch(:content)
|
|
22
|
+
].insert
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def search(db, query, limit:)
|
|
26
|
+
db.fetch(<<~SQL, query.to_s, Integer(limit)).all
|
|
27
|
+
SELECT fragments.id, fragments.fragment_document, fragments.content, bm25(fragments_fts) AS score
|
|
28
|
+
FROM fragments_fts
|
|
29
|
+
JOIN fragments ON fragments.id = fragments_fts.rowid
|
|
30
|
+
WHERE fragments_fts MATCH ?
|
|
31
|
+
ORDER BY score
|
|
32
|
+
LIMIT ?
|
|
33
|
+
SQL
|
|
34
|
+
rescue Sequel::DatabaseError
|
|
35
|
+
[]
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
end
|
|
@@ -0,0 +1,210 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "digest"
|
|
4
|
+
require "json"
|
|
5
|
+
require "time"
|
|
6
|
+
|
|
7
|
+
require "embedding_util"
|
|
8
|
+
|
|
9
|
+
module IndexUtil
|
|
10
|
+
class Index
|
|
11
|
+
EMBED_MANY_LIMIT = 1000
|
|
12
|
+
DEFAULT_LIMIT = 5
|
|
13
|
+
|
|
14
|
+
class << self
|
|
15
|
+
def cli(argv = ARGV)
|
|
16
|
+
CLI.run(argv, index_class: self)
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def database_file
|
|
21
|
+
raise Error, "#{self.class} must define #database_file"
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def document_list
|
|
25
|
+
raise Error, "#{self.class} must define #document_list"
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def document_content(document)
|
|
29
|
+
File.read(document.to_s)
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def document_checksum(_document, content)
|
|
33
|
+
Digest::SHA256.hexdigest(content.to_s)
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def document_sections(_document, content)
|
|
37
|
+
{ nil => content }
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def document_postprocess(_fragment_document, content)
|
|
41
|
+
content
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
def query_amendments(_query)
|
|
45
|
+
{}
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
def index_all!(progress: nil)
|
|
49
|
+
progress&.step("Preparing index", detail: "rebuilding #{database_file}")
|
|
50
|
+
db = Database.rebuild!(database_file)
|
|
51
|
+
documents = list_documents(progress)
|
|
52
|
+
index_documents(db, documents, mode: :all, progress: progress)
|
|
53
|
+
ensure
|
|
54
|
+
db&.disconnect
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
def index_new!(progress: nil)
|
|
58
|
+
progress&.step("Preparing index", detail: "opening #{database_file}")
|
|
59
|
+
db = Database.connect(database_file)
|
|
60
|
+
documents = list_documents(progress)
|
|
61
|
+
index_documents(db, documents, mode: :new, progress: progress)
|
|
62
|
+
ensure
|
|
63
|
+
db&.disconnect
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
def index_update!(progress: nil)
|
|
67
|
+
progress&.step("Preparing index", detail: "opening #{database_file}")
|
|
68
|
+
db = Database.connect(database_file)
|
|
69
|
+
documents = list_documents(progress)
|
|
70
|
+
progress&.step("Updating index", current: 0, total: documents.length, detail: "removing stale documents")
|
|
71
|
+
Document.delete_stale(db, documents)
|
|
72
|
+
index_documents(db, documents, mode: :update, progress: progress)
|
|
73
|
+
ensure
|
|
74
|
+
db&.disconnect
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
def query(query, limit: DEFAULT_LIMIT, verbose: false)
|
|
78
|
+
db = Database.connect(database_file)
|
|
79
|
+
limit = Integer(limit)
|
|
80
|
+
candidate_limit = [limit * 10, 50].max
|
|
81
|
+
query_embedding = EmbeddingUtil.embed(query.to_s)
|
|
82
|
+
vector_candidates = VectorIndex.search(db, query_embedding, limit: candidate_limit)
|
|
83
|
+
keyword_candidates = FragmentsFts.search(db, query.to_s, limit: candidate_limit)
|
|
84
|
+
amendments = query_amendments(query.to_s)
|
|
85
|
+
candidates = merge_candidates(vector_candidates, keyword_candidates, amendments)
|
|
86
|
+
rerank(query.to_s, candidates, limit: limit, verbose: verbose)
|
|
87
|
+
ensure
|
|
88
|
+
db&.disconnect
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
private
|
|
92
|
+
|
|
93
|
+
def list_documents(progress)
|
|
94
|
+
progress&.step("Listing documents")
|
|
95
|
+
documents = document_list.to_a
|
|
96
|
+
progress&.step("Listing documents", current: documents.length, total: documents.length, detail: "found #{documents.length}")
|
|
97
|
+
documents
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
def index_documents(db, documents, mode:, progress:)
|
|
101
|
+
stats = { indexed: 0, skipped: 0 }
|
|
102
|
+
total = documents.length
|
|
103
|
+
documents.each_with_index do |document, index|
|
|
104
|
+
current = index + 1
|
|
105
|
+
document_key = document.to_s
|
|
106
|
+
progress&.step("Indexing documents", current: current, total: total, detail: "checking #{document_key}")
|
|
107
|
+
if mode == :new && Document.exist?(db, document_key)
|
|
108
|
+
stats[:skipped] += 1
|
|
109
|
+
progress&.step("Indexing documents", current: current, total: total, detail: "skipped existing #{document_key}")
|
|
110
|
+
next
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
progress&.step("Indexing documents", current: current, total: total, detail: "loading #{document_key}")
|
|
114
|
+
content = document_content(document)
|
|
115
|
+
checksum = document_checksum(document, content).to_s
|
|
116
|
+
if mode == :update && Document.unchanged?(db, document_key, checksum)
|
|
117
|
+
stats[:skipped] += 1
|
|
118
|
+
progress&.step("Indexing documents", current: current, total: total, detail: "skipped unchanged #{document_key}")
|
|
119
|
+
next
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
progress&.step("Indexing documents", current: current, total: total, detail: "sectioning #{document_key}")
|
|
123
|
+
fragments = build_fragments(document, content)
|
|
124
|
+
progress&.step("Indexing documents", current: current, total: total, detail: "embedding #{fragments.length} fragments from #{document_key}")
|
|
125
|
+
embeddings = embed_fragments(fragments)
|
|
126
|
+
ensure_vector_index(db, embeddings)
|
|
127
|
+
progress&.step("Indexing documents", current: current, total: total, detail: "storing #{document_key}")
|
|
128
|
+
Document.replace_with_fragments(db, { document: document_key, checksum: checksum, fragments: fragments, embeddings: embeddings, profile: embedding_profile })
|
|
129
|
+
stats[:indexed] += 1
|
|
130
|
+
progress&.step("Indexing documents", current: current, total: total, detail: "indexed #{document_key}")
|
|
131
|
+
end
|
|
132
|
+
progress&.finish("Indexing complete: #{stats.fetch(:indexed)} indexed, #{stats.fetch(:skipped)} skipped, #{total} total")
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
def build_fragments(document, content)
|
|
136
|
+
sections = document_sections(document, content)
|
|
137
|
+
raise Error, "document_sections must return a hash-like object of section_id => content" unless sections.respond_to?(:each_pair)
|
|
138
|
+
|
|
139
|
+
sections.each_pair.map do |section_id, section_content|
|
|
140
|
+
section_key = section_id&.to_s
|
|
141
|
+
fragment_document = Fragment.fragment_document(document, section_key)
|
|
142
|
+
processed = document_postprocess(fragment_document, section_content)
|
|
143
|
+
raise Error, "document_postprocess must return a String for #{fragment_document}" unless processed.is_a?(String)
|
|
144
|
+
|
|
145
|
+
{ section_id: section_key, fragment_document: fragment_document, content: processed }
|
|
146
|
+
end
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
def embed_fragments(fragments)
|
|
150
|
+
fragments.each_slice(EMBED_MANY_LIMIT).flat_map do |batch|
|
|
151
|
+
EmbeddingUtil.embed_many(batch.map { |fragment| fragment.fetch(:content) })
|
|
152
|
+
end
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
def ensure_vector_index(db, embeddings)
|
|
156
|
+
first = embeddings.first
|
|
157
|
+
raise Error, "EmbeddingUtil returned no embeddings" unless first
|
|
158
|
+
|
|
159
|
+
VectorIndex.ensure!(db, dimensions: first.length)
|
|
160
|
+
end
|
|
161
|
+
|
|
162
|
+
def embedding_profile
|
|
163
|
+
profile = EmbeddingUtil.configuration.resolved_profile
|
|
164
|
+
profile.respond_to?(:name) ? profile.name : profile.to_s
|
|
165
|
+
rescue StandardError
|
|
166
|
+
"default"
|
|
167
|
+
end
|
|
168
|
+
|
|
169
|
+
def merge_candidates(vector_candidates, keyword_candidates, amendments)
|
|
170
|
+
merged = {}
|
|
171
|
+
vector_candidates.each do |row|
|
|
172
|
+
key = row.fetch(:fragment_document)
|
|
173
|
+
merged[key] = { document: key, content: row.fetch(:content), sources: [], vector_score: nil, keyword_score: nil }
|
|
174
|
+
merged[key][:sources] << "vector"
|
|
175
|
+
merged[key][:vector_score] = row[:distance]
|
|
176
|
+
end
|
|
177
|
+
keyword_candidates.each do |row|
|
|
178
|
+
key = row.fetch(:fragment_document)
|
|
179
|
+
merged[key] ||= { document: key, content: row.fetch(:content), sources: [], vector_score: nil, keyword_score: nil }
|
|
180
|
+
merged[key][:sources] << "keyword"
|
|
181
|
+
merged[key][:keyword_score] = row[:score]
|
|
182
|
+
end
|
|
183
|
+
amendments.each_pair do |document, content|
|
|
184
|
+
key = document.to_s
|
|
185
|
+
merged[key] ||= { document: key, content: content.to_s, sources: [], vector_score: nil, keyword_score: nil }
|
|
186
|
+
merged[key][:sources] << "amendment"
|
|
187
|
+
end
|
|
188
|
+
merged.values
|
|
189
|
+
end
|
|
190
|
+
|
|
191
|
+
def rerank(query, candidates, limit:, verbose:)
|
|
192
|
+
return [] if candidates.empty?
|
|
193
|
+
|
|
194
|
+
documents = candidates.map { |candidate| candidate.fetch(:content) }
|
|
195
|
+
ranked = EmbeddingUtil.rerank(query, documents)
|
|
196
|
+
ranked.first(limit).map do |result|
|
|
197
|
+
candidate = candidates.fetch(result.index)
|
|
198
|
+
output = { document: candidate.fetch(:document), content: candidate.fetch(:content) }
|
|
199
|
+
next output unless verbose
|
|
200
|
+
|
|
201
|
+
output.merge(
|
|
202
|
+
sources: candidate.fetch(:sources).uniq,
|
|
203
|
+
vector_score: candidate[:vector_score],
|
|
204
|
+
keyword_score: candidate[:keyword_score],
|
|
205
|
+
reranker_score: result.score
|
|
206
|
+
)
|
|
207
|
+
end
|
|
208
|
+
end
|
|
209
|
+
end
|
|
210
|
+
end
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "time"
|
|
4
|
+
|
|
5
|
+
module IndexUtil
|
|
6
|
+
module IndexMetadata
|
|
7
|
+
module_function
|
|
8
|
+
|
|
9
|
+
def get(db, key)
|
|
10
|
+
db[:index_metadata].where(key: key.to_s).get(:value)
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def set(db, key, value)
|
|
14
|
+
now = timestamp
|
|
15
|
+
row = { key: key.to_s, value: value.to_s, updated_at: now }
|
|
16
|
+
db[:index_metadata].insert_conflict(target: :key, update: { value: row[:value], updated_at: now }).insert(row)
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def timestamp
|
|
20
|
+
Time.now.utc.iso8601
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
end
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
Sequel.migration do
|
|
4
|
+
change do
|
|
5
|
+
create_table(:index_metadata) do
|
|
6
|
+
String :key, primary_key: true
|
|
7
|
+
String :value, null: false
|
|
8
|
+
String :updated_at, null: false
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
create_table(:documents) do
|
|
12
|
+
primary_key :id
|
|
13
|
+
String :document, null: false, unique: true
|
|
14
|
+
String :checksum, null: false
|
|
15
|
+
String :indexed_at, null: false
|
|
16
|
+
String :created_at, null: false
|
|
17
|
+
String :updated_at, null: false
|
|
18
|
+
|
|
19
|
+
index :document, unique: true
|
|
20
|
+
index :checksum
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
create_table(:fragments) do
|
|
24
|
+
primary_key :id
|
|
25
|
+
foreign_key :document_id, :documents, null: false, on_delete: :cascade
|
|
26
|
+
String :document, null: false
|
|
27
|
+
String :section_id
|
|
28
|
+
String :fragment_document, null: false, unique: true
|
|
29
|
+
String :content, text: true, null: false
|
|
30
|
+
String :created_at, null: false
|
|
31
|
+
String :updated_at, null: false
|
|
32
|
+
|
|
33
|
+
index :document_id
|
|
34
|
+
index :document
|
|
35
|
+
index :fragment_document, unique: true
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
create_table(:fragment_embeddings) do
|
|
39
|
+
foreign_key :fragment_id, :fragments, null: false, on_delete: :cascade
|
|
40
|
+
String :profile, null: false
|
|
41
|
+
Integer :dimensions, null: false
|
|
42
|
+
File :vector, null: false
|
|
43
|
+
Float :vector_norm
|
|
44
|
+
String :created_at, null: false
|
|
45
|
+
|
|
46
|
+
primary_key %i[fragment_id profile]
|
|
47
|
+
index :profile
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
end
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
Sequel.migration do
|
|
4
|
+
up do
|
|
5
|
+
run <<~SQL
|
|
6
|
+
CREATE VIRTUAL TABLE fragments_fts
|
|
7
|
+
USING fts5(fragment_document, content, content='fragments', content_rowid='id')
|
|
8
|
+
SQL
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
down do
|
|
12
|
+
run "DROP TABLE IF EXISTS fragments_fts"
|
|
13
|
+
end
|
|
14
|
+
end
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
Sequel.migration do
|
|
4
|
+
up do
|
|
5
|
+
# The sqlite-vec table depends on the active embedding dimension and is
|
|
6
|
+
# created by IndexUtil::VectorIndex when the first embeddings are stored.
|
|
7
|
+
end
|
|
8
|
+
|
|
9
|
+
down do
|
|
10
|
+
run "DROP TABLE IF EXISTS fragment_vectors"
|
|
11
|
+
end
|
|
12
|
+
end
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module IndexUtil
|
|
4
|
+
class Progress
|
|
5
|
+
def initialize(io = $stderr, width: 24)
|
|
6
|
+
@io = io
|
|
7
|
+
@width = width
|
|
8
|
+
@last_length = 0
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
def step(message, current: nil, total: nil, detail: nil)
|
|
12
|
+
write(format_line(message, current: current, total: total, detail: detail))
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def finish(message)
|
|
16
|
+
write(message)
|
|
17
|
+
@io.puts
|
|
18
|
+
@last_length = 0
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
private
|
|
22
|
+
|
|
23
|
+
def format_line(message, current:, total:, detail:)
|
|
24
|
+
parts = [message]
|
|
25
|
+
parts << bar(current, total) if current && total
|
|
26
|
+
parts << "#{current}/#{total}" if current && total
|
|
27
|
+
parts << detail if detail && !detail.empty?
|
|
28
|
+
parts.join(" ")
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
def bar(current, total)
|
|
32
|
+
ratio = total.zero? ? 1.0 : current.to_f / total
|
|
33
|
+
filled = (ratio.clamp(0.0, 1.0) * @width).round
|
|
34
|
+
|
|
35
|
+
"[#{'=' * filled}#{'.' * (@width - filled)}]"
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def write(line)
|
|
39
|
+
padding = [@last_length - line.length, 0].max
|
|
40
|
+
@io.print("\r#{line}#{' ' * padding}")
|
|
41
|
+
@io.flush
|
|
42
|
+
@last_length = line.length
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
end
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module IndexUtil
|
|
4
|
+
module VectorIndex
|
|
5
|
+
module_function
|
|
6
|
+
|
|
7
|
+
def load!(db)
|
|
8
|
+
require "sqlite_vec"
|
|
9
|
+
db.synchronize do |connection|
|
|
10
|
+
connection.enable_load_extension(true)
|
|
11
|
+
SqliteVec.load(connection)
|
|
12
|
+
connection.enable_load_extension(false)
|
|
13
|
+
end
|
|
14
|
+
db.get(Sequel.lit("vec_version()"))
|
|
15
|
+
rescue LoadError => e
|
|
16
|
+
raise Error, "sqlite-vec gem is required by index_util: #{e.message}"
|
|
17
|
+
rescue StandardError => e
|
|
18
|
+
raise Error, "sqlite-vec support is required by index_util: #{e.message}"
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def ensure!(db, dimensions:)
|
|
22
|
+
dimensions = Integer(dimensions)
|
|
23
|
+
stored_dimensions = IndexMetadata.get(db, :embedding_dimensions)
|
|
24
|
+
if stored_dimensions && stored_dimensions.to_i != dimensions
|
|
25
|
+
raise Error, "Embedding dimensions changed from #{stored_dimensions} to #{dimensions}; rebuild the index with --index"
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
db.run <<~SQL
|
|
29
|
+
CREATE VIRTUAL TABLE IF NOT EXISTS fragment_vectors USING vec0(
|
|
30
|
+
fragment_id integer primary key,
|
|
31
|
+
embedding float[#{dimensions}]
|
|
32
|
+
)
|
|
33
|
+
SQL
|
|
34
|
+
IndexMetadata.set(db, :embedding_dimensions, dimensions)
|
|
35
|
+
rescue Sequel::DatabaseError => e
|
|
36
|
+
raise Error, "Failed to create sqlite-vec table: #{e.message}"
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def insert(db, fragment_id, vector)
|
|
40
|
+
db["INSERT INTO fragment_vectors(fragment_id, embedding) VALUES (?, ?)", Integer(fragment_id), FragmentEmbedding.pack(vector)].insert
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def delete(db, fragment_id)
|
|
44
|
+
db[:fragment_vectors].where(fragment_id: Integer(fragment_id)).delete
|
|
45
|
+
rescue Sequel::DatabaseError
|
|
46
|
+
nil
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
def search(db, query_embedding, limit:)
|
|
50
|
+
db.fetch(<<~SQL, FragmentEmbedding.pack(query_embedding), Integer(limit)).all
|
|
51
|
+
SELECT fragments.id, fragments.fragment_document, fragments.content, fragment_vectors.distance AS distance
|
|
52
|
+
FROM fragment_vectors
|
|
53
|
+
JOIN fragments ON fragments.id = fragment_vectors.fragment_id
|
|
54
|
+
WHERE embedding MATCH ? AND k = ?
|
|
55
|
+
ORDER BY distance
|
|
56
|
+
SQL
|
|
57
|
+
rescue Sequel::DatabaseError
|
|
58
|
+
[]
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
end
|
data/lib/index_util.rb
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "index_util/version"
|
|
4
|
+
|
|
5
|
+
module IndexUtil
|
|
6
|
+
class Error < StandardError; end
|
|
7
|
+
|
|
8
|
+
autoload :CLI, "index_util/cli"
|
|
9
|
+
autoload :Database, "index_util/database"
|
|
10
|
+
autoload :Document, "index_util/document"
|
|
11
|
+
autoload :Fragment, "index_util/fragment"
|
|
12
|
+
autoload :FragmentEmbedding, "index_util/fragment_embedding"
|
|
13
|
+
autoload :FragmentsFts, "index_util/fragments_fts"
|
|
14
|
+
autoload :Index, "index_util/index"
|
|
15
|
+
autoload :IndexMetadata, "index_util/index_metadata"
|
|
16
|
+
autoload :Progress, "index_util/progress"
|
|
17
|
+
autoload :VectorIndex, "index_util/vector_index"
|
|
18
|
+
end
|
metadata
ADDED
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: index_util
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: 0.1.0
|
|
5
|
+
platform: ruby
|
|
6
|
+
authors:
|
|
7
|
+
- hmdne
|
|
8
|
+
bindir: exe
|
|
9
|
+
cert_chain: []
|
|
10
|
+
date: 1980-01-02 00:00:00.000000000 Z
|
|
11
|
+
dependencies:
|
|
12
|
+
- !ruby/object:Gem::Dependency
|
|
13
|
+
name: embedding_util
|
|
14
|
+
requirement: !ruby/object:Gem::Requirement
|
|
15
|
+
requirements:
|
|
16
|
+
- - ">="
|
|
17
|
+
- !ruby/object:Gem::Version
|
|
18
|
+
version: '0'
|
|
19
|
+
type: :runtime
|
|
20
|
+
prerelease: false
|
|
21
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
22
|
+
requirements:
|
|
23
|
+
- - ">="
|
|
24
|
+
- !ruby/object:Gem::Version
|
|
25
|
+
version: '0'
|
|
26
|
+
- !ruby/object:Gem::Dependency
|
|
27
|
+
name: sequel
|
|
28
|
+
requirement: !ruby/object:Gem::Requirement
|
|
29
|
+
requirements:
|
|
30
|
+
- - "~>"
|
|
31
|
+
- !ruby/object:Gem::Version
|
|
32
|
+
version: '5.0'
|
|
33
|
+
type: :runtime
|
|
34
|
+
prerelease: false
|
|
35
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
36
|
+
requirements:
|
|
37
|
+
- - "~>"
|
|
38
|
+
- !ruby/object:Gem::Version
|
|
39
|
+
version: '5.0'
|
|
40
|
+
- !ruby/object:Gem::Dependency
|
|
41
|
+
name: sqlite3
|
|
42
|
+
requirement: !ruby/object:Gem::Requirement
|
|
43
|
+
requirements:
|
|
44
|
+
- - "~>"
|
|
45
|
+
- !ruby/object:Gem::Version
|
|
46
|
+
version: '2.0'
|
|
47
|
+
type: :runtime
|
|
48
|
+
prerelease: false
|
|
49
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
50
|
+
requirements:
|
|
51
|
+
- - "~>"
|
|
52
|
+
- !ruby/object:Gem::Version
|
|
53
|
+
version: '2.0'
|
|
54
|
+
- !ruby/object:Gem::Dependency
|
|
55
|
+
name: sqlite-vec
|
|
56
|
+
requirement: !ruby/object:Gem::Requirement
|
|
57
|
+
requirements:
|
|
58
|
+
- - "~>"
|
|
59
|
+
- !ruby/object:Gem::Version
|
|
60
|
+
version: '0.1'
|
|
61
|
+
type: :runtime
|
|
62
|
+
prerelease: false
|
|
63
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
64
|
+
requirements:
|
|
65
|
+
- - "~>"
|
|
66
|
+
- !ruby/object:Gem::Version
|
|
67
|
+
version: '0.1'
|
|
68
|
+
- !ruby/object:Gem::Dependency
|
|
69
|
+
name: thor
|
|
70
|
+
requirement: !ruby/object:Gem::Requirement
|
|
71
|
+
requirements:
|
|
72
|
+
- - "~>"
|
|
73
|
+
- !ruby/object:Gem::Version
|
|
74
|
+
version: '1.3'
|
|
75
|
+
type: :runtime
|
|
76
|
+
prerelease: false
|
|
77
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
78
|
+
requirements:
|
|
79
|
+
- - "~>"
|
|
80
|
+
- !ruby/object:Gem::Version
|
|
81
|
+
version: '1.3'
|
|
82
|
+
description: A small rbutils gem for building local document indexes with SQLite FTS,
|
|
83
|
+
vector search, embeddings, and reranking.
|
|
84
|
+
email:
|
|
85
|
+
- 54514036+hmdne@users.noreply.github.com
|
|
86
|
+
executables:
|
|
87
|
+
- index_util
|
|
88
|
+
extensions: []
|
|
89
|
+
extra_rdoc_files: []
|
|
90
|
+
files:
|
|
91
|
+
- ".rspec"
|
|
92
|
+
- ".rubocop.yml"
|
|
93
|
+
- CHANGELOG.md
|
|
94
|
+
- LICENSE.txt
|
|
95
|
+
- README.md
|
|
96
|
+
- Rakefile
|
|
97
|
+
- exe/index_util
|
|
98
|
+
- lib/index_util.rb
|
|
99
|
+
- lib/index_util/cli.rb
|
|
100
|
+
- lib/index_util/database.rb
|
|
101
|
+
- lib/index_util/document.rb
|
|
102
|
+
- lib/index_util/fragment.rb
|
|
103
|
+
- lib/index_util/fragment_embedding.rb
|
|
104
|
+
- lib/index_util/fragments_fts.rb
|
|
105
|
+
- lib/index_util/index.rb
|
|
106
|
+
- lib/index_util/index_metadata.rb
|
|
107
|
+
- lib/index_util/migrations/001_create_core_tables.rb
|
|
108
|
+
- lib/index_util/migrations/002_create_fragments_fts.rb
|
|
109
|
+
- lib/index_util/migrations/003_create_fragment_vectors.rb
|
|
110
|
+
- lib/index_util/progress.rb
|
|
111
|
+
- lib/index_util/vector_index.rb
|
|
112
|
+
- lib/index_util/version.rb
|
|
113
|
+
homepage: https://github.com/rbutils/index_util
|
|
114
|
+
licenses:
|
|
115
|
+
- MIT
|
|
116
|
+
metadata:
|
|
117
|
+
allowed_push_host: https://rubygems.org
|
|
118
|
+
source_code_uri: https://github.com/rbutils/index_util
|
|
119
|
+
changelog_uri: https://github.com/rbutils/index_util/blob/master/CHANGELOG.md
|
|
120
|
+
documentation_uri: https://github.com/rbutils/index_util#readme
|
|
121
|
+
bug_tracker_uri: https://github.com/rbutils/index_util/issues
|
|
122
|
+
rubygems_mfa_required: 'true'
|
|
123
|
+
rdoc_options: []
|
|
124
|
+
require_paths:
|
|
125
|
+
- lib
|
|
126
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
127
|
+
requirements:
|
|
128
|
+
- - ">="
|
|
129
|
+
- !ruby/object:Gem::Version
|
|
130
|
+
version: 3.2.0
|
|
131
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
132
|
+
requirements:
|
|
133
|
+
- - ">="
|
|
134
|
+
- !ruby/object:Gem::Version
|
|
135
|
+
version: '0'
|
|
136
|
+
requirements: []
|
|
137
|
+
rubygems_version: 4.0.6
|
|
138
|
+
specification_version: 4
|
|
139
|
+
summary: Subclass-defined local search indexes for Ruby
|
|
140
|
+
test_files: []
|