phronomy 0.8.0 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +31 -41
- data/benchmark/baseline.json +1 -1
- data/benchmark/bench_agent_invoke.rb +1 -1
- data/benchmark/bench_context_assembler.rb +9 -1
- data/benchmark/bench_regression.rb +8 -8
- data/benchmark/bench_tool_schema.rb +2 -2
- data/benchmark/bench_vector_store.rb +1 -1
- data/docs/decisions/011-build-context-as-single-llm-input-authority.md +224 -0
- data/lib/phronomy/agent/base.rb +253 -351
- data/lib/phronomy/agent/concerns/suspendable.rb +6 -6
- data/lib/phronomy/agent/context/capability/base.rb +689 -0
- data/lib/phronomy/agent/context/capability/scope_policy.rb +54 -0
- data/lib/phronomy/agent/context/knowledge/base.rb +58 -0
- data/lib/phronomy/agent/context/knowledge/entity_knowledge.rb +102 -0
- data/lib/phronomy/agent/context/knowledge/static_knowledge.rb +58 -0
- data/lib/phronomy/agent/invocation_pipeline.rb +10 -1
- data/lib/phronomy/agent/react_agent.rb +24 -23
- data/lib/phronomy/agent/shared_state.rb +2 -2
- data/lib/phronomy/agent/tool_executor.rb +1 -1
- data/lib/phronomy/concurrency/gate_registry.rb +0 -1
- data/lib/phronomy/configuration.rb +0 -6
- data/lib/phronomy/llm_context_window/assembler.rb +77 -44
- data/lib/phronomy/multi_agent/handoff.rb +4 -4
- data/lib/phronomy/multi_agent/orchestrator.rb +1 -1
- data/lib/phronomy/multi_agent/team_coordinator.rb +2 -2
- data/lib/phronomy/runtime/runtime_metrics.rb +0 -1
- data/lib/phronomy/runtime.rb +1 -2
- data/lib/phronomy/tool.rb +3 -4
- data/lib/phronomy/{tool/agent_tool.rb → tools/agent.rb} +6 -6
- data/lib/phronomy/{tool/mcp_tool.rb → tools/mcp.rb} +9 -9
- data/lib/phronomy/tools/vector_search.rb +70 -0
- data/lib/phronomy/vector_store/async_backend.rb +110 -0
- data/lib/phronomy/vector_store/base.rb +89 -0
- data/lib/phronomy/vector_store/embeddings/base.rb +41 -0
- data/lib/phronomy/vector_store/embeddings/ruby_llm_embeddings.rb +47 -0
- data/lib/phronomy/vector_store/in_memory.rb +103 -0
- data/lib/phronomy/vector_store/loader/base.rb +27 -0
- data/lib/phronomy/vector_store/loader/csv_loader.rb +58 -0
- data/lib/phronomy/vector_store/loader/markdown_loader.rb +78 -0
- data/lib/phronomy/vector_store/loader/plain_text_loader.rb +24 -0
- data/lib/phronomy/vector_store/pgvector.rb +127 -0
- data/lib/phronomy/vector_store/redis_search.rb +192 -0
- data/lib/phronomy/vector_store/splitter/base.rb +49 -0
- data/lib/phronomy/vector_store/splitter/fixed_size_splitter.rb +53 -0
- data/lib/phronomy/vector_store/splitter/recursive_splitter.rb +107 -0
- data/lib/phronomy/vector_store.rb +16 -4
- data/lib/phronomy/version.rb +1 -1
- data/lib/phronomy.rb +2 -1
- data/scripts/api_snapshot.rb +11 -9
- metadata +28 -32
- data/lib/phronomy/agent/context/conversation/compaction_context.rb +0 -117
- data/lib/phronomy/agent/context/conversation/trigger_context.rb +0 -43
- data/lib/phronomy/agent/context/conversation/trim_context.rb +0 -82
- data/lib/phronomy/agent/context/knowledge/embeddings/base.rb +0 -45
- data/lib/phronomy/agent/context/knowledge/embeddings/ruby_llm_embeddings.rb +0 -51
- data/lib/phronomy/agent/context/knowledge/loader/base.rb +0 -31
- data/lib/phronomy/agent/context/knowledge/loader/csv_loader.rb +0 -62
- data/lib/phronomy/agent/context/knowledge/loader/markdown_loader.rb +0 -82
- data/lib/phronomy/agent/context/knowledge/loader/plain_text_loader.rb +0 -28
- data/lib/phronomy/agent/context/knowledge/source/base.rb +0 -60
- data/lib/phronomy/agent/context/knowledge/source/entity_knowledge.rb +0 -102
- data/lib/phronomy/agent/context/knowledge/source/rag_knowledge.rb +0 -63
- data/lib/phronomy/agent/context/knowledge/source/static_knowledge.rb +0 -58
- data/lib/phronomy/agent/context/knowledge/splitter/base.rb +0 -53
- data/lib/phronomy/agent/context/knowledge/splitter/fixed_size_splitter.rb +0 -57
- data/lib/phronomy/agent/context/knowledge/splitter/recursive_splitter.rb +0 -111
- data/lib/phronomy/agent/context/knowledge/vector_store/async_backend.rb +0 -116
- data/lib/phronomy/agent/context/knowledge/vector_store/base.rb +0 -95
- data/lib/phronomy/agent/context/knowledge/vector_store/in_memory.rb +0 -109
- data/lib/phronomy/agent/context/knowledge/vector_store/pgvector.rb +0 -133
- data/lib/phronomy/agent/context/knowledge/vector_store/redis_search.rb +0 -198
- data/lib/phronomy/embeddings.rb +0 -11
- data/lib/phronomy/loader.rb +0 -13
- data/lib/phronomy/splitter.rb +0 -12
- data/lib/phronomy/tool/base.rb +0 -685
- data/lib/phronomy/tool/scope_policy.rb +0 -50
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "csv"
|
|
4
|
+
|
|
5
|
+
module Phronomy
|
|
6
|
+
module VectorStore
|
|
7
|
+
module Loader
|
|
8
|
+
# Loads a CSV file, converting each row into a separate document.
|
|
9
|
+
#
|
|
10
|
+
# By default the first row is treated as a header and column names are
|
|
11
|
+
# available in the document metadata. The full row is serialised to
|
|
12
|
+
# a human-readable "key: value" string for embedding.
|
|
13
|
+
#
|
|
14
|
+
# @example
|
|
15
|
+
# loader = Phronomy::VectorStore::Loader::CsvLoader.new
|
|
16
|
+
# docs = loader.load("products.csv")
|
|
17
|
+
# # => [
|
|
18
|
+
# # { text: "name: Widget\nprice: 9.99", metadata: { source: "...", row: 1, name: "Widget", price: "9.99" } },
|
|
19
|
+
# # ...
|
|
20
|
+
# # ]
|
|
21
|
+
class CsvLoader < Base
|
|
22
|
+
# @param headers [Boolean] treat the first row as headers (default: true)
|
|
23
|
+
# @param text_column [String, nil] if set, use only this column as the document text
|
|
24
|
+
# @api public
|
|
25
|
+
def initialize(headers: true, text_column: nil)
|
|
26
|
+
@headers = headers
|
|
27
|
+
@text_column = text_column
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
# @param source [String] path to a CSV file
|
|
31
|
+
# @return [Array<Hash>]
|
|
32
|
+
# @raise [Errno::ENOENT] if the file does not exist
|
|
33
|
+
# @api public
|
|
34
|
+
def load(source)
|
|
35
|
+
rows = CSV.read(source, headers: @headers, encoding: "UTF-8")
|
|
36
|
+
|
|
37
|
+
if @headers
|
|
38
|
+
rows.each_with_index.map do |row, idx|
|
|
39
|
+
row_hash = row.to_h
|
|
40
|
+
text = if @text_column
|
|
41
|
+
row_hash[@text_column].to_s
|
|
42
|
+
else
|
|
43
|
+
row_hash.map { |k, v| "#{k}: #{v}" }.join("\n")
|
|
44
|
+
end
|
|
45
|
+
metadata = row_hash.transform_keys(&:to_sym).merge(source: source, row: idx + 1)
|
|
46
|
+
{text: text, metadata: metadata}
|
|
47
|
+
end
|
|
48
|
+
else
|
|
49
|
+
rows.each_with_index.map do |row, idx|
|
|
50
|
+
text = row.join(", ")
|
|
51
|
+
{text: text, metadata: {source: source, row: idx + 1}}
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
end
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Phronomy
|
|
4
|
+
module VectorStore
|
|
5
|
+
module Loader
|
|
6
|
+
# Loads a Markdown file, optionally splitting on top-level headings.
|
|
7
|
+
#
|
|
8
|
+
# When +split_on_headings:+ is true (the default), each H1/H2 section
|
|
9
|
+
# becomes a separate document so that embeddings capture section semantics
|
|
10
|
+
# rather than the full file at once.
|
|
11
|
+
#
|
|
12
|
+
# @example Single document (heading split disabled)
|
|
13
|
+
# loader = Phronomy::VectorStore::Loader::MarkdownLoader.new(split_on_headings: false)
|
|
14
|
+
# docs = loader.load("README.md")
|
|
15
|
+
# # => [{ text: "# Title\n...", metadata: { source: "README.md" } }]
|
|
16
|
+
#
|
|
17
|
+
# @example Split per heading (default)
|
|
18
|
+
# loader = Phronomy::VectorStore::Loader::MarkdownLoader.new
|
|
19
|
+
# docs = loader.load("guide.md")
|
|
20
|
+
# # => [
|
|
21
|
+
# # { text: "# Section 1\n...", metadata: { source: "guide.md", section: "Section 1" } },
|
|
22
|
+
# # { text: "## Sub-section\n...", metadata: { source: "guide.md", section: "Sub-section" } },
|
|
23
|
+
# # ]
|
|
24
|
+
class MarkdownLoader < Base
|
|
25
|
+
HEADING_RE = /^(\#{1,6})\s+(.+)$/
|
|
26
|
+
|
|
27
|
+
# @param split_on_headings [Boolean] split on H1–H6 boundaries (default: true)
|
|
28
|
+
# @api public
|
|
29
|
+
def initialize(split_on_headings: true)
|
|
30
|
+
@split_on_headings = split_on_headings
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
# @param source [String] path to a Markdown file
|
|
34
|
+
# @return [Array<Hash>]
|
|
35
|
+
# @raise [Errno::ENOENT] if the file does not exist
|
|
36
|
+
# @api public
|
|
37
|
+
def load(source)
|
|
38
|
+
content = File.read(source, encoding: "UTF-8")
|
|
39
|
+
return [{text: content, metadata: {source: source}}] unless @split_on_headings
|
|
40
|
+
|
|
41
|
+
split_by_headings(content, source)
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
private
|
|
45
|
+
|
|
46
|
+
def split_by_headings(content, source)
|
|
47
|
+
sections = []
|
|
48
|
+
current_lines = []
|
|
49
|
+
current_heading = nil
|
|
50
|
+
|
|
51
|
+
content.each_line do |line|
|
|
52
|
+
if (m = HEADING_RE.match(line.chomp))
|
|
53
|
+
flush_section(sections, current_lines, current_heading, source) if current_lines.any?
|
|
54
|
+
current_heading = m[2].strip
|
|
55
|
+
current_lines = [line]
|
|
56
|
+
else
|
|
57
|
+
current_lines << line
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
flush_section(sections, current_lines, current_heading, source) if current_lines.any?
|
|
62
|
+
|
|
63
|
+
# Fall back to single document if no headings were found
|
|
64
|
+
sections.empty? ? [{text: content, metadata: {source: source}}] : sections
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
def flush_section(sections, lines, heading, source)
|
|
68
|
+
text = lines.join
|
|
69
|
+
return if text.strip.empty?
|
|
70
|
+
|
|
71
|
+
metadata = {source: source}
|
|
72
|
+
metadata[:section] = heading if heading
|
|
73
|
+
sections << {text: text, metadata: metadata}
|
|
74
|
+
end
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
end
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Phronomy
|
|
4
|
+
module VectorStore
|
|
5
|
+
module Loader
|
|
6
|
+
# Loads a plain-text file as a single document.
|
|
7
|
+
#
|
|
8
|
+
# @example
|
|
9
|
+
# loader = Phronomy::VectorStore::Loader::PlainTextLoader.new
|
|
10
|
+
# docs = loader.load("/path/to/file.txt")
|
|
11
|
+
# # => [{ text: "...", metadata: { source: "/path/to/file.txt" } }]
|
|
12
|
+
class PlainTextLoader < Base
|
|
13
|
+
# @param source [String] absolute or relative path to a text file
|
|
14
|
+
# @return [Array<Hash>] single-element array with the file contents
|
|
15
|
+
# @raise [Errno::ENOENT] if the file does not exist
|
|
16
|
+
# @api public
|
|
17
|
+
def load(source)
|
|
18
|
+
text = File.read(source, encoding: "UTF-8")
|
|
19
|
+
[{text: text, metadata: {source: source}}]
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
end
|
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "json"
|
|
4
|
+
|
|
5
|
+
module Phronomy
|
|
6
|
+
module VectorStore
|
|
7
|
+
# PostgreSQL-backed vector store using the pgvector extension.
|
|
8
|
+
#
|
|
9
|
+
# Requires:
|
|
10
|
+
# - The +pgvector+ gem (add to your Gemfile)
|
|
11
|
+
# - An ActiveRecord model class with the following columns:
|
|
12
|
+
# id (string / uuid)
|
|
13
|
+
# embedding (vector — from the pgvector column type)
|
|
14
|
+
# metadata (text or jsonb — stores arbitrary metadata as JSON)
|
|
15
|
+
#
|
|
16
|
+
# @example Usage
|
|
17
|
+
# store = Phronomy::VectorStore::Pgvector.new(model_class: VectorDocument)
|
|
18
|
+
# store.add(id: "doc1", embedding: [0.1, 0.9], metadata: {text: "hello"})
|
|
19
|
+
# results = store.search(query_embedding: [0.1, 0.8], k: 5)
|
|
20
|
+
class Pgvector < Base
|
|
21
|
+
# @param model_class [Class] ActiveRecord model with id/embedding/metadata columns
|
|
22
|
+
# @param dimension [Integer, nil] expected embedding dimension for Phronomy-side
|
|
23
|
+
# pre-validation. When nil, dimension enforcement is delegated to the
|
|
24
|
+
# database schema; no pre-validation is performed by Phronomy.
|
|
25
|
+
# @api public
|
|
26
|
+
def initialize(model_class:, dimension: nil)
|
|
27
|
+
begin
|
|
28
|
+
require "pgvector"
|
|
29
|
+
rescue LoadError
|
|
30
|
+
raise LoadError,
|
|
31
|
+
"pgvector gem is required for Phronomy::VectorStore::Pgvector. " \
|
|
32
|
+
"Add `gem 'pgvector'` to your Gemfile."
|
|
33
|
+
end
|
|
34
|
+
@model_class = model_class
|
|
35
|
+
@dimension = dimension
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
# @param id [String]
|
|
39
|
+
# @param embedding [Array<Float>]
|
|
40
|
+
# @param metadata [Hash]
|
|
41
|
+
# @param cancellation_token [Phronomy::Concurrency::CancellationToken, nil]
|
|
42
|
+
# @api public
|
|
43
|
+
def add(id:, embedding:, metadata: {}, cancellation_token: nil)
|
|
44
|
+
cancellation_token&.raise_if_cancelled!
|
|
45
|
+
validate_embedding_dimension!(embedding, @dimension)
|
|
46
|
+
@model_class.upsert(
|
|
47
|
+
{id: id, embedding: safe_vector(embedding), metadata: metadata.to_json},
|
|
48
|
+
unique_by: :id
|
|
49
|
+
)
|
|
50
|
+
self
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
# @param query_embedding [Array<Float>]
|
|
54
|
+
# @param k [Integer]
|
|
55
|
+
# @param cancellation_token [Phronomy::Concurrency::CancellationToken, nil]
|
|
56
|
+
# @return [Array<Hash>] sorted by descending similarity score
|
|
57
|
+
# @api public
|
|
58
|
+
def search(query_embedding:, k: 5, cancellation_token: nil)
|
|
59
|
+
cancellation_token&.raise_if_cancelled!
|
|
60
|
+
k_safe = validate_k!(k)
|
|
61
|
+
validate_embedding_dimension!(query_embedding, @dimension)
|
|
62
|
+
vec = safe_vector_literal(query_embedding)
|
|
63
|
+
conn = @model_class.connection
|
|
64
|
+
quoted_vec = "#{conn.quote(vec)}::vector"
|
|
65
|
+
|
|
66
|
+
@model_class
|
|
67
|
+
.select("id, metadata, 1 - (embedding <=> #{quoted_vec}) AS score")
|
|
68
|
+
.order("embedding <=> #{quoted_vec}")
|
|
69
|
+
.limit(k_safe)
|
|
70
|
+
.map do |r|
|
|
71
|
+
{
|
|
72
|
+
id: r.id.to_s,
|
|
73
|
+
score: r.score.to_f,
|
|
74
|
+
metadata: parse_metadata(r.metadata)
|
|
75
|
+
}
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
def remove(id:)
|
|
80
|
+
@model_class.where(id: id).delete_all
|
|
81
|
+
self
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
def clear
|
|
85
|
+
@model_class.delete_all
|
|
86
|
+
self
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
# Returns the number of documents in the backing table.
|
|
90
|
+
def size
|
|
91
|
+
@model_class.count
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
private
|
|
95
|
+
|
|
96
|
+
# Parses a metadata value returned by the pg driver.
|
|
97
|
+
# Handles NULL (nil), already-parsed Hash, and JSON string forms.
|
|
98
|
+
def parse_metadata(raw)
|
|
99
|
+
return {} if raw.nil?
|
|
100
|
+
return symbolize_hash_keys(raw) if raw.is_a?(Hash)
|
|
101
|
+
|
|
102
|
+
parsed = JSON.parse(raw.to_s, symbolize_names: true)
|
|
103
|
+
parsed.is_a?(Hash) ? parsed : {}
|
|
104
|
+
rescue JSON::ParserError
|
|
105
|
+
{}
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
# Recursively symbolizes keys for an already-parsed Hash.
|
|
109
|
+
def symbolize_hash_keys(hash)
|
|
110
|
+
hash.each_with_object({}) do |(k, v), h|
|
|
111
|
+
h[k.to_sym] = v.is_a?(Hash) ? symbolize_hash_keys(v) : v
|
|
112
|
+
end
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
# Validates that all elements are numeric and converts to a pgvector-
|
|
116
|
+
# compatible literal string (e.g. "[1.0,0.5,-0.3]").
|
|
117
|
+
def safe_vector_literal(embedding)
|
|
118
|
+
"[#{embedding.map { |v| Float(v) }.join(",")}]"
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
# Returns a validated vector for the upsert call.
|
|
122
|
+
def safe_vector(embedding)
|
|
123
|
+
safe_vector_literal(embedding)
|
|
124
|
+
end
|
|
125
|
+
end
|
|
126
|
+
end
|
|
127
|
+
end
|
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "json"
|
|
4
|
+
|
|
5
|
+
module Phronomy
|
|
6
|
+
module VectorStore
|
|
7
|
+
# Redis-backed vector store using the RediSearch module (FT.* commands).
|
|
8
|
+
#
|
|
9
|
+
# Requires:
|
|
10
|
+
# - The +redis+ gem (add to your Gemfile)
|
|
11
|
+
# - A Redis server with the RediSearch (RedisSearch) module enabled
|
|
12
|
+
# (or Redis Stack which bundles RediSearch)
|
|
13
|
+
#
|
|
14
|
+
# Vectors are stored as FLOAT32 binary blobs in Redis Hash fields and
|
|
15
|
+
# searched using the KNN approximate-nearest-neighbour algorithm.
|
|
16
|
+
#
|
|
17
|
+
# @example Usage
|
|
18
|
+
# redis = Redis.new(url: "redis://localhost:6379")
|
|
19
|
+
# store = Phronomy::VectorStore::RedisSearch.new(redis: redis, dimension: 1536)
|
|
20
|
+
# store.add(id: "doc1", embedding: [0.1, 0.9], metadata: {text: "hello"})
|
|
21
|
+
# results = store.search(query_embedding: [0.1, 0.8], k: 5)
|
|
22
|
+
class RedisSearch < Base
|
|
23
|
+
DOC_PREFIX = "phronomy_doc:"
|
|
24
|
+
private_constant :DOC_PREFIX
|
|
25
|
+
|
|
26
|
+
# @param redis [Redis] configured Redis client
|
|
27
|
+
# @param index_name [String] RediSearch index name
|
|
28
|
+
# @param dimension [Integer, nil] vector dimension; auto-detected on first add.
|
|
29
|
+
# When connecting to an **existing** RediSearch index, you MUST pass
|
|
30
|
+
# dimension: explicitly. Without it, a freshly constructed instance
|
|
31
|
+
# treats the index as uninitialized until #add is called, and #search
|
|
32
|
+
# silently returns [] in the meantime.
|
|
33
|
+
# @api public
|
|
34
|
+
def initialize(redis:, index_name: "phronomy_vectors", dimension: nil)
|
|
35
|
+
begin
|
|
36
|
+
require "redis"
|
|
37
|
+
rescue LoadError
|
|
38
|
+
raise LoadError,
|
|
39
|
+
"redis gem is required for Phronomy::VectorStore::RedisSearch. " \
|
|
40
|
+
"Add `gem 'redis'` to your Gemfile."
|
|
41
|
+
end
|
|
42
|
+
@redis = redis
|
|
43
|
+
@index_name = index_name
|
|
44
|
+
@dimension = dimension
|
|
45
|
+
@index_created = false
|
|
46
|
+
@mutex = Mutex.new
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
# @param id [String]
|
|
50
|
+
# @param embedding [Array<Float>]
|
|
51
|
+
# @param metadata [Hash]
|
|
52
|
+
# @param cancellation_token [Phronomy::Concurrency::CancellationToken, nil]
|
|
53
|
+
# @api public
|
|
54
|
+
def add(id:, embedding:, metadata: {}, cancellation_token: nil)
|
|
55
|
+
cancellation_token&.raise_if_cancelled!
|
|
56
|
+
# Establish expected dimension on first add (not race-free for concurrent
|
|
57
|
+
# first adds), then validate, then create/reuse the index.
|
|
58
|
+
@dimension ||= embedding.size
|
|
59
|
+
validate_embedding_dimension!(embedding, @dimension)
|
|
60
|
+
ensure_index!(@dimension)
|
|
61
|
+
@redis.call(
|
|
62
|
+
"HSET", "#{DOC_PREFIX}#{id}",
|
|
63
|
+
"embedding", pack_vector(embedding),
|
|
64
|
+
"metadata", metadata.to_json
|
|
65
|
+
)
|
|
66
|
+
self
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
# @param query_embedding [Array<Float>]
|
|
70
|
+
# @param k [Integer]
|
|
71
|
+
# @param cancellation_token [Phronomy::Concurrency::CancellationToken, nil]
|
|
72
|
+
# @return [Array<Hash>] sorted by descending similarity score
|
|
73
|
+
# @api public
|
|
74
|
+
def search(query_embedding:, k: 5, cancellation_token: nil)
|
|
75
|
+
cancellation_token&.raise_if_cancelled!
|
|
76
|
+
# search never establishes dimension. If dimension is unknown and the
|
|
77
|
+
# index has not been created yet, there are no documents to return.
|
|
78
|
+
return [] if @dimension.nil? && !@index_created
|
|
79
|
+
|
|
80
|
+
validate_embedding_dimension!(query_embedding, @dimension)
|
|
81
|
+
ensure_index!(@dimension)
|
|
82
|
+
k_safe = validate_k!(k)
|
|
83
|
+
blob = pack_vector(query_embedding)
|
|
84
|
+
|
|
85
|
+
raw = @redis.call(
|
|
86
|
+
"FT.SEARCH", @index_name,
|
|
87
|
+
"*=>[KNN #{k_safe} @embedding $BLOB AS score]",
|
|
88
|
+
"PARAMS", 2, "BLOB", blob,
|
|
89
|
+
"SORTBY", "score",
|
|
90
|
+
"RETURN", 2, "score", "metadata",
|
|
91
|
+
"DIALECT", 2
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
parse_results(raw)
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
def remove(id:)
|
|
98
|
+
@redis.call("DEL", "#{DOC_PREFIX}#{id}")
|
|
99
|
+
self
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
# Returns the number of documents indexed.
|
|
103
|
+
# Queries FT.INFO when the index has been created; returns 0 otherwise.
|
|
104
|
+
def size
|
|
105
|
+
return 0 unless @index_created
|
|
106
|
+
|
|
107
|
+
raw = @redis.call("FT.INFO", @index_name)
|
|
108
|
+
return 0 unless raw.is_a?(Array)
|
|
109
|
+
|
|
110
|
+
idx = raw.index("num_docs")
|
|
111
|
+
idx ? raw[idx + 1].to_i : 0
|
|
112
|
+
rescue
|
|
113
|
+
0
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
def clear
|
|
117
|
+
@mutex.synchronize do
|
|
118
|
+
begin
|
|
119
|
+
@redis.call("FT.DROPINDEX", @index_name, "DD")
|
|
120
|
+
rescue => e
|
|
121
|
+
raise unless e.message.to_s.include?("Unknown Index name")
|
|
122
|
+
end
|
|
123
|
+
@index_created = false
|
|
124
|
+
end
|
|
125
|
+
self
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
private
|
|
129
|
+
|
|
130
|
+
def ensure_index!(dim)
|
|
131
|
+
@mutex.synchronize do
|
|
132
|
+
return if @index_created
|
|
133
|
+
|
|
134
|
+
@dimension ||= dim
|
|
135
|
+
begin
|
|
136
|
+
@redis.call(
|
|
137
|
+
"FT.CREATE", @index_name,
|
|
138
|
+
"ON", "HASH",
|
|
139
|
+
"PREFIX", 1, DOC_PREFIX,
|
|
140
|
+
"SCHEMA",
|
|
141
|
+
"embedding", "VECTOR", "FLAT", 6,
|
|
142
|
+
"TYPE", "FLOAT32",
|
|
143
|
+
"DIM", @dimension,
|
|
144
|
+
"DISTANCE_METRIC", "COSINE",
|
|
145
|
+
"metadata", "TEXT"
|
|
146
|
+
)
|
|
147
|
+
rescue => e
|
|
148
|
+
raise unless e.message.to_s.include?("Index already exists")
|
|
149
|
+
end
|
|
150
|
+
@index_created = true
|
|
151
|
+
end
|
|
152
|
+
end
|
|
153
|
+
|
|
154
|
+
# Pack a Float array as a FLOAT32 binary string for RediSearch.
|
|
155
|
+
def pack_vector(embedding)
|
|
156
|
+
embedding.map { |v| Float(v) }.pack("f*")
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
# Parse the raw FT.SEARCH response into the standard Hash format.
|
|
160
|
+
#
|
|
161
|
+
# Redis FT.SEARCH returns: [count, key1, [field, value, ...], key2, ...]
|
|
162
|
+
def parse_results(raw)
|
|
163
|
+
return [] if raw.nil? || !raw.is_a?(Array) || raw.size < 2
|
|
164
|
+
|
|
165
|
+
results = []
|
|
166
|
+
i = 1
|
|
167
|
+
while i < raw.size
|
|
168
|
+
key = raw[i]
|
|
169
|
+
fields = raw[i + 1]
|
|
170
|
+
i += 2
|
|
171
|
+
|
|
172
|
+
next unless fields.is_a?(Array)
|
|
173
|
+
|
|
174
|
+
field_hash = fields.each_slice(2).to_h
|
|
175
|
+
score_str = field_hash["score"]
|
|
176
|
+
metadata_str = field_hash["metadata"]
|
|
177
|
+
|
|
178
|
+
next if score_str.nil?
|
|
179
|
+
|
|
180
|
+
id = key.to_s.delete_prefix(DOC_PREFIX)
|
|
181
|
+
# RediSearch returns cosine distance (0=identical, 2=opposite);
|
|
182
|
+
# convert to cosine similarity for consistency with other backends.
|
|
183
|
+
score = 1.0 - score_str.to_f
|
|
184
|
+
metadata = metadata_str ? JSON.parse(metadata_str, symbolize_names: true) : {}
|
|
185
|
+
|
|
186
|
+
results << {id: id, score: score, metadata: metadata}
|
|
187
|
+
end
|
|
188
|
+
results
|
|
189
|
+
end
|
|
190
|
+
end
|
|
191
|
+
end
|
|
192
|
+
end
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Phronomy
|
|
4
|
+
module VectorStore
|
|
5
|
+
module Splitter
|
|
6
|
+
# Abstract base class for text splitters.
|
|
7
|
+
#
|
|
8
|
+
# A splitter takes a single document hash (or plain text) and returns an
|
|
9
|
+
# array of smaller chunk documents:
|
|
10
|
+
#
|
|
11
|
+
# [{ text: String, metadata: Hash }, ...]
|
|
12
|
+
#
|
|
13
|
+
# Subclasses must implement {#split}.
|
|
14
|
+
class Base
|
|
15
|
+
# Split +document+ into an array of chunk documents.
|
|
16
|
+
#
|
|
17
|
+
# @param document [Hash, String]
|
|
18
|
+
# Either a document hash (<tt>{ text: String, metadata: Hash }</tt>)
|
|
19
|
+
# returned by a Loader, or a plain String.
|
|
20
|
+
# @return [Array<Hash>] array of <tt>{ text: String, metadata: Hash }</tt>
|
|
21
|
+
# @raise [NotImplementedError] when not overridden by a subclass
|
|
22
|
+
# @api public
|
|
23
|
+
def split(document)
|
|
24
|
+
raise NotImplementedError, "#{self.class}#split is not implemented"
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
# Convenience method: split an array of documents.
|
|
28
|
+
#
|
|
29
|
+
# @param documents [Array<Hash, String>]
|
|
30
|
+
# @return [Array<Hash>]
|
|
31
|
+
# @api public
|
|
32
|
+
def split_all(documents)
|
|
33
|
+
documents.flat_map { |doc| split(doc) }
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
private
|
|
37
|
+
|
|
38
|
+
# Normalise a document-or-string argument into {text:, metadata:}.
|
|
39
|
+
def normalise(document)
|
|
40
|
+
case document
|
|
41
|
+
when Hash then {text: document[:text].to_s, metadata: document.fetch(:metadata, {})}
|
|
42
|
+
when String then {text: document, metadata: {}}
|
|
43
|
+
else raise ArgumentError, "document must be a Hash or String, got #{document.class}"
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
end
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Phronomy
|
|
4
|
+
module VectorStore
|
|
5
|
+
module Splitter
|
|
6
|
+
# Splits text into fixed-size character chunks with optional overlap.
|
|
7
|
+
#
|
|
8
|
+
# @example
|
|
9
|
+
# splitter = Phronomy::VectorStore::Splitter::FixedSizeSplitter.new(chunk_size: 200, chunk_overlap: 20)
|
|
10
|
+
# chunks = splitter.split({ text: long_text, metadata: { source: "doc.txt" } })
|
|
11
|
+
# # => [
|
|
12
|
+
# # { text: "...(200 chars)...", metadata: { source: "doc.txt", chunk: 0 } },
|
|
13
|
+
# # { text: "...(200 chars, 20-char overlap)...", metadata: { source: "doc.txt", chunk: 1 } },
|
|
14
|
+
# # ]
|
|
15
|
+
class FixedSizeSplitter < Base
|
|
16
|
+
# @param chunk_size [Integer] maximum characters per chunk (default: 1000)
|
|
17
|
+
# @param chunk_overlap [Integer] characters to repeat at the start of each
|
|
18
|
+
# subsequent chunk (default: 200); must be less than chunk_size
|
|
19
|
+
# @api public
|
|
20
|
+
def initialize(chunk_size: 1000, chunk_overlap: 200)
|
|
21
|
+
raise ArgumentError, "chunk_overlap must be less than chunk_size" if chunk_overlap >= chunk_size
|
|
22
|
+
|
|
23
|
+
@chunk_size = chunk_size
|
|
24
|
+
@chunk_overlap = chunk_overlap
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
# @param document [Hash, String]
|
|
28
|
+
# @return [Array<Hash>]
|
|
29
|
+
# @api public
|
|
30
|
+
def split(document)
|
|
31
|
+
doc = normalise(document)
|
|
32
|
+
text = doc[:text]
|
|
33
|
+
base_metadata = doc[:metadata]
|
|
34
|
+
|
|
35
|
+
chunks = []
|
|
36
|
+
start = 0
|
|
37
|
+
index = 0
|
|
38
|
+
|
|
39
|
+
while start < text.length
|
|
40
|
+
chunk_text = text[start, @chunk_size]
|
|
41
|
+
chunks << {text: chunk_text, metadata: base_metadata.merge(chunk: index)}
|
|
42
|
+
break if start + @chunk_size >= text.length
|
|
43
|
+
|
|
44
|
+
start += @chunk_size - @chunk_overlap
|
|
45
|
+
index += 1
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
chunks
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
end
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Phronomy
|
|
4
|
+
module VectorStore
|
|
5
|
+
module Splitter
|
|
6
|
+
# Splits text recursively using a prioritised list of separator strings.
|
|
7
|
+
#
|
|
8
|
+
# The splitter tries each separator in order. When a separator produces
|
|
9
|
+
# chunks that are still larger than +chunk_size+, it recurses with the
|
|
10
|
+
# next separator in the list. This mirrors LangChain's
|
|
11
|
+
# RecursiveCharacterTextSplitter behaviour.
|
|
12
|
+
#
|
|
13
|
+
# Default separators (in priority order):
|
|
14
|
+
# 1. "\n\n" — paragraph breaks
|
|
15
|
+
# 2. "\n" — line breaks
|
|
16
|
+
# 3. ". " — sentence boundaries
|
|
17
|
+
# 4. " " — word boundaries
|
|
18
|
+
# 5. "" — character-level fallback
|
|
19
|
+
#
|
|
20
|
+
# @example
|
|
21
|
+
# splitter = Phronomy::VectorStore::Splitter::RecursiveSplitter.new(chunk_size: 300, chunk_overlap: 30)
|
|
22
|
+
# chunks = splitter.split({ text: long_markdown, metadata: { source: "guide.md" } })
|
|
23
|
+
class RecursiveSplitter < Base
|
|
24
|
+
DEFAULT_SEPARATORS = ["\n\n", "\n", ". ", " ", ""].freeze
|
|
25
|
+
|
|
26
|
+
# @param chunk_size [Integer] maximum characters per chunk (default: 1000)
|
|
27
|
+
# @param chunk_overlap [Integer] overlap characters (default: 200)
|
|
28
|
+
# @param separators [Array<String>] separator list in priority order
|
|
29
|
+
# @api public
|
|
30
|
+
def initialize(chunk_size: 1000, chunk_overlap: 200, separators: DEFAULT_SEPARATORS)
|
|
31
|
+
raise ArgumentError, "chunk_overlap must be less than chunk_size" if chunk_overlap >= chunk_size
|
|
32
|
+
|
|
33
|
+
@chunk_size = chunk_size
|
|
34
|
+
@chunk_overlap = chunk_overlap
|
|
35
|
+
@separators = separators
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
# @param document [Hash, String]
|
|
39
|
+
# @return [Array<Hash>]
|
|
40
|
+
# @api public
|
|
41
|
+
def split(document)
|
|
42
|
+
doc = normalise(document)
|
|
43
|
+
texts = recursive_split(doc[:text], @separators)
|
|
44
|
+
merge_with_overlap(texts).each_with_index.map do |text, idx|
|
|
45
|
+
{text: text, metadata: doc[:metadata].merge(chunk: idx)}
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
private
|
|
50
|
+
|
|
51
|
+
# Split +text+ using the first separator that yields non-trivial pieces,
|
|
52
|
+
# then recurse on any piece that is still too large.
|
|
53
|
+
def recursive_split(text, separators)
|
|
54
|
+
return [text] if text.length <= @chunk_size || separators.empty?
|
|
55
|
+
|
|
56
|
+
sep, *rest_seps = separators
|
|
57
|
+
|
|
58
|
+
# Character-level fallback: just slice
|
|
59
|
+
if sep == ""
|
|
60
|
+
return FixedSizeSplitter
|
|
61
|
+
.new(chunk_size: @chunk_size, chunk_overlap: @chunk_overlap)
|
|
62
|
+
.split(text)
|
|
63
|
+
.map { |c| c[:text] }
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
parts = text.split(sep)
|
|
67
|
+
|
|
68
|
+
# If this separator doesn't split, try the next
|
|
69
|
+
return recursive_split(text, rest_seps) if parts.length <= 1
|
|
70
|
+
|
|
71
|
+
# Re-attach the separator to each part except the last so context is preserved
|
|
72
|
+
parts_with_sep = parts.each_with_index.map do |part, i|
|
|
73
|
+
(i < parts.length - 1) ? part + sep : part
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
parts_with_sep.flat_map do |part|
|
|
77
|
+
if part.length > @chunk_size
|
|
78
|
+
recursive_split(part, rest_seps)
|
|
79
|
+
else
|
|
80
|
+
[part]
|
|
81
|
+
end
|
|
82
|
+
end.reject { |t| t.strip.empty? }
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
# Merge small adjacent pieces and apply overlap between chunks.
|
|
86
|
+
def merge_with_overlap(texts)
|
|
87
|
+
merged = []
|
|
88
|
+
current = +""
|
|
89
|
+
|
|
90
|
+
texts.each do |text|
|
|
91
|
+
if current.length + text.length <= @chunk_size
|
|
92
|
+
current << text
|
|
93
|
+
else
|
|
94
|
+
merged << current.strip unless current.strip.empty?
|
|
95
|
+
# Start next chunk with overlap from the end of current
|
|
96
|
+
overlap_text = (current.length > @chunk_overlap) ? current[-@chunk_overlap..] : current
|
|
97
|
+
current = overlap_text + text
|
|
98
|
+
end
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
merged << current.strip unless current.strip.empty?
|
|
102
|
+
merged
|
|
103
|
+
end
|
|
104
|
+
end
|
|
105
|
+
end
|
|
106
|
+
end
|
|
107
|
+
end
|