phronomy 0.7.1 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +35 -45
- data/benchmark/baseline.json +1 -1
- data/benchmark/bench_agent_invoke.rb +1 -1
- data/benchmark/bench_context_assembler.rb +11 -3
- data/benchmark/bench_regression.rb +11 -11
- data/benchmark/bench_token_estimator.rb +5 -5
- data/benchmark/bench_tool_schema.rb +2 -2
- data/docs/decisions/011-build-context-as-single-llm-input-authority.md +224 -0
- data/lib/phronomy/agent/base.rb +268 -403
- data/lib/phronomy/agent/checkpoint.rb +118 -0
- data/lib/phronomy/agent/concerns/suspendable.rb +6 -6
- data/lib/phronomy/agent/context/capability/base.rb +689 -0
- data/lib/phronomy/agent/context/capability/scope_policy.rb +54 -0
- data/lib/phronomy/agent/context/instruction/prompt_template.rb +102 -0
- data/lib/phronomy/agent/context/knowledge/base.rb +58 -0
- data/lib/phronomy/agent/context/knowledge/entity_knowledge.rb +102 -0
- data/lib/phronomy/agent/context/knowledge/static_knowledge.rb +58 -0
- data/lib/phronomy/agent/fsm.rb +1 -1
- data/lib/phronomy/agent/invocation_pipeline.rb +108 -0
- data/lib/phronomy/agent/lifecycle/fsm_session.rb +251 -0
- data/lib/phronomy/agent/lifecycle/phase_machine_builder.rb +249 -0
- data/lib/phronomy/agent/react_agent.rb +43 -37
- data/lib/phronomy/agent/runner.rb +2 -2
- data/lib/phronomy/agent/shared_state.rb +2 -2
- data/lib/phronomy/agent/tool_executor.rb +108 -0
- data/lib/phronomy/concurrency/async_queue.rb +157 -0
- data/lib/phronomy/concurrency/blocking_adapter_pool.rb +443 -0
- data/lib/phronomy/concurrency/cancellation_scope.rb +125 -0
- data/lib/phronomy/concurrency/cancellation_token.rb +140 -0
- data/lib/phronomy/concurrency/concurrency_gate.rb +157 -0
- data/lib/phronomy/concurrency/deadline.rb +65 -0
- data/lib/phronomy/{runtime → concurrency}/gate_registry.rb +1 -2
- data/lib/phronomy/{runtime → concurrency}/pool_registry.rb +1 -1
- data/lib/phronomy/configuration.rb +0 -6
- data/lib/phronomy/context.rb +2 -8
- data/lib/phronomy/eval/runner.rb +4 -0
- data/lib/phronomy/eval/scorer/llm_judge.rb +12 -1
- data/lib/phronomy/event_loop.rb +7 -7
- data/lib/phronomy/invocation_context.rb +3 -3
- data/lib/phronomy/knowledge_source.rb +0 -5
- data/lib/phronomy/llm_adapter/ruby_llm.rb +17 -11
- data/lib/phronomy/llm_context_window/assembler.rb +191 -0
- data/lib/phronomy/{context → llm_context_window}/context_version_cache.rb +1 -1
- data/lib/phronomy/{context → llm_context_window}/token_budget.rb +7 -4
- data/lib/phronomy/{context → llm_context_window}/token_estimator.rb +3 -3
- data/lib/phronomy/{agent → multi_agent}/handoff.rb +6 -6
- data/lib/phronomy/{agent → multi_agent}/orchestrator.rb +7 -7
- data/lib/phronomy/{agent → multi_agent}/parallel_tool_chat.rb +4 -4
- data/lib/phronomy/{agent → multi_agent}/team_coordinator.rb +4 -4
- data/lib/phronomy/runtime/runtime_metrics.rb +0 -1
- data/lib/phronomy/runtime.rb +20 -6
- data/lib/phronomy/task_group.rb +1 -1
- data/lib/phronomy/tool.rb +3 -4
- data/lib/phronomy/{tool/agent_tool.rb → tools/agent.rb} +6 -6
- data/lib/phronomy/{tool/mcp_tool.rb → tools/mcp.rb} +9 -9
- data/lib/phronomy/tools/vector_search.rb +70 -0
- data/lib/phronomy/tracing/null_tracer.rb +3 -1
- data/lib/phronomy/vector_store/async_backend.rb +4 -4
- data/lib/phronomy/vector_store/base.rb +2 -2
- data/lib/phronomy/vector_store/embeddings/base.rb +41 -0
- data/lib/phronomy/vector_store/embeddings/ruby_llm_embeddings.rb +47 -0
- data/lib/phronomy/vector_store/in_memory.rb +12 -2
- data/lib/phronomy/vector_store/loader/base.rb +27 -0
- data/lib/phronomy/vector_store/loader/csv_loader.rb +58 -0
- data/lib/phronomy/vector_store/loader/markdown_loader.rb +78 -0
- data/lib/phronomy/vector_store/loader/plain_text_loader.rb +24 -0
- data/lib/phronomy/vector_store/pgvector.rb +2 -2
- data/lib/phronomy/vector_store/redis_search.rb +2 -2
- data/lib/phronomy/vector_store/splitter/base.rb +49 -0
- data/lib/phronomy/vector_store/splitter/fixed_size_splitter.rb +53 -0
- data/lib/phronomy/vector_store/splitter/recursive_splitter.rb +107 -0
- data/lib/phronomy/vector_store.rb +14 -2
- data/lib/phronomy/version.rb +1 -1
- data/lib/phronomy/workflow_context.rb +8 -0
- data/lib/phronomy/workflow_runner.rb +11 -131
- data/lib/phronomy.rb +2 -0
- data/scripts/api_snapshot.rb +11 -9
- metadata +44 -46
- data/lib/phronomy/async_queue.rb +0 -155
- data/lib/phronomy/blocking_adapter_pool.rb +0 -435
- data/lib/phronomy/cancellation_scope.rb +0 -123
- data/lib/phronomy/cancellation_token.rb +0 -133
- data/lib/phronomy/concurrency_gate.rb +0 -155
- data/lib/phronomy/context/assembler.rb +0 -143
- data/lib/phronomy/context/compaction_context.rb +0 -111
- data/lib/phronomy/context/trigger_context.rb +0 -39
- data/lib/phronomy/context/trim_context.rb +0 -75
- data/lib/phronomy/deadline.rb +0 -63
- data/lib/phronomy/embeddings/base.rb +0 -39
- data/lib/phronomy/embeddings/ruby_llm_embeddings.rb +0 -45
- data/lib/phronomy/embeddings.rb +0 -11
- data/lib/phronomy/fsm_session.rb +0 -247
- data/lib/phronomy/knowledge_source/base.rb +0 -54
- data/lib/phronomy/knowledge_source/entity_knowledge.rb +0 -96
- data/lib/phronomy/knowledge_source/rag_knowledge.rb +0 -57
- data/lib/phronomy/knowledge_source/static_knowledge.rb +0 -52
- data/lib/phronomy/loader/base.rb +0 -25
- data/lib/phronomy/loader/csv_loader.rb +0 -56
- data/lib/phronomy/loader/markdown_loader.rb +0 -76
- data/lib/phronomy/loader/plain_text_loader.rb +0 -22
- data/lib/phronomy/loader.rb +0 -13
- data/lib/phronomy/prompt_template.rb +0 -96
- data/lib/phronomy/splitter/base.rb +0 -47
- data/lib/phronomy/splitter/fixed_size_splitter.rb +0 -51
- data/lib/phronomy/splitter/recursive_splitter.rb +0 -105
- data/lib/phronomy/splitter.rb +0 -12
- data/lib/phronomy/tool/base.rb +0 -644
- data/lib/phronomy/tool/scope_policy.rb +0 -50
- data/lib/phronomy/tool_executor.rb +0 -106
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Phronomy
|
|
4
|
+
module Tools
|
|
5
|
+
# A Capability::Base subclass that wraps a {Phronomy::VectorStore::Base} and
|
|
6
|
+
# a {Phronomy::VectorStore::Embeddings::Base} adapter so that an agent can
|
|
7
|
+
# perform semantic search as a tool call.
|
|
8
|
+
#
|
|
9
|
+
# Do not instantiate this class directly. Use the factory method
|
|
10
|
+
# {.from_store} to produce a configured subclass, then pass it to your agent.
|
|
11
|
+
#
|
|
12
|
+
# @example
|
|
13
|
+
# store = Phronomy::VectorStore::InMemory.new
|
|
14
|
+
# emb = Phronomy::VectorStore::Embeddings::RubyLLMEmbeddings.new(model: "...")
|
|
15
|
+
# tool = Phronomy::Tools::VectorSearch.from_store(store, embeddings: emb,
|
|
16
|
+
# k: 3, tool_name: "search_docs",
|
|
17
|
+
# description: "Search the company knowledge base.")
|
|
18
|
+
# agent = MyAgent.new
|
|
19
|
+
# agent.tools tool
|
|
20
|
+
#
|
|
21
|
+
# @api public
|
|
22
|
+
class VectorSearch < Phronomy::Agent::Context::Capability::Base
|
|
23
|
+
description "Search for relevant documents using semantic similarity."
|
|
24
|
+
param :query, type: :string, desc: "The natural-language search query"
|
|
25
|
+
|
|
26
|
+
class << self
|
|
27
|
+
# Build a VectorSearch tool backed by the given store and embeddings adapter.
|
|
28
|
+
#
|
|
29
|
+
# @param store [Phronomy::VectorStore::Base]
|
|
30
|
+
# @param embeddings [Phronomy::VectorStore::Embeddings::Base]
|
|
31
|
+
# @param k [Integer] number of results to return (default 5)
|
|
32
|
+
# @param tool_name [String] name exposed to the LLM
|
|
33
|
+
# @param description [String, nil] optional description override
|
|
34
|
+
# @return [Class] anonymous subclass of VectorSearch configured with the given store
|
|
35
|
+
# @api public
|
|
36
|
+
def from_store(store, embeddings:, k: 5, tool_name: "vector_search", description: nil)
|
|
37
|
+
klass = Class.new(self)
|
|
38
|
+
klass.tool_name(tool_name)
|
|
39
|
+
klass.description(description || "Search the vector store for documents similar to the query.")
|
|
40
|
+
|
|
41
|
+
klass.define_method(:initialize) do
|
|
42
|
+
@store = store
|
|
43
|
+
@embeddings = embeddings
|
|
44
|
+
@k = k
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
klass.define_method(:execute) do |query:|
|
|
48
|
+
embedding = @embeddings.embed(query)
|
|
49
|
+
results = @store.search(query_embedding: embedding, k: @k)
|
|
50
|
+
return "No results found." if results.empty?
|
|
51
|
+
|
|
52
|
+
results.map.with_index(1) do |r, i|
|
|
53
|
+
content = r.dig(:metadata, :content) ||
|
|
54
|
+
r.dig(:metadata, :text) ||
|
|
55
|
+
r[:metadata].to_s
|
|
56
|
+
"[#{i}] (score: #{r[:score].round(3)}) #{content}"
|
|
57
|
+
end.join("\n")
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
klass
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
# @api public
|
|
65
|
+
def execute(query:)
|
|
66
|
+
raise NotImplementedError, "Use VectorSearch.from_store to create a configured instance"
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
end
|
|
@@ -16,7 +16,9 @@ module Phronomy
|
|
|
16
16
|
# Returns a minimal span object with the given name.
|
|
17
17
|
def start_span(name, **) = SpanStruct.new(name)
|
|
18
18
|
|
|
19
|
-
# Does nothing.
|
|
19
|
+
# Does nothing. Explicit nil is equivalent to an empty method body; the
|
|
20
|
+
# mutation "remove nil" is accepted as it does not change observable behaviour.
|
|
21
|
+
# mutant:disable
|
|
20
22
|
def finish_span(span, **) = nil
|
|
21
23
|
end
|
|
22
24
|
end
|
|
@@ -36,7 +36,7 @@ module Phronomy
|
|
|
36
36
|
# @param id [String]
|
|
37
37
|
# @param embedding [Array<Float>]
|
|
38
38
|
# @param metadata [Hash]
|
|
39
|
-
# @param cancellation_token [Phronomy::CancellationToken, nil]
|
|
39
|
+
# @param cancellation_token [Phronomy::Concurrency::CancellationToken, nil]
|
|
40
40
|
# @param timeout [Numeric, nil]
|
|
41
41
|
# @return [BlockingAdapterPool::PendingOperation]
|
|
42
42
|
# @api public
|
|
@@ -56,7 +56,7 @@ module Phronomy
|
|
|
56
56
|
#
|
|
57
57
|
# @param query_embedding [Array<Float>]
|
|
58
58
|
# @param k [Integer]
|
|
59
|
-
# @param cancellation_token [Phronomy::CancellationToken, nil]
|
|
59
|
+
# @param cancellation_token [Phronomy::Concurrency::CancellationToken, nil]
|
|
60
60
|
# @param timeout [Numeric, nil]
|
|
61
61
|
# @return [BlockingAdapterPool::PendingOperation]
|
|
62
62
|
# @api public
|
|
@@ -75,7 +75,7 @@ module Phronomy
|
|
|
75
75
|
# Override to use a native async driver.
|
|
76
76
|
#
|
|
77
77
|
# @param id [String]
|
|
78
|
-
# @param cancellation_token [Phronomy::CancellationToken, nil]
|
|
78
|
+
# @param cancellation_token [Phronomy::Concurrency::CancellationToken, nil]
|
|
79
79
|
# @param timeout [Numeric, nil]
|
|
80
80
|
# @return [BlockingAdapterPool::PendingOperation]
|
|
81
81
|
# @api public
|
|
@@ -93,7 +93,7 @@ module Phronomy
|
|
|
93
93
|
# Submits the clear call to {BlockingAdapterPool} by default.
|
|
94
94
|
# Override to use a native async driver.
|
|
95
95
|
#
|
|
96
|
-
# @param cancellation_token [Phronomy::CancellationToken, nil]
|
|
96
|
+
# @param cancellation_token [Phronomy::Concurrency::CancellationToken, nil]
|
|
97
97
|
# @param timeout [Numeric, nil]
|
|
98
98
|
# @return [BlockingAdapterPool::PendingOperation]
|
|
99
99
|
# @api public
|
|
@@ -19,7 +19,7 @@ module Phronomy
|
|
|
19
19
|
# @param id [String] unique document identifier
|
|
20
20
|
# @param embedding [Array<Float>] vector embedding
|
|
21
21
|
# @param metadata [Hash] arbitrary metadata (e.g. the original message object)
|
|
22
|
-
# @param cancellation_token [Phronomy::CancellationToken, nil] optional; raises CancellationError when cancelled
|
|
22
|
+
# @param cancellation_token [Phronomy::Concurrency::CancellationToken, nil] optional; raises CancellationError when cancelled
|
|
23
23
|
# @api public
|
|
24
24
|
def add(id:, embedding:, metadata: {}, cancellation_token: nil)
|
|
25
25
|
cancellation_token&.raise_if_cancelled!
|
|
@@ -30,7 +30,7 @@ module Phronomy
|
|
|
30
30
|
#
|
|
31
31
|
# @param query_embedding [Array<Float>]
|
|
32
32
|
# @param k [Integer] number of results
|
|
33
|
-
# @param cancellation_token [Phronomy::CancellationToken, nil] optional; raises CancellationError when cancelled
|
|
33
|
+
# @param cancellation_token [Phronomy::Concurrency::CancellationToken, nil] optional; raises CancellationError when cancelled
|
|
34
34
|
# @return [Array<Hash>] each element: { id:, score:, metadata: }
|
|
35
35
|
# @api public
|
|
36
36
|
def search(query_embedding:, k: 5, cancellation_token: nil)
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Phronomy
|
|
4
|
+
module VectorStore
|
|
5
|
+
module Embeddings
|
|
6
|
+
# Abstract interface for embedding adapters.
|
|
7
|
+
#
|
|
8
|
+
# Concrete implementations must override {#embed} and return a vector
|
|
9
|
+
# as an +Array<Float>+.
|
|
10
|
+
class Base
|
|
11
|
+
# Embed the given text and return a vector representation.
|
|
12
|
+
#
|
|
13
|
+
# @param text [String] the text to embed
|
|
14
|
+
# @param cancellation_token [Phronomy::Concurrency::CancellationToken, nil] optional; raises CancellationError when cancelled
|
|
15
|
+
# @return [Array<Float>] the embedding vector
|
|
16
|
+
# @api public
|
|
17
|
+
def embed(text, cancellation_token = nil)
|
|
18
|
+
cancellation_token&.raise_if_cancelled!
|
|
19
|
+
raise NotImplementedError, "#{self.class}#embed is not implemented"
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
# Submits an {#embed} call to {BlockingAdapterPool} and returns a
|
|
23
|
+
# {BlockingAdapterPool::PendingOperation}.
|
|
24
|
+
#
|
|
25
|
+
# @param text [String]
|
|
26
|
+
# @param cancellation_token [Phronomy::Concurrency::CancellationToken, nil]
|
|
27
|
+
# @param timeout [Numeric, nil] seconds before the operation is abandoned
|
|
28
|
+
# @return [BlockingAdapterPool::PendingOperation]
|
|
29
|
+
# @api public
|
|
30
|
+
def embed_async(text, cancellation_token = nil, timeout: nil)
|
|
31
|
+
Phronomy::Runtime.instance.blocking_io.submit(
|
|
32
|
+
timeout: timeout,
|
|
33
|
+
cancellation_token: cancellation_token
|
|
34
|
+
) do
|
|
35
|
+
embed(text, cancellation_token)
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
end
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Phronomy
|
|
4
|
+
module VectorStore
|
|
5
|
+
module Embeddings
|
|
6
|
+
# Embeddings adapter backed by RubyLLM.
|
|
7
|
+
#
|
|
8
|
+
# Delegates to +RubyLLM.embed+ and returns the resulting vector as an
|
|
9
|
+
# +Array<Float>+.
|
|
10
|
+
#
|
|
11
|
+
# @example Default model
|
|
12
|
+
# embeddings = Phronomy::VectorStore::Embeddings::RubyLLMEmbeddings.new
|
|
13
|
+
# vector = embeddings.embed("Hello, world!")
|
|
14
|
+
#
|
|
15
|
+
# @example Explicit model
|
|
16
|
+
# embeddings = Phronomy::VectorStore::Embeddings::RubyLLMEmbeddings.new(model: "text-embedding-3-small")
|
|
17
|
+
# vector = embeddings.embed("Hello, world!")
|
|
18
|
+
class RubyLLMEmbeddings < Base
|
|
19
|
+
# @param model [String, nil] embedding model identifier; nil uses the RubyLLM default
|
|
20
|
+
# @param provider [Symbol, nil] provider override (e.g. :openai); nil uses the RubyLLM default
|
|
21
|
+
# @param assume_model_exists [Boolean] when true, skips RubyLLM model-registry validation
|
|
22
|
+
# (useful for locally hosted models not in the registry)
|
|
23
|
+
# @api public
|
|
24
|
+
def initialize(model: nil, provider: nil, assume_model_exists: false)
|
|
25
|
+
@model = model
|
|
26
|
+
@provider = provider
|
|
27
|
+
@assume_model_exists = assume_model_exists
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
# Embed text via RubyLLM.
|
|
31
|
+
#
|
|
32
|
+
# @param text [String]
|
|
33
|
+
# @param cancellation_token [Phronomy::Concurrency::CancellationToken, nil] optional; raises CancellationError when cancelled
|
|
34
|
+
# @return [Array<Float>]
|
|
35
|
+
# @api public
|
|
36
|
+
def embed(text, cancellation_token = nil)
|
|
37
|
+
cancellation_token&.raise_if_cancelled!
|
|
38
|
+
opts = {}
|
|
39
|
+
opts[:model] = @model if @model
|
|
40
|
+
opts[:provider] = @provider if @provider
|
|
41
|
+
opts[:assume_model_exists] = true if @assume_model_exists
|
|
42
|
+
RubyLLM.embed(text, **opts).vectors
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
end
|
|
@@ -25,7 +25,7 @@ module Phronomy
|
|
|
25
25
|
# @param id [String]
|
|
26
26
|
# @param embedding [Array<Float>]
|
|
27
27
|
# @param metadata [Hash]
|
|
28
|
-
# @param cancellation_token [Phronomy::CancellationToken, nil]
|
|
28
|
+
# @param cancellation_token [Phronomy::Concurrency::CancellationToken, nil]
|
|
29
29
|
# @api public
|
|
30
30
|
def add(id:, embedding:, metadata: {}, cancellation_token: nil)
|
|
31
31
|
cancellation_token&.raise_if_cancelled!
|
|
@@ -38,9 +38,14 @@ module Phronomy
|
|
|
38
38
|
|
|
39
39
|
# @param query_embedding [Array<Float>]
|
|
40
40
|
# @param k [Integer]
|
|
41
|
-
# @param cancellation_token [Phronomy::CancellationToken, nil]
|
|
41
|
+
# @param cancellation_token [Phronomy::Concurrency::CancellationToken, nil]
|
|
42
42
|
# @return [Array<Hash>] sorted by descending score
|
|
43
43
|
# @api public
|
|
44
|
+
# mutant:disable - genuine equivalent mutations: doc.fetch(:embedding) vs doc[:embedding] (key
|
|
45
|
+
# always present); {id:, score:, metadata: doc.fetch(:metadata)} shorthand+fetch vs []
|
|
46
|
+
# (key always present); -r.fetch(:score) vs -r[:score] (key always present); snapshot = @documents
|
|
47
|
+
# vs .dup is equivalent in single-threaded tests (GVL makes Hash#dup atomic, no behaviour
|
|
48
|
+
# difference under test isolation)
|
|
44
49
|
def search(query_embedding:, k: 5, cancellation_token: nil)
|
|
45
50
|
cancellation_token&.raise_if_cancelled!
|
|
46
51
|
k = validate_k!(k)
|
|
@@ -77,6 +82,11 @@ module Phronomy
|
|
|
77
82
|
|
|
78
83
|
private
|
|
79
84
|
|
|
85
|
+
# mutant:disable - empty-vector early-return condition variants (if false, if nil, if a.empty?,
|
|
86
|
+
# if b.empty?, if a.empty? && b.empty?, if a.empty? || false, if false || b.empty?,
|
|
87
|
+
# if nil || b.empty?, if nil && b.empty?) are genuine equivalents: dimension validation in
|
|
88
|
+
# #add and #search enforces same-size embeddings, so a.empty? iff b.empty?; when both are
|
|
89
|
+
# empty norm_a = sqrt(0) = 0 so the norm_a.zero? guard returns 0.0 anyway
|
|
80
90
|
def cosine_similarity(a, b)
|
|
81
91
|
return 0.0 if a.empty? || b.empty?
|
|
82
92
|
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Phronomy
|
|
4
|
+
module VectorStore
|
|
5
|
+
module Loader
|
|
6
|
+
# Abstract base class for document loaders.
|
|
7
|
+
#
|
|
8
|
+
# A loader converts an external source (file path, URL, etc.) into an
|
|
9
|
+
# Array of document hashes understood by the rest of the pipeline:
|
|
10
|
+
#
|
|
11
|
+
# [{ text: String, metadata: Hash }, ...]
|
|
12
|
+
#
|
|
13
|
+
# Subclasses must implement {#load}.
|
|
14
|
+
class Base
|
|
15
|
+
# Load documents from +source+ and return an array of document hashes.
|
|
16
|
+
#
|
|
17
|
+
# @param source [String] file path, URL, or other source identifier
|
|
18
|
+
# @return [Array<Hash>] array of <tt>{ text: String, metadata: Hash }</tt>
|
|
19
|
+
# @raise [NotImplementedError] when not overridden by a subclass
|
|
20
|
+
# @api public
|
|
21
|
+
def load(source)
|
|
22
|
+
raise NotImplementedError, "#{self.class}#load is not implemented"
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
end
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "csv"
|
|
4
|
+
|
|
5
|
+
module Phronomy
|
|
6
|
+
module VectorStore
|
|
7
|
+
module Loader
|
|
8
|
+
# Loads a CSV file, converting each row into a separate document.
|
|
9
|
+
#
|
|
10
|
+
# By default the first row is treated as a header and column names are
|
|
11
|
+
# available in the document metadata. The full row is serialised to
|
|
12
|
+
# a human-readable "key: value" string for embedding.
|
|
13
|
+
#
|
|
14
|
+
# @example
|
|
15
|
+
# loader = Phronomy::VectorStore::Loader::CsvLoader.new
|
|
16
|
+
# docs = loader.load("products.csv")
|
|
17
|
+
# # => [
|
|
18
|
+
# # { text: "name: Widget\nprice: 9.99", metadata: { source: "...", row: 1, name: "Widget", price: "9.99" } },
|
|
19
|
+
# # ...
|
|
20
|
+
# # ]
|
|
21
|
+
class CsvLoader < Base
|
|
22
|
+
# @param headers [Boolean] treat the first row as headers (default: true)
|
|
23
|
+
# @param text_column [String, nil] if set, use only this column as the document text
|
|
24
|
+
# @api public
|
|
25
|
+
def initialize(headers: true, text_column: nil)
|
|
26
|
+
@headers = headers
|
|
27
|
+
@text_column = text_column
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
# @param source [String] path to a CSV file
|
|
31
|
+
# @return [Array<Hash>]
|
|
32
|
+
# @raise [Errno::ENOENT] if the file does not exist
|
|
33
|
+
# @api public
|
|
34
|
+
def load(source)
|
|
35
|
+
rows = CSV.read(source, headers: @headers, encoding: "UTF-8")
|
|
36
|
+
|
|
37
|
+
if @headers
|
|
38
|
+
rows.each_with_index.map do |row, idx|
|
|
39
|
+
row_hash = row.to_h
|
|
40
|
+
text = if @text_column
|
|
41
|
+
row_hash[@text_column].to_s
|
|
42
|
+
else
|
|
43
|
+
row_hash.map { |k, v| "#{k}: #{v}" }.join("\n")
|
|
44
|
+
end
|
|
45
|
+
metadata = row_hash.transform_keys(&:to_sym).merge(source: source, row: idx + 1)
|
|
46
|
+
{text: text, metadata: metadata}
|
|
47
|
+
end
|
|
48
|
+
else
|
|
49
|
+
rows.each_with_index.map do |row, idx|
|
|
50
|
+
text = row.join(", ")
|
|
51
|
+
{text: text, metadata: {source: source, row: idx + 1}}
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
end
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Phronomy
|
|
4
|
+
module VectorStore
|
|
5
|
+
module Loader
|
|
6
|
+
# Loads a Markdown file, optionally splitting on top-level headings.
|
|
7
|
+
#
|
|
8
|
+
# When +split_on_headings:+ is true (the default), each H1/H2 section
|
|
9
|
+
# becomes a separate document so that embeddings capture section semantics
|
|
10
|
+
# rather than the full file at once.
|
|
11
|
+
#
|
|
12
|
+
# @example Single document (heading split disabled)
|
|
13
|
+
# loader = Phronomy::VectorStore::Loader::MarkdownLoader.new(split_on_headings: false)
|
|
14
|
+
# docs = loader.load("README.md")
|
|
15
|
+
# # => [{ text: "# Title\n...", metadata: { source: "README.md" } }]
|
|
16
|
+
#
|
|
17
|
+
# @example Split per heading (default)
|
|
18
|
+
# loader = Phronomy::VectorStore::Loader::MarkdownLoader.new
|
|
19
|
+
# docs = loader.load("guide.md")
|
|
20
|
+
# # => [
|
|
21
|
+
# # { text: "# Section 1\n...", metadata: { source: "guide.md", section: "Section 1" } },
|
|
22
|
+
# # { text: "## Sub-section\n...", metadata: { source: "guide.md", section: "Sub-section" } },
|
|
23
|
+
# # ]
|
|
24
|
+
class MarkdownLoader < Base
|
|
25
|
+
HEADING_RE = /^(\#{1,6})\s+(.+)$/
|
|
26
|
+
|
|
27
|
+
# @param split_on_headings [Boolean] split on H1–H6 boundaries (default: true)
|
|
28
|
+
# @api public
|
|
29
|
+
def initialize(split_on_headings: true)
|
|
30
|
+
@split_on_headings = split_on_headings
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
# @param source [String] path to a Markdown file
|
|
34
|
+
# @return [Array<Hash>]
|
|
35
|
+
# @raise [Errno::ENOENT] if the file does not exist
|
|
36
|
+
# @api public
|
|
37
|
+
def load(source)
|
|
38
|
+
content = File.read(source, encoding: "UTF-8")
|
|
39
|
+
return [{text: content, metadata: {source: source}}] unless @split_on_headings
|
|
40
|
+
|
|
41
|
+
split_by_headings(content, source)
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
private
|
|
45
|
+
|
|
46
|
+
def split_by_headings(content, source)
|
|
47
|
+
sections = []
|
|
48
|
+
current_lines = []
|
|
49
|
+
current_heading = nil
|
|
50
|
+
|
|
51
|
+
content.each_line do |line|
|
|
52
|
+
if (m = HEADING_RE.match(line.chomp))
|
|
53
|
+
flush_section(sections, current_lines, current_heading, source) if current_lines.any?
|
|
54
|
+
current_heading = m[2].strip
|
|
55
|
+
current_lines = [line]
|
|
56
|
+
else
|
|
57
|
+
current_lines << line
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
flush_section(sections, current_lines, current_heading, source) if current_lines.any?
|
|
62
|
+
|
|
63
|
+
# Fall back to single document if no headings were found
|
|
64
|
+
sections.empty? ? [{text: content, metadata: {source: source}}] : sections
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
def flush_section(sections, lines, heading, source)
|
|
68
|
+
text = lines.join
|
|
69
|
+
return if text.strip.empty?
|
|
70
|
+
|
|
71
|
+
metadata = {source: source}
|
|
72
|
+
metadata[:section] = heading if heading
|
|
73
|
+
sections << {text: text, metadata: metadata}
|
|
74
|
+
end
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
end
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Phronomy
|
|
4
|
+
module VectorStore
|
|
5
|
+
module Loader
|
|
6
|
+
# Loads a plain-text file as a single document.
|
|
7
|
+
#
|
|
8
|
+
# @example
|
|
9
|
+
# loader = Phronomy::VectorStore::Loader::PlainTextLoader.new
|
|
10
|
+
# docs = loader.load("/path/to/file.txt")
|
|
11
|
+
# # => [{ text: "...", metadata: { source: "/path/to/file.txt" } }]
|
|
12
|
+
class PlainTextLoader < Base
|
|
13
|
+
# @param source [String] absolute or relative path to a text file
|
|
14
|
+
# @return [Array<Hash>] single-element array with the file contents
|
|
15
|
+
# @raise [Errno::ENOENT] if the file does not exist
|
|
16
|
+
# @api public
|
|
17
|
+
def load(source)
|
|
18
|
+
text = File.read(source, encoding: "UTF-8")
|
|
19
|
+
[{text: text, metadata: {source: source}}]
|
|
20
|
+
end
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
end
|
|
@@ -38,7 +38,7 @@ module Phronomy
|
|
|
38
38
|
# @param id [String]
|
|
39
39
|
# @param embedding [Array<Float>]
|
|
40
40
|
# @param metadata [Hash]
|
|
41
|
-
# @param cancellation_token [Phronomy::CancellationToken, nil]
|
|
41
|
+
# @param cancellation_token [Phronomy::Concurrency::CancellationToken, nil]
|
|
42
42
|
# @api public
|
|
43
43
|
def add(id:, embedding:, metadata: {}, cancellation_token: nil)
|
|
44
44
|
cancellation_token&.raise_if_cancelled!
|
|
@@ -52,7 +52,7 @@ module Phronomy
|
|
|
52
52
|
|
|
53
53
|
# @param query_embedding [Array<Float>]
|
|
54
54
|
# @param k [Integer]
|
|
55
|
-
# @param cancellation_token [Phronomy::CancellationToken, nil]
|
|
55
|
+
# @param cancellation_token [Phronomy::Concurrency::CancellationToken, nil]
|
|
56
56
|
# @return [Array<Hash>] sorted by descending similarity score
|
|
57
57
|
# @api public
|
|
58
58
|
def search(query_embedding:, k: 5, cancellation_token: nil)
|
|
@@ -49,7 +49,7 @@ module Phronomy
|
|
|
49
49
|
# @param id [String]
|
|
50
50
|
# @param embedding [Array<Float>]
|
|
51
51
|
# @param metadata [Hash]
|
|
52
|
-
# @param cancellation_token [Phronomy::CancellationToken, nil]
|
|
52
|
+
# @param cancellation_token [Phronomy::Concurrency::CancellationToken, nil]
|
|
53
53
|
# @api public
|
|
54
54
|
def add(id:, embedding:, metadata: {}, cancellation_token: nil)
|
|
55
55
|
cancellation_token&.raise_if_cancelled!
|
|
@@ -68,7 +68,7 @@ module Phronomy
|
|
|
68
68
|
|
|
69
69
|
# @param query_embedding [Array<Float>]
|
|
70
70
|
# @param k [Integer]
|
|
71
|
-
# @param cancellation_token [Phronomy::CancellationToken, nil]
|
|
71
|
+
# @param cancellation_token [Phronomy::Concurrency::CancellationToken, nil]
|
|
72
72
|
# @return [Array<Hash>] sorted by descending similarity score
|
|
73
73
|
# @api public
|
|
74
74
|
def search(query_embedding:, k: 5, cancellation_token: nil)
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Phronomy
|
|
4
|
+
module VectorStore
|
|
5
|
+
module Splitter
|
|
6
|
+
# Abstract base class for text splitters.
|
|
7
|
+
#
|
|
8
|
+
# A splitter takes a single document hash (or plain text) and returns an
|
|
9
|
+
# array of smaller chunk documents:
|
|
10
|
+
#
|
|
11
|
+
# [{ text: String, metadata: Hash }, ...]
|
|
12
|
+
#
|
|
13
|
+
# Subclasses must implement {#split}.
|
|
14
|
+
class Base
|
|
15
|
+
# Split +document+ into an array of chunk documents.
|
|
16
|
+
#
|
|
17
|
+
# @param document [Hash, String]
|
|
18
|
+
# Either a document hash (<tt>{ text: String, metadata: Hash }</tt>)
|
|
19
|
+
# returned by a Loader, or a plain String.
|
|
20
|
+
# @return [Array<Hash>] array of <tt>{ text: String, metadata: Hash }</tt>
|
|
21
|
+
# @raise [NotImplementedError] when not overridden by a subclass
|
|
22
|
+
# @api public
|
|
23
|
+
def split(document)
|
|
24
|
+
raise NotImplementedError, "#{self.class}#split is not implemented"
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
# Convenience method: split an array of documents.
|
|
28
|
+
#
|
|
29
|
+
# @param documents [Array<Hash, String>]
|
|
30
|
+
# @return [Array<Hash>]
|
|
31
|
+
# @api public
|
|
32
|
+
def split_all(documents)
|
|
33
|
+
documents.flat_map { |doc| split(doc) }
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
private
|
|
37
|
+
|
|
38
|
+
# Normalise a document-or-string argument into {text:, metadata:}.
|
|
39
|
+
def normalise(document)
|
|
40
|
+
case document
|
|
41
|
+
when Hash then {text: document[:text].to_s, metadata: document.fetch(:metadata, {})}
|
|
42
|
+
when String then {text: document, metadata: {}}
|
|
43
|
+
else raise ArgumentError, "document must be a Hash or String, got #{document.class}"
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
end
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Phronomy
|
|
4
|
+
module VectorStore
|
|
5
|
+
module Splitter
|
|
6
|
+
# Splits text into fixed-size character chunks with optional overlap.
|
|
7
|
+
#
|
|
8
|
+
# @example
|
|
9
|
+
# splitter = Phronomy::VectorStore::Splitter::FixedSizeSplitter.new(chunk_size: 200, chunk_overlap: 20)
|
|
10
|
+
# chunks = splitter.split({ text: long_text, metadata: { source: "doc.txt" } })
|
|
11
|
+
# # => [
|
|
12
|
+
# # { text: "...(200 chars)...", metadata: { source: "doc.txt", chunk: 0 } },
|
|
13
|
+
# # { text: "...(200 chars, 20-char overlap)...", metadata: { source: "doc.txt", chunk: 1 } },
|
|
14
|
+
# # ]
|
|
15
|
+
class FixedSizeSplitter < Base
|
|
16
|
+
# @param chunk_size [Integer] maximum characters per chunk (default: 1000)
|
|
17
|
+
# @param chunk_overlap [Integer] characters to repeat at the start of each
|
|
18
|
+
# subsequent chunk (default: 200); must be less than chunk_size
|
|
19
|
+
# @api public
|
|
20
|
+
def initialize(chunk_size: 1000, chunk_overlap: 200)
|
|
21
|
+
raise ArgumentError, "chunk_overlap must be less than chunk_size" if chunk_overlap >= chunk_size
|
|
22
|
+
|
|
23
|
+
@chunk_size = chunk_size
|
|
24
|
+
@chunk_overlap = chunk_overlap
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
# @param document [Hash, String]
|
|
28
|
+
# @return [Array<Hash>]
|
|
29
|
+
# @api public
|
|
30
|
+
def split(document)
|
|
31
|
+
doc = normalise(document)
|
|
32
|
+
text = doc[:text]
|
|
33
|
+
base_metadata = doc[:metadata]
|
|
34
|
+
|
|
35
|
+
chunks = []
|
|
36
|
+
start = 0
|
|
37
|
+
index = 0
|
|
38
|
+
|
|
39
|
+
while start < text.length
|
|
40
|
+
chunk_text = text[start, @chunk_size]
|
|
41
|
+
chunks << {text: chunk_text, metadata: base_metadata.merge(chunk: index)}
|
|
42
|
+
break if start + @chunk_size >= text.length
|
|
43
|
+
|
|
44
|
+
start += @chunk_size - @chunk_overlap
|
|
45
|
+
index += 1
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
chunks
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
end
|