phronomy 0.7.1 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +35 -45
- data/benchmark/baseline.json +1 -1
- data/benchmark/bench_agent_invoke.rb +1 -1
- data/benchmark/bench_context_assembler.rb +11 -3
- data/benchmark/bench_regression.rb +11 -11
- data/benchmark/bench_token_estimator.rb +5 -5
- data/benchmark/bench_tool_schema.rb +2 -2
- data/docs/decisions/011-build-context-as-single-llm-input-authority.md +224 -0
- data/lib/phronomy/agent/base.rb +268 -403
- data/lib/phronomy/agent/checkpoint.rb +118 -0
- data/lib/phronomy/agent/concerns/suspendable.rb +6 -6
- data/lib/phronomy/agent/context/capability/base.rb +689 -0
- data/lib/phronomy/agent/context/capability/scope_policy.rb +54 -0
- data/lib/phronomy/agent/context/instruction/prompt_template.rb +102 -0
- data/lib/phronomy/agent/context/knowledge/base.rb +58 -0
- data/lib/phronomy/agent/context/knowledge/entity_knowledge.rb +102 -0
- data/lib/phronomy/agent/context/knowledge/static_knowledge.rb +58 -0
- data/lib/phronomy/agent/fsm.rb +1 -1
- data/lib/phronomy/agent/invocation_pipeline.rb +108 -0
- data/lib/phronomy/agent/lifecycle/fsm_session.rb +251 -0
- data/lib/phronomy/agent/lifecycle/phase_machine_builder.rb +249 -0
- data/lib/phronomy/agent/react_agent.rb +43 -37
- data/lib/phronomy/agent/runner.rb +2 -2
- data/lib/phronomy/agent/shared_state.rb +2 -2
- data/lib/phronomy/agent/tool_executor.rb +108 -0
- data/lib/phronomy/concurrency/async_queue.rb +157 -0
- data/lib/phronomy/concurrency/blocking_adapter_pool.rb +443 -0
- data/lib/phronomy/concurrency/cancellation_scope.rb +125 -0
- data/lib/phronomy/concurrency/cancellation_token.rb +140 -0
- data/lib/phronomy/concurrency/concurrency_gate.rb +157 -0
- data/lib/phronomy/concurrency/deadline.rb +65 -0
- data/lib/phronomy/{runtime → concurrency}/gate_registry.rb +1 -2
- data/lib/phronomy/{runtime → concurrency}/pool_registry.rb +1 -1
- data/lib/phronomy/configuration.rb +0 -6
- data/lib/phronomy/context.rb +2 -8
- data/lib/phronomy/eval/runner.rb +4 -0
- data/lib/phronomy/eval/scorer/llm_judge.rb +12 -1
- data/lib/phronomy/event_loop.rb +7 -7
- data/lib/phronomy/invocation_context.rb +3 -3
- data/lib/phronomy/knowledge_source.rb +0 -5
- data/lib/phronomy/llm_adapter/ruby_llm.rb +17 -11
- data/lib/phronomy/llm_context_window/assembler.rb +191 -0
- data/lib/phronomy/{context → llm_context_window}/context_version_cache.rb +1 -1
- data/lib/phronomy/{context → llm_context_window}/token_budget.rb +7 -4
- data/lib/phronomy/{context → llm_context_window}/token_estimator.rb +3 -3
- data/lib/phronomy/{agent → multi_agent}/handoff.rb +6 -6
- data/lib/phronomy/{agent → multi_agent}/orchestrator.rb +7 -7
- data/lib/phronomy/{agent → multi_agent}/parallel_tool_chat.rb +4 -4
- data/lib/phronomy/{agent → multi_agent}/team_coordinator.rb +4 -4
- data/lib/phronomy/runtime/runtime_metrics.rb +0 -1
- data/lib/phronomy/runtime.rb +20 -6
- data/lib/phronomy/task_group.rb +1 -1
- data/lib/phronomy/tool.rb +3 -4
- data/lib/phronomy/{tool/agent_tool.rb → tools/agent.rb} +6 -6
- data/lib/phronomy/{tool/mcp_tool.rb → tools/mcp.rb} +9 -9
- data/lib/phronomy/tools/vector_search.rb +70 -0
- data/lib/phronomy/tracing/null_tracer.rb +3 -1
- data/lib/phronomy/vector_store/async_backend.rb +4 -4
- data/lib/phronomy/vector_store/base.rb +2 -2
- data/lib/phronomy/vector_store/embeddings/base.rb +41 -0
- data/lib/phronomy/vector_store/embeddings/ruby_llm_embeddings.rb +47 -0
- data/lib/phronomy/vector_store/in_memory.rb +12 -2
- data/lib/phronomy/vector_store/loader/base.rb +27 -0
- data/lib/phronomy/vector_store/loader/csv_loader.rb +58 -0
- data/lib/phronomy/vector_store/loader/markdown_loader.rb +78 -0
- data/lib/phronomy/vector_store/loader/plain_text_loader.rb +24 -0
- data/lib/phronomy/vector_store/pgvector.rb +2 -2
- data/lib/phronomy/vector_store/redis_search.rb +2 -2
- data/lib/phronomy/vector_store/splitter/base.rb +49 -0
- data/lib/phronomy/vector_store/splitter/fixed_size_splitter.rb +53 -0
- data/lib/phronomy/vector_store/splitter/recursive_splitter.rb +107 -0
- data/lib/phronomy/vector_store.rb +14 -2
- data/lib/phronomy/version.rb +1 -1
- data/lib/phronomy/workflow_context.rb +8 -0
- data/lib/phronomy/workflow_runner.rb +11 -131
- data/lib/phronomy.rb +2 -0
- data/scripts/api_snapshot.rb +11 -9
- metadata +44 -46
- data/lib/phronomy/async_queue.rb +0 -155
- data/lib/phronomy/blocking_adapter_pool.rb +0 -435
- data/lib/phronomy/cancellation_scope.rb +0 -123
- data/lib/phronomy/cancellation_token.rb +0 -133
- data/lib/phronomy/concurrency_gate.rb +0 -155
- data/lib/phronomy/context/assembler.rb +0 -143
- data/lib/phronomy/context/compaction_context.rb +0 -111
- data/lib/phronomy/context/trigger_context.rb +0 -39
- data/lib/phronomy/context/trim_context.rb +0 -75
- data/lib/phronomy/deadline.rb +0 -63
- data/lib/phronomy/embeddings/base.rb +0 -39
- data/lib/phronomy/embeddings/ruby_llm_embeddings.rb +0 -45
- data/lib/phronomy/embeddings.rb +0 -11
- data/lib/phronomy/fsm_session.rb +0 -247
- data/lib/phronomy/knowledge_source/base.rb +0 -54
- data/lib/phronomy/knowledge_source/entity_knowledge.rb +0 -96
- data/lib/phronomy/knowledge_source/rag_knowledge.rb +0 -57
- data/lib/phronomy/knowledge_source/static_knowledge.rb +0 -52
- data/lib/phronomy/loader/base.rb +0 -25
- data/lib/phronomy/loader/csv_loader.rb +0 -56
- data/lib/phronomy/loader/markdown_loader.rb +0 -76
- data/lib/phronomy/loader/plain_text_loader.rb +0 -22
- data/lib/phronomy/loader.rb +0 -13
- data/lib/phronomy/prompt_template.rb +0 -96
- data/lib/phronomy/splitter/base.rb +0 -47
- data/lib/phronomy/splitter/fixed_size_splitter.rb +0 -51
- data/lib/phronomy/splitter/recursive_splitter.rb +0 -105
- data/lib/phronomy/splitter.rb +0 -12
- data/lib/phronomy/tool/base.rb +0 -644
- data/lib/phronomy/tool/scope_policy.rb +0 -50
- data/lib/phronomy/tool_executor.rb +0 -106
|
@@ -1,57 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
module Phronomy
|
|
4
|
-
module KnowledgeSource
|
|
5
|
-
# A KnowledgeSource that retrieves semantically relevant chunks from a VectorStore.
|
|
6
|
-
#
|
|
7
|
-
# On each #fetch call, the query is embedded and the k nearest documents are
|
|
8
|
-
# returned as knowledge chunks.
|
|
9
|
-
#
|
|
10
|
-
# @example
|
|
11
|
-
# store = Phronomy::VectorStore::InMemory.new
|
|
12
|
-
# embeddings = Phronomy::Embeddings::RubyLLMEmbeddings.new(model: "text-embedding-3-small")
|
|
13
|
-
# ks = Phronomy::KnowledgeSource::RAGKnowledge.new(
|
|
14
|
-
# store: store,
|
|
15
|
-
# embeddings: embeddings,
|
|
16
|
-
# k: 5
|
|
17
|
-
# )
|
|
18
|
-
class RAGKnowledge < Base
|
|
19
|
-
# @param store [Phronomy::VectorStore::Base] vector store holding documents
|
|
20
|
-
# @param embeddings [Phronomy::Embeddings::Base] embeddings adapter
|
|
21
|
-
# @param k [Integer] number of chunks to retrieve
|
|
22
|
-
# @param type [Symbol] semantic tag (default :rag)
|
|
23
|
-
# @param source [String, nil] default source label; falls back to
|
|
24
|
-
# each document's :source metadata when nil
|
|
25
|
-
# @api public
|
|
26
|
-
def initialize(store:, embeddings:, k: 5, type: :rag, source: nil)
|
|
27
|
-
@store = store
|
|
28
|
-
@embeddings = embeddings
|
|
29
|
-
@k = k
|
|
30
|
-
@type = type
|
|
31
|
-
@source = source
|
|
32
|
-
end
|
|
33
|
-
|
|
34
|
-
# Embed the query and retrieve the k nearest chunks from the vector store.
|
|
35
|
-
#
|
|
36
|
-
# Returns an empty array when query is nil or blank.
|
|
37
|
-
#
|
|
38
|
-
# @param query [String, nil]
|
|
39
|
-
# @param cancellation_token [Phronomy::CancellationToken, nil] optional; raises CancellationError when cancelled
|
|
40
|
-
# @return [Array<Hash>]
|
|
41
|
-
# @api public
|
|
42
|
-
def fetch(query: nil, cancellation_token: nil)
|
|
43
|
-
cancellation_token&.raise_if_cancelled!
|
|
44
|
-
return [] if query.nil? || query.strip.empty?
|
|
45
|
-
|
|
46
|
-
vector = @embeddings.embed(query, cancellation_token)
|
|
47
|
-
results = @store.search(query_embedding: vector, k: @k, cancellation_token: cancellation_token)
|
|
48
|
-
results.map do |doc|
|
|
49
|
-
chunk = {content: doc[:metadata][:content], type: @type}
|
|
50
|
-
src = @source || doc[:metadata][:source]
|
|
51
|
-
chunk[:source] = src if src
|
|
52
|
-
chunk
|
|
53
|
-
end
|
|
54
|
-
end
|
|
55
|
-
end
|
|
56
|
-
end
|
|
57
|
-
end
|
|
@@ -1,52 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
module Phronomy
|
|
4
|
-
module KnowledgeSource
|
|
5
|
-
# A KnowledgeSource backed by fixed text provided at construction time.
|
|
6
|
-
#
|
|
7
|
-
# Useful for injecting static documents, policy files, or configuration
|
|
8
|
-
# knowledge that does not change per request.
|
|
9
|
-
#
|
|
10
|
-
# @example
|
|
11
|
-
# ks = Phronomy::KnowledgeSource::StaticKnowledge.new(
|
|
12
|
-
# "Our refund policy: ...",
|
|
13
|
-
# type: :policy
|
|
14
|
-
# )
|
|
15
|
-
# agent.invoke("What is the refund policy?", config: { knowledge_sources: [ks] })
|
|
16
|
-
class StaticKnowledge < Base
|
|
17
|
-
# @param text [String] the static knowledge text to inject
|
|
18
|
-
# @param type [Symbol] semantic tag for the chunk (default :static)
|
|
19
|
-
# @param source [String, nil] label identifying where this knowledge came from
|
|
20
|
-
# (e.g. a filename). Included in the context XML tag and exposed to the LLM
|
|
21
|
-
# so that agents can produce grounded citations.
|
|
22
|
-
# @api public
|
|
23
|
-
def initialize(text, type: :static, source: nil)
|
|
24
|
-
@text = text.to_s
|
|
25
|
-
@type = type
|
|
26
|
-
@source = source
|
|
27
|
-
end
|
|
28
|
-
|
|
29
|
-
# Returns the fixed text as a single chunk, regardless of query.
|
|
30
|
-
#
|
|
31
|
-
# @param query [String, nil] ignored for static knowledge
|
|
32
|
-
# @param cancellation_token [Phronomy::CancellationToken, nil] optional; raises CancellationError when cancelled
|
|
33
|
-
# @return [Array<Hash>]
|
|
34
|
-
# @api public
|
|
35
|
-
def fetch(query: nil, cancellation_token: nil)
|
|
36
|
-
cancellation_token&.raise_if_cancelled!
|
|
37
|
-
return [] if @text.empty?
|
|
38
|
-
|
|
39
|
-
chunk = {content: @text, type: @type}
|
|
40
|
-
chunk[:source] = @source if @source
|
|
41
|
-
[chunk]
|
|
42
|
-
end
|
|
43
|
-
|
|
44
|
-
# Static knowledge content never changes between invocations.
|
|
45
|
-
# @return [true]
|
|
46
|
-
# @api public
|
|
47
|
-
def static?
|
|
48
|
-
true
|
|
49
|
-
end
|
|
50
|
-
end
|
|
51
|
-
end
|
|
52
|
-
end
|
data/lib/phronomy/loader/base.rb
DELETED
|
@@ -1,25 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
module Phronomy
|
|
4
|
-
module Loader
|
|
5
|
-
# Abstract base class for document loaders.
|
|
6
|
-
#
|
|
7
|
-
# A loader converts an external source (file path, URL, etc.) into an
|
|
8
|
-
# Array of document hashes understood by the rest of the pipeline:
|
|
9
|
-
#
|
|
10
|
-
# [{ text: String, metadata: Hash }, ...]
|
|
11
|
-
#
|
|
12
|
-
# Subclasses must implement {#load}.
|
|
13
|
-
class Base
|
|
14
|
-
# Load documents from +source+ and return an array of document hashes.
|
|
15
|
-
#
|
|
16
|
-
# @param source [String] file path, URL, or other source identifier
|
|
17
|
-
# @return [Array<Hash>] array of <tt>{ text: String, metadata: Hash }</tt>
|
|
18
|
-
# @raise [NotImplementedError] when not overridden by a subclass
|
|
19
|
-
# @api public
|
|
20
|
-
def load(source)
|
|
21
|
-
raise NotImplementedError, "#{self.class}#load is not implemented"
|
|
22
|
-
end
|
|
23
|
-
end
|
|
24
|
-
end
|
|
25
|
-
end
|
|
@@ -1,56 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require "csv"
|
|
4
|
-
|
|
5
|
-
module Phronomy
|
|
6
|
-
module Loader
|
|
7
|
-
# Loads a CSV file, converting each row into a separate document.
|
|
8
|
-
#
|
|
9
|
-
# By default the first row is treated as a header and column names are
|
|
10
|
-
# available in the document metadata. The full row is serialised to
|
|
11
|
-
# a human-readable "key: value" string for embedding.
|
|
12
|
-
#
|
|
13
|
-
# @example
|
|
14
|
-
# loader = Phronomy::Loader::CsvLoader.new
|
|
15
|
-
# docs = loader.load("products.csv")
|
|
16
|
-
# # => [
|
|
17
|
-
# # { text: "name: Widget\nprice: 9.99", metadata: { source: "...", row: 1, name: "Widget", price: "9.99" } },
|
|
18
|
-
# # ...
|
|
19
|
-
# # ]
|
|
20
|
-
class CsvLoader < Base
|
|
21
|
-
# @param headers [Boolean] treat the first row as headers (default: true)
|
|
22
|
-
# @param text_column [String, nil] if set, use only this column as the document text
|
|
23
|
-
# @api public
|
|
24
|
-
def initialize(headers: true, text_column: nil)
|
|
25
|
-
@headers = headers
|
|
26
|
-
@text_column = text_column
|
|
27
|
-
end
|
|
28
|
-
|
|
29
|
-
# @param source [String] path to a CSV file
|
|
30
|
-
# @return [Array<Hash>]
|
|
31
|
-
# @raise [Errno::ENOENT] if the file does not exist
|
|
32
|
-
# @api public
|
|
33
|
-
def load(source)
|
|
34
|
-
rows = CSV.read(source, headers: @headers, encoding: "UTF-8")
|
|
35
|
-
|
|
36
|
-
if @headers
|
|
37
|
-
rows.each_with_index.map do |row, idx|
|
|
38
|
-
row_hash = row.to_h
|
|
39
|
-
text = if @text_column
|
|
40
|
-
row_hash[@text_column].to_s
|
|
41
|
-
else
|
|
42
|
-
row_hash.map { |k, v| "#{k}: #{v}" }.join("\n")
|
|
43
|
-
end
|
|
44
|
-
metadata = row_hash.transform_keys(&:to_sym).merge(source: source, row: idx + 1)
|
|
45
|
-
{text: text, metadata: metadata}
|
|
46
|
-
end
|
|
47
|
-
else
|
|
48
|
-
rows.each_with_index.map do |row, idx|
|
|
49
|
-
text = row.join(", ")
|
|
50
|
-
{text: text, metadata: {source: source, row: idx + 1}}
|
|
51
|
-
end
|
|
52
|
-
end
|
|
53
|
-
end
|
|
54
|
-
end
|
|
55
|
-
end
|
|
56
|
-
end
|
|
@@ -1,76 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
module Phronomy
|
|
4
|
-
module Loader
|
|
5
|
-
# Loads a Markdown file, optionally splitting on top-level headings.
|
|
6
|
-
#
|
|
7
|
-
# When +split_on_headings:+ is true (the default), each H1/H2 section
|
|
8
|
-
# becomes a separate document so that embeddings capture section semantics
|
|
9
|
-
# rather than the full file at once.
|
|
10
|
-
#
|
|
11
|
-
# @example Single document (heading split disabled)
|
|
12
|
-
# loader = Phronomy::Loader::MarkdownLoader.new(split_on_headings: false)
|
|
13
|
-
# docs = loader.load("README.md")
|
|
14
|
-
# # => [{ text: "# Title\n...", metadata: { source: "README.md" } }]
|
|
15
|
-
#
|
|
16
|
-
# @example Split per heading (default)
|
|
17
|
-
# loader = Phronomy::Loader::MarkdownLoader.new
|
|
18
|
-
# docs = loader.load("guide.md")
|
|
19
|
-
# # => [
|
|
20
|
-
# # { text: "# Section 1\n...", metadata: { source: "guide.md", section: "Section 1" } },
|
|
21
|
-
# # { text: "## Sub-section\n...", metadata: { source: "guide.md", section: "Sub-section" } },
|
|
22
|
-
# # ]
|
|
23
|
-
class MarkdownLoader < Base
|
|
24
|
-
HEADING_RE = /^(\#{1,6})\s+(.+)$/
|
|
25
|
-
|
|
26
|
-
# @param split_on_headings [Boolean] split on H1–H6 boundaries (default: true)
|
|
27
|
-
# @api public
|
|
28
|
-
def initialize(split_on_headings: true)
|
|
29
|
-
@split_on_headings = split_on_headings
|
|
30
|
-
end
|
|
31
|
-
|
|
32
|
-
# @param source [String] path to a Markdown file
|
|
33
|
-
# @return [Array<Hash>]
|
|
34
|
-
# @raise [Errno::ENOENT] if the file does not exist
|
|
35
|
-
# @api public
|
|
36
|
-
def load(source)
|
|
37
|
-
content = File.read(source, encoding: "UTF-8")
|
|
38
|
-
return [{text: content, metadata: {source: source}}] unless @split_on_headings
|
|
39
|
-
|
|
40
|
-
split_by_headings(content, source)
|
|
41
|
-
end
|
|
42
|
-
|
|
43
|
-
private
|
|
44
|
-
|
|
45
|
-
def split_by_headings(content, source)
|
|
46
|
-
sections = []
|
|
47
|
-
current_lines = []
|
|
48
|
-
current_heading = nil
|
|
49
|
-
|
|
50
|
-
content.each_line do |line|
|
|
51
|
-
if (m = HEADING_RE.match(line.chomp))
|
|
52
|
-
flush_section(sections, current_lines, current_heading, source) if current_lines.any?
|
|
53
|
-
current_heading = m[2].strip
|
|
54
|
-
current_lines = [line]
|
|
55
|
-
else
|
|
56
|
-
current_lines << line
|
|
57
|
-
end
|
|
58
|
-
end
|
|
59
|
-
|
|
60
|
-
flush_section(sections, current_lines, current_heading, source) if current_lines.any?
|
|
61
|
-
|
|
62
|
-
# Fall back to single document if no headings were found
|
|
63
|
-
sections.empty? ? [{text: content, metadata: {source: source}}] : sections
|
|
64
|
-
end
|
|
65
|
-
|
|
66
|
-
def flush_section(sections, lines, heading, source)
|
|
67
|
-
text = lines.join
|
|
68
|
-
return if text.strip.empty?
|
|
69
|
-
|
|
70
|
-
metadata = {source: source}
|
|
71
|
-
metadata[:section] = heading if heading
|
|
72
|
-
sections << {text: text, metadata: metadata}
|
|
73
|
-
end
|
|
74
|
-
end
|
|
75
|
-
end
|
|
76
|
-
end
|
|
@@ -1,22 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
module Phronomy
|
|
4
|
-
module Loader
|
|
5
|
-
# Loads a plain-text file as a single document.
|
|
6
|
-
#
|
|
7
|
-
# @example
|
|
8
|
-
# loader = Phronomy::Loader::PlainTextLoader.new
|
|
9
|
-
# docs = loader.load("/path/to/file.txt")
|
|
10
|
-
# # => [{ text: "...", metadata: { source: "/path/to/file.txt" } }]
|
|
11
|
-
class PlainTextLoader < Base
|
|
12
|
-
# @param source [String] absolute or relative path to a text file
|
|
13
|
-
# @return [Array<Hash>] single-element array with the file contents
|
|
14
|
-
# @raise [Errno::ENOENT] if the file does not exist
|
|
15
|
-
# @api public
|
|
16
|
-
def load(source)
|
|
17
|
-
text = File.read(source, encoding: "UTF-8")
|
|
18
|
-
[{text: text, metadata: {source: source}}]
|
|
19
|
-
end
|
|
20
|
-
end
|
|
21
|
-
end
|
|
22
|
-
end
|
data/lib/phronomy/loader.rb
DELETED
|
@@ -1,13 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
module Phronomy
|
|
4
|
-
# Document loader implementations for ingesting files into a RAG pipeline.
|
|
5
|
-
#
|
|
6
|
-
# Sub-classes are auto-loaded by Zeitwerk:
|
|
7
|
-
# Phronomy::Loader::Base
|
|
8
|
-
# Phronomy::Loader::PlainTextLoader
|
|
9
|
-
# Phronomy::Loader::MarkdownLoader
|
|
10
|
-
# Phronomy::Loader::CsvLoader
|
|
11
|
-
module Loader
|
|
12
|
-
end
|
|
13
|
-
end
|
|
@@ -1,96 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
module Phronomy
|
|
4
|
-
# A prompt template that substitutes {{variable}} placeholders in a string.
|
|
5
|
-
#
|
|
6
|
-
# @example Simple human template
|
|
7
|
-
# t = Phronomy::PromptTemplate.new(template: "Translate to {{lang}}: {{text}}")
|
|
8
|
-
# t.format(lang: "French", text: "Hello")
|
|
9
|
-
# # => "Translate to French: Hello"
|
|
10
|
-
#
|
|
11
|
-
# @example With a system template
|
|
12
|
-
# t = Phronomy::PromptTemplate.new(
|
|
13
|
-
# template: "{{question}}",
|
|
14
|
-
# system_template: "You are a {{role}} assistant."
|
|
15
|
-
# )
|
|
16
|
-
# t.format_system(role: "helpful")
|
|
17
|
-
# # => "You are a helpful assistant."
|
|
18
|
-
#
|
|
19
|
-
# As a Runnable, #invoke accepts a Hash of variables and returns a Hash
|
|
20
|
-
# with :prompt (and optionally :system) keys.
|
|
21
|
-
class PromptTemplate
|
|
22
|
-
include Phronomy::Runnable
|
|
23
|
-
|
|
24
|
-
PLACEHOLDER = /\{\{(\w+)\}\}/
|
|
25
|
-
|
|
26
|
-
attr_reader :template, :system_template
|
|
27
|
-
|
|
28
|
-
# @param template [String] human message template with {{var}} placeholders
|
|
29
|
-
# @param system_template [String, nil] optional system message template
|
|
30
|
-
# @api public
|
|
31
|
-
def initialize(template:, system_template: nil)
|
|
32
|
-
@template = template
|
|
33
|
-
@system_template = system_template
|
|
34
|
-
end
|
|
35
|
-
|
|
36
|
-
# Substitute all {{var}} placeholders in the human template.
|
|
37
|
-
#
|
|
38
|
-
# @param variables [Hash{Symbol => String}]
|
|
39
|
-
# @return [String]
|
|
40
|
-
# @api public
|
|
41
|
-
def format(**variables)
|
|
42
|
-
substitute(@template, variables)
|
|
43
|
-
end
|
|
44
|
-
|
|
45
|
-
# Substitute all {{var}} placeholders in the system template.
|
|
46
|
-
# Returns nil when no system template was set.
|
|
47
|
-
#
|
|
48
|
-
# @param variables [Hash{Symbol => String}]
|
|
49
|
-
# @return [String, nil]
|
|
50
|
-
# @api public
|
|
51
|
-
def format_system(**variables)
|
|
52
|
-
@system_template && substitute(@system_template, variables)
|
|
53
|
-
end
|
|
54
|
-
|
|
55
|
-
# Runnable interface: accepts a Hash of variable values.
|
|
56
|
-
# Returns { prompt: String, system: String|nil }.
|
|
57
|
-
#
|
|
58
|
-
# @param input [Hash{Symbol => String}]
|
|
59
|
-
# @return [Hash]
|
|
60
|
-
# @api public
|
|
61
|
-
def invoke(input, config: {})
|
|
62
|
-
vars = normalize_input(input)
|
|
63
|
-
result = {prompt: format(**vars)}
|
|
64
|
-
sys = format_system(**vars)
|
|
65
|
-
result[:system] = sys if sys
|
|
66
|
-
result
|
|
67
|
-
end
|
|
68
|
-
|
|
69
|
-
# Returns the list of placeholder names found in both templates.
|
|
70
|
-
#
|
|
71
|
-
# @return [Array<Symbol>]
|
|
72
|
-
# @api public
|
|
73
|
-
def variables
|
|
74
|
-
names = @template.scan(PLACEHOLDER).flatten
|
|
75
|
-
names += @system_template.scan(PLACEHOLDER).flatten if @system_template
|
|
76
|
-
names.map(&:to_sym).uniq
|
|
77
|
-
end
|
|
78
|
-
|
|
79
|
-
private
|
|
80
|
-
|
|
81
|
-
def substitute(text, variables)
|
|
82
|
-
text.gsub(PLACEHOLDER) do |match|
|
|
83
|
-
key = Regexp.last_match(1).to_sym
|
|
84
|
-
variables.fetch(key) { raise KeyError, "Missing variable: {{#{key}}}" }
|
|
85
|
-
end
|
|
86
|
-
end
|
|
87
|
-
|
|
88
|
-
def normalize_input(input)
|
|
89
|
-
case input
|
|
90
|
-
when Hash then input
|
|
91
|
-
when String then {input: input}
|
|
92
|
-
else raise ArgumentError, "PromptTemplate#invoke expects a Hash of variables, got #{input.class}"
|
|
93
|
-
end
|
|
94
|
-
end
|
|
95
|
-
end
|
|
96
|
-
end
|
|
@@ -1,47 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
module Phronomy
|
|
4
|
-
module Splitter
|
|
5
|
-
# Abstract base class for text splitters.
|
|
6
|
-
#
|
|
7
|
-
# A splitter takes a single document hash (or plain text) and returns an
|
|
8
|
-
# array of smaller chunk documents:
|
|
9
|
-
#
|
|
10
|
-
# [{ text: String, metadata: Hash }, ...]
|
|
11
|
-
#
|
|
12
|
-
# Subclasses must implement {#split}.
|
|
13
|
-
class Base
|
|
14
|
-
# Split +document+ into an array of chunk documents.
|
|
15
|
-
#
|
|
16
|
-
# @param document [Hash, String]
|
|
17
|
-
# Either a document hash (<tt>{ text: String, metadata: Hash }</tt>)
|
|
18
|
-
# returned by a Loader, or a plain String.
|
|
19
|
-
# @return [Array<Hash>] array of <tt>{ text: String, metadata: Hash }</tt>
|
|
20
|
-
# @raise [NotImplementedError] when not overridden by a subclass
|
|
21
|
-
# @api public
|
|
22
|
-
def split(document)
|
|
23
|
-
raise NotImplementedError, "#{self.class}#split is not implemented"
|
|
24
|
-
end
|
|
25
|
-
|
|
26
|
-
# Convenience method: split an array of documents.
|
|
27
|
-
#
|
|
28
|
-
# @param documents [Array<Hash, String>]
|
|
29
|
-
# @return [Array<Hash>]
|
|
30
|
-
# @api public
|
|
31
|
-
def split_all(documents)
|
|
32
|
-
documents.flat_map { |doc| split(doc) }
|
|
33
|
-
end
|
|
34
|
-
|
|
35
|
-
private
|
|
36
|
-
|
|
37
|
-
# Normalise a document-or-string argument into {text:, metadata:}.
|
|
38
|
-
def normalise(document)
|
|
39
|
-
case document
|
|
40
|
-
when Hash then {text: document[:text].to_s, metadata: document.fetch(:metadata, {})}
|
|
41
|
-
when String then {text: document, metadata: {}}
|
|
42
|
-
else raise ArgumentError, "document must be a Hash or String, got #{document.class}"
|
|
43
|
-
end
|
|
44
|
-
end
|
|
45
|
-
end
|
|
46
|
-
end
|
|
47
|
-
end
|
|
@@ -1,51 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
module Phronomy
|
|
4
|
-
module Splitter
|
|
5
|
-
# Splits text into fixed-size character chunks with optional overlap.
|
|
6
|
-
#
|
|
7
|
-
# @example
|
|
8
|
-
# splitter = Phronomy::Splitter::FixedSizeSplitter.new(chunk_size: 200, chunk_overlap: 20)
|
|
9
|
-
# chunks = splitter.split({ text: long_text, metadata: { source: "doc.txt" } })
|
|
10
|
-
# # => [
|
|
11
|
-
# # { text: "...(200 chars)...", metadata: { source: "doc.txt", chunk: 0 } },
|
|
12
|
-
# # { text: "...(200 chars, 20-char overlap)...", metadata: { source: "doc.txt", chunk: 1 } },
|
|
13
|
-
# # ]
|
|
14
|
-
class FixedSizeSplitter < Base
|
|
15
|
-
# @param chunk_size [Integer] maximum characters per chunk (default: 1000)
|
|
16
|
-
# @param chunk_overlap [Integer] characters to repeat at the start of each
|
|
17
|
-
# subsequent chunk (default: 200); must be less than chunk_size
|
|
18
|
-
# @api public
|
|
19
|
-
def initialize(chunk_size: 1000, chunk_overlap: 200)
|
|
20
|
-
raise ArgumentError, "chunk_overlap must be less than chunk_size" if chunk_overlap >= chunk_size
|
|
21
|
-
|
|
22
|
-
@chunk_size = chunk_size
|
|
23
|
-
@chunk_overlap = chunk_overlap
|
|
24
|
-
end
|
|
25
|
-
|
|
26
|
-
# @param document [Hash, String]
|
|
27
|
-
# @return [Array<Hash>]
|
|
28
|
-
# @api public
|
|
29
|
-
def split(document)
|
|
30
|
-
doc = normalise(document)
|
|
31
|
-
text = doc[:text]
|
|
32
|
-
base_metadata = doc[:metadata]
|
|
33
|
-
|
|
34
|
-
chunks = []
|
|
35
|
-
start = 0
|
|
36
|
-
index = 0
|
|
37
|
-
|
|
38
|
-
while start < text.length
|
|
39
|
-
chunk_text = text[start, @chunk_size]
|
|
40
|
-
chunks << {text: chunk_text, metadata: base_metadata.merge(chunk: index)}
|
|
41
|
-
break if start + @chunk_size >= text.length
|
|
42
|
-
|
|
43
|
-
start += @chunk_size - @chunk_overlap
|
|
44
|
-
index += 1
|
|
45
|
-
end
|
|
46
|
-
|
|
47
|
-
chunks
|
|
48
|
-
end
|
|
49
|
-
end
|
|
50
|
-
end
|
|
51
|
-
end
|
|
@@ -1,105 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
module Phronomy
|
|
4
|
-
module Splitter
|
|
5
|
-
# Splits text recursively using a prioritised list of separator strings.
|
|
6
|
-
#
|
|
7
|
-
# The splitter tries each separator in order. When a separator produces
|
|
8
|
-
# chunks that are still larger than +chunk_size+, it recurses with the
|
|
9
|
-
# next separator in the list. This mirrors LangChain's
|
|
10
|
-
# RecursiveCharacterTextSplitter behaviour.
|
|
11
|
-
#
|
|
12
|
-
# Default separators (in priority order):
|
|
13
|
-
# 1. "\n\n" — paragraph breaks
|
|
14
|
-
# 2. "\n" — line breaks
|
|
15
|
-
# 3. ". " — sentence boundaries
|
|
16
|
-
# 4. " " — word boundaries
|
|
17
|
-
# 5. "" — character-level fallback
|
|
18
|
-
#
|
|
19
|
-
# @example
|
|
20
|
-
# splitter = Phronomy::Splitter::RecursiveSplitter.new(chunk_size: 300, chunk_overlap: 30)
|
|
21
|
-
# chunks = splitter.split({ text: long_markdown, metadata: { source: "guide.md" } })
|
|
22
|
-
class RecursiveSplitter < Base
|
|
23
|
-
DEFAULT_SEPARATORS = ["\n\n", "\n", ". ", " ", ""].freeze
|
|
24
|
-
|
|
25
|
-
# @param chunk_size [Integer] maximum characters per chunk (default: 1000)
|
|
26
|
-
# @param chunk_overlap [Integer] overlap characters (default: 200)
|
|
27
|
-
# @param separators [Array<String>] separator list in priority order
|
|
28
|
-
# @api public
|
|
29
|
-
def initialize(chunk_size: 1000, chunk_overlap: 200, separators: DEFAULT_SEPARATORS)
|
|
30
|
-
raise ArgumentError, "chunk_overlap must be less than chunk_size" if chunk_overlap >= chunk_size
|
|
31
|
-
|
|
32
|
-
@chunk_size = chunk_size
|
|
33
|
-
@chunk_overlap = chunk_overlap
|
|
34
|
-
@separators = separators
|
|
35
|
-
end
|
|
36
|
-
|
|
37
|
-
# @param document [Hash, String]
|
|
38
|
-
# @return [Array<Hash>]
|
|
39
|
-
# @api public
|
|
40
|
-
def split(document)
|
|
41
|
-
doc = normalise(document)
|
|
42
|
-
texts = recursive_split(doc[:text], @separators)
|
|
43
|
-
merge_with_overlap(texts).each_with_index.map do |text, idx|
|
|
44
|
-
{text: text, metadata: doc[:metadata].merge(chunk: idx)}
|
|
45
|
-
end
|
|
46
|
-
end
|
|
47
|
-
|
|
48
|
-
private
|
|
49
|
-
|
|
50
|
-
# Split +text+ using the first separator that yields non-trivial pieces,
|
|
51
|
-
# then recurse on any piece that is still too large.
|
|
52
|
-
def recursive_split(text, separators)
|
|
53
|
-
return [text] if text.length <= @chunk_size || separators.empty?
|
|
54
|
-
|
|
55
|
-
sep, *rest_seps = separators
|
|
56
|
-
|
|
57
|
-
# Character-level fallback: just slice
|
|
58
|
-
if sep == ""
|
|
59
|
-
return FixedSizeSplitter
|
|
60
|
-
.new(chunk_size: @chunk_size, chunk_overlap: @chunk_overlap)
|
|
61
|
-
.split(text)
|
|
62
|
-
.map { |c| c[:text] }
|
|
63
|
-
end
|
|
64
|
-
|
|
65
|
-
parts = text.split(sep)
|
|
66
|
-
|
|
67
|
-
# If this separator doesn't split, try the next
|
|
68
|
-
return recursive_split(text, rest_seps) if parts.length <= 1
|
|
69
|
-
|
|
70
|
-
# Re-attach the separator to each part except the last so context is preserved
|
|
71
|
-
parts_with_sep = parts.each_with_index.map do |part, i|
|
|
72
|
-
(i < parts.length - 1) ? part + sep : part
|
|
73
|
-
end
|
|
74
|
-
|
|
75
|
-
parts_with_sep.flat_map do |part|
|
|
76
|
-
if part.length > @chunk_size
|
|
77
|
-
recursive_split(part, rest_seps)
|
|
78
|
-
else
|
|
79
|
-
[part]
|
|
80
|
-
end
|
|
81
|
-
end.reject { |t| t.strip.empty? }
|
|
82
|
-
end
|
|
83
|
-
|
|
84
|
-
# Merge small adjacent pieces and apply overlap between chunks.
|
|
85
|
-
def merge_with_overlap(texts)
|
|
86
|
-
merged = []
|
|
87
|
-
current = +""
|
|
88
|
-
|
|
89
|
-
texts.each do |text|
|
|
90
|
-
if current.length + text.length <= @chunk_size
|
|
91
|
-
current << text
|
|
92
|
-
else
|
|
93
|
-
merged << current.strip unless current.strip.empty?
|
|
94
|
-
# Start next chunk with overlap from the end of current
|
|
95
|
-
overlap_text = (current.length > @chunk_overlap) ? current[-@chunk_overlap..] : current
|
|
96
|
-
current = overlap_text + text
|
|
97
|
-
end
|
|
98
|
-
end
|
|
99
|
-
|
|
100
|
-
merged << current.strip unless current.strip.empty?
|
|
101
|
-
merged
|
|
102
|
-
end
|
|
103
|
-
end
|
|
104
|
-
end
|
|
105
|
-
end
|
data/lib/phronomy/splitter.rb
DELETED
|
@@ -1,12 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
module Phronomy
|
|
4
|
-
# Text splitter implementations for chunking documents before embedding.
|
|
5
|
-
#
|
|
6
|
-
# Sub-classes are auto-loaded by Zeitwerk:
|
|
7
|
-
# Phronomy::Splitter::Base
|
|
8
|
-
# Phronomy::Splitter::FixedSizeSplitter
|
|
9
|
-
# Phronomy::Splitter::RecursiveSplitter
|
|
10
|
-
module Splitter
|
|
11
|
-
end
|
|
12
|
-
end
|