phronomy 0.7.1 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +16 -16
  3. data/benchmark/bench_context_assembler.rb +2 -2
  4. data/benchmark/bench_regression.rb +5 -5
  5. data/benchmark/bench_token_estimator.rb +5 -5
  6. data/benchmark/bench_tool_schema.rb +1 -1
  7. data/benchmark/bench_vector_store.rb +1 -1
  8. data/lib/phronomy/agent/base.rb +86 -123
  9. data/lib/phronomy/agent/checkpoint.rb +118 -0
  10. data/lib/phronomy/agent/context/conversation/compaction_context.rb +117 -0
  11. data/lib/phronomy/agent/context/conversation/trigger_context.rb +43 -0
  12. data/lib/phronomy/agent/context/conversation/trim_context.rb +82 -0
  13. data/lib/phronomy/agent/context/instruction/prompt_template.rb +102 -0
  14. data/lib/phronomy/agent/context/knowledge/embeddings/base.rb +45 -0
  15. data/lib/phronomy/agent/context/knowledge/embeddings/ruby_llm_embeddings.rb +51 -0
  16. data/lib/phronomy/agent/context/knowledge/loader/base.rb +31 -0
  17. data/lib/phronomy/agent/context/knowledge/loader/csv_loader.rb +62 -0
  18. data/lib/phronomy/agent/context/knowledge/loader/markdown_loader.rb +82 -0
  19. data/lib/phronomy/agent/context/knowledge/loader/plain_text_loader.rb +28 -0
  20. data/lib/phronomy/agent/context/knowledge/source/base.rb +60 -0
  21. data/lib/phronomy/agent/context/knowledge/source/entity_knowledge.rb +102 -0
  22. data/lib/phronomy/agent/context/knowledge/source/rag_knowledge.rb +63 -0
  23. data/lib/phronomy/agent/context/knowledge/source/static_knowledge.rb +58 -0
  24. data/lib/phronomy/agent/context/knowledge/splitter/base.rb +53 -0
  25. data/lib/phronomy/agent/context/knowledge/splitter/fixed_size_splitter.rb +57 -0
  26. data/lib/phronomy/agent/context/knowledge/splitter/recursive_splitter.rb +111 -0
  27. data/lib/phronomy/agent/context/knowledge/vector_store/async_backend.rb +116 -0
  28. data/lib/phronomy/agent/context/knowledge/vector_store/base.rb +95 -0
  29. data/lib/phronomy/agent/context/knowledge/vector_store/in_memory.rb +109 -0
  30. data/lib/phronomy/agent/context/knowledge/vector_store/pgvector.rb +133 -0
  31. data/lib/phronomy/agent/context/knowledge/vector_store/redis_search.rb +198 -0
  32. data/lib/phronomy/agent/fsm.rb +1 -1
  33. data/lib/phronomy/agent/invocation_pipeline.rb +99 -0
  34. data/lib/phronomy/agent/lifecycle/fsm_session.rb +251 -0
  35. data/lib/phronomy/agent/lifecycle/phase_machine_builder.rb +249 -0
  36. data/lib/phronomy/agent/react_agent.rb +19 -14
  37. data/lib/phronomy/agent/runner.rb +2 -2
  38. data/lib/phronomy/agent/tool_executor.rb +108 -0
  39. data/lib/phronomy/concurrency/async_queue.rb +157 -0
  40. data/lib/phronomy/concurrency/blocking_adapter_pool.rb +443 -0
  41. data/lib/phronomy/concurrency/cancellation_scope.rb +125 -0
  42. data/lib/phronomy/concurrency/cancellation_token.rb +140 -0
  43. data/lib/phronomy/concurrency/concurrency_gate.rb +157 -0
  44. data/lib/phronomy/concurrency/deadline.rb +65 -0
  45. data/lib/phronomy/{runtime → concurrency}/gate_registry.rb +1 -1
  46. data/lib/phronomy/{runtime → concurrency}/pool_registry.rb +1 -1
  47. data/lib/phronomy/context.rb +2 -8
  48. data/lib/phronomy/embeddings.rb +2 -2
  49. data/lib/phronomy/eval/runner.rb +4 -0
  50. data/lib/phronomy/eval/scorer/llm_judge.rb +12 -1
  51. data/lib/phronomy/event_loop.rb +7 -7
  52. data/lib/phronomy/invocation_context.rb +3 -3
  53. data/lib/phronomy/knowledge_source.rb +0 -5
  54. data/lib/phronomy/llm_adapter/ruby_llm.rb +17 -11
  55. data/lib/phronomy/{context → llm_context_window}/assembler.rb +18 -3
  56. data/lib/phronomy/{context → llm_context_window}/context_version_cache.rb +1 -1
  57. data/lib/phronomy/{context → llm_context_window}/token_budget.rb +7 -4
  58. data/lib/phronomy/{context → llm_context_window}/token_estimator.rb +3 -3
  59. data/lib/phronomy/loader.rb +4 -4
  60. data/lib/phronomy/{agent → multi_agent}/handoff.rb +2 -2
  61. data/lib/phronomy/{agent → multi_agent}/orchestrator.rb +6 -6
  62. data/lib/phronomy/{agent → multi_agent}/parallel_tool_chat.rb +4 -4
  63. data/lib/phronomy/{agent → multi_agent}/team_coordinator.rb +2 -2
  64. data/lib/phronomy/runtime.rb +19 -4
  65. data/lib/phronomy/splitter.rb +3 -3
  66. data/lib/phronomy/task_group.rb +1 -1
  67. data/lib/phronomy/tool/base.rb +50 -9
  68. data/lib/phronomy/tracing/null_tracer.rb +3 -1
  69. data/lib/phronomy/vector_store.rb +2 -2
  70. data/lib/phronomy/version.rb +1 -1
  71. data/lib/phronomy/workflow_context.rb +8 -0
  72. data/lib/phronomy/workflow_runner.rb +11 -131
  73. data/lib/phronomy.rb +1 -0
  74. metadata +44 -42
  75. data/lib/phronomy/async_queue.rb +0 -155
  76. data/lib/phronomy/blocking_adapter_pool.rb +0 -435
  77. data/lib/phronomy/cancellation_scope.rb +0 -123
  78. data/lib/phronomy/cancellation_token.rb +0 -133
  79. data/lib/phronomy/concurrency_gate.rb +0 -155
  80. data/lib/phronomy/context/compaction_context.rb +0 -111
  81. data/lib/phronomy/context/trigger_context.rb +0 -39
  82. data/lib/phronomy/context/trim_context.rb +0 -75
  83. data/lib/phronomy/deadline.rb +0 -63
  84. data/lib/phronomy/embeddings/base.rb +0 -39
  85. data/lib/phronomy/embeddings/ruby_llm_embeddings.rb +0 -45
  86. data/lib/phronomy/fsm_session.rb +0 -247
  87. data/lib/phronomy/knowledge_source/base.rb +0 -54
  88. data/lib/phronomy/knowledge_source/entity_knowledge.rb +0 -96
  89. data/lib/phronomy/knowledge_source/rag_knowledge.rb +0 -57
  90. data/lib/phronomy/knowledge_source/static_knowledge.rb +0 -52
  91. data/lib/phronomy/loader/base.rb +0 -25
  92. data/lib/phronomy/loader/csv_loader.rb +0 -56
  93. data/lib/phronomy/loader/markdown_loader.rb +0 -76
  94. data/lib/phronomy/loader/plain_text_loader.rb +0 -22
  95. data/lib/phronomy/prompt_template.rb +0 -96
  96. data/lib/phronomy/splitter/base.rb +0 -47
  97. data/lib/phronomy/splitter/fixed_size_splitter.rb +0 -51
  98. data/lib/phronomy/splitter/recursive_splitter.rb +0 -105
  99. data/lib/phronomy/tool_executor.rb +0 -106
  100. data/lib/phronomy/vector_store/async_backend.rb +0 -110
  101. data/lib/phronomy/vector_store/base.rb +0 -89
  102. data/lib/phronomy/vector_store/in_memory.rb +0 -93
  103. data/lib/phronomy/vector_store/pgvector.rb +0 -127
  104. data/lib/phronomy/vector_store/redis_search.rb +0 -192
@@ -0,0 +1,82 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Phronomy
4
+ module Agent
5
+ module Context
6
+ module Knowledge
7
+ module Loader
8
+ # Loads a Markdown file, optionally splitting on top-level headings.
9
+ #
10
+ # When +split_on_headings:+ is true (the default), each H1/H2 section
11
+ # becomes a separate document so that embeddings capture section semantics
12
+ # rather than the full file at once.
13
+ #
14
+ # @example Single document (heading split disabled)
15
+ # loader = Phronomy::Agent::Context::Knowledge::Loader::MarkdownLoader.new(split_on_headings: false)
16
+ # docs = loader.load("README.md")
17
+ # # => [{ text: "# Title\n...", metadata: { source: "README.md" } }]
18
+ #
19
+ # @example Split per heading (default)
20
+ # loader = Phronomy::Agent::Context::Knowledge::Loader::MarkdownLoader.new
21
+ # docs = loader.load("guide.md")
22
+ # # => [
23
+ # # { text: "# Section 1\n...", metadata: { source: "guide.md", section: "Section 1" } },
24
+ # # { text: "## Sub-section\n...", metadata: { source: "guide.md", section: "Sub-section" } },
25
+ # # ]
26
+ class MarkdownLoader < Base
27
+ HEADING_RE = /^(\#{1,6})\s+(.+)$/
28
+
29
+ # @param split_on_headings [Boolean] split on H1–H6 boundaries (default: true)
30
+ # @api public
31
+ def initialize(split_on_headings: true)
32
+ @split_on_headings = split_on_headings
33
+ end
34
+
35
+ # @param source [String] path to a Markdown file
36
+ # @return [Array<Hash>]
37
+ # @raise [Errno::ENOENT] if the file does not exist
38
+ # @api public
39
+ def load(source)
40
+ content = File.read(source, encoding: "UTF-8")
41
+ return [{text: content, metadata: {source: source}}] unless @split_on_headings
42
+
43
+ split_by_headings(content, source)
44
+ end
45
+
46
+ private
47
+
48
+ def split_by_headings(content, source)
49
+ sections = []
50
+ current_lines = []
51
+ current_heading = nil
52
+
53
+ content.each_line do |line|
54
+ if (m = HEADING_RE.match(line.chomp))
55
+ flush_section(sections, current_lines, current_heading, source) if current_lines.any?
56
+ current_heading = m[2].strip
57
+ current_lines = [line]
58
+ else
59
+ current_lines << line
60
+ end
61
+ end
62
+
63
+ flush_section(sections, current_lines, current_heading, source) if current_lines.any?
64
+
65
+ # Fall back to single document if no headings were found
66
+ sections.empty? ? [{text: content, metadata: {source: source}}] : sections
67
+ end
68
+
69
+ def flush_section(sections, lines, heading, source)
70
+ text = lines.join
71
+ return if text.strip.empty?
72
+
73
+ metadata = {source: source}
74
+ metadata[:section] = heading if heading
75
+ sections << {text: text, metadata: metadata}
76
+ end
77
+ end
78
+ end
79
+ end
80
+ end
81
+ end
82
+ end
@@ -0,0 +1,28 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Phronomy
4
+ module Agent
5
+ module Context
6
+ module Knowledge
7
+ module Loader
8
+ # Loads a plain-text file as a single document.
9
+ #
10
+ # @example
11
+ # loader = Phronomy::Agent::Context::Knowledge::Loader::PlainTextLoader.new
12
+ # docs = loader.load("/path/to/file.txt")
13
+ # # => [{ text: "...", metadata: { source: "/path/to/file.txt" } }]
14
+ class PlainTextLoader < Base
15
+ # @param source [String] absolute or relative path to a text file
16
+ # @return [Array<Hash>] single-element array with the file contents
17
+ # @raise [Errno::ENOENT] if the file does not exist
18
+ # @api public
19
+ def load(source)
20
+ text = File.read(source, encoding: "UTF-8")
21
+ [{text: text, metadata: {source: source}}]
22
+ end
23
+ end
24
+ end
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,60 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Phronomy
4
+ module Agent
5
+ module Context
6
+ module Knowledge
7
+ module Source
8
+ # Abstract base class for all KnowledgeSource implementations.
9
+ #
10
+ # Subclasses must implement #fetch(query:) and return an Array of chunk Hashes.
11
+ # Each chunk Hash must contain:
12
+ # :content [String] the text to inject into the context
13
+ # :type [Symbol] semantic tag (e.g. :static, :rag, :entity)
14
+ class Base
15
+ # Retrieve knowledge chunks relevant to the given query.
16
+ #
17
+ # @param query [String, nil] the current user input used to select relevant chunks
18
+ # @param cancellation_token [Phronomy::Concurrency::CancellationToken, nil] optional token; raises CancellationError when cancelled
19
+ # @return [Array<Hash>] array of { content: String, type: Symbol }
20
+ # @api public
21
+ def fetch(query: nil, cancellation_token: nil)
22
+ cancellation_token&.raise_if_cancelled!
23
+ raise NotImplementedError, "#{self.class}#fetch is not implemented"
24
+ end
25
+
26
+ # Submits a {#fetch} call to {BlockingAdapterPool} and returns a
27
+ # {BlockingAdapterPool::PendingOperation}.
28
+ # Callers can fan out multiple fetches in parallel and await them all.
29
+ #
30
+ # @param query [String, nil]
31
+ # @param cancellation_token [Phronomy::Concurrency::CancellationToken, nil]
32
+ # @param timeout [Numeric, nil] seconds before the operation is abandoned
33
+ # @return [BlockingAdapterPool::PendingOperation]
34
+ # @api public
35
+ def fetch_async(query: nil, cancellation_token: nil, timeout: nil)
36
+ Phronomy::Runtime.instance.blocking_io.submit(
37
+ timeout: timeout,
38
+ cancellation_token: cancellation_token
39
+ ) do
40
+ fetch(query: query, cancellation_token: cancellation_token)
41
+ end
42
+ end
43
+
44
+ # Returns true when this source's content is considered static (i.e. does
45
+ # not change between agent invocations). Static sources are eligible for
46
+ # fingerprint-based caching in ContextVersionCache.
47
+ #
48
+ # Override in subclasses that return fixed content.
49
+ #
50
+ # @return [Boolean]
51
+ # @api public
52
+ def static?
53
+ false
54
+ end
55
+ end
56
+ end
57
+ end
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,102 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Phronomy
4
+ module Agent
5
+ module Context
6
+ module Knowledge
7
+ module Source
8
+ # A KnowledgeSource that extracts named-entity facts from conversation history.
9
+ #
10
+ # This is the knowledge-injection counterpart of the old EntityMemory.
11
+ # It scans saved user messages with a regex heuristic (no LLM call) and
12
+ # returns the discovered facts as a single knowledge chunk tagged :entity.
13
+ #
14
+ # EntityKnowledge is stateful: it accumulates extracted facts via #update(messages:)
15
+ # which should be called each time new messages are saved.
16
+ #
17
+ # Supported extraction patterns (case-insensitive):
18
+ # "my name is Alice" → { name: "Alice" }
19
+ # "I am Alice" → { identity: "Alice" }
20
+ # "I'm a software engineer" → { occupation: "software engineer" }
21
+ # "I work at / for Acme" → { workplace: "Acme" }
22
+ # "I live in Tokyo" → { location: "Tokyo" }
23
+ # "I'm from Tokyo" → { location: "Tokyo" }
24
+ # "I like / love Ruby" → { preference: "Ruby" }
25
+ #
26
+ # @example
27
+ # ks = Phronomy::Agent::Context::Knowledge::Source::EntityKnowledge.new
28
+ # ks.update(messages: chat_messages)
29
+ # agent.invoke("What is my name?", config: { knowledge_sources: [ks] })
30
+ class EntityKnowledge < Base
31
+ PATTERNS = [
32
+ [:name, /\bmy name is\s+([A-Za-z][A-Za-z0-9 \-']*)/i],
33
+ [:identity, /\bI\s+am\s+([A-Z][A-Za-z0-9 \-']+)/],
34
+ [:occupation, /\bI(?:'m| am) a(?:n)?\s+([A-Za-z][A-Za-z0-9 \-']*)/i],
35
+ [:workplace, /\bI (?:work|worked) (?:at|for|in)\s+([A-Za-z0-9][A-Za-z0-9 \-'.&,]*)/i],
36
+ [:location, /\bI live in\s+([A-Za-z][A-Za-z0-9 \-']*)/i],
37
+ [:location, /\bI(?:'m| am) from\s+([A-Za-z][A-Za-z0-9 \-']*)/i],
38
+ [:preference, /\bI (?:like|love|enjoy)\s+([A-Za-z][A-Za-z0-9 \-']*)/i]
39
+ ].freeze
40
+
41
+ def initialize
42
+ @entities = {}
43
+ end
44
+
45
+ # Scan messages and accumulate entity facts.
46
+ # Call this after saving a new set of messages (e.g. from a ConversationManager save hook).
47
+ #
48
+ # @param messages [Array] message objects responding to #role and #content
49
+ # @api public
50
+ def update(messages:)
51
+ messages.each do |msg|
52
+ next unless msg.role.to_sym == :user
53
+
54
+ extract(msg.content.to_s).each { |key, value| @entities[key] = value }
55
+ end
56
+ end
57
+
58
+ # Returns a single chunk containing all known entity facts in XML context format.
59
+ # Returns an empty array when no entities have been discovered.
60
+ #
61
+ # @param query [String, nil] unused — entity knowledge is always fully injected
62
+ # @param cancellation_token [Phronomy::Concurrency::CancellationToken, nil] optional; raises CancellationError when cancelled
63
+ # @return [Array<Hash>]
64
+ # @api public
65
+ def fetch(query: nil, cancellation_token: nil)
66
+ cancellation_token&.raise_if_cancelled!
67
+ return [] if @entities.empty?
68
+
69
+ lines = @entities.map { |key, value| "- #{key}: #{value}" }.join("\n")
70
+ content = <<~CONTENT.chomp
71
+ Known facts about the user:
72
+ #{lines}
73
+ CONTENT
74
+ [{content: content, type: :entity}]
75
+ end
76
+
77
+ # Returns the current entity store (primarily for testing).
78
+ #
79
+ # @return [Hash]
80
+ # @api public
81
+ def entities
82
+ @entities.dup
83
+ end
84
+
85
+ private
86
+
87
+ def extract(text)
88
+ found = {}
89
+ PATTERNS.each do |key, pattern|
90
+ if (match = text.match(pattern))
91
+ value = match[1].strip.sub(/[.!?]\s+.*$/, "").gsub(/[.,;!?]+$/, "")
92
+ found[key] = value unless value.empty?
93
+ end
94
+ end
95
+ found
96
+ end
97
+ end
98
+ end
99
+ end
100
+ end
101
+ end
102
+ end
@@ -0,0 +1,63 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Phronomy
4
+ module Agent
5
+ module Context
6
+ module Knowledge
7
+ module Source
8
+ # A KnowledgeSource that retrieves semantically relevant chunks from a VectorStore.
9
+ #
10
+ # On each #fetch call, the query is embedded and the k nearest documents are
11
+ # returned as knowledge chunks.
12
+ #
13
+ # @example
14
+ # store = Phronomy::Agent::Context::Knowledge::VectorStore::InMemory.new
15
+ # embeddings = Phronomy::Agent::Context::Knowledge::Embeddings::RubyLLMEmbeddings.new(model: "text-embedding-3-small")
16
+ # ks = Phronomy::Agent::Context::Knowledge::Source::RAGKnowledge.new(
17
+ # store: store,
18
+ # embeddings: embeddings,
19
+ # k: 5
20
+ # )
21
+ class RAGKnowledge < Base
22
+ # @param store [Phronomy::Agent::Context::Knowledge::VectorStore::Base] vector store holding documents
23
+ # @param embeddings [Phronomy::Agent::Context::Knowledge::Embeddings::Base] embeddings adapter
24
+ # @param k [Integer] number of chunks to retrieve
25
+ # @param type [Symbol] semantic tag (default :rag)
26
+ # @param source [String, nil] default source label; falls back to
27
+ # each document's :source metadata when nil
28
+ # @api public
29
+ def initialize(store:, embeddings:, k: 5, type: :rag, source: nil)
30
+ @store = store
31
+ @embeddings = embeddings
32
+ @k = k
33
+ @type = type
34
+ @source = source
35
+ end
36
+
37
+ # Embed the query and retrieve the k nearest chunks from the vector store.
38
+ #
39
+ # Returns an empty array when query is nil or blank.
40
+ #
41
+ # @param query [String, nil]
42
+ # @param cancellation_token [Phronomy::Concurrency::CancellationToken, nil] optional; raises CancellationError when cancelled
43
+ # @return [Array<Hash>]
44
+ # @api public
45
+ def fetch(query: nil, cancellation_token: nil)
46
+ cancellation_token&.raise_if_cancelled!
47
+ return [] if query.nil? || query.strip.empty?
48
+
49
+ vector = @embeddings.embed(query, cancellation_token)
50
+ results = @store.search(query_embedding: vector, k: @k, cancellation_token: cancellation_token)
51
+ results.map do |doc|
52
+ chunk = {content: doc[:metadata][:content], type: @type}
53
+ src = @source || doc[:metadata][:source]
54
+ chunk[:source] = src if src
55
+ chunk
56
+ end
57
+ end
58
+ end
59
+ end
60
+ end
61
+ end
62
+ end
63
+ end
@@ -0,0 +1,58 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Phronomy
4
+ module Agent
5
+ module Context
6
+ module Knowledge
7
+ module Source
8
+ # A KnowledgeSource backed by fixed text provided at construction time.
9
+ #
10
+ # Useful for injecting static documents, policy files, or configuration
11
+ # knowledge that does not change per request.
12
+ #
13
+ # @example
14
+ # ks = Phronomy::Agent::Context::Knowledge::Source::StaticKnowledge.new(
15
+ # "Our refund policy: ...",
16
+ # type: :policy
17
+ # )
18
+ # agent.invoke("What is the refund policy?", config: { knowledge_sources: [ks] })
19
+ class StaticKnowledge < Base
20
+ # @param text [String] the static knowledge text to inject
21
+ # @param type [Symbol] semantic tag for the chunk (default :static)
22
+ # @param source [String, nil] label identifying where this knowledge came from
23
+ # (e.g. a filename). Included in the context XML tag and exposed to the LLM
24
+ # so that agents can produce grounded citations.
25
+ # @api public
26
+ def initialize(text, type: :static, source: nil)
27
+ @text = text.to_s
28
+ @type = type
29
+ @source = source
30
+ end
31
+
32
+ # Returns the fixed text as a single chunk, regardless of query.
33
+ #
34
+ # @param query [String, nil] ignored for static knowledge
35
+ # @param cancellation_token [Phronomy::Concurrency::CancellationToken, nil] optional; raises CancellationError when cancelled
36
+ # @return [Array<Hash>]
37
+ # @api public
38
+ def fetch(query: nil, cancellation_token: nil)
39
+ cancellation_token&.raise_if_cancelled!
40
+ return [] if @text.empty?
41
+
42
+ chunk = {content: @text, type: @type}
43
+ chunk[:source] = @source if @source
44
+ [chunk]
45
+ end
46
+
47
+ # Static knowledge content never changes between invocations.
48
+ # @return [true]
49
+ # @api public
50
+ def static?
51
+ true
52
+ end
53
+ end
54
+ end
55
+ end
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,53 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Phronomy
4
+ module Agent
5
+ module Context
6
+ module Knowledge
7
+ module Splitter
8
+ # Abstract base class for text splitters.
9
+ #
10
+ # A splitter takes a single document hash (or plain text) and returns an
11
+ # array of smaller chunk documents:
12
+ #
13
+ # [{ text: String, metadata: Hash }, ...]
14
+ #
15
+ # Subclasses must implement {#split}.
16
+ class Base
17
+ # Split +document+ into an array of chunk documents.
18
+ #
19
+ # @param document [Hash, String]
20
+ # Either a document hash (<tt>{ text: String, metadata: Hash }</tt>)
21
+ # returned by a Loader, or a plain String.
22
+ # @return [Array<Hash>] array of <tt>{ text: String, metadata: Hash }</tt>
23
+ # @raise [NotImplementedError] when not overridden by a subclass
24
+ # @api public
25
+ def split(document)
26
+ raise NotImplementedError, "#{self.class}#split is not implemented"
27
+ end
28
+
29
+ # Convenience method: split an array of documents.
30
+ #
31
+ # @param documents [Array<Hash, String>]
32
+ # @return [Array<Hash>]
33
+ # @api public
34
+ def split_all(documents)
35
+ documents.flat_map { |doc| split(doc) }
36
+ end
37
+
38
+ private
39
+
40
+ # Normalise a document-or-string argument into {text:, metadata:}.
41
+ def normalise(document)
42
+ case document
43
+ when Hash then {text: document[:text].to_s, metadata: document.fetch(:metadata, {})}
44
+ when String then {text: document, metadata: {}}
45
+ else raise ArgumentError, "document must be a Hash or String, got #{document.class}"
46
+ end
47
+ end
48
+ end
49
+ end
50
+ end
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,57 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Phronomy
4
+ module Agent
5
+ module Context
6
+ module Knowledge
7
+ module Splitter
8
+ # Splits text into fixed-size character chunks with optional overlap.
9
+ #
10
+ # @example
11
+ # splitter = Phronomy::Agent::Context::Knowledge::Splitter::FixedSizeSplitter.new(chunk_size: 200, chunk_overlap: 20)
12
+ # chunks = splitter.split({ text: long_text, metadata: { source: "doc.txt" } })
13
+ # # => [
14
+ # # { text: "...(200 chars)...", metadata: { source: "doc.txt", chunk: 0 } },
15
+ # # { text: "...(200 chars, 20-char overlap)...", metadata: { source: "doc.txt", chunk: 1 } },
16
+ # # ]
17
+ class FixedSizeSplitter < Base
18
+ # @param chunk_size [Integer] maximum characters per chunk (default: 1000)
19
+ # @param chunk_overlap [Integer] characters to repeat at the start of each
20
+ # subsequent chunk (default: 200); must be less than chunk_size
21
+ # @api public
22
+ def initialize(chunk_size: 1000, chunk_overlap: 200)
23
+ raise ArgumentError, "chunk_overlap must be less than chunk_size" if chunk_overlap >= chunk_size
24
+
25
+ @chunk_size = chunk_size
26
+ @chunk_overlap = chunk_overlap
27
+ end
28
+
29
+ # @param document [Hash, String]
30
+ # @return [Array<Hash>]
31
+ # @api public
32
+ def split(document)
33
+ doc = normalise(document)
34
+ text = doc[:text]
35
+ base_metadata = doc[:metadata]
36
+
37
+ chunks = []
38
+ start = 0
39
+ index = 0
40
+
41
+ while start < text.length
42
+ chunk_text = text[start, @chunk_size]
43
+ chunks << {text: chunk_text, metadata: base_metadata.merge(chunk: index)}
44
+ break if start + @chunk_size >= text.length
45
+
46
+ start += @chunk_size - @chunk_overlap
47
+ index += 1
48
+ end
49
+
50
+ chunks
51
+ end
52
+ end
53
+ end
54
+ end
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,111 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Phronomy
4
+ module Agent
5
+ module Context
6
+ module Knowledge
7
+ module Splitter
8
+ # Splits text recursively using a prioritised list of separator strings.
9
+ #
10
+ # The splitter tries each separator in order. When a separator produces
11
+ # chunks that are still larger than +chunk_size+, it recurses with the
12
+ # next separator in the list. This mirrors LangChain's
13
+ # RecursiveCharacterTextSplitter behaviour.
14
+ #
15
+ # Default separators (in priority order):
16
+ # 1. "\n\n" — paragraph breaks
17
+ # 2. "\n" — line breaks
18
+ # 3. ". " — sentence boundaries
19
+ # 4. " " — word boundaries
20
+ # 5. "" — character-level fallback
21
+ #
22
+ # @example
23
+ # splitter = Phronomy::Agent::Context::Knowledge::Splitter::RecursiveSplitter.new(chunk_size: 300, chunk_overlap: 30)
24
+ # chunks = splitter.split({ text: long_markdown, metadata: { source: "guide.md" } })
25
+ class RecursiveSplitter < Base
26
+ DEFAULT_SEPARATORS = ["\n\n", "\n", ". ", " ", ""].freeze
27
+
28
+ # @param chunk_size [Integer] maximum characters per chunk (default: 1000)
29
+ # @param chunk_overlap [Integer] overlap characters (default: 200)
30
+ # @param separators [Array<String>] separator list in priority order
31
+ # @api public
32
+ def initialize(chunk_size: 1000, chunk_overlap: 200, separators: DEFAULT_SEPARATORS)
33
+ raise ArgumentError, "chunk_overlap must be less than chunk_size" if chunk_overlap >= chunk_size
34
+
35
+ @chunk_size = chunk_size
36
+ @chunk_overlap = chunk_overlap
37
+ @separators = separators
38
+ end
39
+
40
+ # @param document [Hash, String]
41
+ # @return [Array<Hash>]
42
+ # @api public
43
+ def split(document)
44
+ doc = normalise(document)
45
+ texts = recursive_split(doc[:text], @separators)
46
+ merge_with_overlap(texts).each_with_index.map do |text, idx|
47
+ {text: text, metadata: doc[:metadata].merge(chunk: idx)}
48
+ end
49
+ end
50
+
51
+ private
52
+
53
+ # Split +text+ using the first separator that yields non-trivial pieces,
54
+ # then recurse on any piece that is still too large.
55
+ def recursive_split(text, separators)
56
+ return [text] if text.length <= @chunk_size || separators.empty?
57
+
58
+ sep, *rest_seps = separators
59
+
60
+ # Character-level fallback: just slice
61
+ if sep == ""
62
+ return FixedSizeSplitter
63
+ .new(chunk_size: @chunk_size, chunk_overlap: @chunk_overlap)
64
+ .split(text)
65
+ .map { |c| c[:text] }
66
+ end
67
+
68
+ parts = text.split(sep)
69
+
70
+ # If this separator doesn't split, try the next
71
+ return recursive_split(text, rest_seps) if parts.length <= 1
72
+
73
+ # Re-attach the separator to each part except the last so context is preserved
74
+ parts_with_sep = parts.each_with_index.map do |part, i|
75
+ (i < parts.length - 1) ? part + sep : part
76
+ end
77
+
78
+ parts_with_sep.flat_map do |part|
79
+ if part.length > @chunk_size
80
+ recursive_split(part, rest_seps)
81
+ else
82
+ [part]
83
+ end
84
+ end.reject { |t| t.strip.empty? }
85
+ end
86
+
87
+ # Merge small adjacent pieces and apply overlap between chunks.
88
+ def merge_with_overlap(texts)
89
+ merged = []
90
+ current = +""
91
+
92
+ texts.each do |text|
93
+ if current.length + text.length <= @chunk_size
94
+ current << text
95
+ else
96
+ merged << current.strip unless current.strip.empty?
97
+ # Start next chunk with overlap from the end of current
98
+ overlap_text = (current.length > @chunk_overlap) ? current[-@chunk_overlap..] : current
99
+ current = overlap_text + text
100
+ end
101
+ end
102
+
103
+ merged << current.strip unless current.strip.empty?
104
+ merged
105
+ end
106
+ end
107
+ end
108
+ end
109
+ end
110
+ end
111
+ end