phronomy 0.7.1 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +35 -45
  3. data/benchmark/baseline.json +1 -1
  4. data/benchmark/bench_agent_invoke.rb +1 -1
  5. data/benchmark/bench_context_assembler.rb +11 -3
  6. data/benchmark/bench_regression.rb +11 -11
  7. data/benchmark/bench_token_estimator.rb +5 -5
  8. data/benchmark/bench_tool_schema.rb +2 -2
  9. data/docs/decisions/011-build-context-as-single-llm-input-authority.md +224 -0
  10. data/lib/phronomy/agent/base.rb +268 -403
  11. data/lib/phronomy/agent/checkpoint.rb +118 -0
  12. data/lib/phronomy/agent/concerns/suspendable.rb +6 -6
  13. data/lib/phronomy/agent/context/capability/base.rb +689 -0
  14. data/lib/phronomy/agent/context/capability/scope_policy.rb +54 -0
  15. data/lib/phronomy/agent/context/instruction/prompt_template.rb +102 -0
  16. data/lib/phronomy/agent/context/knowledge/base.rb +58 -0
  17. data/lib/phronomy/agent/context/knowledge/entity_knowledge.rb +102 -0
  18. data/lib/phronomy/agent/context/knowledge/static_knowledge.rb +58 -0
  19. data/lib/phronomy/agent/fsm.rb +1 -1
  20. data/lib/phronomy/agent/invocation_pipeline.rb +108 -0
  21. data/lib/phronomy/agent/lifecycle/fsm_session.rb +251 -0
  22. data/lib/phronomy/agent/lifecycle/phase_machine_builder.rb +249 -0
  23. data/lib/phronomy/agent/react_agent.rb +43 -37
  24. data/lib/phronomy/agent/runner.rb +2 -2
  25. data/lib/phronomy/agent/shared_state.rb +2 -2
  26. data/lib/phronomy/agent/tool_executor.rb +108 -0
  27. data/lib/phronomy/concurrency/async_queue.rb +157 -0
  28. data/lib/phronomy/concurrency/blocking_adapter_pool.rb +443 -0
  29. data/lib/phronomy/concurrency/cancellation_scope.rb +125 -0
  30. data/lib/phronomy/concurrency/cancellation_token.rb +140 -0
  31. data/lib/phronomy/concurrency/concurrency_gate.rb +157 -0
  32. data/lib/phronomy/concurrency/deadline.rb +65 -0
  33. data/lib/phronomy/{runtime → concurrency}/gate_registry.rb +1 -2
  34. data/lib/phronomy/{runtime → concurrency}/pool_registry.rb +1 -1
  35. data/lib/phronomy/configuration.rb +0 -6
  36. data/lib/phronomy/context.rb +2 -8
  37. data/lib/phronomy/eval/runner.rb +4 -0
  38. data/lib/phronomy/eval/scorer/llm_judge.rb +12 -1
  39. data/lib/phronomy/event_loop.rb +7 -7
  40. data/lib/phronomy/invocation_context.rb +3 -3
  41. data/lib/phronomy/knowledge_source.rb +0 -5
  42. data/lib/phronomy/llm_adapter/ruby_llm.rb +17 -11
  43. data/lib/phronomy/llm_context_window/assembler.rb +191 -0
  44. data/lib/phronomy/{context → llm_context_window}/context_version_cache.rb +1 -1
  45. data/lib/phronomy/{context → llm_context_window}/token_budget.rb +7 -4
  46. data/lib/phronomy/{context → llm_context_window}/token_estimator.rb +3 -3
  47. data/lib/phronomy/{agent → multi_agent}/handoff.rb +6 -6
  48. data/lib/phronomy/{agent → multi_agent}/orchestrator.rb +7 -7
  49. data/lib/phronomy/{agent → multi_agent}/parallel_tool_chat.rb +4 -4
  50. data/lib/phronomy/{agent → multi_agent}/team_coordinator.rb +4 -4
  51. data/lib/phronomy/runtime/runtime_metrics.rb +0 -1
  52. data/lib/phronomy/runtime.rb +20 -6
  53. data/lib/phronomy/task_group.rb +1 -1
  54. data/lib/phronomy/tool.rb +3 -4
  55. data/lib/phronomy/{tool/agent_tool.rb → tools/agent.rb} +6 -6
  56. data/lib/phronomy/{tool/mcp_tool.rb → tools/mcp.rb} +9 -9
  57. data/lib/phronomy/tools/vector_search.rb +70 -0
  58. data/lib/phronomy/tracing/null_tracer.rb +3 -1
  59. data/lib/phronomy/vector_store/async_backend.rb +4 -4
  60. data/lib/phronomy/vector_store/base.rb +2 -2
  61. data/lib/phronomy/vector_store/embeddings/base.rb +41 -0
  62. data/lib/phronomy/vector_store/embeddings/ruby_llm_embeddings.rb +47 -0
  63. data/lib/phronomy/vector_store/in_memory.rb +12 -2
  64. data/lib/phronomy/vector_store/loader/base.rb +27 -0
  65. data/lib/phronomy/vector_store/loader/csv_loader.rb +58 -0
  66. data/lib/phronomy/vector_store/loader/markdown_loader.rb +78 -0
  67. data/lib/phronomy/vector_store/loader/plain_text_loader.rb +24 -0
  68. data/lib/phronomy/vector_store/pgvector.rb +2 -2
  69. data/lib/phronomy/vector_store/redis_search.rb +2 -2
  70. data/lib/phronomy/vector_store/splitter/base.rb +49 -0
  71. data/lib/phronomy/vector_store/splitter/fixed_size_splitter.rb +53 -0
  72. data/lib/phronomy/vector_store/splitter/recursive_splitter.rb +107 -0
  73. data/lib/phronomy/vector_store.rb +14 -2
  74. data/lib/phronomy/version.rb +1 -1
  75. data/lib/phronomy/workflow_context.rb +8 -0
  76. data/lib/phronomy/workflow_runner.rb +11 -131
  77. data/lib/phronomy.rb +2 -0
  78. data/scripts/api_snapshot.rb +11 -9
  79. metadata +44 -46
  80. data/lib/phronomy/async_queue.rb +0 -155
  81. data/lib/phronomy/blocking_adapter_pool.rb +0 -435
  82. data/lib/phronomy/cancellation_scope.rb +0 -123
  83. data/lib/phronomy/cancellation_token.rb +0 -133
  84. data/lib/phronomy/concurrency_gate.rb +0 -155
  85. data/lib/phronomy/context/assembler.rb +0 -143
  86. data/lib/phronomy/context/compaction_context.rb +0 -111
  87. data/lib/phronomy/context/trigger_context.rb +0 -39
  88. data/lib/phronomy/context/trim_context.rb +0 -75
  89. data/lib/phronomy/deadline.rb +0 -63
  90. data/lib/phronomy/embeddings/base.rb +0 -39
  91. data/lib/phronomy/embeddings/ruby_llm_embeddings.rb +0 -45
  92. data/lib/phronomy/embeddings.rb +0 -11
  93. data/lib/phronomy/fsm_session.rb +0 -247
  94. data/lib/phronomy/knowledge_source/base.rb +0 -54
  95. data/lib/phronomy/knowledge_source/entity_knowledge.rb +0 -96
  96. data/lib/phronomy/knowledge_source/rag_knowledge.rb +0 -57
  97. data/lib/phronomy/knowledge_source/static_knowledge.rb +0 -52
  98. data/lib/phronomy/loader/base.rb +0 -25
  99. data/lib/phronomy/loader/csv_loader.rb +0 -56
  100. data/lib/phronomy/loader/markdown_loader.rb +0 -76
  101. data/lib/phronomy/loader/plain_text_loader.rb +0 -22
  102. data/lib/phronomy/loader.rb +0 -13
  103. data/lib/phronomy/prompt_template.rb +0 -96
  104. data/lib/phronomy/splitter/base.rb +0 -47
  105. data/lib/phronomy/splitter/fixed_size_splitter.rb +0 -51
  106. data/lib/phronomy/splitter/recursive_splitter.rb +0 -105
  107. data/lib/phronomy/splitter.rb +0 -12
  108. data/lib/phronomy/tool/base.rb +0 -644
  109. data/lib/phronomy/tool/scope_policy.rb +0 -50
  110. data/lib/phronomy/tool_executor.rb +0 -106
@@ -1,57 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Phronomy
4
- module KnowledgeSource
5
- # A KnowledgeSource that retrieves semantically relevant chunks from a VectorStore.
6
- #
7
- # On each #fetch call, the query is embedded and the k nearest documents are
8
- # returned as knowledge chunks.
9
- #
10
- # @example
11
- # store = Phronomy::VectorStore::InMemory.new
12
- # embeddings = Phronomy::Embeddings::RubyLLMEmbeddings.new(model: "text-embedding-3-small")
13
- # ks = Phronomy::KnowledgeSource::RAGKnowledge.new(
14
- # store: store,
15
- # embeddings: embeddings,
16
- # k: 5
17
- # )
18
- class RAGKnowledge < Base
19
- # @param store [Phronomy::VectorStore::Base] vector store holding documents
20
- # @param embeddings [Phronomy::Embeddings::Base] embeddings adapter
21
- # @param k [Integer] number of chunks to retrieve
22
- # @param type [Symbol] semantic tag (default :rag)
23
- # @param source [String, nil] default source label; falls back to
24
- # each document's :source metadata when nil
25
- # @api public
26
- def initialize(store:, embeddings:, k: 5, type: :rag, source: nil)
27
- @store = store
28
- @embeddings = embeddings
29
- @k = k
30
- @type = type
31
- @source = source
32
- end
33
-
34
- # Embed the query and retrieve the k nearest chunks from the vector store.
35
- #
36
- # Returns an empty array when query is nil or blank.
37
- #
38
- # @param query [String, nil]
39
- # @param cancellation_token [Phronomy::CancellationToken, nil] optional; raises CancellationError when cancelled
40
- # @return [Array<Hash>]
41
- # @api public
42
- def fetch(query: nil, cancellation_token: nil)
43
- cancellation_token&.raise_if_cancelled!
44
- return [] if query.nil? || query.strip.empty?
45
-
46
- vector = @embeddings.embed(query, cancellation_token)
47
- results = @store.search(query_embedding: vector, k: @k, cancellation_token: cancellation_token)
48
- results.map do |doc|
49
- chunk = {content: doc[:metadata][:content], type: @type}
50
- src = @source || doc[:metadata][:source]
51
- chunk[:source] = src if src
52
- chunk
53
- end
54
- end
55
- end
56
- end
57
- end
@@ -1,52 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Phronomy
4
- module KnowledgeSource
5
- # A KnowledgeSource backed by fixed text provided at construction time.
6
- #
7
- # Useful for injecting static documents, policy files, or configuration
8
- # knowledge that does not change per request.
9
- #
10
- # @example
11
- # ks = Phronomy::KnowledgeSource::StaticKnowledge.new(
12
- # "Our refund policy: ...",
13
- # type: :policy
14
- # )
15
- # agent.invoke("What is the refund policy?", config: { knowledge_sources: [ks] })
16
- class StaticKnowledge < Base
17
- # @param text [String] the static knowledge text to inject
18
- # @param type [Symbol] semantic tag for the chunk (default :static)
19
- # @param source [String, nil] label identifying where this knowledge came from
20
- # (e.g. a filename). Included in the context XML tag and exposed to the LLM
21
- # so that agents can produce grounded citations.
22
- # @api public
23
- def initialize(text, type: :static, source: nil)
24
- @text = text.to_s
25
- @type = type
26
- @source = source
27
- end
28
-
29
- # Returns the fixed text as a single chunk, regardless of query.
30
- #
31
- # @param query [String, nil] ignored for static knowledge
32
- # @param cancellation_token [Phronomy::CancellationToken, nil] optional; raises CancellationError when cancelled
33
- # @return [Array<Hash>]
34
- # @api public
35
- def fetch(query: nil, cancellation_token: nil)
36
- cancellation_token&.raise_if_cancelled!
37
- return [] if @text.empty?
38
-
39
- chunk = {content: @text, type: @type}
40
- chunk[:source] = @source if @source
41
- [chunk]
42
- end
43
-
44
- # Static knowledge content never changes between invocations.
45
- # @return [true]
46
- # @api public
47
- def static?
48
- true
49
- end
50
- end
51
- end
52
- end
@@ -1,25 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Phronomy
4
- module Loader
5
- # Abstract base class for document loaders.
6
- #
7
- # A loader converts an external source (file path, URL, etc.) into an
8
- # Array of document hashes understood by the rest of the pipeline:
9
- #
10
- # [{ text: String, metadata: Hash }, ...]
11
- #
12
- # Subclasses must implement {#load}.
13
- class Base
14
- # Load documents from +source+ and return an array of document hashes.
15
- #
16
- # @param source [String] file path, URL, or other source identifier
17
- # @return [Array<Hash>] array of <tt>{ text: String, metadata: Hash }</tt>
18
- # @raise [NotImplementedError] when not overridden by a subclass
19
- # @api public
20
- def load(source)
21
- raise NotImplementedError, "#{self.class}#load is not implemented"
22
- end
23
- end
24
- end
25
- end
@@ -1,56 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- require "csv"
4
-
5
- module Phronomy
6
- module Loader
7
- # Loads a CSV file, converting each row into a separate document.
8
- #
9
- # By default the first row is treated as a header and column names are
10
- # available in the document metadata. The full row is serialised to
11
- # a human-readable "key: value" string for embedding.
12
- #
13
- # @example
14
- # loader = Phronomy::Loader::CsvLoader.new
15
- # docs = loader.load("products.csv")
16
- # # => [
17
- # # { text: "name: Widget\nprice: 9.99", metadata: { source: "...", row: 1, name: "Widget", price: "9.99" } },
18
- # # ...
19
- # # ]
20
- class CsvLoader < Base
21
- # @param headers [Boolean] treat the first row as headers (default: true)
22
- # @param text_column [String, nil] if set, use only this column as the document text
23
- # @api public
24
- def initialize(headers: true, text_column: nil)
25
- @headers = headers
26
- @text_column = text_column
27
- end
28
-
29
- # @param source [String] path to a CSV file
30
- # @return [Array<Hash>]
31
- # @raise [Errno::ENOENT] if the file does not exist
32
- # @api public
33
- def load(source)
34
- rows = CSV.read(source, headers: @headers, encoding: "UTF-8")
35
-
36
- if @headers
37
- rows.each_with_index.map do |row, idx|
38
- row_hash = row.to_h
39
- text = if @text_column
40
- row_hash[@text_column].to_s
41
- else
42
- row_hash.map { |k, v| "#{k}: #{v}" }.join("\n")
43
- end
44
- metadata = row_hash.transform_keys(&:to_sym).merge(source: source, row: idx + 1)
45
- {text: text, metadata: metadata}
46
- end
47
- else
48
- rows.each_with_index.map do |row, idx|
49
- text = row.join(", ")
50
- {text: text, metadata: {source: source, row: idx + 1}}
51
- end
52
- end
53
- end
54
- end
55
- end
56
- end
@@ -1,76 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Phronomy
4
- module Loader
5
- # Loads a Markdown file, optionally splitting on top-level headings.
6
- #
7
- # When +split_on_headings:+ is true (the default), each H1/H2 section
8
- # becomes a separate document so that embeddings capture section semantics
9
- # rather than the full file at once.
10
- #
11
- # @example Single document (heading split disabled)
12
- # loader = Phronomy::Loader::MarkdownLoader.new(split_on_headings: false)
13
- # docs = loader.load("README.md")
14
- # # => [{ text: "# Title\n...", metadata: { source: "README.md" } }]
15
- #
16
- # @example Split per heading (default)
17
- # loader = Phronomy::Loader::MarkdownLoader.new
18
- # docs = loader.load("guide.md")
19
- # # => [
20
- # # { text: "# Section 1\n...", metadata: { source: "guide.md", section: "Section 1" } },
21
- # # { text: "## Sub-section\n...", metadata: { source: "guide.md", section: "Sub-section" } },
22
- # # ]
23
- class MarkdownLoader < Base
24
- HEADING_RE = /^(\#{1,6})\s+(.+)$/
25
-
26
- # @param split_on_headings [Boolean] split on H1–H6 boundaries (default: true)
27
- # @api public
28
- def initialize(split_on_headings: true)
29
- @split_on_headings = split_on_headings
30
- end
31
-
32
- # @param source [String] path to a Markdown file
33
- # @return [Array<Hash>]
34
- # @raise [Errno::ENOENT] if the file does not exist
35
- # @api public
36
- def load(source)
37
- content = File.read(source, encoding: "UTF-8")
38
- return [{text: content, metadata: {source: source}}] unless @split_on_headings
39
-
40
- split_by_headings(content, source)
41
- end
42
-
43
- private
44
-
45
- def split_by_headings(content, source)
46
- sections = []
47
- current_lines = []
48
- current_heading = nil
49
-
50
- content.each_line do |line|
51
- if (m = HEADING_RE.match(line.chomp))
52
- flush_section(sections, current_lines, current_heading, source) if current_lines.any?
53
- current_heading = m[2].strip
54
- current_lines = [line]
55
- else
56
- current_lines << line
57
- end
58
- end
59
-
60
- flush_section(sections, current_lines, current_heading, source) if current_lines.any?
61
-
62
- # Fall back to single document if no headings were found
63
- sections.empty? ? [{text: content, metadata: {source: source}}] : sections
64
- end
65
-
66
- def flush_section(sections, lines, heading, source)
67
- text = lines.join
68
- return if text.strip.empty?
69
-
70
- metadata = {source: source}
71
- metadata[:section] = heading if heading
72
- sections << {text: text, metadata: metadata}
73
- end
74
- end
75
- end
76
- end
@@ -1,22 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Phronomy
4
- module Loader
5
- # Loads a plain-text file as a single document.
6
- #
7
- # @example
8
- # loader = Phronomy::Loader::PlainTextLoader.new
9
- # docs = loader.load("/path/to/file.txt")
10
- # # => [{ text: "...", metadata: { source: "/path/to/file.txt" } }]
11
- class PlainTextLoader < Base
12
- # @param source [String] absolute or relative path to a text file
13
- # @return [Array<Hash>] single-element array with the file contents
14
- # @raise [Errno::ENOENT] if the file does not exist
15
- # @api public
16
- def load(source)
17
- text = File.read(source, encoding: "UTF-8")
18
- [{text: text, metadata: {source: source}}]
19
- end
20
- end
21
- end
22
- end
@@ -1,13 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Phronomy
4
- # Document loader implementations for ingesting files into a RAG pipeline.
5
- #
6
- # Sub-classes are auto-loaded by Zeitwerk:
7
- # Phronomy::Loader::Base
8
- # Phronomy::Loader::PlainTextLoader
9
- # Phronomy::Loader::MarkdownLoader
10
- # Phronomy::Loader::CsvLoader
11
- module Loader
12
- end
13
- end
@@ -1,96 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Phronomy
4
- # A prompt template that substitutes {{variable}} placeholders in a string.
5
- #
6
- # @example Simple human template
7
- # t = Phronomy::PromptTemplate.new(template: "Translate to {{lang}}: {{text}}")
8
- # t.format(lang: "French", text: "Hello")
9
- # # => "Translate to French: Hello"
10
- #
11
- # @example With a system template
12
- # t = Phronomy::PromptTemplate.new(
13
- # template: "{{question}}",
14
- # system_template: "You are a {{role}} assistant."
15
- # )
16
- # t.format_system(role: "helpful")
17
- # # => "You are a helpful assistant."
18
- #
19
- # As a Runnable, #invoke accepts a Hash of variables and returns a Hash
20
- # with :prompt (and optionally :system) keys.
21
- class PromptTemplate
22
- include Phronomy::Runnable
23
-
24
- PLACEHOLDER = /\{\{(\w+)\}\}/
25
-
26
- attr_reader :template, :system_template
27
-
28
- # @param template [String] human message template with {{var}} placeholders
29
- # @param system_template [String, nil] optional system message template
30
- # @api public
31
- def initialize(template:, system_template: nil)
32
- @template = template
33
- @system_template = system_template
34
- end
35
-
36
- # Substitute all {{var}} placeholders in the human template.
37
- #
38
- # @param variables [Hash{Symbol => String}]
39
- # @return [String]
40
- # @api public
41
- def format(**variables)
42
- substitute(@template, variables)
43
- end
44
-
45
- # Substitute all {{var}} placeholders in the system template.
46
- # Returns nil when no system template was set.
47
- #
48
- # @param variables [Hash{Symbol => String}]
49
- # @return [String, nil]
50
- # @api public
51
- def format_system(**variables)
52
- @system_template && substitute(@system_template, variables)
53
- end
54
-
55
- # Runnable interface: accepts a Hash of variable values.
56
- # Returns { prompt: String, system: String|nil }.
57
- #
58
- # @param input [Hash{Symbol => String}]
59
- # @return [Hash]
60
- # @api public
61
- def invoke(input, config: {})
62
- vars = normalize_input(input)
63
- result = {prompt: format(**vars)}
64
- sys = format_system(**vars)
65
- result[:system] = sys if sys
66
- result
67
- end
68
-
69
- # Returns the list of placeholder names found in both templates.
70
- #
71
- # @return [Array<Symbol>]
72
- # @api public
73
- def variables
74
- names = @template.scan(PLACEHOLDER).flatten
75
- names += @system_template.scan(PLACEHOLDER).flatten if @system_template
76
- names.map(&:to_sym).uniq
77
- end
78
-
79
- private
80
-
81
- def substitute(text, variables)
82
- text.gsub(PLACEHOLDER) do |match|
83
- key = Regexp.last_match(1).to_sym
84
- variables.fetch(key) { raise KeyError, "Missing variable: {{#{key}}}" }
85
- end
86
- end
87
-
88
- def normalize_input(input)
89
- case input
90
- when Hash then input
91
- when String then {input: input}
92
- else raise ArgumentError, "PromptTemplate#invoke expects a Hash of variables, got #{input.class}"
93
- end
94
- end
95
- end
96
- end
@@ -1,47 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Phronomy
4
- module Splitter
5
- # Abstract base class for text splitters.
6
- #
7
- # A splitter takes a single document hash (or plain text) and returns an
8
- # array of smaller chunk documents:
9
- #
10
- # [{ text: String, metadata: Hash }, ...]
11
- #
12
- # Subclasses must implement {#split}.
13
- class Base
14
- # Split +document+ into an array of chunk documents.
15
- #
16
- # @param document [Hash, String]
17
- # Either a document hash (<tt>{ text: String, metadata: Hash }</tt>)
18
- # returned by a Loader, or a plain String.
19
- # @return [Array<Hash>] array of <tt>{ text: String, metadata: Hash }</tt>
20
- # @raise [NotImplementedError] when not overridden by a subclass
21
- # @api public
22
- def split(document)
23
- raise NotImplementedError, "#{self.class}#split is not implemented"
24
- end
25
-
26
- # Convenience method: split an array of documents.
27
- #
28
- # @param documents [Array<Hash, String>]
29
- # @return [Array<Hash>]
30
- # @api public
31
- def split_all(documents)
32
- documents.flat_map { |doc| split(doc) }
33
- end
34
-
35
- private
36
-
37
- # Normalise a document-or-string argument into {text:, metadata:}.
38
- def normalise(document)
39
- case document
40
- when Hash then {text: document[:text].to_s, metadata: document.fetch(:metadata, {})}
41
- when String then {text: document, metadata: {}}
42
- else raise ArgumentError, "document must be a Hash or String, got #{document.class}"
43
- end
44
- end
45
- end
46
- end
47
- end
@@ -1,51 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Phronomy
4
- module Splitter
5
- # Splits text into fixed-size character chunks with optional overlap.
6
- #
7
- # @example
8
- # splitter = Phronomy::Splitter::FixedSizeSplitter.new(chunk_size: 200, chunk_overlap: 20)
9
- # chunks = splitter.split({ text: long_text, metadata: { source: "doc.txt" } })
10
- # # => [
11
- # # { text: "...(200 chars)...", metadata: { source: "doc.txt", chunk: 0 } },
12
- # # { text: "...(200 chars, 20-char overlap)...", metadata: { source: "doc.txt", chunk: 1 } },
13
- # # ]
14
- class FixedSizeSplitter < Base
15
- # @param chunk_size [Integer] maximum characters per chunk (default: 1000)
16
- # @param chunk_overlap [Integer] characters to repeat at the start of each
17
- # subsequent chunk (default: 200); must be less than chunk_size
18
- # @api public
19
- def initialize(chunk_size: 1000, chunk_overlap: 200)
20
- raise ArgumentError, "chunk_overlap must be less than chunk_size" if chunk_overlap >= chunk_size
21
-
22
- @chunk_size = chunk_size
23
- @chunk_overlap = chunk_overlap
24
- end
25
-
26
- # @param document [Hash, String]
27
- # @return [Array<Hash>]
28
- # @api public
29
- def split(document)
30
- doc = normalise(document)
31
- text = doc[:text]
32
- base_metadata = doc[:metadata]
33
-
34
- chunks = []
35
- start = 0
36
- index = 0
37
-
38
- while start < text.length
39
- chunk_text = text[start, @chunk_size]
40
- chunks << {text: chunk_text, metadata: base_metadata.merge(chunk: index)}
41
- break if start + @chunk_size >= text.length
42
-
43
- start += @chunk_size - @chunk_overlap
44
- index += 1
45
- end
46
-
47
- chunks
48
- end
49
- end
50
- end
51
- end
@@ -1,105 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Phronomy
4
- module Splitter
5
- # Splits text recursively using a prioritised list of separator strings.
6
- #
7
- # The splitter tries each separator in order. When a separator produces
8
- # chunks that are still larger than +chunk_size+, it recurses with the
9
- # next separator in the list. This mirrors LangChain's
10
- # RecursiveCharacterTextSplitter behaviour.
11
- #
12
- # Default separators (in priority order):
13
- # 1. "\n\n" — paragraph breaks
14
- # 2. "\n" — line breaks
15
- # 3. ". " — sentence boundaries
16
- # 4. " " — word boundaries
17
- # 5. "" — character-level fallback
18
- #
19
- # @example
20
- # splitter = Phronomy::Splitter::RecursiveSplitter.new(chunk_size: 300, chunk_overlap: 30)
21
- # chunks = splitter.split({ text: long_markdown, metadata: { source: "guide.md" } })
22
- class RecursiveSplitter < Base
23
- DEFAULT_SEPARATORS = ["\n\n", "\n", ". ", " ", ""].freeze
24
-
25
- # @param chunk_size [Integer] maximum characters per chunk (default: 1000)
26
- # @param chunk_overlap [Integer] overlap characters (default: 200)
27
- # @param separators [Array<String>] separator list in priority order
28
- # @api public
29
- def initialize(chunk_size: 1000, chunk_overlap: 200, separators: DEFAULT_SEPARATORS)
30
- raise ArgumentError, "chunk_overlap must be less than chunk_size" if chunk_overlap >= chunk_size
31
-
32
- @chunk_size = chunk_size
33
- @chunk_overlap = chunk_overlap
34
- @separators = separators
35
- end
36
-
37
- # @param document [Hash, String]
38
- # @return [Array<Hash>]
39
- # @api public
40
- def split(document)
41
- doc = normalise(document)
42
- texts = recursive_split(doc[:text], @separators)
43
- merge_with_overlap(texts).each_with_index.map do |text, idx|
44
- {text: text, metadata: doc[:metadata].merge(chunk: idx)}
45
- end
46
- end
47
-
48
- private
49
-
50
- # Split +text+ using the first separator that yields non-trivial pieces,
51
- # then recurse on any piece that is still too large.
52
- def recursive_split(text, separators)
53
- return [text] if text.length <= @chunk_size || separators.empty?
54
-
55
- sep, *rest_seps = separators
56
-
57
- # Character-level fallback: just slice
58
- if sep == ""
59
- return FixedSizeSplitter
60
- .new(chunk_size: @chunk_size, chunk_overlap: @chunk_overlap)
61
- .split(text)
62
- .map { |c| c[:text] }
63
- end
64
-
65
- parts = text.split(sep)
66
-
67
- # If this separator doesn't split, try the next
68
- return recursive_split(text, rest_seps) if parts.length <= 1
69
-
70
- # Re-attach the separator to each part except the last so context is preserved
71
- parts_with_sep = parts.each_with_index.map do |part, i|
72
- (i < parts.length - 1) ? part + sep : part
73
- end
74
-
75
- parts_with_sep.flat_map do |part|
76
- if part.length > @chunk_size
77
- recursive_split(part, rest_seps)
78
- else
79
- [part]
80
- end
81
- end.reject { |t| t.strip.empty? }
82
- end
83
-
84
- # Merge small adjacent pieces and apply overlap between chunks.
85
- def merge_with_overlap(texts)
86
- merged = []
87
- current = +""
88
-
89
- texts.each do |text|
90
- if current.length + text.length <= @chunk_size
91
- current << text
92
- else
93
- merged << current.strip unless current.strip.empty?
94
- # Start next chunk with overlap from the end of current
95
- overlap_text = (current.length > @chunk_overlap) ? current[-@chunk_overlap..] : current
96
- current = overlap_text + text
97
- end
98
- end
99
-
100
- merged << current.strip unless current.strip.empty?
101
- merged
102
- end
103
- end
104
- end
105
- end
@@ -1,12 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- module Phronomy
4
- # Text splitter implementations for chunking documents before embedding.
5
- #
6
- # Sub-classes are auto-loaded by Zeitwerk:
7
- # Phronomy::Splitter::Base
8
- # Phronomy::Splitter::FixedSizeSplitter
9
- # Phronomy::Splitter::RecursiveSplitter
10
- module Splitter
11
- end
12
- end