phronomy 0.7.1 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +35 -45
  3. data/benchmark/baseline.json +1 -1
  4. data/benchmark/bench_agent_invoke.rb +1 -1
  5. data/benchmark/bench_context_assembler.rb +11 -3
  6. data/benchmark/bench_regression.rb +11 -11
  7. data/benchmark/bench_token_estimator.rb +5 -5
  8. data/benchmark/bench_tool_schema.rb +2 -2
  9. data/docs/decisions/011-build-context-as-single-llm-input-authority.md +224 -0
  10. data/lib/phronomy/agent/base.rb +268 -403
  11. data/lib/phronomy/agent/checkpoint.rb +118 -0
  12. data/lib/phronomy/agent/concerns/suspendable.rb +6 -6
  13. data/lib/phronomy/agent/context/capability/base.rb +689 -0
  14. data/lib/phronomy/agent/context/capability/scope_policy.rb +54 -0
  15. data/lib/phronomy/agent/context/instruction/prompt_template.rb +102 -0
  16. data/lib/phronomy/agent/context/knowledge/base.rb +58 -0
  17. data/lib/phronomy/agent/context/knowledge/entity_knowledge.rb +102 -0
  18. data/lib/phronomy/agent/context/knowledge/static_knowledge.rb +58 -0
  19. data/lib/phronomy/agent/fsm.rb +1 -1
  20. data/lib/phronomy/agent/invocation_pipeline.rb +108 -0
  21. data/lib/phronomy/agent/lifecycle/fsm_session.rb +251 -0
  22. data/lib/phronomy/agent/lifecycle/phase_machine_builder.rb +249 -0
  23. data/lib/phronomy/agent/react_agent.rb +43 -37
  24. data/lib/phronomy/agent/runner.rb +2 -2
  25. data/lib/phronomy/agent/shared_state.rb +2 -2
  26. data/lib/phronomy/agent/tool_executor.rb +108 -0
  27. data/lib/phronomy/concurrency/async_queue.rb +157 -0
  28. data/lib/phronomy/concurrency/blocking_adapter_pool.rb +443 -0
  29. data/lib/phronomy/concurrency/cancellation_scope.rb +125 -0
  30. data/lib/phronomy/concurrency/cancellation_token.rb +140 -0
  31. data/lib/phronomy/concurrency/concurrency_gate.rb +157 -0
  32. data/lib/phronomy/concurrency/deadline.rb +65 -0
  33. data/lib/phronomy/{runtime → concurrency}/gate_registry.rb +1 -2
  34. data/lib/phronomy/{runtime → concurrency}/pool_registry.rb +1 -1
  35. data/lib/phronomy/configuration.rb +0 -6
  36. data/lib/phronomy/context.rb +2 -8
  37. data/lib/phronomy/eval/runner.rb +4 -0
  38. data/lib/phronomy/eval/scorer/llm_judge.rb +12 -1
  39. data/lib/phronomy/event_loop.rb +7 -7
  40. data/lib/phronomy/invocation_context.rb +3 -3
  41. data/lib/phronomy/knowledge_source.rb +0 -5
  42. data/lib/phronomy/llm_adapter/ruby_llm.rb +17 -11
  43. data/lib/phronomy/llm_context_window/assembler.rb +191 -0
  44. data/lib/phronomy/{context → llm_context_window}/context_version_cache.rb +1 -1
  45. data/lib/phronomy/{context → llm_context_window}/token_budget.rb +7 -4
  46. data/lib/phronomy/{context → llm_context_window}/token_estimator.rb +3 -3
  47. data/lib/phronomy/{agent → multi_agent}/handoff.rb +6 -6
  48. data/lib/phronomy/{agent → multi_agent}/orchestrator.rb +7 -7
  49. data/lib/phronomy/{agent → multi_agent}/parallel_tool_chat.rb +4 -4
  50. data/lib/phronomy/{agent → multi_agent}/team_coordinator.rb +4 -4
  51. data/lib/phronomy/runtime/runtime_metrics.rb +0 -1
  52. data/lib/phronomy/runtime.rb +20 -6
  53. data/lib/phronomy/task_group.rb +1 -1
  54. data/lib/phronomy/tool.rb +3 -4
  55. data/lib/phronomy/{tool/agent_tool.rb → tools/agent.rb} +6 -6
  56. data/lib/phronomy/{tool/mcp_tool.rb → tools/mcp.rb} +9 -9
  57. data/lib/phronomy/tools/vector_search.rb +70 -0
  58. data/lib/phronomy/tracing/null_tracer.rb +3 -1
  59. data/lib/phronomy/vector_store/async_backend.rb +4 -4
  60. data/lib/phronomy/vector_store/base.rb +2 -2
  61. data/lib/phronomy/vector_store/embeddings/base.rb +41 -0
  62. data/lib/phronomy/vector_store/embeddings/ruby_llm_embeddings.rb +47 -0
  63. data/lib/phronomy/vector_store/in_memory.rb +12 -2
  64. data/lib/phronomy/vector_store/loader/base.rb +27 -0
  65. data/lib/phronomy/vector_store/loader/csv_loader.rb +58 -0
  66. data/lib/phronomy/vector_store/loader/markdown_loader.rb +78 -0
  67. data/lib/phronomy/vector_store/loader/plain_text_loader.rb +24 -0
  68. data/lib/phronomy/vector_store/pgvector.rb +2 -2
  69. data/lib/phronomy/vector_store/redis_search.rb +2 -2
  70. data/lib/phronomy/vector_store/splitter/base.rb +49 -0
  71. data/lib/phronomy/vector_store/splitter/fixed_size_splitter.rb +53 -0
  72. data/lib/phronomy/vector_store/splitter/recursive_splitter.rb +107 -0
  73. data/lib/phronomy/vector_store.rb +14 -2
  74. data/lib/phronomy/version.rb +1 -1
  75. data/lib/phronomy/workflow_context.rb +8 -0
  76. data/lib/phronomy/workflow_runner.rb +11 -131
  77. data/lib/phronomy.rb +2 -0
  78. data/scripts/api_snapshot.rb +11 -9
  79. metadata +44 -46
  80. data/lib/phronomy/async_queue.rb +0 -155
  81. data/lib/phronomy/blocking_adapter_pool.rb +0 -435
  82. data/lib/phronomy/cancellation_scope.rb +0 -123
  83. data/lib/phronomy/cancellation_token.rb +0 -133
  84. data/lib/phronomy/concurrency_gate.rb +0 -155
  85. data/lib/phronomy/context/assembler.rb +0 -143
  86. data/lib/phronomy/context/compaction_context.rb +0 -111
  87. data/lib/phronomy/context/trigger_context.rb +0 -39
  88. data/lib/phronomy/context/trim_context.rb +0 -75
  89. data/lib/phronomy/deadline.rb +0 -63
  90. data/lib/phronomy/embeddings/base.rb +0 -39
  91. data/lib/phronomy/embeddings/ruby_llm_embeddings.rb +0 -45
  92. data/lib/phronomy/embeddings.rb +0 -11
  93. data/lib/phronomy/fsm_session.rb +0 -247
  94. data/lib/phronomy/knowledge_source/base.rb +0 -54
  95. data/lib/phronomy/knowledge_source/entity_knowledge.rb +0 -96
  96. data/lib/phronomy/knowledge_source/rag_knowledge.rb +0 -57
  97. data/lib/phronomy/knowledge_source/static_knowledge.rb +0 -52
  98. data/lib/phronomy/loader/base.rb +0 -25
  99. data/lib/phronomy/loader/csv_loader.rb +0 -56
  100. data/lib/phronomy/loader/markdown_loader.rb +0 -76
  101. data/lib/phronomy/loader/plain_text_loader.rb +0 -22
  102. data/lib/phronomy/loader.rb +0 -13
  103. data/lib/phronomy/prompt_template.rb +0 -96
  104. data/lib/phronomy/splitter/base.rb +0 -47
  105. data/lib/phronomy/splitter/fixed_size_splitter.rb +0 -51
  106. data/lib/phronomy/splitter/recursive_splitter.rb +0 -105
  107. data/lib/phronomy/splitter.rb +0 -12
  108. data/lib/phronomy/tool/base.rb +0 -644
  109. data/lib/phronomy/tool/scope_policy.rb +0 -50
  110. data/lib/phronomy/tool_executor.rb +0 -106
@@ -0,0 +1,70 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Phronomy
4
+ module Tools
5
+ # A Capability::Base subclass that wraps a {Phronomy::VectorStore::Base} and
6
+ # a {Phronomy::VectorStore::Embeddings::Base} adapter so that an agent can
7
+ # perform semantic search as a tool call.
8
+ #
9
+ # Do not instantiate this class directly. Use the factory method
10
+ # {.from_store} to produce a configured subclass, then pass it to your agent.
11
+ #
12
+ # @example
13
+ # store = Phronomy::VectorStore::InMemory.new
14
+ # emb = Phronomy::VectorStore::Embeddings::RubyLLMEmbeddings.new(model: "...")
15
+ # tool = Phronomy::Tools::VectorSearch.from_store(store, embeddings: emb,
16
+ # k: 3, tool_name: "search_docs",
17
+ # description: "Search the company knowledge base.")
18
+ # agent = MyAgent.new
19
+ # agent.tools tool
20
+ #
21
+ # @api public
22
+ class VectorSearch < Phronomy::Agent::Context::Capability::Base
23
+ description "Search for relevant documents using semantic similarity."
24
+ param :query, type: :string, desc: "The natural-language search query"
25
+
26
+ class << self
27
+ # Build a VectorSearch tool backed by the given store and embeddings adapter.
28
+ #
29
+ # @param store [Phronomy::VectorStore::Base]
30
+ # @param embeddings [Phronomy::VectorStore::Embeddings::Base]
31
+ # @param k [Integer] number of results to return (default 5)
32
+ # @param tool_name [String] name exposed to the LLM
33
+ # @param description [String, nil] optional description override
34
+ # @return [Class] anonymous subclass of VectorSearch configured with the given store
35
+ # @api public
36
+ def from_store(store, embeddings:, k: 5, tool_name: "vector_search", description: nil)
37
+ klass = Class.new(self)
38
+ klass.tool_name(tool_name)
39
+ klass.description(description || "Search the vector store for documents similar to the query.")
40
+
41
+ klass.define_method(:initialize) do
42
+ @store = store
43
+ @embeddings = embeddings
44
+ @k = k
45
+ end
46
+
47
+ klass.define_method(:execute) do |query:|
48
+ embedding = @embeddings.embed(query)
49
+ results = @store.search(query_embedding: embedding, k: @k)
50
+ return "No results found." if results.empty?
51
+
52
+ results.map.with_index(1) do |r, i|
53
+ content = r.dig(:metadata, :content) ||
54
+ r.dig(:metadata, :text) ||
55
+ r[:metadata].to_s
56
+ "[#{i}] (score: #{r[:score].round(3)}) #{content}"
57
+ end.join("\n")
58
+ end
59
+
60
+ klass
61
+ end
62
+ end
63
+
64
+ # @api public
65
+ def execute(query:)
66
+ raise NotImplementedError, "Use VectorSearch.from_store to create a configured instance"
67
+ end
68
+ end
69
+ end
70
+ end
@@ -16,7 +16,9 @@ module Phronomy
16
16
  # Returns a minimal span object with the given name.
17
17
  def start_span(name, **) = SpanStruct.new(name)
18
18
 
19
- # Does nothing.
19
+ # Does nothing. Explicit nil is equivalent to an empty method body; the
20
+ # mutation "remove nil" is accepted as it does not change observable behaviour.
21
+ # mutant:disable
20
22
  def finish_span(span, **) = nil
21
23
  end
22
24
  end
@@ -36,7 +36,7 @@ module Phronomy
36
36
  # @param id [String]
37
37
  # @param embedding [Array<Float>]
38
38
  # @param metadata [Hash]
39
- # @param cancellation_token [Phronomy::CancellationToken, nil]
39
+ # @param cancellation_token [Phronomy::Concurrency::CancellationToken, nil]
40
40
  # @param timeout [Numeric, nil]
41
41
  # @return [BlockingAdapterPool::PendingOperation]
42
42
  # @api public
@@ -56,7 +56,7 @@ module Phronomy
56
56
  #
57
57
  # @param query_embedding [Array<Float>]
58
58
  # @param k [Integer]
59
- # @param cancellation_token [Phronomy::CancellationToken, nil]
59
+ # @param cancellation_token [Phronomy::Concurrency::CancellationToken, nil]
60
60
  # @param timeout [Numeric, nil]
61
61
  # @return [BlockingAdapterPool::PendingOperation]
62
62
  # @api public
@@ -75,7 +75,7 @@ module Phronomy
75
75
  # Override to use a native async driver.
76
76
  #
77
77
  # @param id [String]
78
- # @param cancellation_token [Phronomy::CancellationToken, nil]
78
+ # @param cancellation_token [Phronomy::Concurrency::CancellationToken, nil]
79
79
  # @param timeout [Numeric, nil]
80
80
  # @return [BlockingAdapterPool::PendingOperation]
81
81
  # @api public
@@ -93,7 +93,7 @@ module Phronomy
93
93
  # Submits the clear call to {BlockingAdapterPool} by default.
94
94
  # Override to use a native async driver.
95
95
  #
96
- # @param cancellation_token [Phronomy::CancellationToken, nil]
96
+ # @param cancellation_token [Phronomy::Concurrency::CancellationToken, nil]
97
97
  # @param timeout [Numeric, nil]
98
98
  # @return [BlockingAdapterPool::PendingOperation]
99
99
  # @api public
@@ -19,7 +19,7 @@ module Phronomy
19
19
  # @param id [String] unique document identifier
20
20
  # @param embedding [Array<Float>] vector embedding
21
21
  # @param metadata [Hash] arbitrary metadata (e.g. the original message object)
22
- # @param cancellation_token [Phronomy::CancellationToken, nil] optional; raises CancellationError when cancelled
22
+ # @param cancellation_token [Phronomy::Concurrency::CancellationToken, nil] optional; raises CancellationError when cancelled
23
23
  # @api public
24
24
  def add(id:, embedding:, metadata: {}, cancellation_token: nil)
25
25
  cancellation_token&.raise_if_cancelled!
@@ -30,7 +30,7 @@ module Phronomy
30
30
  #
31
31
  # @param query_embedding [Array<Float>]
32
32
  # @param k [Integer] number of results
33
- # @param cancellation_token [Phronomy::CancellationToken, nil] optional; raises CancellationError when cancelled
33
+ # @param cancellation_token [Phronomy::Concurrency::CancellationToken, nil] optional; raises CancellationError when cancelled
34
34
  # @return [Array<Hash>] each element: { id:, score:, metadata: }
35
35
  # @api public
36
36
  def search(query_embedding:, k: 5, cancellation_token: nil)
@@ -0,0 +1,41 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Phronomy
4
+ module VectorStore
5
+ module Embeddings
6
+ # Abstract interface for embedding adapters.
7
+ #
8
+ # Concrete implementations must override {#embed} and return a vector
9
+ # as an +Array<Float>+.
10
+ class Base
11
+ # Embed the given text and return a vector representation.
12
+ #
13
+ # @param text [String] the text to embed
14
+ # @param cancellation_token [Phronomy::Concurrency::CancellationToken, nil] optional; raises CancellationError when cancelled
15
+ # @return [Array<Float>] the embedding vector
16
+ # @api public
17
+ def embed(text, cancellation_token = nil)
18
+ cancellation_token&.raise_if_cancelled!
19
+ raise NotImplementedError, "#{self.class}#embed is not implemented"
20
+ end
21
+
22
+ # Submits an {#embed} call to {BlockingAdapterPool} and returns a
23
+ # {BlockingAdapterPool::PendingOperation}.
24
+ #
25
+ # @param text [String]
26
+ # @param cancellation_token [Phronomy::Concurrency::CancellationToken, nil]
27
+ # @param timeout [Numeric, nil] seconds before the operation is abandoned
28
+ # @return [BlockingAdapterPool::PendingOperation]
29
+ # @api public
30
+ def embed_async(text, cancellation_token = nil, timeout: nil)
31
+ Phronomy::Runtime.instance.blocking_io.submit(
32
+ timeout: timeout,
33
+ cancellation_token: cancellation_token
34
+ ) do
35
+ embed(text, cancellation_token)
36
+ end
37
+ end
38
+ end
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,47 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Phronomy
4
+ module VectorStore
5
+ module Embeddings
6
+ # Embeddings adapter backed by RubyLLM.
7
+ #
8
+ # Delegates to +RubyLLM.embed+ and returns the resulting vector as an
9
+ # +Array<Float>+.
10
+ #
11
+ # @example Default model
12
+ # embeddings = Phronomy::VectorStore::Embeddings::RubyLLMEmbeddings.new
13
+ # vector = embeddings.embed("Hello, world!")
14
+ #
15
+ # @example Explicit model
16
+ # embeddings = Phronomy::VectorStore::Embeddings::RubyLLMEmbeddings.new(model: "text-embedding-3-small")
17
+ # vector = embeddings.embed("Hello, world!")
18
+ class RubyLLMEmbeddings < Base
19
+ # @param model [String, nil] embedding model identifier; nil uses the RubyLLM default
20
+ # @param provider [Symbol, nil] provider override (e.g. :openai); nil uses the RubyLLM default
21
+ # @param assume_model_exists [Boolean] when true, skips RubyLLM model-registry validation
22
+ # (useful for locally hosted models not in the registry)
23
+ # @api public
24
+ def initialize(model: nil, provider: nil, assume_model_exists: false)
25
+ @model = model
26
+ @provider = provider
27
+ @assume_model_exists = assume_model_exists
28
+ end
29
+
30
+ # Embed text via RubyLLM.
31
+ #
32
+ # @param text [String]
33
+ # @param cancellation_token [Phronomy::Concurrency::CancellationToken, nil] optional; raises CancellationError when cancelled
34
+ # @return [Array<Float>]
35
+ # @api public
36
+ def embed(text, cancellation_token = nil)
37
+ cancellation_token&.raise_if_cancelled!
38
+ opts = {}
39
+ opts[:model] = @model if @model
40
+ opts[:provider] = @provider if @provider
41
+ opts[:assume_model_exists] = true if @assume_model_exists
42
+ RubyLLM.embed(text, **opts).vectors
43
+ end
44
+ end
45
+ end
46
+ end
47
+ end
@@ -25,7 +25,7 @@ module Phronomy
25
25
  # @param id [String]
26
26
  # @param embedding [Array<Float>]
27
27
  # @param metadata [Hash]
28
- # @param cancellation_token [Phronomy::CancellationToken, nil]
28
+ # @param cancellation_token [Phronomy::Concurrency::CancellationToken, nil]
29
29
  # @api public
30
30
  def add(id:, embedding:, metadata: {}, cancellation_token: nil)
31
31
  cancellation_token&.raise_if_cancelled!
@@ -38,9 +38,14 @@ module Phronomy
38
38
 
39
39
  # @param query_embedding [Array<Float>]
40
40
  # @param k [Integer]
41
- # @param cancellation_token [Phronomy::CancellationToken, nil]
41
+ # @param cancellation_token [Phronomy::Concurrency::CancellationToken, nil]
42
42
  # @return [Array<Hash>] sorted by descending score
43
43
  # @api public
44
+ # mutant:disable - genuine equivalent mutations: doc.fetch(:embedding) vs doc[:embedding] (key
45
+ # always present); {id:, score:, metadata: doc.fetch(:metadata)} shorthand+fetch vs []
46
+ # (key always present); -r.fetch(:score) vs -r[:score] (key always present); snapshot = @documents
47
+ # vs .dup is equivalent in single-threaded tests (GVL makes Hash#dup atomic, no behaviour
48
+ # difference under test isolation)
44
49
  def search(query_embedding:, k: 5, cancellation_token: nil)
45
50
  cancellation_token&.raise_if_cancelled!
46
51
  k = validate_k!(k)
@@ -77,6 +82,11 @@ module Phronomy
77
82
 
78
83
  private
79
84
 
85
+ # mutant:disable - empty-vector early-return condition variants (if false, if nil, if a.empty?,
86
+ # if b.empty?, if a.empty? && b.empty?, if a.empty? || false, if false || b.empty?,
87
+ # if nil || b.empty?, if nil && b.empty?) are genuine equivalents: dimension validation in
88
+ # #add and #search enforces same-size embeddings, so a.empty? iff b.empty?; when both are
89
+ # empty norm_a = sqrt(0) = 0 so the norm_a.zero? guard returns 0.0 anyway
80
90
  def cosine_similarity(a, b)
81
91
  return 0.0 if a.empty? || b.empty?
82
92
 
@@ -0,0 +1,27 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Phronomy
4
+ module VectorStore
5
+ module Loader
6
+ # Abstract base class for document loaders.
7
+ #
8
+ # A loader converts an external source (file path, URL, etc.) into an
9
+ # Array of document hashes understood by the rest of the pipeline:
10
+ #
11
+ # [{ text: String, metadata: Hash }, ...]
12
+ #
13
+ # Subclasses must implement {#load}.
14
+ class Base
15
+ # Load documents from +source+ and return an array of document hashes.
16
+ #
17
+ # @param source [String] file path, URL, or other source identifier
18
+ # @return [Array<Hash>] array of <tt>{ text: String, metadata: Hash }</tt>
19
+ # @raise [NotImplementedError] when not overridden by a subclass
20
+ # @api public
21
+ def load(source)
22
+ raise NotImplementedError, "#{self.class}#load is not implemented"
23
+ end
24
+ end
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,58 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "csv"
4
+
5
+ module Phronomy
6
+ module VectorStore
7
+ module Loader
8
+ # Loads a CSV file, converting each row into a separate document.
9
+ #
10
+ # By default the first row is treated as a header and column names are
11
+ # available in the document metadata. The full row is serialised to
12
+ # a human-readable "key: value" string for embedding.
13
+ #
14
+ # @example
15
+ # loader = Phronomy::VectorStore::Loader::CsvLoader.new
16
+ # docs = loader.load("products.csv")
17
+ # # => [
18
+ # # { text: "name: Widget\nprice: 9.99", metadata: { source: "...", row: 1, name: "Widget", price: "9.99" } },
19
+ # # ...
20
+ # # ]
21
+ class CsvLoader < Base
22
+ # @param headers [Boolean] treat the first row as headers (default: true)
23
+ # @param text_column [String, nil] if set, use only this column as the document text
24
+ # @api public
25
+ def initialize(headers: true, text_column: nil)
26
+ @headers = headers
27
+ @text_column = text_column
28
+ end
29
+
30
+ # @param source [String] path to a CSV file
31
+ # @return [Array<Hash>]
32
+ # @raise [Errno::ENOENT] if the file does not exist
33
+ # @api public
34
+ def load(source)
35
+ rows = CSV.read(source, headers: @headers, encoding: "UTF-8")
36
+
37
+ if @headers
38
+ rows.each_with_index.map do |row, idx|
39
+ row_hash = row.to_h
40
+ text = if @text_column
41
+ row_hash[@text_column].to_s
42
+ else
43
+ row_hash.map { |k, v| "#{k}: #{v}" }.join("\n")
44
+ end
45
+ metadata = row_hash.transform_keys(&:to_sym).merge(source: source, row: idx + 1)
46
+ {text: text, metadata: metadata}
47
+ end
48
+ else
49
+ rows.each_with_index.map do |row, idx|
50
+ text = row.join(", ")
51
+ {text: text, metadata: {source: source, row: idx + 1}}
52
+ end
53
+ end
54
+ end
55
+ end
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,78 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Phronomy
4
+ module VectorStore
5
+ module Loader
6
+ # Loads a Markdown file, optionally splitting on top-level headings.
7
+ #
8
+ # When +split_on_headings:+ is true (the default), each H1/H2 section
9
+ # becomes a separate document so that embeddings capture section semantics
10
+ # rather than the full file at once.
11
+ #
12
+ # @example Single document (heading split disabled)
13
+ # loader = Phronomy::VectorStore::Loader::MarkdownLoader.new(split_on_headings: false)
14
+ # docs = loader.load("README.md")
15
+ # # => [{ text: "# Title\n...", metadata: { source: "README.md" } }]
16
+ #
17
+ # @example Split per heading (default)
18
+ # loader = Phronomy::VectorStore::Loader::MarkdownLoader.new
19
+ # docs = loader.load("guide.md")
20
+ # # => [
21
+ # # { text: "# Section 1\n...", metadata: { source: "guide.md", section: "Section 1" } },
22
+ # # { text: "## Sub-section\n...", metadata: { source: "guide.md", section: "Sub-section" } },
23
+ # # ]
24
+ class MarkdownLoader < Base
25
+ HEADING_RE = /^(\#{1,6})\s+(.+)$/
26
+
27
+ # @param split_on_headings [Boolean] split on H1–H6 boundaries (default: true)
28
+ # @api public
29
+ def initialize(split_on_headings: true)
30
+ @split_on_headings = split_on_headings
31
+ end
32
+
33
+ # @param source [String] path to a Markdown file
34
+ # @return [Array<Hash>]
35
+ # @raise [Errno::ENOENT] if the file does not exist
36
+ # @api public
37
+ def load(source)
38
+ content = File.read(source, encoding: "UTF-8")
39
+ return [{text: content, metadata: {source: source}}] unless @split_on_headings
40
+
41
+ split_by_headings(content, source)
42
+ end
43
+
44
+ private
45
+
46
+ def split_by_headings(content, source)
47
+ sections = []
48
+ current_lines = []
49
+ current_heading = nil
50
+
51
+ content.each_line do |line|
52
+ if (m = HEADING_RE.match(line.chomp))
53
+ flush_section(sections, current_lines, current_heading, source) if current_lines.any?
54
+ current_heading = m[2].strip
55
+ current_lines = [line]
56
+ else
57
+ current_lines << line
58
+ end
59
+ end
60
+
61
+ flush_section(sections, current_lines, current_heading, source) if current_lines.any?
62
+
63
+ # Fall back to single document if no headings were found
64
+ sections.empty? ? [{text: content, metadata: {source: source}}] : sections
65
+ end
66
+
67
+ def flush_section(sections, lines, heading, source)
68
+ text = lines.join
69
+ return if text.strip.empty?
70
+
71
+ metadata = {source: source}
72
+ metadata[:section] = heading if heading
73
+ sections << {text: text, metadata: metadata}
74
+ end
75
+ end
76
+ end
77
+ end
78
+ end
@@ -0,0 +1,24 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Phronomy
4
+ module VectorStore
5
+ module Loader
6
+ # Loads a plain-text file as a single document.
7
+ #
8
+ # @example
9
+ # loader = Phronomy::VectorStore::Loader::PlainTextLoader.new
10
+ # docs = loader.load("/path/to/file.txt")
11
+ # # => [{ text: "...", metadata: { source: "/path/to/file.txt" } }]
12
+ class PlainTextLoader < Base
13
+ # @param source [String] absolute or relative path to a text file
14
+ # @return [Array<Hash>] single-element array with the file contents
15
+ # @raise [Errno::ENOENT] if the file does not exist
16
+ # @api public
17
+ def load(source)
18
+ text = File.read(source, encoding: "UTF-8")
19
+ [{text: text, metadata: {source: source}}]
20
+ end
21
+ end
22
+ end
23
+ end
24
+ end
@@ -38,7 +38,7 @@ module Phronomy
38
38
  # @param id [String]
39
39
  # @param embedding [Array<Float>]
40
40
  # @param metadata [Hash]
41
- # @param cancellation_token [Phronomy::CancellationToken, nil]
41
+ # @param cancellation_token [Phronomy::Concurrency::CancellationToken, nil]
42
42
  # @api public
43
43
  def add(id:, embedding:, metadata: {}, cancellation_token: nil)
44
44
  cancellation_token&.raise_if_cancelled!
@@ -52,7 +52,7 @@ module Phronomy
52
52
 
53
53
  # @param query_embedding [Array<Float>]
54
54
  # @param k [Integer]
55
- # @param cancellation_token [Phronomy::CancellationToken, nil]
55
+ # @param cancellation_token [Phronomy::Concurrency::CancellationToken, nil]
56
56
  # @return [Array<Hash>] sorted by descending similarity score
57
57
  # @api public
58
58
  def search(query_embedding:, k: 5, cancellation_token: nil)
@@ -49,7 +49,7 @@ module Phronomy
49
49
  # @param id [String]
50
50
  # @param embedding [Array<Float>]
51
51
  # @param metadata [Hash]
52
- # @param cancellation_token [Phronomy::CancellationToken, nil]
52
+ # @param cancellation_token [Phronomy::Concurrency::CancellationToken, nil]
53
53
  # @api public
54
54
  def add(id:, embedding:, metadata: {}, cancellation_token: nil)
55
55
  cancellation_token&.raise_if_cancelled!
@@ -68,7 +68,7 @@ module Phronomy
68
68
 
69
69
  # @param query_embedding [Array<Float>]
70
70
  # @param k [Integer]
71
- # @param cancellation_token [Phronomy::CancellationToken, nil]
71
+ # @param cancellation_token [Phronomy::Concurrency::CancellationToken, nil]
72
72
  # @return [Array<Hash>] sorted by descending similarity score
73
73
  # @api public
74
74
  def search(query_embedding:, k: 5, cancellation_token: nil)
@@ -0,0 +1,49 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Phronomy
4
+ module VectorStore
5
+ module Splitter
6
+ # Abstract base class for text splitters.
7
+ #
8
+ # A splitter takes a single document hash (or plain text) and returns an
9
+ # array of smaller chunk documents:
10
+ #
11
+ # [{ text: String, metadata: Hash }, ...]
12
+ #
13
+ # Subclasses must implement {#split}.
14
+ class Base
15
+ # Split +document+ into an array of chunk documents.
16
+ #
17
+ # @param document [Hash, String]
18
+ # Either a document hash (<tt>{ text: String, metadata: Hash }</tt>)
19
+ # returned by a Loader, or a plain String.
20
+ # @return [Array<Hash>] array of <tt>{ text: String, metadata: Hash }</tt>
21
+ # @raise [NotImplementedError] when not overridden by a subclass
22
+ # @api public
23
+ def split(document)
24
+ raise NotImplementedError, "#{self.class}#split is not implemented"
25
+ end
26
+
27
+ # Convenience method: split an array of documents.
28
+ #
29
+ # @param documents [Array<Hash, String>]
30
+ # @return [Array<Hash>]
31
+ # @api public
32
+ def split_all(documents)
33
+ documents.flat_map { |doc| split(doc) }
34
+ end
35
+
36
+ private
37
+
38
+ # Normalise a document-or-string argument into {text:, metadata:}.
39
+ def normalise(document)
40
+ case document
41
+ when Hash then {text: document[:text].to_s, metadata: document.fetch(:metadata, {})}
42
+ when String then {text: document, metadata: {}}
43
+ else raise ArgumentError, "document must be a Hash or String, got #{document.class}"
44
+ end
45
+ end
46
+ end
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,53 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Phronomy
4
+ module VectorStore
5
+ module Splitter
6
+ # Splits text into fixed-size character chunks with optional overlap.
7
+ #
8
+ # @example
9
+ # splitter = Phronomy::VectorStore::Splitter::FixedSizeSplitter.new(chunk_size: 200, chunk_overlap: 20)
10
+ # chunks = splitter.split({ text: long_text, metadata: { source: "doc.txt" } })
11
+ # # => [
12
+ # # { text: "...(200 chars)...", metadata: { source: "doc.txt", chunk: 0 } },
13
+ # # { text: "...(200 chars, 20-char overlap)...", metadata: { source: "doc.txt", chunk: 1 } },
14
+ # # ]
15
+ class FixedSizeSplitter < Base
16
+ # @param chunk_size [Integer] maximum characters per chunk (default: 1000)
17
+ # @param chunk_overlap [Integer] characters to repeat at the start of each
18
+ # subsequent chunk (default: 200); must be less than chunk_size
19
+ # @api public
20
+ def initialize(chunk_size: 1000, chunk_overlap: 200)
21
+ raise ArgumentError, "chunk_overlap must be less than chunk_size" if chunk_overlap >= chunk_size
22
+
23
+ @chunk_size = chunk_size
24
+ @chunk_overlap = chunk_overlap
25
+ end
26
+
27
+ # @param document [Hash, String]
28
+ # @return [Array<Hash>]
29
+ # @api public
30
+ def split(document)
31
+ doc = normalise(document)
32
+ text = doc[:text]
33
+ base_metadata = doc[:metadata]
34
+
35
+ chunks = []
36
+ start = 0
37
+ index = 0
38
+
39
+ while start < text.length
40
+ chunk_text = text[start, @chunk_size]
41
+ chunks << {text: chunk_text, metadata: base_metadata.merge(chunk: index)}
42
+ break if start + @chunk_size >= text.length
43
+
44
+ start += @chunk_size - @chunk_overlap
45
+ index += 1
46
+ end
47
+
48
+ chunks
49
+ end
50
+ end
51
+ end
52
+ end
53
+ end