phronomy 0.8.0 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +31 -41
  3. data/benchmark/baseline.json +1 -1
  4. data/benchmark/bench_agent_invoke.rb +1 -1
  5. data/benchmark/bench_context_assembler.rb +9 -1
  6. data/benchmark/bench_regression.rb +8 -8
  7. data/benchmark/bench_tool_schema.rb +2 -2
  8. data/benchmark/bench_vector_store.rb +1 -1
  9. data/docs/decisions/011-build-context-as-single-llm-input-authority.md +224 -0
  10. data/lib/phronomy/agent/base.rb +253 -351
  11. data/lib/phronomy/agent/concerns/suspendable.rb +6 -6
  12. data/lib/phronomy/agent/context/capability/base.rb +689 -0
  13. data/lib/phronomy/agent/context/capability/scope_policy.rb +54 -0
  14. data/lib/phronomy/agent/context/knowledge/base.rb +58 -0
  15. data/lib/phronomy/agent/context/knowledge/entity_knowledge.rb +102 -0
  16. data/lib/phronomy/agent/context/knowledge/static_knowledge.rb +58 -0
  17. data/lib/phronomy/agent/invocation_pipeline.rb +10 -1
  18. data/lib/phronomy/agent/react_agent.rb +24 -23
  19. data/lib/phronomy/agent/shared_state.rb +2 -2
  20. data/lib/phronomy/agent/tool_executor.rb +1 -1
  21. data/lib/phronomy/concurrency/gate_registry.rb +0 -1
  22. data/lib/phronomy/configuration.rb +0 -6
  23. data/lib/phronomy/llm_context_window/assembler.rb +77 -44
  24. data/lib/phronomy/multi_agent/handoff.rb +4 -4
  25. data/lib/phronomy/multi_agent/orchestrator.rb +1 -1
  26. data/lib/phronomy/multi_agent/team_coordinator.rb +2 -2
  27. data/lib/phronomy/runtime/runtime_metrics.rb +0 -1
  28. data/lib/phronomy/runtime.rb +1 -2
  29. data/lib/phronomy/tool.rb +3 -4
  30. data/lib/phronomy/{tool/agent_tool.rb → tools/agent.rb} +6 -6
  31. data/lib/phronomy/{tool/mcp_tool.rb → tools/mcp.rb} +9 -9
  32. data/lib/phronomy/tools/vector_search.rb +70 -0
  33. data/lib/phronomy/vector_store/async_backend.rb +110 -0
  34. data/lib/phronomy/vector_store/base.rb +89 -0
  35. data/lib/phronomy/vector_store/embeddings/base.rb +41 -0
  36. data/lib/phronomy/vector_store/embeddings/ruby_llm_embeddings.rb +47 -0
  37. data/lib/phronomy/vector_store/in_memory.rb +103 -0
  38. data/lib/phronomy/vector_store/loader/base.rb +27 -0
  39. data/lib/phronomy/vector_store/loader/csv_loader.rb +58 -0
  40. data/lib/phronomy/vector_store/loader/markdown_loader.rb +78 -0
  41. data/lib/phronomy/vector_store/loader/plain_text_loader.rb +24 -0
  42. data/lib/phronomy/vector_store/pgvector.rb +127 -0
  43. data/lib/phronomy/vector_store/redis_search.rb +192 -0
  44. data/lib/phronomy/vector_store/splitter/base.rb +49 -0
  45. data/lib/phronomy/vector_store/splitter/fixed_size_splitter.rb +53 -0
  46. data/lib/phronomy/vector_store/splitter/recursive_splitter.rb +107 -0
  47. data/lib/phronomy/vector_store.rb +16 -4
  48. data/lib/phronomy/version.rb +1 -1
  49. data/lib/phronomy.rb +2 -1
  50. data/scripts/api_snapshot.rb +11 -9
  51. metadata +28 -32
  52. data/lib/phronomy/agent/context/conversation/compaction_context.rb +0 -117
  53. data/lib/phronomy/agent/context/conversation/trigger_context.rb +0 -43
  54. data/lib/phronomy/agent/context/conversation/trim_context.rb +0 -82
  55. data/lib/phronomy/agent/context/knowledge/embeddings/base.rb +0 -45
  56. data/lib/phronomy/agent/context/knowledge/embeddings/ruby_llm_embeddings.rb +0 -51
  57. data/lib/phronomy/agent/context/knowledge/loader/base.rb +0 -31
  58. data/lib/phronomy/agent/context/knowledge/loader/csv_loader.rb +0 -62
  59. data/lib/phronomy/agent/context/knowledge/loader/markdown_loader.rb +0 -82
  60. data/lib/phronomy/agent/context/knowledge/loader/plain_text_loader.rb +0 -28
  61. data/lib/phronomy/agent/context/knowledge/source/base.rb +0 -60
  62. data/lib/phronomy/agent/context/knowledge/source/entity_knowledge.rb +0 -102
  63. data/lib/phronomy/agent/context/knowledge/source/rag_knowledge.rb +0 -63
  64. data/lib/phronomy/agent/context/knowledge/source/static_knowledge.rb +0 -58
  65. data/lib/phronomy/agent/context/knowledge/splitter/base.rb +0 -53
  66. data/lib/phronomy/agent/context/knowledge/splitter/fixed_size_splitter.rb +0 -57
  67. data/lib/phronomy/agent/context/knowledge/splitter/recursive_splitter.rb +0 -111
  68. data/lib/phronomy/agent/context/knowledge/vector_store/async_backend.rb +0 -116
  69. data/lib/phronomy/agent/context/knowledge/vector_store/base.rb +0 -95
  70. data/lib/phronomy/agent/context/knowledge/vector_store/in_memory.rb +0 -109
  71. data/lib/phronomy/agent/context/knowledge/vector_store/pgvector.rb +0 -133
  72. data/lib/phronomy/agent/context/knowledge/vector_store/redis_search.rb +0 -198
  73. data/lib/phronomy/embeddings.rb +0 -11
  74. data/lib/phronomy/loader.rb +0 -13
  75. data/lib/phronomy/splitter.rb +0 -12
  76. data/lib/phronomy/tool/base.rb +0 -685
  77. data/lib/phronomy/tool/scope_policy.rb +0 -50
@@ -0,0 +1,58 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "csv"
4
+
5
+ module Phronomy
6
+ module VectorStore
7
+ module Loader
8
+ # Loads a CSV file, converting each row into a separate document.
9
+ #
10
+ # By default the first row is treated as a header and column names are
11
+ # available in the document metadata. The full row is serialised to
12
+ # a human-readable "key: value" string for embedding.
13
+ #
14
+ # @example
15
+ # loader = Phronomy::VectorStore::Loader::CsvLoader.new
16
+ # docs = loader.load("products.csv")
17
+ # # => [
18
+ # # { text: "name: Widget\nprice: 9.99", metadata: { source: "...", row: 1, name: "Widget", price: "9.99" } },
19
+ # # ...
20
+ # # ]
21
+ class CsvLoader < Base
22
+ # @param headers [Boolean] treat the first row as headers (default: true)
23
+ # @param text_column [String, nil] if set, use only this column as the document text
24
+ # @api public
25
+ def initialize(headers: true, text_column: nil)
26
+ @headers = headers
27
+ @text_column = text_column
28
+ end
29
+
30
+ # @param source [String] path to a CSV file
31
+ # @return [Array<Hash>]
32
+ # @raise [Errno::ENOENT] if the file does not exist
33
+ # @api public
34
+ def load(source)
35
+ rows = CSV.read(source, headers: @headers, encoding: "UTF-8")
36
+
37
+ if @headers
38
+ rows.each_with_index.map do |row, idx|
39
+ row_hash = row.to_h
40
+ text = if @text_column
41
+ row_hash[@text_column].to_s
42
+ else
43
+ row_hash.map { |k, v| "#{k}: #{v}" }.join("\n")
44
+ end
45
+ metadata = row_hash.transform_keys(&:to_sym).merge(source: source, row: idx + 1)
46
+ {text: text, metadata: metadata}
47
+ end
48
+ else
49
+ rows.each_with_index.map do |row, idx|
50
+ text = row.join(", ")
51
+ {text: text, metadata: {source: source, row: idx + 1}}
52
+ end
53
+ end
54
+ end
55
+ end
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,78 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Phronomy
4
+ module VectorStore
5
+ module Loader
6
+ # Loads a Markdown file, optionally splitting on top-level headings.
7
+ #
8
+ # When +split_on_headings:+ is true (the default), each H1/H2 section
9
+ # becomes a separate document so that embeddings capture section semantics
10
+ # rather than the full file at once.
11
+ #
12
+ # @example Single document (heading split disabled)
13
+ # loader = Phronomy::VectorStore::Loader::MarkdownLoader.new(split_on_headings: false)
14
+ # docs = loader.load("README.md")
15
+ # # => [{ text: "# Title\n...", metadata: { source: "README.md" } }]
16
+ #
17
+ # @example Split per heading (default)
18
+ # loader = Phronomy::VectorStore::Loader::MarkdownLoader.new
19
+ # docs = loader.load("guide.md")
20
+ # # => [
21
+ # # { text: "# Section 1\n...", metadata: { source: "guide.md", section: "Section 1" } },
22
+ # # { text: "## Sub-section\n...", metadata: { source: "guide.md", section: "Sub-section" } },
23
+ # # ]
24
+ class MarkdownLoader < Base
25
+ HEADING_RE = /^(\#{1,6})\s+(.+)$/
26
+
27
+ # @param split_on_headings [Boolean] split on H1–H6 boundaries (default: true)
28
+ # @api public
29
+ def initialize(split_on_headings: true)
30
+ @split_on_headings = split_on_headings
31
+ end
32
+
33
+ # @param source [String] path to a Markdown file
34
+ # @return [Array<Hash>]
35
+ # @raise [Errno::ENOENT] if the file does not exist
36
+ # @api public
37
+ def load(source)
38
+ content = File.read(source, encoding: "UTF-8")
39
+ return [{text: content, metadata: {source: source}}] unless @split_on_headings
40
+
41
+ split_by_headings(content, source)
42
+ end
43
+
44
+ private
45
+
46
+ def split_by_headings(content, source)
47
+ sections = []
48
+ current_lines = []
49
+ current_heading = nil
50
+
51
+ content.each_line do |line|
52
+ if (m = HEADING_RE.match(line.chomp))
53
+ flush_section(sections, current_lines, current_heading, source) if current_lines.any?
54
+ current_heading = m[2].strip
55
+ current_lines = [line]
56
+ else
57
+ current_lines << line
58
+ end
59
+ end
60
+
61
+ flush_section(sections, current_lines, current_heading, source) if current_lines.any?
62
+
63
+ # Fall back to single document if no headings were found
64
+ sections.empty? ? [{text: content, metadata: {source: source}}] : sections
65
+ end
66
+
67
+ def flush_section(sections, lines, heading, source)
68
+ text = lines.join
69
+ return if text.strip.empty?
70
+
71
+ metadata = {source: source}
72
+ metadata[:section] = heading if heading
73
+ sections << {text: text, metadata: metadata}
74
+ end
75
+ end
76
+ end
77
+ end
78
+ end
@@ -0,0 +1,24 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Phronomy
4
+ module VectorStore
5
+ module Loader
6
+ # Loads a plain-text file as a single document.
7
+ #
8
+ # @example
9
+ # loader = Phronomy::VectorStore::Loader::PlainTextLoader.new
10
+ # docs = loader.load("/path/to/file.txt")
11
+ # # => [{ text: "...", metadata: { source: "/path/to/file.txt" } }]
12
+ class PlainTextLoader < Base
13
+ # @param source [String] absolute or relative path to a text file
14
+ # @return [Array<Hash>] single-element array with the file contents
15
+ # @raise [Errno::ENOENT] if the file does not exist
16
+ # @api public
17
+ def load(source)
18
+ text = File.read(source, encoding: "UTF-8")
19
+ [{text: text, metadata: {source: source}}]
20
+ end
21
+ end
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,127 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "json"
4
+
5
+ module Phronomy
6
+ module VectorStore
7
+ # PostgreSQL-backed vector store using the pgvector extension.
8
+ #
9
+ # Requires:
10
+ # - The +pgvector+ gem (add to your Gemfile)
11
+ # - An ActiveRecord model class with the following columns:
12
+ # id (string / uuid)
13
+ # embedding (vector — from the pgvector column type)
14
+ # metadata (text or jsonb — stores arbitrary metadata as JSON)
15
+ #
16
+ # @example Usage
17
+ # store = Phronomy::VectorStore::Pgvector.new(model_class: VectorDocument)
18
+ # store.add(id: "doc1", embedding: [0.1, 0.9], metadata: {text: "hello"})
19
+ # results = store.search(query_embedding: [0.1, 0.8], k: 5)
20
+ class Pgvector < Base
21
+ # @param model_class [Class] ActiveRecord model with id/embedding/metadata columns
22
+ # @param dimension [Integer, nil] expected embedding dimension for Phronomy-side
23
+ # pre-validation. When nil, dimension enforcement is delegated to the
24
+ # database schema; no pre-validation is performed by Phronomy.
25
+ # @api public
26
+ def initialize(model_class:, dimension: nil)
27
+ begin
28
+ require "pgvector"
29
+ rescue LoadError
30
+ raise LoadError,
31
+ "pgvector gem is required for Phronomy::VectorStore::Pgvector. " \
32
+ "Add `gem 'pgvector'` to your Gemfile."
33
+ end
34
+ @model_class = model_class
35
+ @dimension = dimension
36
+ end
37
+
38
+ # @param id [String]
39
+ # @param embedding [Array<Float>]
40
+ # @param metadata [Hash]
41
+ # @param cancellation_token [Phronomy::Concurrency::CancellationToken, nil]
42
+ # @api public
43
+ def add(id:, embedding:, metadata: {}, cancellation_token: nil)
44
+ cancellation_token&.raise_if_cancelled!
45
+ validate_embedding_dimension!(embedding, @dimension)
46
+ @model_class.upsert(
47
+ {id: id, embedding: safe_vector(embedding), metadata: metadata.to_json},
48
+ unique_by: :id
49
+ )
50
+ self
51
+ end
52
+
53
+ # @param query_embedding [Array<Float>]
54
+ # @param k [Integer]
55
+ # @param cancellation_token [Phronomy::Concurrency::CancellationToken, nil]
56
+ # @return [Array<Hash>] sorted by descending similarity score
57
+ # @api public
58
+ def search(query_embedding:, k: 5, cancellation_token: nil)
59
+ cancellation_token&.raise_if_cancelled!
60
+ k_safe = validate_k!(k)
61
+ validate_embedding_dimension!(query_embedding, @dimension)
62
+ vec = safe_vector_literal(query_embedding)
63
+ conn = @model_class.connection
64
+ quoted_vec = "#{conn.quote(vec)}::vector"
65
+
66
+ @model_class
67
+ .select("id, metadata, 1 - (embedding <=> #{quoted_vec}) AS score")
68
+ .order("embedding <=> #{quoted_vec}")
69
+ .limit(k_safe)
70
+ .map do |r|
71
+ {
72
+ id: r.id.to_s,
73
+ score: r.score.to_f,
74
+ metadata: parse_metadata(r.metadata)
75
+ }
76
+ end
77
+ end
78
+
79
+ def remove(id:)
80
+ @model_class.where(id: id).delete_all
81
+ self
82
+ end
83
+
84
+ def clear
85
+ @model_class.delete_all
86
+ self
87
+ end
88
+
89
+ # Returns the number of documents in the backing table.
90
+ def size
91
+ @model_class.count
92
+ end
93
+
94
+ private
95
+
96
+ # Parses a metadata value returned by the pg driver.
97
+ # Handles NULL (nil), already-parsed Hash, and JSON string forms.
98
+ def parse_metadata(raw)
99
+ return {} if raw.nil?
100
+ return symbolize_hash_keys(raw) if raw.is_a?(Hash)
101
+
102
+ parsed = JSON.parse(raw.to_s, symbolize_names: true)
103
+ parsed.is_a?(Hash) ? parsed : {}
104
+ rescue JSON::ParserError
105
+ {}
106
+ end
107
+
108
+ # Recursively symbolizes keys for an already-parsed Hash.
109
+ def symbolize_hash_keys(hash)
110
+ hash.each_with_object({}) do |(k, v), h|
111
+ h[k.to_sym] = v.is_a?(Hash) ? symbolize_hash_keys(v) : v
112
+ end
113
+ end
114
+
115
+ # Validates that all elements are numeric and converts to a pgvector-
116
+ # compatible literal string (e.g. "[1.0,0.5,-0.3]").
117
+ def safe_vector_literal(embedding)
118
+ "[#{embedding.map { |v| Float(v) }.join(",")}]"
119
+ end
120
+
121
+ # Returns a validated vector for the upsert call.
122
+ def safe_vector(embedding)
123
+ safe_vector_literal(embedding)
124
+ end
125
+ end
126
+ end
127
+ end
@@ -0,0 +1,192 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "json"
4
+
5
+ module Phronomy
6
+ module VectorStore
7
+ # Redis-backed vector store using the RediSearch module (FT.* commands).
8
+ #
9
+ # Requires:
10
+ # - The +redis+ gem (add to your Gemfile)
11
+ # - A Redis server with the RediSearch (RedisSearch) module enabled
12
+ # (or Redis Stack which bundles RediSearch)
13
+ #
14
+ # Vectors are stored as FLOAT32 binary blobs in Redis Hash fields and
15
+ # searched using the KNN approximate-nearest-neighbour algorithm.
16
+ #
17
+ # @example Usage
18
+ # redis = Redis.new(url: "redis://localhost:6379")
19
+ # store = Phronomy::VectorStore::RedisSearch.new(redis: redis, dimension: 1536)
20
+ # store.add(id: "doc1", embedding: [0.1, 0.9], metadata: {text: "hello"})
21
+ # results = store.search(query_embedding: [0.1, 0.8], k: 5)
22
+ class RedisSearch < Base
23
+ DOC_PREFIX = "phronomy_doc:"
24
+ private_constant :DOC_PREFIX
25
+
26
+ # @param redis [Redis] configured Redis client
27
+ # @param index_name [String] RediSearch index name
28
+ # @param dimension [Integer, nil] vector dimension; auto-detected on first add.
29
+ # When connecting to an **existing** RediSearch index, you MUST pass
30
+ # dimension: explicitly. Without it, a freshly constructed instance
31
+ # treats the index as uninitialized until #add is called, and #search
32
+ # silently returns [] in the meantime.
33
+ # @api public
34
+ def initialize(redis:, index_name: "phronomy_vectors", dimension: nil)
35
+ begin
36
+ require "redis"
37
+ rescue LoadError
38
+ raise LoadError,
39
+ "redis gem is required for Phronomy::VectorStore::RedisSearch. " \
40
+ "Add `gem 'redis'` to your Gemfile."
41
+ end
42
+ @redis = redis
43
+ @index_name = index_name
44
+ @dimension = dimension
45
+ @index_created = false
46
+ @mutex = Mutex.new
47
+ end
48
+
49
+ # @param id [String]
50
+ # @param embedding [Array<Float>]
51
+ # @param metadata [Hash]
52
+ # @param cancellation_token [Phronomy::Concurrency::CancellationToken, nil]
53
+ # @api public
54
+ def add(id:, embedding:, metadata: {}, cancellation_token: nil)
55
+ cancellation_token&.raise_if_cancelled!
56
+ # Establish expected dimension on first add (not race-free for concurrent
57
+ # first adds), then validate, then create/reuse the index.
58
+ @dimension ||= embedding.size
59
+ validate_embedding_dimension!(embedding, @dimension)
60
+ ensure_index!(@dimension)
61
+ @redis.call(
62
+ "HSET", "#{DOC_PREFIX}#{id}",
63
+ "embedding", pack_vector(embedding),
64
+ "metadata", metadata.to_json
65
+ )
66
+ self
67
+ end
68
+
69
+ # @param query_embedding [Array<Float>]
70
+ # @param k [Integer]
71
+ # @param cancellation_token [Phronomy::Concurrency::CancellationToken, nil]
72
+ # @return [Array<Hash>] sorted by descending similarity score
73
+ # @api public
74
+ def search(query_embedding:, k: 5, cancellation_token: nil)
75
+ cancellation_token&.raise_if_cancelled!
76
+ # search never establishes dimension. If dimension is unknown and the
77
+ # index has not been created yet, there are no documents to return.
78
+ return [] if @dimension.nil? && !@index_created
79
+
80
+ validate_embedding_dimension!(query_embedding, @dimension)
81
+ ensure_index!(@dimension)
82
+ k_safe = validate_k!(k)
83
+ blob = pack_vector(query_embedding)
84
+
85
+ raw = @redis.call(
86
+ "FT.SEARCH", @index_name,
87
+ "*=>[KNN #{k_safe} @embedding $BLOB AS score]",
88
+ "PARAMS", 2, "BLOB", blob,
89
+ "SORTBY", "score",
90
+ "RETURN", 2, "score", "metadata",
91
+ "DIALECT", 2
92
+ )
93
+
94
+ parse_results(raw)
95
+ end
96
+
97
+ def remove(id:)
98
+ @redis.call("DEL", "#{DOC_PREFIX}#{id}")
99
+ self
100
+ end
101
+
102
+ # Returns the number of documents indexed.
103
+ # Queries FT.INFO when the index has been created; returns 0 otherwise.
104
+ def size
105
+ return 0 unless @index_created
106
+
107
+ raw = @redis.call("FT.INFO", @index_name)
108
+ return 0 unless raw.is_a?(Array)
109
+
110
+ idx = raw.index("num_docs")
111
+ idx ? raw[idx + 1].to_i : 0
112
+ rescue
113
+ 0
114
+ end
115
+
116
+ def clear
117
+ @mutex.synchronize do
118
+ begin
119
+ @redis.call("FT.DROPINDEX", @index_name, "DD")
120
+ rescue => e
121
+ raise unless e.message.to_s.include?("Unknown Index name")
122
+ end
123
+ @index_created = false
124
+ end
125
+ self
126
+ end
127
+
128
+ private
129
+
130
+ def ensure_index!(dim)
131
+ @mutex.synchronize do
132
+ return if @index_created
133
+
134
+ @dimension ||= dim
135
+ begin
136
+ @redis.call(
137
+ "FT.CREATE", @index_name,
138
+ "ON", "HASH",
139
+ "PREFIX", 1, DOC_PREFIX,
140
+ "SCHEMA",
141
+ "embedding", "VECTOR", "FLAT", 6,
142
+ "TYPE", "FLOAT32",
143
+ "DIM", @dimension,
144
+ "DISTANCE_METRIC", "COSINE",
145
+ "metadata", "TEXT"
146
+ )
147
+ rescue => e
148
+ raise unless e.message.to_s.include?("Index already exists")
149
+ end
150
+ @index_created = true
151
+ end
152
+ end
153
+
154
+ # Pack a Float array as a FLOAT32 binary string for RediSearch.
155
+ def pack_vector(embedding)
156
+ embedding.map { |v| Float(v) }.pack("f*")
157
+ end
158
+
159
+ # Parse the raw FT.SEARCH response into the standard Hash format.
160
+ #
161
+ # Redis FT.SEARCH returns: [count, key1, [field, value, ...], key2, ...]
162
+ def parse_results(raw)
163
+ return [] if raw.nil? || !raw.is_a?(Array) || raw.size < 2
164
+
165
+ results = []
166
+ i = 1
167
+ while i < raw.size
168
+ key = raw[i]
169
+ fields = raw[i + 1]
170
+ i += 2
171
+
172
+ next unless fields.is_a?(Array)
173
+
174
+ field_hash = fields.each_slice(2).to_h
175
+ score_str = field_hash["score"]
176
+ metadata_str = field_hash["metadata"]
177
+
178
+ next if score_str.nil?
179
+
180
+ id = key.to_s.delete_prefix(DOC_PREFIX)
181
+ # RediSearch returns cosine distance (0=identical, 2=opposite);
182
+ # convert to cosine similarity for consistency with other backends.
183
+ score = 1.0 - score_str.to_f
184
+ metadata = metadata_str ? JSON.parse(metadata_str, symbolize_names: true) : {}
185
+
186
+ results << {id: id, score: score, metadata: metadata}
187
+ end
188
+ results
189
+ end
190
+ end
191
+ end
192
+ end
@@ -0,0 +1,49 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Phronomy
4
+ module VectorStore
5
+ module Splitter
6
+ # Abstract base class for text splitters.
7
+ #
8
+ # A splitter takes a single document hash (or plain text) and returns an
9
+ # array of smaller chunk documents:
10
+ #
11
+ # [{ text: String, metadata: Hash }, ...]
12
+ #
13
+ # Subclasses must implement {#split}.
14
+ class Base
15
+ # Split +document+ into an array of chunk documents.
16
+ #
17
+ # @param document [Hash, String]
18
+ # Either a document hash (<tt>{ text: String, metadata: Hash }</tt>)
19
+ # returned by a Loader, or a plain String.
20
+ # @return [Array<Hash>] array of <tt>{ text: String, metadata: Hash }</tt>
21
+ # @raise [NotImplementedError] when not overridden by a subclass
22
+ # @api public
23
+ def split(document)
24
+ raise NotImplementedError, "#{self.class}#split is not implemented"
25
+ end
26
+
27
+ # Convenience method: split an array of documents.
28
+ #
29
+ # @param documents [Array<Hash, String>]
30
+ # @return [Array<Hash>]
31
+ # @api public
32
+ def split_all(documents)
33
+ documents.flat_map { |doc| split(doc) }
34
+ end
35
+
36
+ private
37
+
38
+ # Normalise a document-or-string argument into {text:, metadata:}.
39
+ def normalise(document)
40
+ case document
41
+ when Hash then {text: document[:text].to_s, metadata: document.fetch(:metadata, {})}
42
+ when String then {text: document, metadata: {}}
43
+ else raise ArgumentError, "document must be a Hash or String, got #{document.class}"
44
+ end
45
+ end
46
+ end
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,53 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Phronomy
4
+ module VectorStore
5
+ module Splitter
6
+ # Splits text into fixed-size character chunks with optional overlap.
7
+ #
8
+ # @example
9
+ # splitter = Phronomy::VectorStore::Splitter::FixedSizeSplitter.new(chunk_size: 200, chunk_overlap: 20)
10
+ # chunks = splitter.split({ text: long_text, metadata: { source: "doc.txt" } })
11
+ # # => [
12
+ # # { text: "...(200 chars)...", metadata: { source: "doc.txt", chunk: 0 } },
13
+ # # { text: "...(200 chars, 20-char overlap)...", metadata: { source: "doc.txt", chunk: 1 } },
14
+ # # ]
15
+ class FixedSizeSplitter < Base
16
+ # @param chunk_size [Integer] maximum characters per chunk (default: 1000)
17
+ # @param chunk_overlap [Integer] characters to repeat at the start of each
18
+ # subsequent chunk (default: 200); must be less than chunk_size
19
+ # @api public
20
+ def initialize(chunk_size: 1000, chunk_overlap: 200)
21
+ raise ArgumentError, "chunk_overlap must be less than chunk_size" if chunk_overlap >= chunk_size
22
+
23
+ @chunk_size = chunk_size
24
+ @chunk_overlap = chunk_overlap
25
+ end
26
+
27
+ # @param document [Hash, String]
28
+ # @return [Array<Hash>]
29
+ # @api public
30
+ def split(document)
31
+ doc = normalise(document)
32
+ text = doc[:text]
33
+ base_metadata = doc[:metadata]
34
+
35
+ chunks = []
36
+ start = 0
37
+ index = 0
38
+
39
+ while start < text.length
40
+ chunk_text = text[start, @chunk_size]
41
+ chunks << {text: chunk_text, metadata: base_metadata.merge(chunk: index)}
42
+ break if start + @chunk_size >= text.length
43
+
44
+ start += @chunk_size - @chunk_overlap
45
+ index += 1
46
+ end
47
+
48
+ chunks
49
+ end
50
+ end
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,107 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Phronomy
4
+ module VectorStore
5
+ module Splitter
6
+ # Splits text recursively using a prioritised list of separator strings.
7
+ #
8
+ # The splitter tries each separator in order. When a separator produces
9
+ # chunks that are still larger than +chunk_size+, it recurses with the
10
+ # next separator in the list. This mirrors LangChain's
11
+ # RecursiveCharacterTextSplitter behaviour.
12
+ #
13
+ # Default separators (in priority order):
14
+ # 1. "\n\n" — paragraph breaks
15
+ # 2. "\n" — line breaks
16
+ # 3. ". " — sentence boundaries
17
+ # 4. " " — word boundaries
18
+ # 5. "" — character-level fallback
19
+ #
20
+ # @example
21
+ # splitter = Phronomy::VectorStore::Splitter::RecursiveSplitter.new(chunk_size: 300, chunk_overlap: 30)
22
+ # chunks = splitter.split({ text: long_markdown, metadata: { source: "guide.md" } })
23
+ class RecursiveSplitter < Base
24
+ DEFAULT_SEPARATORS = ["\n\n", "\n", ". ", " ", ""].freeze
25
+
26
+ # @param chunk_size [Integer] maximum characters per chunk (default: 1000)
27
+ # @param chunk_overlap [Integer] overlap characters (default: 200)
28
+ # @param separators [Array<String>] separator list in priority order
29
+ # @api public
30
+ def initialize(chunk_size: 1000, chunk_overlap: 200, separators: DEFAULT_SEPARATORS)
31
+ raise ArgumentError, "chunk_overlap must be less than chunk_size" if chunk_overlap >= chunk_size
32
+
33
+ @chunk_size = chunk_size
34
+ @chunk_overlap = chunk_overlap
35
+ @separators = separators
36
+ end
37
+
38
+ # @param document [Hash, String]
39
+ # @return [Array<Hash>]
40
+ # @api public
41
+ def split(document)
42
+ doc = normalise(document)
43
+ texts = recursive_split(doc[:text], @separators)
44
+ merge_with_overlap(texts).each_with_index.map do |text, idx|
45
+ {text: text, metadata: doc[:metadata].merge(chunk: idx)}
46
+ end
47
+ end
48
+
49
+ private
50
+
51
+ # Split +text+ using the first separator that yields non-trivial pieces,
52
+ # then recurse on any piece that is still too large.
53
+ def recursive_split(text, separators)
54
+ return [text] if text.length <= @chunk_size || separators.empty?
55
+
56
+ sep, *rest_seps = separators
57
+
58
+ # Character-level fallback: just slice
59
+ if sep == ""
60
+ return FixedSizeSplitter
61
+ .new(chunk_size: @chunk_size, chunk_overlap: @chunk_overlap)
62
+ .split(text)
63
+ .map { |c| c[:text] }
64
+ end
65
+
66
+ parts = text.split(sep)
67
+
68
+ # If this separator doesn't split, try the next
69
+ return recursive_split(text, rest_seps) if parts.length <= 1
70
+
71
+ # Re-attach the separator to each part except the last so context is preserved
72
+ parts_with_sep = parts.each_with_index.map do |part, i|
73
+ (i < parts.length - 1) ? part + sep : part
74
+ end
75
+
76
+ parts_with_sep.flat_map do |part|
77
+ if part.length > @chunk_size
78
+ recursive_split(part, rest_seps)
79
+ else
80
+ [part]
81
+ end
82
+ end.reject { |t| t.strip.empty? }
83
+ end
84
+
85
+ # Merge small adjacent pieces and apply overlap between chunks.
86
+ def merge_with_overlap(texts)
87
+ merged = []
88
+ current = +""
89
+
90
+ texts.each do |text|
91
+ if current.length + text.length <= @chunk_size
92
+ current << text
93
+ else
94
+ merged << current.strip unless current.strip.empty?
95
+ # Start next chunk with overlap from the end of current
96
+ overlap_text = (current.length > @chunk_overlap) ? current[-@chunk_overlap..] : current
97
+ current = overlap_text + text
98
+ end
99
+ end
100
+
101
+ merged << current.strip unless current.strip.empty?
102
+ merged
103
+ end
104
+ end
105
+ end
106
+ end
107
+ end