htm 0.0.1 → 0.0.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (184) hide show
  1. checksums.yaml +4 -4
  2. data/.aigcm_msg +1 -0
  3. data/.architecture/reviews/comprehensive-codebase-review.md +577 -0
  4. data/.claude/settings.local.json +92 -0
  5. data/.envrc +1 -0
  6. data/.irbrc +283 -80
  7. data/.tbls.yml +31 -0
  8. data/CHANGELOG.md +314 -16
  9. data/CLAUDE.md +603 -0
  10. data/README.md +76 -5
  11. data/Rakefile +5 -0
  12. data/SETUP.md +132 -101
  13. data/db/migrate/{20250101000001_enable_extensions.rb → 00001_enable_extensions.rb} +0 -1
  14. data/db/migrate/00002_create_robots.rb +11 -0
  15. data/db/migrate/00003_create_file_sources.rb +20 -0
  16. data/db/migrate/00004_create_nodes.rb +65 -0
  17. data/db/migrate/00005_create_tags.rb +13 -0
  18. data/db/migrate/00006_create_node_tags.rb +18 -0
  19. data/db/migrate/00007_create_robot_nodes.rb +26 -0
  20. data/db/migrate/00009_add_working_memory_to_robot_nodes.rb +12 -0
  21. data/db/schema.sql +390 -36
  22. data/docs/api/database.md +19 -232
  23. data/docs/api/embedding-service.md +1 -7
  24. data/docs/api/htm.md +305 -364
  25. data/docs/api/index.md +1 -7
  26. data/docs/api/long-term-memory.md +342 -590
  27. data/docs/api/yard/HTM/ActiveRecordConfig.md +23 -0
  28. data/docs/api/yard/HTM/AuthorizationError.md +11 -0
  29. data/docs/api/yard/HTM/CircuitBreaker.md +92 -0
  30. data/docs/api/yard/HTM/CircuitBreakerOpenError.md +34 -0
  31. data/docs/api/yard/HTM/Configuration.md +175 -0
  32. data/docs/api/yard/HTM/Database.md +99 -0
  33. data/docs/api/yard/HTM/DatabaseError.md +14 -0
  34. data/docs/api/yard/HTM/EmbeddingError.md +18 -0
  35. data/docs/api/yard/HTM/EmbeddingService.md +58 -0
  36. data/docs/api/yard/HTM/Error.md +11 -0
  37. data/docs/api/yard/HTM/JobAdapter.md +39 -0
  38. data/docs/api/yard/HTM/LongTermMemory.md +342 -0
  39. data/docs/api/yard/HTM/NotFoundError.md +17 -0
  40. data/docs/api/yard/HTM/Observability.md +107 -0
  41. data/docs/api/yard/HTM/QueryTimeoutError.md +19 -0
  42. data/docs/api/yard/HTM/Railtie.md +27 -0
  43. data/docs/api/yard/HTM/ResourceExhaustedError.md +13 -0
  44. data/docs/api/yard/HTM/TagError.md +18 -0
  45. data/docs/api/yard/HTM/TagService.md +67 -0
  46. data/docs/api/yard/HTM/Timeframe/Result.md +24 -0
  47. data/docs/api/yard/HTM/Timeframe.md +40 -0
  48. data/docs/api/yard/HTM/TimeframeExtractor/Result.md +24 -0
  49. data/docs/api/yard/HTM/TimeframeExtractor.md +45 -0
  50. data/docs/api/yard/HTM/ValidationError.md +20 -0
  51. data/docs/api/yard/HTM/WorkingMemory.md +131 -0
  52. data/docs/api/yard/HTM.md +80 -0
  53. data/docs/api/yard/index.csv +179 -0
  54. data/docs/api/yard-reference.md +51 -0
  55. data/docs/architecture/adrs/001-postgresql-timescaledb.md +1 -1
  56. data/docs/architecture/adrs/003-ollama-embeddings.md +1 -1
  57. data/docs/architecture/adrs/010-redis-working-memory-rejected.md +2 -27
  58. data/docs/architecture/adrs/index.md +2 -13
  59. data/docs/architecture/hive-mind.md +165 -166
  60. data/docs/architecture/index.md +2 -2
  61. data/docs/architecture/overview.md +5 -171
  62. data/docs/architecture/two-tier-memory.md +1 -35
  63. data/docs/assets/images/adr-010-current-architecture.svg +37 -0
  64. data/docs/assets/images/adr-010-proposed-architecture.svg +48 -0
  65. data/docs/assets/images/adr-dependency-tree.svg +93 -0
  66. data/docs/assets/images/class-hierarchy.svg +55 -0
  67. data/docs/assets/images/exception-hierarchy.svg +45 -0
  68. data/docs/assets/images/htm-architecture-overview.svg +83 -0
  69. data/docs/assets/images/htm-complete-memory-flow.svg +160 -0
  70. data/docs/assets/images/htm-context-assembly-flow.svg +148 -0
  71. data/docs/assets/images/htm-eviction-process.svg +141 -0
  72. data/docs/assets/images/htm-memory-addition-flow.svg +138 -0
  73. data/docs/assets/images/htm-memory-recall-flow.svg +152 -0
  74. data/docs/assets/images/htm-node-states.svg +123 -0
  75. data/docs/assets/images/project-structure.svg +78 -0
  76. data/docs/assets/images/test-directory-structure.svg +38 -0
  77. data/{dbdoc → docs/database}/README.md +127 -125
  78. data/docs/database/public.file_sources.md +42 -0
  79. data/docs/database/public.file_sources.svg +211 -0
  80. data/{dbdoc → docs/database}/public.node_tags.md +7 -8
  81. data/docs/database/public.node_tags.svg +239 -0
  82. data/{dbdoc → docs/database}/public.nodes.md +22 -17
  83. data/docs/database/public.nodes.svg +271 -0
  84. data/docs/database/public.robot_nodes.md +46 -0
  85. data/docs/database/public.robot_nodes.svg +243 -0
  86. data/{dbdoc → docs/database}/public.robots.md +2 -3
  87. data/docs/database/public.robots.svg +161 -0
  88. data/docs/database/public.tags.svg +139 -0
  89. data/{dbdoc → docs/database}/schema.json +941 -630
  90. data/docs/database/schema.svg +282 -0
  91. data/docs/development/index.md +1 -29
  92. data/docs/development/schema.md +134 -309
  93. data/docs/development/testing.md +1 -9
  94. data/docs/getting-started/index.md +47 -0
  95. data/docs/{installation.md → getting-started/installation.md} +2 -2
  96. data/docs/{quick-start.md → getting-started/quick-start.md} +5 -5
  97. data/docs/guides/adding-memories.md +295 -643
  98. data/docs/guides/recalling-memories.md +36 -1
  99. data/docs/guides/search-strategies.md +85 -51
  100. data/docs/images/htm-er-diagram.svg +156 -0
  101. data/docs/index.md +16 -31
  102. data/docs/multi_framework_support.md +4 -4
  103. data/examples/README.md +280 -0
  104. data/examples/basic_usage.rb +18 -16
  105. data/examples/cli_app/htm_cli.rb +146 -8
  106. data/examples/cli_app/temp.log +93 -0
  107. data/examples/custom_llm_configuration.rb +1 -2
  108. data/examples/example_app/app.rb +11 -14
  109. data/examples/file_loader_usage.rb +177 -0
  110. data/examples/robot_groups/lib/robot_group.rb +419 -0
  111. data/examples/robot_groups/lib/working_memory_channel.rb +140 -0
  112. data/examples/robot_groups/multi_process.rb +286 -0
  113. data/examples/robot_groups/robot_worker.rb +136 -0
  114. data/examples/robot_groups/same_process.rb +229 -0
  115. data/examples/sinatra_app/Gemfile +1 -0
  116. data/examples/sinatra_app/Gemfile.lock +166 -0
  117. data/examples/sinatra_app/app.rb +219 -24
  118. data/examples/timeframe_demo.rb +276 -0
  119. data/lib/htm/active_record_config.rb +10 -3
  120. data/lib/htm/circuit_breaker.rb +202 -0
  121. data/lib/htm/configuration.rb +313 -80
  122. data/lib/htm/database.rb +67 -36
  123. data/lib/htm/embedding_service.rb +39 -2
  124. data/lib/htm/errors.rb +131 -11
  125. data/lib/htm/{sinatra.rb → integrations/sinatra.rb} +87 -12
  126. data/lib/htm/job_adapter.rb +10 -3
  127. data/lib/htm/jobs/generate_embedding_job.rb +5 -4
  128. data/lib/htm/jobs/generate_tags_job.rb +4 -0
  129. data/lib/htm/loaders/markdown_loader.rb +263 -0
  130. data/lib/htm/loaders/paragraph_chunker.rb +112 -0
  131. data/lib/htm/long_term_memory.rb +601 -321
  132. data/lib/htm/models/file_source.rb +99 -0
  133. data/lib/htm/models/node.rb +116 -12
  134. data/lib/htm/models/robot.rb +53 -4
  135. data/lib/htm/models/robot_node.rb +51 -0
  136. data/lib/htm/models/tag.rb +302 -0
  137. data/lib/htm/observability.rb +395 -0
  138. data/lib/htm/tag_service.rb +60 -3
  139. data/lib/htm/tasks.rb +29 -0
  140. data/lib/htm/timeframe.rb +194 -0
  141. data/lib/htm/timeframe_extractor.rb +307 -0
  142. data/lib/htm/version.rb +1 -1
  143. data/lib/htm/working_memory.rb +165 -70
  144. data/lib/htm.rb +352 -133
  145. data/lib/tasks/doc.rake +300 -0
  146. data/lib/tasks/files.rake +299 -0
  147. data/lib/tasks/htm.rake +188 -2
  148. data/lib/tasks/jobs.rake +10 -12
  149. data/lib/tasks/tags.rake +194 -0
  150. data/mkdocs.yml +91 -9
  151. data/notes/ARCHITECTURE_REVIEW.md +1167 -0
  152. data/notes/IMPLEMENTATION_SUMMARY.md +606 -0
  153. data/notes/MULTI_FRAMEWORK_IMPLEMENTATION.md +451 -0
  154. data/notes/next_steps.md +100 -0
  155. data/notes/plan.md +627 -0
  156. data/notes/tag_ontology_enhancement_ideas.md +222 -0
  157. data/notes/timescaledb_removal_summary.md +200 -0
  158. metadata +177 -37
  159. data/db/migrate/20250101000002_create_robots.rb +0 -14
  160. data/db/migrate/20250101000003_create_nodes.rb +0 -42
  161. data/db/migrate/20250101000005_create_tags.rb +0 -38
  162. data/db/migrate/20250101000007_add_node_vector_indexes.rb +0 -30
  163. data/dbdoc/public.node_tags.svg +0 -112
  164. data/dbdoc/public.nodes.svg +0 -118
  165. data/dbdoc/public.robots.svg +0 -90
  166. data/dbdoc/public.tags.svg +0 -60
  167. data/dbdoc/schema.svg +0 -154
  168. data/{dbdoc → docs/database}/public.node_stats.md +0 -0
  169. data/{dbdoc → docs/database}/public.node_stats.svg +0 -0
  170. data/{dbdoc → docs/database}/public.nodes_tags.md +0 -0
  171. data/{dbdoc → docs/database}/public.nodes_tags.svg +0 -0
  172. data/{dbdoc → docs/database}/public.ontology_structure.md +0 -0
  173. data/{dbdoc → docs/database}/public.ontology_structure.svg +0 -0
  174. data/{dbdoc → docs/database}/public.operations_log.md +0 -0
  175. data/{dbdoc → docs/database}/public.operations_log.svg +0 -0
  176. data/{dbdoc → docs/database}/public.relationships.md +0 -0
  177. data/{dbdoc → docs/database}/public.relationships.svg +0 -0
  178. data/{dbdoc → docs/database}/public.robot_activity.md +0 -0
  179. data/{dbdoc → docs/database}/public.robot_activity.svg +0 -0
  180. data/{dbdoc → docs/database}/public.schema_migrations.md +0 -0
  181. data/{dbdoc → docs/database}/public.schema_migrations.svg +0 -0
  182. data/{dbdoc → docs/database}/public.tags.md +3 -3
  183. /data/{dbdoc → docs/database}/public.topic_relationships.md +0 -0
  184. /data/{dbdoc → docs/database}/public.topic_relationships.svg +0 -0
@@ -0,0 +1,263 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'yaml'
4
+ require 'digest'
5
+
6
+ class HTM
7
+ module Loaders
8
+ # Markdown file loader
9
+ #
10
+ # Loads markdown files into HTM long-term memory with support for:
11
+ # - YAML frontmatter parsing (stored as metadata on first chunk)
12
+ # - Paragraph-based chunking
13
+ # - Re-sync on file changes (via mtime comparison)
14
+ # - Duplicate detection via content_hash
15
+ #
16
+ # @example Load a single file
17
+ # loader = MarkdownLoader.new(htm)
18
+ # result = loader.load_file('/path/to/doc.md')
19
+ # # => { file_path: '/path/to/doc.md', chunks_created: 5, ... }
20
+ #
21
+ # @example Load a directory
22
+ # results = loader.load_directory('/path/to/docs', pattern: '**/*.md')
23
+ #
24
+ class MarkdownLoader
25
+ FRONTMATTER_REGEX = /\A---\s*\n(.*?)\n---\s*\n/m
26
+ MAX_FILE_SIZE = 10 * 1024 * 1024 # 10 MB maximum file size
27
+
28
+ # @param htm_instance [HTM] The HTM instance to use for storing nodes
29
+ def initialize(htm_instance)
30
+ @htm = htm_instance
31
+ @chunker = ParagraphChunker.new
32
+ end
33
+
34
+ # Load a single markdown file into long-term memory
35
+ #
36
+ # @param path [String] Path to markdown file
37
+ # @param force [Boolean] Force re-sync even if mtime unchanged
38
+ # @return [Hash] Load result with keys:
39
+ # - :file_path [String] Absolute path to file
40
+ # - :chunks_created [Integer] Number of new chunks created
41
+ # - :chunks_updated [Integer] Number of existing chunks updated
42
+ # - :chunks_deleted [Integer] Number of chunks soft-deleted
43
+ # - :skipped [Boolean] True if file was unchanged and skipped
44
+ #
45
+ def load_file(path, force: false)
46
+ expanded_path = File.expand_path(path)
47
+
48
+ unless File.exist?(expanded_path)
49
+ raise ArgumentError, "File not found: #{path}"
50
+ end
51
+
52
+ unless File.file?(expanded_path)
53
+ raise ArgumentError, "Not a file: #{path}"
54
+ end
55
+
56
+ # Validate file size before reading
57
+ file_size = File.size(expanded_path)
58
+ if file_size > MAX_FILE_SIZE
59
+ raise ArgumentError, "File too large: #{path} (#{file_size} bytes). Maximum size is #{MAX_FILE_SIZE} bytes (10 MB)."
60
+ end
61
+
62
+ # Read file with encoding detection and fallback
63
+ # Try UTF-8 first, then fall back to binary if encoding errors occur
64
+ begin
65
+ content = File.read(expanded_path, encoding: 'UTF-8')
66
+ rescue Encoding::InvalidByteSequenceError, Encoding::UndefinedConversionError
67
+ # Try reading as binary and force encoding to UTF-8, replacing invalid chars
68
+ content = File.read(expanded_path, encoding: 'BINARY')
69
+ content = content.encode('UTF-8', invalid: :replace, undef: :replace, replace: '?')
70
+ HTM.logger.warn "File #{path} has non-UTF-8 encoding, some characters may be replaced"
71
+ end
72
+ stat = File.stat(expanded_path)
73
+ file_hash = Digest::SHA256.hexdigest(content)
74
+
75
+ # Find or create source record
76
+ source = HTM::Models::FileSource.find_or_initialize_by(file_path: expanded_path)
77
+
78
+ # Check if sync needed
79
+ unless force || source.new_record? || source.needs_sync?(stat.mtime)
80
+ return {
81
+ file_path: expanded_path,
82
+ chunks_created: 0,
83
+ chunks_updated: 0,
84
+ chunks_deleted: 0,
85
+ skipped: true
86
+ }
87
+ end
88
+
89
+ # Parse frontmatter and body
90
+ frontmatter, body = extract_frontmatter(content)
91
+
92
+ # Chunk the body
93
+ chunks = @chunker.chunk(body)
94
+
95
+ # Prepend frontmatter to first chunk if present
96
+ if frontmatter.any? && chunks.any?
97
+ frontmatter_yaml = YAML.dump(frontmatter).sub(/\A---\n/, "---\n")
98
+ chunks[0] = "#{frontmatter_yaml}---\n\n#{chunks[0]}"
99
+ end
100
+
101
+ # Save source first (need ID for node association)
102
+ source.save! if source.new_record?
103
+
104
+ # Sync chunks to database
105
+ result = sync_chunks(source, chunks)
106
+
107
+ # Update source record
108
+ source.update!(
109
+ file_hash: file_hash,
110
+ mtime: stat.mtime,
111
+ file_size: stat.size,
112
+ frontmatter: frontmatter,
113
+ last_synced_at: Time.current
114
+ )
115
+
116
+ result.merge(
117
+ file_path: expanded_path,
118
+ file_source_id: source.id,
119
+ skipped: false
120
+ )
121
+ end
122
+
123
+ # Load all matching files from a directory
124
+ #
125
+ # @param path [String] Directory path
126
+ # @param pattern [String] Glob pattern (default: '**/*.md')
127
+ # @param force [Boolean] Force re-sync even if unchanged
128
+ # @return [Array<Hash>] Results for each file
129
+ #
130
+ def load_directory(path, pattern: '**/*.md', force: false)
131
+ expanded_path = File.expand_path(path)
132
+
133
+ unless File.exist?(expanded_path)
134
+ raise ArgumentError, "Directory not found: #{path}"
135
+ end
136
+
137
+ unless File.directory?(expanded_path)
138
+ raise ArgumentError, "Not a directory: #{path}"
139
+ end
140
+
141
+ files = Dir.glob(File.join(expanded_path, pattern))
142
+
143
+ files.map do |file_path|
144
+ begin
145
+ load_file(file_path, force: force)
146
+ rescue StandardError => e
147
+ { file_path: file_path, error: e.message, skipped: false }
148
+ end
149
+ end
150
+ end
151
+
152
+ private
153
+
154
+ # Extract YAML frontmatter from content
155
+ #
156
+ # @param content [String] File content
157
+ # @return [Array(Hash, String)] Frontmatter hash and body string
158
+ #
159
+ def extract_frontmatter(content)
160
+ match = content.match(FRONTMATTER_REGEX)
161
+
162
+ if match
163
+ yaml_content = match[1]
164
+ body = content[match.end(0)..]
165
+
166
+ begin
167
+ frontmatter = YAML.safe_load(yaml_content, permitted_classes: [Date, Time, Symbol]) || {}
168
+ # Convert symbol keys to strings
169
+ frontmatter = frontmatter.transform_keys(&:to_s) if frontmatter.is_a?(Hash)
170
+ rescue Psych::SyntaxError
171
+ frontmatter = {}
172
+ end
173
+ else
174
+ frontmatter = {}
175
+ body = content
176
+ end
177
+
178
+ [frontmatter, body]
179
+ end
180
+
181
+ # Sync chunks to database, handling updates and deletions
182
+ #
183
+ # @param source [FileSource] The source record
184
+ # @param chunks [Array<String>] New chunk contents
185
+ # @return [Hash] Sync statistics
186
+ #
187
+ def sync_chunks(source, chunks)
188
+ created = 0
189
+ updated = 0
190
+ deleted = 0
191
+
192
+ # Get existing nodes for this source (include soft-deleted for potential restore)
193
+ existing_nodes = source.persisted? ?
194
+ HTM::Models::Node.unscoped.where(source_id: source.id).to_a : []
195
+ existing_by_hash = existing_nodes.index_by(&:content_hash)
196
+
197
+ # Track which existing nodes we've matched
198
+ matched_hashes = Set.new
199
+
200
+ # Process each new chunk
201
+ chunks.each_with_index do |chunk_content, position|
202
+ chunk_hash = HTM::Models::Node.generate_content_hash(chunk_content)
203
+
204
+ if existing_by_hash[chunk_hash]
205
+ # Chunk exists - update position if needed, restore if soft-deleted
206
+ node = existing_by_hash[chunk_hash]
207
+ matched_hashes << chunk_hash
208
+
209
+ changes = {}
210
+ changes[:chunk_position] = position if node.chunk_position != position
211
+ changes[:deleted_at] = nil if node.deleted_at.present?
212
+
213
+ if changes.any?
214
+ node.update!(changes)
215
+ updated += 1
216
+ end
217
+ else
218
+ # New chunk - create node
219
+ node = create_chunk_node(source, chunk_content, position)
220
+ created += 1 if node
221
+ end
222
+ end
223
+
224
+ # Soft-delete chunks that no longer exist in file
225
+ existing_by_hash.each do |hash, node|
226
+ next if matched_hashes.include?(hash)
227
+ next if node.deleted_at.present? # Already deleted
228
+
229
+ node.soft_delete!
230
+ deleted += 1
231
+ end
232
+
233
+ { chunks_created: created, chunks_updated: updated, chunks_deleted: deleted }
234
+ end
235
+
236
+ # Create a node for a chunk
237
+ #
238
+ # @param source [FileSource] The source record
239
+ # @param content [String] Chunk content
240
+ # @param position [Integer] Position in file (0-indexed)
241
+ # @return [Node, nil] The created node or nil if duplicate
242
+ #
243
+ def create_chunk_node(source, content, position)
244
+ # Use remember to get proper embedding/tag processing
245
+ node_id = @htm.remember(content)
246
+
247
+ # Update with source reference
248
+ node = HTM::Models::Node.find(node_id)
249
+ node.update!(source_id: source.id, chunk_position: position)
250
+
251
+ node
252
+ rescue ActiveRecord::RecordNotUnique
253
+ # Duplicate content exists (different source or no source)
254
+ # Find and link to this source
255
+ existing = HTM::Models::Node.find_by_content(content)
256
+ if existing && existing.source_id.nil?
257
+ existing.update!(source_id: source.id, chunk_position: position)
258
+ end
259
+ existing
260
+ end
261
+ end
262
+ end
263
+ end
@@ -0,0 +1,112 @@
1
+ # frozen_string_literal: true
2
+
3
+ class HTM
4
+ module Loaders
5
+ # Paragraph-based text chunker
6
+ #
7
+ # Splits text into chunks based on paragraph boundaries (blank lines).
8
+ # Preserves code blocks as single chunks to avoid breaking syntax.
9
+ #
10
+ # @example Basic usage
11
+ # chunker = ParagraphChunker.new
12
+ # chunks = chunker.chunk("First paragraph.\n\nSecond paragraph.")
13
+ # # => ["First paragraph.", "Second paragraph."]
14
+ #
15
+ # @example With code blocks
16
+ # text = "Intro\n\n```ruby\ndef foo\n bar\nend\n```\n\nConclusion"
17
+ # chunks = chunker.chunk(text)
18
+ # # => ["Intro", "```ruby\ndef foo\n bar\nend\n```", "Conclusion"]
19
+ #
20
+ class ParagraphChunker
21
+ MIN_CHUNK_SIZE = 10 # Only merge very short fragments (single words)
22
+
23
+ # Split text into paragraph chunks
24
+ #
25
+ # @param text [String] Text to chunk
26
+ # @return [Array<String>] Array of paragraph chunks
27
+ #
28
+ def chunk(text)
29
+ return [] if text.nil? || text.strip.empty?
30
+
31
+ # Normalize line endings
32
+ normalized = text.gsub(/\r\n?/, "\n")
33
+
34
+ # Protect code blocks from splitting
35
+ protected_text, code_blocks = protect_code_blocks(normalized)
36
+
37
+ # Split by blank lines (2+ newlines)
38
+ raw_chunks = protected_text.split(/\n\s*\n+/)
39
+
40
+ # Restore code blocks and clean up
41
+ chunks = raw_chunks.map do |chunk|
42
+ restore_code_blocks(chunk.strip, code_blocks)
43
+ end.reject(&:empty?)
44
+
45
+ # Merge very short chunks with neighbors
46
+ merge_short_chunks(chunks)
47
+ end
48
+
49
+ private
50
+
51
+ # Replace code blocks with placeholders to prevent splitting
52
+ #
53
+ # @param text [String] Text containing code blocks
54
+ # @return [Array(String, Hash)] Protected text and code block map
55
+ #
56
+ def protect_code_blocks(text)
57
+ code_blocks = {}
58
+ counter = 0
59
+
60
+ # Match fenced code blocks (``` or ~~~)
61
+ protected = text.gsub(/```[\s\S]*?```|~~~[\s\S]*?~~~/m) do |match|
62
+ placeholder = "<<<CODE_BLOCK_#{counter}>>>"
63
+ code_blocks[placeholder] = match
64
+ counter += 1
65
+ placeholder
66
+ end
67
+
68
+ [protected, code_blocks]
69
+ end
70
+
71
+ # Restore code blocks from placeholders
72
+ #
73
+ # @param text [String] Text with placeholders
74
+ # @param code_blocks [Hash] Placeholder to code block mapping
75
+ # @return [String] Text with code blocks restored
76
+ #
77
+ def restore_code_blocks(text, code_blocks)
78
+ result = text
79
+ code_blocks.each do |placeholder, block|
80
+ result = result.gsub(placeholder, block)
81
+ end
82
+ result
83
+ end
84
+
85
+ # Merge chunks shorter than MIN_CHUNK_SIZE with neighbors
86
+ #
87
+ # @param chunks [Array<String>] Original chunks
88
+ # @return [Array<String>] Merged chunks
89
+ #
90
+ def merge_short_chunks(chunks)
91
+ return chunks if chunks.size <= 1
92
+
93
+ result = []
94
+ buffer = ''
95
+
96
+ chunks.each do |chunk|
97
+ if buffer.empty?
98
+ buffer = chunk
99
+ elsif buffer.length < MIN_CHUNK_SIZE
100
+ buffer = "#{buffer}\n\n#{chunk}"
101
+ else
102
+ result << buffer
103
+ buffer = chunk
104
+ end
105
+ end
106
+
107
+ result << buffer unless buffer.empty?
108
+ result
109
+ end
110
+ end
111
+ end
112
+ end