htm 0.0.2 → 0.0.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.aigcm_msg +1 -0
- data/.architecture/reviews/comprehensive-codebase-review.md +577 -0
- data/.claude/settings.local.json +95 -0
- data/.irbrc +283 -80
- data/.tbls.yml +2 -1
- data/CHANGELOG.md +327 -26
- data/CLAUDE.md +603 -0
- data/README.md +83 -12
- data/Rakefile +5 -0
- data/bin/htm_mcp.rb +527 -0
- data/db/migrate/{20250101000001_enable_extensions.rb → 00001_enable_extensions.rb} +0 -1
- data/db/migrate/00002_create_robots.rb +11 -0
- data/db/migrate/00003_create_file_sources.rb +20 -0
- data/db/migrate/00004_create_nodes.rb +65 -0
- data/db/migrate/00005_create_tags.rb +13 -0
- data/db/migrate/00006_create_node_tags.rb +18 -0
- data/db/migrate/00007_create_robot_nodes.rb +26 -0
- data/db/migrate/00009_add_working_memory_to_robot_nodes.rb +12 -0
- data/db/schema.sql +172 -1
- data/docs/api/database.md +1 -2
- data/docs/api/htm.md +197 -2
- data/docs/api/yard/HTM/ActiveRecordConfig.md +23 -0
- data/docs/api/yard/HTM/AuthorizationError.md +11 -0
- data/docs/api/yard/HTM/CircuitBreaker.md +92 -0
- data/docs/api/yard/HTM/CircuitBreakerOpenError.md +34 -0
- data/docs/api/yard/HTM/Configuration.md +175 -0
- data/docs/api/yard/HTM/Database.md +99 -0
- data/docs/api/yard/HTM/DatabaseError.md +14 -0
- data/docs/api/yard/HTM/EmbeddingError.md +18 -0
- data/docs/api/yard/HTM/EmbeddingService.md +58 -0
- data/docs/api/yard/HTM/Error.md +11 -0
- data/docs/api/yard/HTM/JobAdapter.md +39 -0
- data/docs/api/yard/HTM/LongTermMemory.md +342 -0
- data/docs/api/yard/HTM/NotFoundError.md +17 -0
- data/docs/api/yard/HTM/Observability.md +107 -0
- data/docs/api/yard/HTM/QueryTimeoutError.md +19 -0
- data/docs/api/yard/HTM/Railtie.md +27 -0
- data/docs/api/yard/HTM/ResourceExhaustedError.md +13 -0
- data/docs/api/yard/HTM/TagError.md +18 -0
- data/docs/api/yard/HTM/TagService.md +67 -0
- data/docs/api/yard/HTM/Timeframe/Result.md +24 -0
- data/docs/api/yard/HTM/Timeframe.md +40 -0
- data/docs/api/yard/HTM/TimeframeExtractor/Result.md +24 -0
- data/docs/api/yard/HTM/TimeframeExtractor.md +45 -0
- data/docs/api/yard/HTM/ValidationError.md +20 -0
- data/docs/api/yard/HTM/WorkingMemory.md +131 -0
- data/docs/api/yard/HTM.md +80 -0
- data/docs/api/yard/index.csv +179 -0
- data/docs/api/yard-reference.md +51 -0
- data/docs/database/README.md +128 -128
- data/docs/database/public.file_sources.md +42 -0
- data/docs/database/public.file_sources.svg +211 -0
- data/docs/database/public.node_tags.md +4 -4
- data/docs/database/public.node_tags.svg +212 -79
- data/docs/database/public.nodes.md +22 -12
- data/docs/database/public.nodes.svg +246 -127
- data/docs/database/public.robot_nodes.md +11 -9
- data/docs/database/public.robot_nodes.svg +220 -98
- data/docs/database/public.robots.md +2 -2
- data/docs/database/public.robots.svg +136 -81
- data/docs/database/public.tags.md +3 -3
- data/docs/database/public.tags.svg +118 -39
- data/docs/database/schema.json +850 -771
- data/docs/database/schema.svg +256 -197
- data/docs/development/schema.md +67 -2
- data/docs/guides/adding-memories.md +93 -7
- data/docs/guides/recalling-memories.md +36 -1
- data/examples/README.md +405 -0
- data/examples/cli_app/htm_cli.rb +65 -5
- data/examples/cli_app/temp.log +93 -0
- data/examples/file_loader_usage.rb +177 -0
- data/examples/mcp_client.rb +529 -0
- data/examples/robot_groups/lib/robot_group.rb +419 -0
- data/examples/robot_groups/lib/working_memory_channel.rb +140 -0
- data/examples/robot_groups/multi_process.rb +286 -0
- data/examples/robot_groups/robot_worker.rb +136 -0
- data/examples/robot_groups/same_process.rb +229 -0
- data/examples/timeframe_demo.rb +276 -0
- data/lib/htm/active_record_config.rb +1 -1
- data/lib/htm/circuit_breaker.rb +202 -0
- data/lib/htm/configuration.rb +59 -13
- data/lib/htm/database.rb +67 -36
- data/lib/htm/embedding_service.rb +39 -2
- data/lib/htm/errors.rb +131 -11
- data/lib/htm/jobs/generate_embedding_job.rb +5 -4
- data/lib/htm/jobs/generate_tags_job.rb +4 -0
- data/lib/htm/loaders/markdown_loader.rb +263 -0
- data/lib/htm/loaders/paragraph_chunker.rb +112 -0
- data/lib/htm/long_term_memory.rb +460 -343
- data/lib/htm/models/file_source.rb +99 -0
- data/lib/htm/models/node.rb +80 -5
- data/lib/htm/models/robot.rb +24 -1
- data/lib/htm/models/robot_node.rb +1 -0
- data/lib/htm/models/tag.rb +254 -4
- data/lib/htm/observability.rb +395 -0
- data/lib/htm/tag_service.rb +60 -3
- data/lib/htm/tasks.rb +26 -1
- data/lib/htm/timeframe.rb +194 -0
- data/lib/htm/timeframe_extractor.rb +307 -0
- data/lib/htm/version.rb +1 -1
- data/lib/htm/working_memory.rb +165 -70
- data/lib/htm.rb +328 -130
- data/lib/tasks/doc.rake +300 -0
- data/lib/tasks/files.rake +299 -0
- data/lib/tasks/htm.rake +158 -3
- data/lib/tasks/jobs.rake +3 -9
- data/lib/tasks/tags.rake +166 -6
- data/mkdocs.yml +36 -1
- data/notes/ARCHITECTURE_REVIEW.md +1167 -0
- data/notes/IMPLEMENTATION_SUMMARY.md +606 -0
- data/notes/MULTI_FRAMEWORK_IMPLEMENTATION.md +451 -0
- data/notes/next_steps.md +100 -0
- data/notes/plan.md +627 -0
- data/notes/tag_ontology_enhancement_ideas.md +222 -0
- data/notes/timescaledb_removal_summary.md +200 -0
- metadata +158 -17
- data/db/migrate/20250101000002_create_robots.rb +0 -14
- data/db/migrate/20250101000003_create_nodes.rb +0 -42
- data/db/migrate/20250101000005_create_tags.rb +0 -38
- data/db/migrate/20250101000007_add_node_vector_indexes.rb +0 -30
- data/db/migrate/20250125000001_add_content_hash_to_nodes.rb +0 -14
- data/db/migrate/20250125000002_create_robot_nodes.rb +0 -35
- data/db/migrate/20250125000003_remove_source_and_robot_id_from_nodes.rb +0 -28
- data/db/migrate/20250126000001_create_working_memories.rb +0 -19
- data/db/migrate/20250126000002_remove_unused_columns.rb +0 -12
- data/docs/database/public.working_memories.md +0 -40
- data/docs/database/public.working_memories.svg +0 -112
- data/lib/htm/models/working_memory_entry.rb +0 -88
|
@@ -0,0 +1,263 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'yaml'
|
|
4
|
+
require 'digest'
|
|
5
|
+
|
|
6
|
+
class HTM
|
|
7
|
+
module Loaders
|
|
8
|
+
# Markdown file loader
|
|
9
|
+
#
|
|
10
|
+
# Loads markdown files into HTM long-term memory with support for:
|
|
11
|
+
# - YAML frontmatter parsing (stored as metadata on first chunk)
|
|
12
|
+
# - Paragraph-based chunking
|
|
13
|
+
# - Re-sync on file changes (via mtime comparison)
|
|
14
|
+
# - Duplicate detection via content_hash
|
|
15
|
+
#
|
|
16
|
+
# @example Load a single file
|
|
17
|
+
# loader = MarkdownLoader.new(htm)
|
|
18
|
+
# result = loader.load_file('/path/to/doc.md')
|
|
19
|
+
# # => { file_path: '/path/to/doc.md', chunks_created: 5, ... }
|
|
20
|
+
#
|
|
21
|
+
# @example Load a directory
|
|
22
|
+
# results = loader.load_directory('/path/to/docs', pattern: '**/*.md')
|
|
23
|
+
#
|
|
24
|
+
class MarkdownLoader
|
|
25
|
+
FRONTMATTER_REGEX = /\A---\s*\n(.*?)\n---\s*\n/m
|
|
26
|
+
MAX_FILE_SIZE = 10 * 1024 * 1024 # 10 MB maximum file size
|
|
27
|
+
|
|
28
|
+
# @param htm_instance [HTM] The HTM instance to use for storing nodes
|
|
29
|
+
def initialize(htm_instance)
|
|
30
|
+
@htm = htm_instance
|
|
31
|
+
@chunker = ParagraphChunker.new
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
# Load a single markdown file into long-term memory
|
|
35
|
+
#
|
|
36
|
+
# @param path [String] Path to markdown file
|
|
37
|
+
# @param force [Boolean] Force re-sync even if mtime unchanged
|
|
38
|
+
# @return [Hash] Load result with keys:
|
|
39
|
+
# - :file_path [String] Absolute path to file
|
|
40
|
+
# - :chunks_created [Integer] Number of new chunks created
|
|
41
|
+
# - :chunks_updated [Integer] Number of existing chunks updated
|
|
42
|
+
# - :chunks_deleted [Integer] Number of chunks soft-deleted
|
|
43
|
+
# - :skipped [Boolean] True if file was unchanged and skipped
|
|
44
|
+
#
|
|
45
|
+
def load_file(path, force: false)
|
|
46
|
+
expanded_path = File.expand_path(path)
|
|
47
|
+
|
|
48
|
+
unless File.exist?(expanded_path)
|
|
49
|
+
raise ArgumentError, "File not found: #{path}"
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
unless File.file?(expanded_path)
|
|
53
|
+
raise ArgumentError, "Not a file: #{path}"
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
# Validate file size before reading
|
|
57
|
+
file_size = File.size(expanded_path)
|
|
58
|
+
if file_size > MAX_FILE_SIZE
|
|
59
|
+
raise ArgumentError, "File too large: #{path} (#{file_size} bytes). Maximum size is #{MAX_FILE_SIZE} bytes (10 MB)."
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
# Read file with encoding detection and fallback
|
|
63
|
+
# Try UTF-8 first, then fall back to binary if encoding errors occur
|
|
64
|
+
begin
|
|
65
|
+
content = File.read(expanded_path, encoding: 'UTF-8')
|
|
66
|
+
rescue Encoding::InvalidByteSequenceError, Encoding::UndefinedConversionError
|
|
67
|
+
# Try reading as binary and force encoding to UTF-8, replacing invalid chars
|
|
68
|
+
content = File.read(expanded_path, encoding: 'BINARY')
|
|
69
|
+
content = content.encode('UTF-8', invalid: :replace, undef: :replace, replace: '?')
|
|
70
|
+
HTM.logger.warn "File #{path} has non-UTF-8 encoding, some characters may be replaced"
|
|
71
|
+
end
|
|
72
|
+
stat = File.stat(expanded_path)
|
|
73
|
+
file_hash = Digest::SHA256.hexdigest(content)
|
|
74
|
+
|
|
75
|
+
# Find or create source record
|
|
76
|
+
source = HTM::Models::FileSource.find_or_initialize_by(file_path: expanded_path)
|
|
77
|
+
|
|
78
|
+
# Check if sync needed
|
|
79
|
+
unless force || source.new_record? || source.needs_sync?(stat.mtime)
|
|
80
|
+
return {
|
|
81
|
+
file_path: expanded_path,
|
|
82
|
+
chunks_created: 0,
|
|
83
|
+
chunks_updated: 0,
|
|
84
|
+
chunks_deleted: 0,
|
|
85
|
+
skipped: true
|
|
86
|
+
}
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
# Parse frontmatter and body
|
|
90
|
+
frontmatter, body = extract_frontmatter(content)
|
|
91
|
+
|
|
92
|
+
# Chunk the body
|
|
93
|
+
chunks = @chunker.chunk(body)
|
|
94
|
+
|
|
95
|
+
# Prepend frontmatter to first chunk if present
|
|
96
|
+
if frontmatter.any? && chunks.any?
|
|
97
|
+
frontmatter_yaml = YAML.dump(frontmatter).sub(/\A---\n/, "---\n")
|
|
98
|
+
chunks[0] = "#{frontmatter_yaml}---\n\n#{chunks[0]}"
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
# Save source first (need ID for node association)
|
|
102
|
+
source.save! if source.new_record?
|
|
103
|
+
|
|
104
|
+
# Sync chunks to database
|
|
105
|
+
result = sync_chunks(source, chunks)
|
|
106
|
+
|
|
107
|
+
# Update source record
|
|
108
|
+
source.update!(
|
|
109
|
+
file_hash: file_hash,
|
|
110
|
+
mtime: stat.mtime,
|
|
111
|
+
file_size: stat.size,
|
|
112
|
+
frontmatter: frontmatter,
|
|
113
|
+
last_synced_at: Time.current
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
result.merge(
|
|
117
|
+
file_path: expanded_path,
|
|
118
|
+
file_source_id: source.id,
|
|
119
|
+
skipped: false
|
|
120
|
+
)
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
# Load all matching files from a directory
|
|
124
|
+
#
|
|
125
|
+
# @param path [String] Directory path
|
|
126
|
+
# @param pattern [String] Glob pattern (default: '**/*.md')
|
|
127
|
+
# @param force [Boolean] Force re-sync even if unchanged
|
|
128
|
+
# @return [Array<Hash>] Results for each file
|
|
129
|
+
#
|
|
130
|
+
def load_directory(path, pattern: '**/*.md', force: false)
|
|
131
|
+
expanded_path = File.expand_path(path)
|
|
132
|
+
|
|
133
|
+
unless File.exist?(expanded_path)
|
|
134
|
+
raise ArgumentError, "Directory not found: #{path}"
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
unless File.directory?(expanded_path)
|
|
138
|
+
raise ArgumentError, "Not a directory: #{path}"
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
files = Dir.glob(File.join(expanded_path, pattern))
|
|
142
|
+
|
|
143
|
+
files.map do |file_path|
|
|
144
|
+
begin
|
|
145
|
+
load_file(file_path, force: force)
|
|
146
|
+
rescue StandardError => e
|
|
147
|
+
{ file_path: file_path, error: e.message, skipped: false }
|
|
148
|
+
end
|
|
149
|
+
end
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
private
|
|
153
|
+
|
|
154
|
+
# Extract YAML frontmatter from content
|
|
155
|
+
#
|
|
156
|
+
# @param content [String] File content
|
|
157
|
+
# @return [Array(Hash, String)] Frontmatter hash and body string
|
|
158
|
+
#
|
|
159
|
+
def extract_frontmatter(content)
|
|
160
|
+
match = content.match(FRONTMATTER_REGEX)
|
|
161
|
+
|
|
162
|
+
if match
|
|
163
|
+
yaml_content = match[1]
|
|
164
|
+
body = content[match.end(0)..]
|
|
165
|
+
|
|
166
|
+
begin
|
|
167
|
+
frontmatter = YAML.safe_load(yaml_content, permitted_classes: [Date, Time, Symbol]) || {}
|
|
168
|
+
# Convert symbol keys to strings
|
|
169
|
+
frontmatter = frontmatter.transform_keys(&:to_s) if frontmatter.is_a?(Hash)
|
|
170
|
+
rescue Psych::SyntaxError
|
|
171
|
+
frontmatter = {}
|
|
172
|
+
end
|
|
173
|
+
else
|
|
174
|
+
frontmatter = {}
|
|
175
|
+
body = content
|
|
176
|
+
end
|
|
177
|
+
|
|
178
|
+
[frontmatter, body]
|
|
179
|
+
end
|
|
180
|
+
|
|
181
|
+
# Sync chunks to database, handling updates and deletions
|
|
182
|
+
#
|
|
183
|
+
# @param source [FileSource] The source record
|
|
184
|
+
# @param chunks [Array<String>] New chunk contents
|
|
185
|
+
# @return [Hash] Sync statistics
|
|
186
|
+
#
|
|
187
|
+
def sync_chunks(source, chunks)
|
|
188
|
+
created = 0
|
|
189
|
+
updated = 0
|
|
190
|
+
deleted = 0
|
|
191
|
+
|
|
192
|
+
# Get existing nodes for this source (include soft-deleted for potential restore)
|
|
193
|
+
existing_nodes = source.persisted? ?
|
|
194
|
+
HTM::Models::Node.unscoped.where(source_id: source.id).to_a : []
|
|
195
|
+
existing_by_hash = existing_nodes.index_by(&:content_hash)
|
|
196
|
+
|
|
197
|
+
# Track which existing nodes we've matched
|
|
198
|
+
matched_hashes = Set.new
|
|
199
|
+
|
|
200
|
+
# Process each new chunk
|
|
201
|
+
chunks.each_with_index do |chunk_content, position|
|
|
202
|
+
chunk_hash = HTM::Models::Node.generate_content_hash(chunk_content)
|
|
203
|
+
|
|
204
|
+
if existing_by_hash[chunk_hash]
|
|
205
|
+
# Chunk exists - update position if needed, restore if soft-deleted
|
|
206
|
+
node = existing_by_hash[chunk_hash]
|
|
207
|
+
matched_hashes << chunk_hash
|
|
208
|
+
|
|
209
|
+
changes = {}
|
|
210
|
+
changes[:chunk_position] = position if node.chunk_position != position
|
|
211
|
+
changes[:deleted_at] = nil if node.deleted_at.present?
|
|
212
|
+
|
|
213
|
+
if changes.any?
|
|
214
|
+
node.update!(changes)
|
|
215
|
+
updated += 1
|
|
216
|
+
end
|
|
217
|
+
else
|
|
218
|
+
# New chunk - create node
|
|
219
|
+
node = create_chunk_node(source, chunk_content, position)
|
|
220
|
+
created += 1 if node
|
|
221
|
+
end
|
|
222
|
+
end
|
|
223
|
+
|
|
224
|
+
# Soft-delete chunks that no longer exist in file
|
|
225
|
+
existing_by_hash.each do |hash, node|
|
|
226
|
+
next if matched_hashes.include?(hash)
|
|
227
|
+
next if node.deleted_at.present? # Already deleted
|
|
228
|
+
|
|
229
|
+
node.soft_delete!
|
|
230
|
+
deleted += 1
|
|
231
|
+
end
|
|
232
|
+
|
|
233
|
+
{ chunks_created: created, chunks_updated: updated, chunks_deleted: deleted }
|
|
234
|
+
end
|
|
235
|
+
|
|
236
|
+
# Create a node for a chunk
|
|
237
|
+
#
|
|
238
|
+
# @param source [FileSource] The source record
|
|
239
|
+
# @param content [String] Chunk content
|
|
240
|
+
# @param position [Integer] Position in file (0-indexed)
|
|
241
|
+
# @return [Node, nil] The created node or nil if duplicate
|
|
242
|
+
#
|
|
243
|
+
def create_chunk_node(source, content, position)
|
|
244
|
+
# Use remember to get proper embedding/tag processing
|
|
245
|
+
node_id = @htm.remember(content)
|
|
246
|
+
|
|
247
|
+
# Update with source reference
|
|
248
|
+
node = HTM::Models::Node.find(node_id)
|
|
249
|
+
node.update!(source_id: source.id, chunk_position: position)
|
|
250
|
+
|
|
251
|
+
node
|
|
252
|
+
rescue ActiveRecord::RecordNotUnique
|
|
253
|
+
# Duplicate content exists (different source or no source)
|
|
254
|
+
# Find and link to this source
|
|
255
|
+
existing = HTM::Models::Node.find_by_content(content)
|
|
256
|
+
if existing && existing.source_id.nil?
|
|
257
|
+
existing.update!(source_id: source.id, chunk_position: position)
|
|
258
|
+
end
|
|
259
|
+
existing
|
|
260
|
+
end
|
|
261
|
+
end
|
|
262
|
+
end
|
|
263
|
+
end
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
class HTM
|
|
4
|
+
module Loaders
|
|
5
|
+
# Paragraph-based text chunker
|
|
6
|
+
#
|
|
7
|
+
# Splits text into chunks based on paragraph boundaries (blank lines).
|
|
8
|
+
# Preserves code blocks as single chunks to avoid breaking syntax.
|
|
9
|
+
#
|
|
10
|
+
# @example Basic usage
|
|
11
|
+
# chunker = ParagraphChunker.new
|
|
12
|
+
# chunks = chunker.chunk("First paragraph.\n\nSecond paragraph.")
|
|
13
|
+
# # => ["First paragraph.", "Second paragraph."]
|
|
14
|
+
#
|
|
15
|
+
# @example With code blocks
|
|
16
|
+
# text = "Intro\n\n```ruby\ndef foo\n bar\nend\n```\n\nConclusion"
|
|
17
|
+
# chunks = chunker.chunk(text)
|
|
18
|
+
# # => ["Intro", "```ruby\ndef foo\n bar\nend\n```", "Conclusion"]
|
|
19
|
+
#
|
|
20
|
+
class ParagraphChunker
|
|
21
|
+
MIN_CHUNK_SIZE = 10 # Only merge very short fragments (single words)
|
|
22
|
+
|
|
23
|
+
# Split text into paragraph chunks
|
|
24
|
+
#
|
|
25
|
+
# @param text [String] Text to chunk
|
|
26
|
+
# @return [Array<String>] Array of paragraph chunks
|
|
27
|
+
#
|
|
28
|
+
def chunk(text)
|
|
29
|
+
return [] if text.nil? || text.strip.empty?
|
|
30
|
+
|
|
31
|
+
# Normalize line endings
|
|
32
|
+
normalized = text.gsub(/\r\n?/, "\n")
|
|
33
|
+
|
|
34
|
+
# Protect code blocks from splitting
|
|
35
|
+
protected_text, code_blocks = protect_code_blocks(normalized)
|
|
36
|
+
|
|
37
|
+
# Split by blank lines (2+ newlines)
|
|
38
|
+
raw_chunks = protected_text.split(/\n\s*\n+/)
|
|
39
|
+
|
|
40
|
+
# Restore code blocks and clean up
|
|
41
|
+
chunks = raw_chunks.map do |chunk|
|
|
42
|
+
restore_code_blocks(chunk.strip, code_blocks)
|
|
43
|
+
end.reject(&:empty?)
|
|
44
|
+
|
|
45
|
+
# Merge very short chunks with neighbors
|
|
46
|
+
merge_short_chunks(chunks)
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
private
|
|
50
|
+
|
|
51
|
+
# Replace code blocks with placeholders to prevent splitting
|
|
52
|
+
#
|
|
53
|
+
# @param text [String] Text containing code blocks
|
|
54
|
+
# @return [Array(String, Hash)] Protected text and code block map
|
|
55
|
+
#
|
|
56
|
+
def protect_code_blocks(text)
|
|
57
|
+
code_blocks = {}
|
|
58
|
+
counter = 0
|
|
59
|
+
|
|
60
|
+
# Match fenced code blocks (``` or ~~~)
|
|
61
|
+
protected = text.gsub(/```[\s\S]*?```|~~~[\s\S]*?~~~/m) do |match|
|
|
62
|
+
placeholder = "<<<CODE_BLOCK_#{counter}>>>"
|
|
63
|
+
code_blocks[placeholder] = match
|
|
64
|
+
counter += 1
|
|
65
|
+
placeholder
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
[protected, code_blocks]
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
# Restore code blocks from placeholders
|
|
72
|
+
#
|
|
73
|
+
# @param text [String] Text with placeholders
|
|
74
|
+
# @param code_blocks [Hash] Placeholder to code block mapping
|
|
75
|
+
# @return [String] Text with code blocks restored
|
|
76
|
+
#
|
|
77
|
+
def restore_code_blocks(text, code_blocks)
|
|
78
|
+
result = text
|
|
79
|
+
code_blocks.each do |placeholder, block|
|
|
80
|
+
result = result.gsub(placeholder, block)
|
|
81
|
+
end
|
|
82
|
+
result
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
# Merge chunks shorter than MIN_CHUNK_SIZE with neighbors
|
|
86
|
+
#
|
|
87
|
+
# @param chunks [Array<String>] Original chunks
|
|
88
|
+
# @return [Array<String>] Merged chunks
|
|
89
|
+
#
|
|
90
|
+
def merge_short_chunks(chunks)
|
|
91
|
+
return chunks if chunks.size <= 1
|
|
92
|
+
|
|
93
|
+
result = []
|
|
94
|
+
buffer = ''
|
|
95
|
+
|
|
96
|
+
chunks.each do |chunk|
|
|
97
|
+
if buffer.empty?
|
|
98
|
+
buffer = chunk
|
|
99
|
+
elsif buffer.length < MIN_CHUNK_SIZE
|
|
100
|
+
buffer = "#{buffer}\n\n#{chunk}"
|
|
101
|
+
else
|
|
102
|
+
result << buffer
|
|
103
|
+
buffer = chunk
|
|
104
|
+
end
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
result << buffer unless buffer.empty?
|
|
108
|
+
result
|
|
109
|
+
end
|
|
110
|
+
end
|
|
111
|
+
end
|
|
112
|
+
end
|