ace-compressor 0.24.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,470 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "pathname"
4
+
5
+ module Ace
6
+ module Compressor
7
+ module Organisms
8
+ # Orchestrates compact mode with a tiered section ladder. In agent mode the
9
+ # same ladder is used, but eligible narrative sections may be rewritten by
10
+ # AgentCompressor while rule-bearing sections stay deterministic.
11
+ class CompactCompressor
12
+ DENSE_FACT_RE = /\d|(?:must|must not|never|required|should|shall|cannot|can't|do not|only)\b/i
13
+ MIMICRY_RE = /\b(?:exact output|required format|must match|mimic|verbatim|follow exactly)\b/i
14
+ SENSITIVE_TABLE_RE = /\b(?:must|must not|never|required|only|shall|cannot|do not|policy|constraint)\b/i
15
+ TABLE_SEPARATOR_RE = /\A\|?\s*:?-{3,}:?(?:\s*\|\s*:?-{3,}:?)*\|?\z/
16
+ TABLE_ROW_SEPARATOR_ESCAPED_RE = /\s+\\\|\\\|ROW\\\|\\\|\s+/
17
+ EXAMPLE_PAYLOAD_PREFIXES = ["CMD|", "FILES|", "TREE|", "CODE|"].freeze
18
+ PRESERVE_TABLE_MAX_ROWS = 2
19
+ SCHEMA_KEY_ROWS_MAX_ROWS = 6
20
+ SCHEMA_KEY_ROWS_LIMIT = 2
21
+ SUMMARY_ROWS_LIMIT = 1
22
+ EXACT_SECTION_PREFIXES = ["RULE|", "CONSTRAINT|", "CMD|", "U|"].freeze
23
+
24
+ attr_reader :ignored_paths, :refused_sources
25
+
26
+ def initialize(paths, verbose: false, mode_label: "compact", agent_rewriter: nil)
27
+ @paths = Array(paths)
28
+ @mode_label = mode_label
29
+ @resolver = ExactCompressor.new(paths, verbose: verbose, mode_label: mode_label)
30
+ @parser = Ace::Compressor::Atoms::MarkdownParser.new
31
+ @transformer = Ace::Compressor::Atoms::CanonicalBlockTransformer
32
+ @classifier = Ace::Compressor::Atoms::CompactPolicyClassifier.new
33
+ @agent_rewriter = agent_rewriter
34
+ @refused_sources = []
35
+ @example_registry = {}
36
+ end
37
+
38
+ def call
39
+ compress_sources(resolve_sources)
40
+ end
41
+
42
+ def resolve_sources
43
+ @resolver.resolve_sources
44
+ end
45
+
46
+ def ignored_paths
47
+ @resolver.ignored_paths
48
+ end
49
+
50
+ def compress_sources(sources, source_paths: nil)
51
+ @refused_sources = []
52
+ @example_registry = {}
53
+ lines = [Ace::Compressor::Models::ContextPack.header(@mode_label)]
54
+
55
+ Array(sources).each do |source|
56
+ source_label = source_label(display_source(source, source_paths))
57
+ lines << Ace::Compressor::Models::ContextPack.file_line(source_label)
58
+ text = File.read(source)
59
+ raise Ace::Compressor::Error, "Input file is empty. #{@mode_label.capitalize} mode requires content: #{source}" if text.strip.empty?
60
+
61
+ blocks = @parser.call(text)
62
+ raise Ace::Compressor::Error, "Input file is empty after frontmatter removal. #{@mode_label.capitalize} mode requires content: #{source}" if blocks.empty?
63
+
64
+ policy = @classifier.call(source: source_label, blocks: blocks)
65
+ action = policy.fetch("action")
66
+ lines << Ace::Compressor::Models::ContextPack.policy_line(doc_class: policy.fetch("class"), action: action)
67
+ transformed = @transformer.new(source_label).call(blocks)
68
+
69
+ if @mode_label == "agent"
70
+ lines.concat agent_records(transformed, policy: policy, source_label: source_label)
71
+ else
72
+ lines.concat deterministic_records(transformed, action: action, source_label: source_label, policy_class: policy.fetch("class"))
73
+ end
74
+ end
75
+
76
+ lines.join("\n")
77
+ end
78
+
79
+ private
80
+
81
+ def deterministic_records(transformed, action:, source_label:, policy_class:)
82
+ case action
83
+ when "refuse_compact"
84
+ refusal_for_source(source_label, policy_class, "compact_preflight", "rule-heavy source requires exact mode")
85
+ when "compact_with_exact_rule_sections"
86
+ mixed_records = compact_with_exact_rule_sections(transformed, source_label: source_label)
87
+ fidelity = mixed_fidelity(transformed, mixed_records)
88
+ if fidelity.fetch(:status) == "pass"
89
+ [Ace::Compressor::Models::ContextPack.fidelity_line(source: source_label, status: "pass", check: "exact_rule_sections", details: fidelity.fetch(:details)), *mixed_records]
90
+ else
91
+ refusal_for_source(source_label, policy_class, "exact_rule_sections", fidelity.fetch(:details))
92
+ end
93
+ else
94
+ compact_records(transformed, action, source_label: source_label)
95
+ end
96
+ end
97
+
98
+ def agent_records(transformed, policy:, source_label:)
99
+ sections = split_sections(transformed)
100
+ used_agent = false
101
+ result = []
102
+
103
+ sections.each do |section_records|
104
+ strategy = section_strategy(section_records, policy_class: policy.fetch("class"))
105
+ deterministic = deterministic_section_records(section_records, strategy: strategy, source_label: source_label)
106
+
107
+ if agent_eligible?(strategy) && @agent_rewriter
108
+ rewrite = @agent_rewriter.rewrite_section(section_records, source_label: source_label)
109
+ if rewrite[:ok] && section_improves?(rewrite[:lines], deterministic)
110
+ result.concat(rewrite[:lines])
111
+ used_agent = true
112
+ next
113
+ end
114
+ end
115
+
116
+ result.concat(deterministic)
117
+ end
118
+
119
+ if used_agent
120
+ [Ace::Compressor::Models::ContextPack.fidelity_line(source: source_label, status: "pass", check: "agent_value", details: "agent_sections_applied"), *result]
121
+ else
122
+ @refused_sources << {"source" => source_label, "reason" => "no_win", "failed_check" => "agent_value"}
123
+ [
124
+ Ace::Compressor::Models::ContextPack.fidelity_line(source: source_label, status: "fail", check: "agent_value", details: "no_agent_section_beat_deterministic"),
125
+ *result,
126
+ Ace::Compressor::Models::ContextPack.refusal_line(source: source_label, reason: "no_win", failed_check: "agent_value"),
127
+ Ace::Compressor::Models::ContextPack.guidance_line(source: source_label, retry_with: "--mode compact")
128
+ ]
129
+ end
130
+ end
131
+
132
+ def split_sections(records)
133
+ sections = []
134
+ current = []
135
+ Array(records).each do |line|
136
+ if line.start_with?("SEC|")
137
+ sections << current unless current.empty?
138
+ current = [line]
139
+ else
140
+ current << line unless current.empty?
141
+ end
142
+ end
143
+ sections << current unless current.empty?
144
+ sections
145
+ end
146
+
147
+ def section_strategy(section_records, policy_class:)
148
+ payload = Array(section_records).reject { |line| line.start_with?("SEC|") }
149
+ return :exact if payload.any? { |line| line.start_with?(*EXACT_SECTION_PREFIXES) }
150
+ return :hybrid if payload.any? { |line| line.start_with?("TABLE|") }
151
+ return :lossy if %w[narrative-heavy mixed].include?(policy_class)
152
+ return :hybrid if %w[overview architecture reference guide vision].include?(policy_class)
153
+
154
+ :deterministic
155
+ end
156
+
157
+ def agent_eligible?(strategy)
158
+ %i[lossy hybrid].include?(strategy)
159
+ end
160
+
161
+ def deterministic_section_records(section_records, strategy:, source_label:)
162
+ return section_records if strategy == :exact
163
+
164
+ action = (strategy == :lossy) ? "aggressive_compact" : "conservative_compact"
165
+ compact_records(section_records, action, source_label: source_label)
166
+ end
167
+
168
+ def section_improves?(candidate_lines, deterministic_lines)
169
+ Array(candidate_lines).join("\n").bytesize < Array(deterministic_lines).join("\n").bytesize
170
+ end
171
+
172
+ def compact_records(records, action, source_label:)
173
+ if action == "aggressive_compact"
174
+ aggressive_compact(records, source_label: source_label)
175
+ else
176
+ conservative_compact(records, source_label: source_label)
177
+ end
178
+ end
179
+
180
+ def aggressive_compact(records, source_label:)
181
+ current_section = "__root__"
182
+ pending_section = nil
183
+ summary_seen = {}
184
+ fact_seen = {}
185
+ kept = []
186
+
187
+ Array(records).each do |line|
188
+ case line
189
+ when /\ASEC\|/
190
+ current_section = line
191
+ pending_section = line
192
+ when /\ASUMMARY\|/
193
+ next if summary_seen[current_section]
194
+ summary_seen[current_section] = true
195
+ flush_pending_section!(kept, pending_section)
196
+ pending_section = nil
197
+ kept << compact_summary_line(line)
198
+ when /\A(?:RULE|CONSTRAINT|PROBLEMS|LIST|EXAMPLE|U|CMD|FILES|TREE|CODE|TABLE)\|/
199
+ flush_pending_section!(kept, pending_section)
200
+ pending_section = nil
201
+ kept << line
202
+ when /\AFACT\|/
203
+ fact_key = current_section
204
+ text = line.sub(/\AFACT\|/, "")
205
+ next if fact_seen[fact_key] && !DENSE_FACT_RE.match?(text)
206
+ fact_seen[fact_key] = true
207
+ flush_pending_section!(kept, pending_section)
208
+ pending_section = nil
209
+ kept << line
210
+ end
211
+ end
212
+
213
+ post_process_structured_records(kept, source_label: source_label)
214
+ end
215
+
216
+ def compact_with_exact_rule_sections(records, source_label:)
217
+ sections = split_sections(records)
218
+ sections.flat_map do |section_records|
219
+ strategy = section_strategy(section_records, policy_class: "mixed")
220
+ deterministic_section_records(section_records, strategy: strategy, source_label: source_label)
221
+ end
222
+ end
223
+
224
+ def mixed_fidelity(original_records, compacted_records)
225
+ original_rules = rule_records(original_records)
226
+ compacted_rules = rule_records(compacted_records)
227
+ missing_rules = original_rules.reject { |line| compacted_rules.include?(line) }
228
+
229
+ if missing_rules.empty?
230
+ {status: "pass", details: "all_rule_records_preserved"}
231
+ else
232
+ {status: "fail", details: "missing_rule_records=#{missing_rules.size}"}
233
+ end
234
+ end
235
+
236
+ def rule_records(records)
237
+ Array(records).select { |line| line.start_with?("RULE|", "CONSTRAINT|") }
238
+ end
239
+
240
+ def refusal_for_source(source_label, reason, failed_check, details)
241
+ @refused_sources << {"source" => source_label, "reason" => reason, "failed_check" => failed_check}
242
+ [
243
+ Ace::Compressor::Models::ContextPack.fidelity_line(source: source_label, status: "fail", check: failed_check, details: details),
244
+ Ace::Compressor::Models::ContextPack.refusal_line(source: source_label, reason: reason, failed_check: failed_check),
245
+ Ace::Compressor::Models::ContextPack.guidance_line(source: source_label, retry_with: "--mode exact")
246
+ ]
247
+ end
248
+
249
+ def conservative_compact(records, source_label:)
250
+ seen = {}
251
+ deduped = Array(records).each_with_object([]) do |line, acc|
252
+ next if seen[line]
253
+ seen[line] = true
254
+ acc << line
255
+ end
256
+ post_process_structured_records(deduped, source_label: source_label)
257
+ end
258
+
259
+ def post_process_structured_records(records, source_label:)
260
+ table_index = 0
261
+ mimicry_required = false
262
+ collapse_example_payload = false
263
+ Array(records).each_with_object([]) do |line, acc|
264
+ case line
265
+ when /\ASEC\|/
266
+ mimicry_required = false
267
+ collapse_example_payload = false
268
+ acc << line
269
+ when /\A(?:RULE|CONSTRAINT|FACT)\|/
270
+ mimicry_required ||= line.match?(MIMICRY_RE)
271
+ collapse_example_payload = false
272
+ acc << line
273
+ when /\AEXAMPLE\|/
274
+ collapse_example_payload = false
275
+ tool = example_tool(line)
276
+ if tool.empty?
277
+ acc << line
278
+ next
279
+ end
280
+ if mimicry_required
281
+ register_example(tool, source_label)
282
+ acc << line
283
+ next
284
+ end
285
+ seen = @example_registry[tool]
286
+ if seen
287
+ acc << Ace::Compressor::Models::ContextPack.example_ref_line(tool: tool, source: source_label, original_source: seen.fetch("source"), reason: "duplicate_example")
288
+ acc << Ace::Compressor::Models::ContextPack.loss_line(kind: "example", target: tool, strategy: "reference", original: 1, retained: 0, unit: "examples", source: source_label, details: "collapsed_to=#{seen.fetch("source")}")
289
+ collapse_example_payload = true
290
+ else
291
+ register_example(tool, source_label)
292
+ acc << line
293
+ end
294
+ when /\ATABLE\|/
295
+ collapse_example_payload = false
296
+ table_index += 1
297
+ acc.concat compact_table_records(line, source_label: source_label, table_index: table_index)
298
+ else
299
+ if collapse_example_payload && example_payload_line?(line)
300
+ next
301
+ end
302
+ collapse_example_payload = false unless example_payload_line?(line)
303
+ acc << line
304
+ end
305
+ end
306
+ end
307
+
308
+ def compact_table_records(line, source_label:, table_index:)
309
+ rows = parse_table_rows(line)
310
+ return [line] if rows.empty?
311
+
312
+ header_rows, data_rows = split_table_rows(rows)
313
+ original_data_count = data_rows.size
314
+ table_id = table_record_id(source_label, table_index)
315
+ strategy, retained_data_rows = select_table_strategy(rows, data_rows)
316
+ retained_rows = header_rows + retained_data_rows
317
+ records = [Ace::Compressor::Models::ContextPack.table_line(retained_rows, table_id: table_id, strategy: strategy)]
318
+
319
+ if original_data_count > retained_data_rows.size
320
+ records << Ace::Compressor::Models::ContextPack.loss_line(kind: "table", target: table_id, strategy: strategy, original: original_data_count, retained: retained_data_rows.size, unit: "rows", source: source_label, details: "data_rows_only")
321
+ end
322
+ records
323
+ end
324
+
325
+ def select_table_strategy(all_rows, data_rows)
326
+ original_count = data_rows.size
327
+ if original_count <= PRESERVE_TABLE_MAX_ROWS || sensitive_table?(all_rows)
328
+ ["preserve", data_rows]
329
+ elsif original_count <= SCHEMA_KEY_ROWS_MAX_ROWS
330
+ ["schema_plus_key_rows", select_key_rows(data_rows, limit: SCHEMA_KEY_ROWS_LIMIT)]
331
+ else
332
+ ["summarize_with_loss", select_key_rows(data_rows, limit: SUMMARY_ROWS_LIMIT)]
333
+ end
334
+ end
335
+
336
+ def split_table_rows(rows)
337
+ separator_index = rows.find_index { |row| row.match?(TABLE_SEPARATOR_RE) }
338
+ if separator_index
339
+ [rows[0..separator_index], rows[(separator_index + 1)..] || []]
340
+ else
341
+ [rows.first ? [rows.first] : [], rows[1..] || []]
342
+ end
343
+ end
344
+
345
+ def parse_table_rows(line)
346
+ payload = line.sub(/\ATABLE\|/, "").to_s
347
+ return [] if payload.strip.empty?
348
+
349
+ return parse_structured_table_rows(payload) if payload.include?("rows=")
350
+
351
+ payload.split(TABLE_ROW_SEPARATOR_ESCAPED_RE).map { |row| row.gsub("\\|", "|").strip }
352
+ end
353
+
354
+ def parse_structured_table_rows(payload)
355
+ fields = payload.split("|").each_with_object({}) do |field, hash|
356
+ key, value = field.split("=", 2)
357
+ next if value.nil?
358
+
359
+ hash[key] = value
360
+ end
361
+
362
+ columns = fields.fetch("cols", "").split(",").map { |cell| cell.gsub("\\|", "|").strip }.reject(&:empty?)
363
+ data_rows = decode_structured_table_rows(fields.fetch("rows", ""))
364
+ rows = []
365
+ rows << "| #{columns.join(" | ")} |" unless columns.empty?
366
+ rows << "|#{Array(columns).map { "---" }.join("|")}|" unless columns.empty?
367
+ rows.concat(data_rows.map { |cells| "| #{cells.join(" | ")} |" })
368
+ rows
369
+ end
370
+
371
+ def decode_structured_table_rows(value)
372
+ rows = []
373
+ current_row = []
374
+ current_cell = +""
375
+ escape_next = false
376
+
377
+ value.to_s.each_char do |char|
378
+ if escape_next
379
+ current_cell << char
380
+ escape_next = false
381
+ elsif char == "\\"
382
+ escape_next = true
383
+ elsif char == ">"
384
+ current_row << current_cell.strip
385
+ current_cell = +""
386
+ elsif char == ";"
387
+ current_row << current_cell.strip
388
+ rows << current_row
389
+ current_row = []
390
+ current_cell = +""
391
+ else
392
+ current_cell << char
393
+ end
394
+ end
395
+
396
+ unless current_cell.empty? && current_row.empty?
397
+ current_row << current_cell.strip
398
+ rows << current_row
399
+ end
400
+
401
+ rows
402
+ end
403
+
404
+ def select_key_rows(rows, limit:)
405
+ return rows if rows.size <= limit
406
+ selected_indexes = [0]
407
+ dense_index = rows.find_index { |row| row.match?(DENSE_FACT_RE) }
408
+ selected_indexes << dense_index unless dense_index.nil?
409
+ selected_indexes << (rows.size - 1)
410
+ selected_indexes = selected_indexes.uniq
411
+ rows.each_index do |index|
412
+ break if selected_indexes.size >= limit
413
+ next if selected_indexes.include?(index)
414
+ selected_indexes << index
415
+ end
416
+ selected_indexes.sort.take(limit).map { |index| rows[index] }
417
+ end
418
+
419
+ def sensitive_table?(rows)
420
+ Array(rows).any? { |row| row.match?(SENSITIVE_TABLE_RE) }
421
+ end
422
+
423
+ def table_record_id(source_label, table_index)
424
+ base = File.basename(source_label.to_s, File.extname(source_label.to_s)).downcase.gsub(/[^a-z0-9]+/, "_").sub(/\A_+/, "").sub(/_+\z/, "")
425
+ base = "source" if base.empty?
426
+ "#{base}_t#{table_index}"
427
+ end
428
+
429
+ def example_payload_line?(line)
430
+ line.start_with?(*EXAMPLE_PAYLOAD_PREFIXES)
431
+ end
432
+
433
+ def example_tool(line)
434
+ line[/\AEXAMPLE\|tool=([^|]+)/, 1].to_s.gsub("\\|", "|")
435
+ end
436
+
437
+ def register_example(tool, source_label)
438
+ @example_registry[tool] ||= {"source" => source_label}
439
+ end
440
+
441
+ def display_source(source, source_paths)
442
+ return source unless source_paths
443
+
444
+ source_paths[File.expand_path(source)] || source_paths[source] || source
445
+ end
446
+
447
+ def source_label(source)
448
+ pathname = Pathname.new(source)
449
+ project_root = Pathname.new(Dir.pwd)
450
+ relative = pathname.relative_path_from(project_root).to_s
451
+ return relative unless relative.start_with?("..")
452
+ source
453
+ rescue ArgumentError
454
+ source
455
+ end
456
+
457
+ def flush_pending_section!(lines, pending_section)
458
+ lines << pending_section unless pending_section.nil?
459
+ end
460
+
461
+ def compact_summary_line(line)
462
+ text = line.sub(/\ASUMMARY\|/, "")
463
+ first_sentence = text.split(/(?<=[.!?])\s+/).first.to_s.strip
464
+ compact_text = first_sentence.empty? ? text : first_sentence
465
+ Ace::Compressor::Models::ContextPack.summary_line(compact_text)
466
+ end
467
+ end
468
+ end
469
+ end
470
+ end