ace-compressor 0.24.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.ace-defaults/compressor/config.yml +11 -0
- data/.ace-defaults/nav/protocols/tmpl-sources/ace-compressor.yml +10 -0
- data/CHANGELOG.md +357 -0
- data/README.md +46 -0
- data/Rakefile +15 -0
- data/exe/ace-compressor +13 -0
- data/handbook/templates/agent/minify-single-source.template.md +34 -0
- data/lib/ace/compressor/atoms/canonical_block_transformer.rb +341 -0
- data/lib/ace/compressor/atoms/compact_policy_classifier.rb +130 -0
- data/lib/ace/compressor/atoms/markdown_parser.rb +190 -0
- data/lib/ace/compressor/atoms/retention_reporter.rb +111 -0
- data/lib/ace/compressor/cli/commands/benchmark.rb +51 -0
- data/lib/ace/compressor/cli/commands/compress.rb +89 -0
- data/lib/ace/compressor/cli.rb +23 -0
- data/lib/ace/compressor/models/context_pack.rb +175 -0
- data/lib/ace/compressor/molecules/cache_store.rb +301 -0
- data/lib/ace/compressor/molecules/input_resolver.rb +98 -0
- data/lib/ace/compressor/organisms/agent_compressor.rb +325 -0
- data/lib/ace/compressor/organisms/benchmark_runner.rb +172 -0
- data/lib/ace/compressor/organisms/compact_compressor.rb +470 -0
- data/lib/ace/compressor/organisms/compression_runner.rb +315 -0
- data/lib/ace/compressor/organisms/exact_compressor.rb +187 -0
- data/lib/ace/compressor/version.rb +7 -0
- data/lib/ace/compressor.rb +109 -0
- metadata +156 -0
|
@@ -0,0 +1,341 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Ace
|
|
4
|
+
module Compressor
|
|
5
|
+
module Atoms
|
|
6
|
+
class CanonicalBlockTransformer
|
|
7
|
+
RULE_RE = /\b(?:must|must not|never|required|required to|should|shall|shall not|cannot|can't|do not)\b/i
|
|
8
|
+
CONSTRAINT_RE = /\b(?:constraint|no more than|at most|must not|never|cannot)\b/i
|
|
9
|
+
RULE_SECTION_RE = /(rules?|guidelines?|policy|requirements?|constraints?|musts?)/i
|
|
10
|
+
SUMMARY_SECTION_RE = /(overview|summary|vision|purpose|goal|why|motivation|introduction|intro)/i
|
|
11
|
+
EXAMPLE_HEADING_RE = /\Aexample\s*:\s*(.+)\z/i
|
|
12
|
+
PROBLEM_SECTION_RE = /(problems?|issues?|risks?|pitfalls?|drawbacks?)/i
|
|
13
|
+
PROBLEM_CONTEXT_RE = /\b(?:suffer from|struggle with|problems?|issues?|risks?|pitfalls?|drawbacks?|pain points?)\b/i
|
|
14
|
+
TREE_LINE_RE = /[│├└╰]--/
|
|
15
|
+
FILE_PATH_RE = /\A(?:\.{1,2}\/)?[A-Za-z0-9._-]+(?:\/[A-Za-z0-9._-]+)*\z/
|
|
16
|
+
SHELL_LANGS = %w[bash sh shell zsh fish cmd powershell ps1].freeze
|
|
17
|
+
SHELL_CONTROL_RE = /\A(?:#|if\b|then\b|else\b|elif\b|fi\b|for\b|while\b|until\b|do\b|done\b|case\b|esac\b|function\b|\{|\})/
|
|
18
|
+
CONTEXTPACK_PREFIXES = %w[
|
|
19
|
+
H FILE POLICY FIDELITY REFUSAL GUIDANCE FALLBACK SEC SUMMARY FACT RULE CONSTRAINT
|
|
20
|
+
PROBLEMS LIST EXAMPLE CMD FILES TREE CODE TABLE LOSS EXAMPLE_REF U
|
|
21
|
+
].freeze
|
|
22
|
+
LIST_NARRATIVE_MIN_CHARS = 48
|
|
23
|
+
LIST_NARRATIVE_MIN_WORDS = 6
|
|
24
|
+
LIST_STOPWORDS = %w[a an and as at by for from in into is of on or that the this to via with within].freeze
|
|
25
|
+
LIST_TOKEN_MAP = {
|
|
26
|
+
"architecture" => "arch",
|
|
27
|
+
"architectural" => "arch",
|
|
28
|
+
"configuration" => "config",
|
|
29
|
+
"documentation" => "docs",
|
|
30
|
+
"generation" => "gen",
|
|
31
|
+
"management" => "mgmt",
|
|
32
|
+
"repository" => "repo",
|
|
33
|
+
"repositories" => "repos",
|
|
34
|
+
"development" => "dev",
|
|
35
|
+
"integration" => "integr",
|
|
36
|
+
"execution" => "exec",
|
|
37
|
+
"reporting" => "reports",
|
|
38
|
+
"organization" => "org",
|
|
39
|
+
"organizations" => "orgs",
|
|
40
|
+
"capabilities" => "caps",
|
|
41
|
+
"capability" => "cap",
|
|
42
|
+
"foundation" => "base",
|
|
43
|
+
"tracking" => "track",
|
|
44
|
+
"powered" => "pwr",
|
|
45
|
+
"detected" => "detect",
|
|
46
|
+
"matching" => "match"
|
|
47
|
+
}.freeze
|
|
48
|
+
|
|
49
|
+
def initialize(source)
|
|
50
|
+
@source = source
|
|
51
|
+
@current_section = nil
|
|
52
|
+
@example_tool = nil
|
|
53
|
+
@last_text = nil
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def call(blocks)
|
|
57
|
+
lines = []
|
|
58
|
+
|
|
59
|
+
Array(blocks).each do |block|
|
|
60
|
+
next unless block.is_a?(Hash) && block[:type]
|
|
61
|
+
|
|
62
|
+
case block[:type]
|
|
63
|
+
when :heading
|
|
64
|
+
lines << section_line(block)
|
|
65
|
+
when :text
|
|
66
|
+
lines << text_record(block[:text])
|
|
67
|
+
when :list
|
|
68
|
+
lines.concat list_lines(block)
|
|
69
|
+
when :fenced_code
|
|
70
|
+
lines.concat fenced_code_lines(block)
|
|
71
|
+
when :table
|
|
72
|
+
table = table_line(block)
|
|
73
|
+
lines << table if table
|
|
74
|
+
when :unresolved
|
|
75
|
+
lines << unresolved_line(block)
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
lines.compact
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
private
|
|
83
|
+
|
|
84
|
+
def section_line(block)
|
|
85
|
+
heading = normalize_heading_text(normalize_inline(block[:text].to_s))
|
|
86
|
+
return nil if heading.empty?
|
|
87
|
+
|
|
88
|
+
match = heading.match(EXAMPLE_HEADING_RE)
|
|
89
|
+
if match
|
|
90
|
+
tool = heading_tool_slug(match[1].to_s)
|
|
91
|
+
@example_tool = tool
|
|
92
|
+
@last_text = nil
|
|
93
|
+
"EXAMPLE|tool=#{tool}"
|
|
94
|
+
else
|
|
95
|
+
@example_tool = nil
|
|
96
|
+
@current_section = heading_slug(heading)
|
|
97
|
+
@last_text = nil
|
|
98
|
+
Ace::Compressor::Models::ContextPack.section_line(@current_section)
|
|
99
|
+
end
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
def text_record(raw_text)
|
|
103
|
+
text = normalize_inline(raw_text.to_s)
|
|
104
|
+
return nil if text.empty?
|
|
105
|
+
|
|
106
|
+
example_match = text.match(EXAMPLE_HEADING_RE)
|
|
107
|
+
if example_match
|
|
108
|
+
@example_tool = heading_tool_slug(example_match[1].to_s)
|
|
109
|
+
@last_text = text
|
|
110
|
+
return Ace::Compressor::Models::ContextPack.example_line(@example_tool)
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
kind =
|
|
114
|
+
if text_summary?
|
|
115
|
+
:summary
|
|
116
|
+
elsif RULE_RE.match?(text)
|
|
117
|
+
:rule
|
|
118
|
+
elsif CONSTRAINT_RE.match?(text)
|
|
119
|
+
:constraint
|
|
120
|
+
elsif RULE_SECTION_RE.match?(@current_section.to_s)
|
|
121
|
+
:rule
|
|
122
|
+
else
|
|
123
|
+
:fact
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
@last_text = text
|
|
127
|
+
|
|
128
|
+
case kind
|
|
129
|
+
when :summary
|
|
130
|
+
Ace::Compressor::Models::ContextPack.summary_line(text)
|
|
131
|
+
when :constraint
|
|
132
|
+
Ace::Compressor::Models::ContextPack.constraint_line(text)
|
|
133
|
+
when :rule
|
|
134
|
+
Ace::Compressor::Models::ContextPack.rule_line(text)
|
|
135
|
+
else
|
|
136
|
+
Ace::Compressor::Models::ContextPack.fact_line(text)
|
|
137
|
+
end
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
def list_lines(block)
|
|
141
|
+
items = Array(block[:items]).map { |item| list_item_slug(item.to_s) }.reject(&:empty?)
|
|
142
|
+
return [] if items.empty?
|
|
143
|
+
|
|
144
|
+
if problem_list_context?
|
|
145
|
+
[Ace::Compressor::Models::ContextPack.problems_line(items)]
|
|
146
|
+
else
|
|
147
|
+
list_key = @current_section.to_s.empty? ? "items" : @current_section
|
|
148
|
+
[Ace::Compressor::Models::ContextPack.list_line(list_key, items)]
|
|
149
|
+
end
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
def fenced_code_lines(block)
|
|
153
|
+
code_lines = Array(block[:content].to_s.lines).map(&:strip).reject(&:empty?)
|
|
154
|
+
return [] if code_lines.empty?
|
|
155
|
+
|
|
156
|
+
nested_lines = nested_contextpack_lines(code_lines)
|
|
157
|
+
return nested_lines if nested_lines
|
|
158
|
+
|
|
159
|
+
language = block[:language].to_s.strip.downcase
|
|
160
|
+
if file_list_block?(code_lines)
|
|
161
|
+
[Ace::Compressor::Models::ContextPack.files_line(file_label, code_lines)]
|
|
162
|
+
elsif tree_block?(code_lines)
|
|
163
|
+
[Ace::Compressor::Models::ContextPack.tree_line(tree_label, code_lines.join(" "))]
|
|
164
|
+
elsif shell_script_block?(language, code_lines)
|
|
165
|
+
[Ace::Compressor::Models::ContextPack.code_line(language.empty? ? "bash" : language, code_lines.join(" "))]
|
|
166
|
+
elsif shell_command_block?(language, code_lines)
|
|
167
|
+
code_lines.map { |line| Ace::Compressor::Models::ContextPack.cmd_line(line) }
|
|
168
|
+
else
|
|
169
|
+
[Ace::Compressor::Models::ContextPack.code_line(language.empty? ? "code" : language, code_lines.join(" "))]
|
|
170
|
+
end
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
def table_line(block)
|
|
174
|
+
rows = Array(block[:rows])
|
|
175
|
+
return nil if rows.empty?
|
|
176
|
+
|
|
177
|
+
Ace::Compressor::Models::ContextPack.table_line(rows)
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
def unresolved_line(block)
|
|
181
|
+
kind = block[:kind].to_s
|
|
182
|
+
raw = block[:text].to_s
|
|
183
|
+
Ace::Compressor::Models::ContextPack.unresolved_line(kind, raw)
|
|
184
|
+
end
|
|
185
|
+
|
|
186
|
+
def text_summary?
|
|
187
|
+
return false if @current_section.to_s.empty?
|
|
188
|
+
|
|
189
|
+
SUMMARY_SECTION_RE.match?(@current_section.to_s)
|
|
190
|
+
end
|
|
191
|
+
|
|
192
|
+
def shell_command_block?(language, lines)
|
|
193
|
+
return false if shell_script_block?(language, lines)
|
|
194
|
+
return true if SHELL_LANGS.include?(language)
|
|
195
|
+
return false unless language.empty?
|
|
196
|
+
|
|
197
|
+
lines.all? { |line| line.match?(/\A[a-zA-Z0-9_.\/-]+(?:\s+.+)?\z/) }
|
|
198
|
+
end
|
|
199
|
+
|
|
200
|
+
def shell_script_block?(language, lines)
|
|
201
|
+
return false if lines.empty?
|
|
202
|
+
|
|
203
|
+
shell_language = SHELL_LANGS.include?(language)
|
|
204
|
+
inferred_shell = language.empty? && lines.all? { |line| shellish_line?(line) }
|
|
205
|
+
return false unless shell_language || inferred_shell
|
|
206
|
+
|
|
207
|
+
lines.size > 3 || lines.any? { |line| shell_script_line?(line) }
|
|
208
|
+
end
|
|
209
|
+
|
|
210
|
+
def tree_block?(lines)
|
|
211
|
+
lines.any? { |line| TREE_LINE_RE.match?(line) || line.match?(/\A[|` ]*[├└]──/) }
|
|
212
|
+
end
|
|
213
|
+
|
|
214
|
+
def file_list_block?(lines)
|
|
215
|
+
return false if lines.empty?
|
|
216
|
+
|
|
217
|
+
lines.all? { |line| FILE_PATH_RE.match?(line) }
|
|
218
|
+
end
|
|
219
|
+
|
|
220
|
+
def nested_contextpack_lines(lines)
|
|
221
|
+
payload = Array(lines).reject { |line| contextpack_header_line?(line) }
|
|
222
|
+
return nil if payload.empty?
|
|
223
|
+
return nil unless payload.all? { |line| contextpack_line?(line) }
|
|
224
|
+
|
|
225
|
+
payload
|
|
226
|
+
end
|
|
227
|
+
|
|
228
|
+
def contextpack_line?(line)
|
|
229
|
+
prefix = line.to_s.split("|", 2).first
|
|
230
|
+
CONTEXTPACK_PREFIXES.include?(prefix)
|
|
231
|
+
end
|
|
232
|
+
|
|
233
|
+
def contextpack_header_line?(line)
|
|
234
|
+
line.to_s.start_with?("H|ContextPack/")
|
|
235
|
+
end
|
|
236
|
+
|
|
237
|
+
def tree_label
|
|
238
|
+
return @example_tool unless @example_tool.to_s.empty?
|
|
239
|
+
return @current_section unless @current_section.to_s.empty?
|
|
240
|
+
|
|
241
|
+
File.basename(@source).sub(/\.[^.]+\z/, "")
|
|
242
|
+
end
|
|
243
|
+
|
|
244
|
+
def file_label
|
|
245
|
+
return @example_tool unless @example_tool.to_s.empty?
|
|
246
|
+
return @current_section unless @current_section.to_s.empty?
|
|
247
|
+
|
|
248
|
+
"files"
|
|
249
|
+
end
|
|
250
|
+
|
|
251
|
+
def section_contains_problems?(section)
|
|
252
|
+
PROBLEM_SECTION_RE.match?(section.to_s.tr("_", " "))
|
|
253
|
+
end
|
|
254
|
+
|
|
255
|
+
def problem_list_context?
|
|
256
|
+
section_contains_problems?(@current_section) || PROBLEM_CONTEXT_RE.match?(@last_text.to_s)
|
|
257
|
+
end
|
|
258
|
+
|
|
259
|
+
def heading_slug(text)
|
|
260
|
+
normalize_heading_text(text)
|
|
261
|
+
.downcase
|
|
262
|
+
.gsub(/["']/, "")
|
|
263
|
+
.gsub(/\P{Alnum}+/, "_").squeeze("_")
|
|
264
|
+
.sub(/\A_+/, "")
|
|
265
|
+
.sub(/_+\z/, "")
|
|
266
|
+
.then { |value| value.empty? ? "section" : value }
|
|
267
|
+
end
|
|
268
|
+
|
|
269
|
+
def heading_tool_slug(text)
|
|
270
|
+
normalize_heading_text(text)
|
|
271
|
+
.downcase
|
|
272
|
+
.gsub(/["']/, "")
|
|
273
|
+
.gsub(/\P{Alnum}+/, "-").squeeze("-")
|
|
274
|
+
.sub(/\A-+/, "")
|
|
275
|
+
.sub(/-+\z/, "")
|
|
276
|
+
.then { |value| value.empty? ? "tool" : value }
|
|
277
|
+
end
|
|
278
|
+
|
|
279
|
+
def list_item_slug(text)
|
|
280
|
+
return heading_slug(text) unless narrative_list_item?(text)
|
|
281
|
+
|
|
282
|
+
compact_phrase_slug(text)
|
|
283
|
+
end
|
|
284
|
+
|
|
285
|
+
def narrative_list_item?(text)
|
|
286
|
+
raw = normalize_inline(text.to_s)
|
|
287
|
+
raw.length > LIST_NARRATIVE_MIN_CHARS || raw.split(/\s+/).length >= LIST_NARRATIVE_MIN_WORDS
|
|
288
|
+
end
|
|
289
|
+
|
|
290
|
+
def compact_phrase_slug(text)
|
|
291
|
+
tokens = normalize_heading_text(normalize_inline(text.to_s))
|
|
292
|
+
.downcase
|
|
293
|
+
.gsub(/["']/, "")
|
|
294
|
+
.scan(/[a-z0-9]+/)
|
|
295
|
+
.filter_map do |token|
|
|
296
|
+
next if LIST_STOPWORDS.include?(token)
|
|
297
|
+
|
|
298
|
+
LIST_TOKEN_MAP.fetch(token, token)
|
|
299
|
+
end
|
|
300
|
+
|
|
301
|
+
tokens = tokens.each_with_object([]) do |token, result|
|
|
302
|
+
result << token unless result.last == token
|
|
303
|
+
end
|
|
304
|
+
|
|
305
|
+
value = tokens.join("_")
|
|
306
|
+
value.empty? ? heading_slug(text) : value
|
|
307
|
+
end
|
|
308
|
+
|
|
309
|
+
def shellish_line?(line)
|
|
310
|
+
line.match?(/\A[a-zA-Z0-9_.\/\-$"'`#\[\]()=:;]+(?:\s+.+)?\z/)
|
|
311
|
+
end
|
|
312
|
+
|
|
313
|
+
def shell_script_line?(line)
|
|
314
|
+
line.match?(SHELL_CONTROL_RE)
|
|
315
|
+
end
|
|
316
|
+
|
|
317
|
+
def normalize_heading_text(text)
|
|
318
|
+
text
|
|
319
|
+
.to_s
|
|
320
|
+
.strip
|
|
321
|
+
.sub(/\A[#\s]+/, "")
|
|
322
|
+
.sub(/\A\d+(?:[.)])?\s*/, "")
|
|
323
|
+
.gsub(/^\p{Extended_Pictographic}+\s*/, "")
|
|
324
|
+
end
|
|
325
|
+
|
|
326
|
+
def normalize_inline(text)
|
|
327
|
+
without_links = text.gsub(/\[([^\]]+)\]\([^)]+\)/, "\\1")
|
|
328
|
+
without_emoji = without_links.gsub(emoji_re, "")
|
|
329
|
+
without_bold = without_emoji.gsub(/\*{1,3}([^*]+)\*{1,3}/, "\\1")
|
|
330
|
+
without_backticks = without_bold.gsub(/`([^`]+)`/, "\\1")
|
|
331
|
+
without_blockquote = without_backticks.gsub(/^(?:\s*>+\s*)+/, "")
|
|
332
|
+
without_blockquote.gsub(/\s+/, " ").strip
|
|
333
|
+
end
|
|
334
|
+
|
|
335
|
+
def emoji_re
|
|
336
|
+
/[\u{1F300}-\u{1FAFF}\u{2600}-\u{26FF}\u{2700}-\u{27BF}]+/u
|
|
337
|
+
end
|
|
338
|
+
end
|
|
339
|
+
end
|
|
340
|
+
end
|
|
341
|
+
end
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Ace
|
|
4
|
+
module Compressor
|
|
5
|
+
module Atoms
|
|
6
|
+
# Classifies a source into compact-mode policy categories so the caller
|
|
7
|
+
# can choose the safest compression strategy for that document.
|
|
8
|
+
class CompactPolicyClassifier
|
|
9
|
+
# Narrative file-name hints commonly used for explanatory docs.
|
|
10
|
+
NARRATIVE_FILE_HINT_RE = /(?:^|\/)(?:readme|vision|guide|guides|architecture)(?:\.|\/|$)/i
|
|
11
|
+
# Headings that signal descriptive/explanatory prose.
|
|
12
|
+
NARRATIVE_HEADING_RE = /\b(?:overview|vision|introduction|core principles|why|how it works|purpose|motivation|summary|guide)\b/i
|
|
13
|
+
# Headings that indicate normative policy or constraints.
|
|
14
|
+
RULE_HEADING_RE = /\b(?:decision|impact|policy|rule|rules|requirement|requirements|constraint|constraints)\b/i
|
|
15
|
+
# Modal language that usually indicates must-follow rules.
|
|
16
|
+
RULE_TEXT_RE = /\b(?:must|must not|never|required|requires|should|shall|cannot|can't|do not|only)\b/i
|
|
17
|
+
|
|
18
|
+
def call(source:, blocks:)
|
|
19
|
+
stats = signal_stats(source, blocks)
|
|
20
|
+
doc_class = classify(stats)
|
|
21
|
+
action = action_for(doc_class)
|
|
22
|
+
{
|
|
23
|
+
"class" => doc_class,
|
|
24
|
+
"action" => action
|
|
25
|
+
}
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
private
|
|
29
|
+
|
|
30
|
+
def classify(stats)
|
|
31
|
+
return "rule-heavy" if rule_heavy?(stats)
|
|
32
|
+
return "mixed" if mixed?(stats)
|
|
33
|
+
return "narrative-heavy" if narrative_heavy?(stats)
|
|
34
|
+
|
|
35
|
+
"unknown"
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def action_for(doc_class)
|
|
39
|
+
case doc_class
|
|
40
|
+
when "narrative-heavy"
|
|
41
|
+
"aggressive_compact"
|
|
42
|
+
when "mixed"
|
|
43
|
+
"compact_with_exact_rule_sections"
|
|
44
|
+
when "rule-heavy"
|
|
45
|
+
"refuse_compact"
|
|
46
|
+
else
|
|
47
|
+
"conservative_compact"
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def signal_stats(source, blocks)
|
|
52
|
+
source_text = source.to_s
|
|
53
|
+
block_list = Array(blocks)
|
|
54
|
+
heading_hits = block_list.count { |block| narrative_heading?(block) }
|
|
55
|
+
text_blocks = block_list.count { |block| block[:type] == :text }
|
|
56
|
+
list_blocks = block_list.count { |block| block[:type] == :list }
|
|
57
|
+
file_hint = source_text.match?(NARRATIVE_FILE_HINT_RE)
|
|
58
|
+
rule_heading_hits = block_list.count { |block| rule_heading?(block) }
|
|
59
|
+
rule_text_hits = block_list.count { |block| rule_text?(block) }
|
|
60
|
+
rule_list_hits = block_list.sum { |block| rule_list_hits(block) }
|
|
61
|
+
|
|
62
|
+
{
|
|
63
|
+
heading_hits: heading_hits,
|
|
64
|
+
text_blocks: text_blocks,
|
|
65
|
+
list_blocks: list_blocks,
|
|
66
|
+
file_hint: file_hint,
|
|
67
|
+
rule_heading_hits: rule_heading_hits,
|
|
68
|
+
rule_signal_count: rule_heading_hits + rule_text_hits + rule_list_hits
|
|
69
|
+
}
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
def narrative_heavy?(stats)
|
|
73
|
+
heading_hits = stats.fetch(:heading_hits)
|
|
74
|
+
text_blocks = stats.fetch(:text_blocks)
|
|
75
|
+
list_blocks = stats.fetch(:list_blocks)
|
|
76
|
+
file_hint = stats.fetch(:file_hint)
|
|
77
|
+
|
|
78
|
+
return true if file_hint && text_blocks >= 2 && text_blocks >= list_blocks
|
|
79
|
+
return true if heading_hits >= 2 && text_blocks >= 2 && text_blocks >= list_blocks
|
|
80
|
+
|
|
81
|
+
false
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
def mixed?(stats)
|
|
85
|
+
rule_signal_count = stats.fetch(:rule_signal_count)
|
|
86
|
+
return false if rule_signal_count < 2
|
|
87
|
+
return false if rule_heavy?(stats)
|
|
88
|
+
return false if stats.fetch(:file_hint) && stats.fetch(:rule_heading_hits).zero?
|
|
89
|
+
|
|
90
|
+
narrative_heavy?(stats) || stats.fetch(:heading_hits) >= 1 || stats.fetch(:text_blocks) >= 2
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
def rule_heavy?(stats)
|
|
94
|
+
rule_signal_count = stats.fetch(:rule_signal_count)
|
|
95
|
+
rule_heading_hits = stats.fetch(:rule_heading_hits)
|
|
96
|
+
narrative_signals = stats.fetch(:heading_hits) + (stats.fetch(:file_hint) ? 1 : 0)
|
|
97
|
+
|
|
98
|
+
return true if rule_signal_count >= 6 && narrative_signals <= 1
|
|
99
|
+
return true if rule_signal_count >= 5 && rule_heading_hits >= 2
|
|
100
|
+
|
|
101
|
+
false
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
def narrative_heading?(block)
|
|
105
|
+
return false unless block[:type] == :heading
|
|
106
|
+
|
|
107
|
+
NARRATIVE_HEADING_RE.match?(block[:text].to_s)
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
def rule_heading?(block)
|
|
111
|
+
return false unless block[:type] == :heading
|
|
112
|
+
|
|
113
|
+
RULE_HEADING_RE.match?(block[:text].to_s)
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
def rule_text?(block)
|
|
117
|
+
return false unless block[:type] == :text
|
|
118
|
+
|
|
119
|
+
RULE_TEXT_RE.match?(block[:text].to_s)
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
def rule_list_hits(block)
|
|
123
|
+
return 0 unless block[:type] == :list
|
|
124
|
+
|
|
125
|
+
Array(block[:items]).count { |item| RULE_TEXT_RE.match?(item.to_s) }
|
|
126
|
+
end
|
|
127
|
+
end
|
|
128
|
+
end
|
|
129
|
+
end
|
|
130
|
+
end
|
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Ace
|
|
4
|
+
module Compressor
|
|
5
|
+
module Atoms
|
|
6
|
+
class MarkdownParser
|
|
7
|
+
HEADING_RE = /\A(#+)\s+(.+)\z/
|
|
8
|
+
BULLET_LIST_RE = /\A(?:\s*)[-*+]\s+(.+)\z/
|
|
9
|
+
NUMBERED_LIST_RE = /\A(?:\s*)\d+\.\s+(.+)\z/
|
|
10
|
+
IMAGE_ONLY_RE = /\A!\[[^\]]*\]\([^)]+\)\z/
|
|
11
|
+
TABLE_SEPARATOR_RE = /\A\|?[-\s:|]+\|?\z/
|
|
12
|
+
FENCE_START_RE = /\A```/
|
|
13
|
+
|
|
14
|
+
def call(text)
|
|
15
|
+
body = strip_frontmatter(text.to_s)
|
|
16
|
+
return [] if body.strip.empty?
|
|
17
|
+
|
|
18
|
+
blocks = []
|
|
19
|
+
paragraph_lines = []
|
|
20
|
+
lines = body.lines
|
|
21
|
+
index = 0
|
|
22
|
+
while index < lines.length
|
|
23
|
+
stripped = lines[index].strip
|
|
24
|
+
|
|
25
|
+
if stripped.empty?
|
|
26
|
+
flush_paragraph(blocks, paragraph_lines)
|
|
27
|
+
index += 1
|
|
28
|
+
next
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
if layout_separator?(stripped)
|
|
32
|
+
flush_paragraph(blocks, paragraph_lines)
|
|
33
|
+
index += 1
|
|
34
|
+
next
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
if blockquote_marker?(stripped)
|
|
38
|
+
flush_paragraph(blocks, paragraph_lines)
|
|
39
|
+
index += 1
|
|
40
|
+
next
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
if stripped.match?(FENCE_START_RE)
|
|
44
|
+
flush_paragraph(blocks, paragraph_lines)
|
|
45
|
+
language = stripped.sub(FENCE_START_RE, "").strip
|
|
46
|
+
fence_lines = []
|
|
47
|
+
index += 1
|
|
48
|
+
while index < lines.length
|
|
49
|
+
candidate = lines[index]
|
|
50
|
+
break if candidate.strip.match?(FENCE_START_RE)
|
|
51
|
+
|
|
52
|
+
fence_lines << candidate
|
|
53
|
+
index += 1
|
|
54
|
+
end
|
|
55
|
+
index += 1 if index < lines.length && lines[index]&.strip&.match?(FENCE_START_RE)
|
|
56
|
+
blocks << {
|
|
57
|
+
type: :fenced_code,
|
|
58
|
+
language: language,
|
|
59
|
+
content: fence_lines.join
|
|
60
|
+
}
|
|
61
|
+
next
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
if image_only_line?(stripped)
|
|
65
|
+
flush_paragraph(blocks, paragraph_lines)
|
|
66
|
+
blocks << {
|
|
67
|
+
type: :unresolved,
|
|
68
|
+
text: stripped,
|
|
69
|
+
kind: "image-only"
|
|
70
|
+
}
|
|
71
|
+
index += 1
|
|
72
|
+
next
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
if table_start?(lines, index)
|
|
76
|
+
flush_paragraph(blocks, paragraph_lines)
|
|
77
|
+
table_rows = []
|
|
78
|
+
while index < lines.length
|
|
79
|
+
candidate = lines[index]
|
|
80
|
+
break if candidate.strip.empty?
|
|
81
|
+
break unless candidate.include?("|")
|
|
82
|
+
|
|
83
|
+
table_rows << candidate.strip
|
|
84
|
+
index += 1
|
|
85
|
+
end
|
|
86
|
+
blocks << {type: :table, rows: table_rows}
|
|
87
|
+
next
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
if list_start?(stripped)
|
|
91
|
+
flush_paragraph(blocks, paragraph_lines)
|
|
92
|
+
list_items = []
|
|
93
|
+
ordered = false
|
|
94
|
+
while index < lines.length
|
|
95
|
+
item_line = lines[index].strip
|
|
96
|
+
break unless list_start?(item_line)
|
|
97
|
+
|
|
98
|
+
ordered = true if ordered_list_line?(item_line)
|
|
99
|
+
list_items << strip_list_marker(item_line)
|
|
100
|
+
index += 1
|
|
101
|
+
end
|
|
102
|
+
blocks << {
|
|
103
|
+
type: :list,
|
|
104
|
+
ordered: ordered,
|
|
105
|
+
items: list_items
|
|
106
|
+
}
|
|
107
|
+
next
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
heading_match = stripped.match(HEADING_RE)
|
|
111
|
+
if heading_match && heading_match[1].length <= 6
|
|
112
|
+
flush_paragraph(blocks, paragraph_lines)
|
|
113
|
+
blocks << {
|
|
114
|
+
type: :heading,
|
|
115
|
+
level: heading_match[1].length,
|
|
116
|
+
text: heading_match[2].strip
|
|
117
|
+
}
|
|
118
|
+
else
|
|
119
|
+
paragraph_lines << stripped
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
index += 1
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
flush_paragraph(blocks, paragraph_lines)
|
|
126
|
+
blocks
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
private
|
|
130
|
+
|
|
131
|
+
def strip_frontmatter(text)
|
|
132
|
+
return text unless text.start_with?("---\n", "---\r\n")
|
|
133
|
+
|
|
134
|
+
lines = text.lines
|
|
135
|
+
end_index = nil
|
|
136
|
+
|
|
137
|
+
lines[1..].each_with_index do |line, idx|
|
|
138
|
+
if line.strip == "---"
|
|
139
|
+
end_index = idx + 1
|
|
140
|
+
break
|
|
141
|
+
end
|
|
142
|
+
end
|
|
143
|
+
|
|
144
|
+
return text if end_index.nil?
|
|
145
|
+
|
|
146
|
+
lines[(end_index + 1)..]&.join.to_s
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
def flush_paragraph(blocks, lines)
|
|
150
|
+
return if lines.empty?
|
|
151
|
+
|
|
152
|
+
blocks << {type: :text, text: lines.join(" ")}
|
|
153
|
+
lines.clear
|
|
154
|
+
end
|
|
155
|
+
|
|
156
|
+
def layout_separator?(line)
|
|
157
|
+
line.match?(/\A(?:[-*_]){3,}\z/)
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
def image_only_line?(line)
|
|
161
|
+
line.match?(IMAGE_ONLY_RE)
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
def blockquote_marker?(line)
|
|
165
|
+
line.match?(/^>+$/)
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
def list_start?(line)
|
|
169
|
+
line.match?(BULLET_LIST_RE) || line.match?(NUMBERED_LIST_RE)
|
|
170
|
+
end
|
|
171
|
+
|
|
172
|
+
def ordered_list_line?(line)
|
|
173
|
+
line.match?(NUMBERED_LIST_RE)
|
|
174
|
+
end
|
|
175
|
+
|
|
176
|
+
def strip_list_marker(line)
|
|
177
|
+
line.sub(BULLET_LIST_RE, "\\1").sub(NUMBERED_LIST_RE, "\\1").strip
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
def table_start?(lines, index)
|
|
181
|
+
current = lines[index]&.strip.to_s
|
|
182
|
+
next_line = lines[index + 1]&.strip.to_s
|
|
183
|
+
return false unless current.include?("|")
|
|
184
|
+
|
|
185
|
+
next_line.match?(TABLE_SEPARATOR_RE)
|
|
186
|
+
end
|
|
187
|
+
end
|
|
188
|
+
end
|
|
189
|
+
end
|
|
190
|
+
end
|