inkmark 0.1.0-aarch64-linux
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/LICENSE.txt +21 -0
- data/NOTICE +16 -0
- data/README.md +1166 -0
- data/lib/inkmark/3.3/inkmark.so +0 -0
- data/lib/inkmark/3.4/inkmark.so +0 -0
- data/lib/inkmark/4.0/inkmark.so +0 -0
- data/lib/inkmark/event.rb +342 -0
- data/lib/inkmark/native.rb +8 -0
- data/lib/inkmark/options.rb +698 -0
- data/lib/inkmark/toc.rb +40 -0
- data/lib/inkmark/version.rb +6 -0
- data/lib/inkmark.rb +711 -0
- data/sig/inkmark.rbs +219 -0
- metadata +178 -0
data/lib/inkmark.rb
ADDED
|
@@ -0,0 +1,711 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "yaml"
|
|
4
|
+
|
|
5
|
+
# Inkmark is a very fast, feature-rich, AI-first CommonMark/GFM
|
|
6
|
+
# markdown renderer backed by the Rust pulldown-cmark parser.
|
|
7
|
+
#
|
|
8
|
+
# Default behavior: GFM extensions (tables, strikethrough, tasklists,
|
|
9
|
+
# footnotes) are enabled; raw HTML is suppressed. Override via options.
|
|
10
|
+
#
|
|
11
|
+
# ### Presets
|
|
12
|
+
#
|
|
13
|
+
# Four named bundles of options cover the common profiles:
|
|
14
|
+
#
|
|
15
|
+
# - +:gfm+ (the default): CommonMark + core GFM only.
|
|
16
|
+
# - +:commonmark+: strict CommonMark, no GFM.
|
|
17
|
+
# - +:recommended+: opinionated bundle for modern web content (smart
|
|
18
|
+
# punctuation, auto heading IDs, lazy images, autolinks + nofollow,
|
|
19
|
+
# URL scheme allowlists, emoji shortcodes, syntax highlighting,
|
|
20
|
+
# frontmatter).
|
|
21
|
+
# - +:trusted+: +:recommended+ plus raw-HTML pass-through. **Use only
|
|
22
|
+
# for fully trusted content.**
|
|
23
|
+
#
|
|
24
|
+
# See {Inkmark::Options::PRESETS}.
|
|
25
|
+
#
|
|
26
|
+
# ### Raw HTML safety
|
|
27
|
+
#
|
|
28
|
+
# Raw HTML is suppressed by default; every +<tag>+ in the source is
|
|
29
|
+
# escaped to text. Enable pass-through with +raw_html: true+ or the
|
|
30
|
+
# +:trusted+ preset **only for trusted input**. Inkmark does not
|
|
31
|
+
# sanitize raw HTML beyond the narrow GFM tagfilter; sanitize before rendering
|
|
32
|
+
# user-influenced content.
|
|
33
|
+
#
|
|
34
|
+
# @example Class-method shortcut
|
|
35
|
+
# Inkmark.to_html("**hello**")
|
|
36
|
+
# #=> "<p><strong>hello</strong></p>\n"
|
|
37
|
+
#
|
|
38
|
+
# @example Instance form with options
|
|
39
|
+
# g = Inkmark.new("# hi", options: { tables: false })
|
|
40
|
+
# g.to_html
|
|
41
|
+
#
|
|
42
|
+
# @example Mutable options after construction
|
|
43
|
+
# g = Inkmark.new("# hi")
|
|
44
|
+
# g.options.tables = false
|
|
45
|
+
# g.to_html
|
|
46
|
+
#
|
|
47
|
+
# @example Recommended profile
|
|
48
|
+
# Inkmark.to_html(md, options: { preset: :recommended })
|
|
49
|
+
class Inkmark
|
|
50
|
+
# Base error class for Inkmark-specific runtime failures.
|
|
51
|
+
class Error < StandardError; end
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
require_relative "inkmark/version"
|
|
55
|
+
require_relative "inkmark/options"
|
|
56
|
+
require_relative "inkmark/event"
|
|
57
|
+
require_relative "inkmark/toc"
|
|
58
|
+
require_relative "inkmark/native"
|
|
59
|
+
|
|
60
|
+
class Inkmark
|
|
61
|
+
class << self
|
|
62
|
+
# Render +source+ markdown to HTML in one call.
|
|
63
|
+
#
|
|
64
|
+
# This is a class-method fast path that skips Inkmark instance and
|
|
65
|
+
# Options copy allocation for the common one-shot render pattern.
|
|
66
|
+
# When the caller passes +options: nil+ (the default), we reuse the
|
|
67
|
+
# cached frozen hash that {Inkmark::Options#to_native_hash_frozen}
|
|
68
|
+
# returns; the cache lives on the Options instance itself and is
|
|
69
|
+
# invalidated by the Options mutation methods, so
|
|
70
|
+
# +Inkmark.default_options.tables = false+ followed by
|
|
71
|
+
# +Inkmark.to_html(src)+ picks up the new value without stale-cache
|
|
72
|
+
# bugs.
|
|
73
|
+
#
|
|
74
|
+
# **Raw HTML safety.** +raw_html: false+ (the default) escapes
|
|
75
|
+
# every raw HTML tag in the source—safe for untrusted input.
|
|
76
|
+
# Enable +raw_html: true+ (or +preset: :trusted+) only for
|
|
77
|
+
# content you fully trust, and run the output through a dedicated
|
|
78
|
+
# HTML sanitizer before displaying it.
|
|
79
|
+
#
|
|
80
|
+
# @param source [String, nil] the markdown source to render
|
|
81
|
+
# @param options [Hash, Inkmark::Options, nil] rendering options; merged
|
|
82
|
+
# over {default_options} when a Hash is supplied. Accepts
|
|
83
|
+
# +preset: :name+ (see {Inkmark::Options::PRESETS}).
|
|
84
|
+
# @return [String] the rendered HTML
|
|
85
|
+
# @raise [TypeError] if +options+ is not a Hash, Inkmark::Options, or nil
|
|
86
|
+
# @example
|
|
87
|
+
# Inkmark.to_html("**bold**") #=> "<p><strong>bold</strong></p>\n"
|
|
88
|
+
# @example With a preset
|
|
89
|
+
# Inkmark.to_html(md, options: { preset: :recommended })
|
|
90
|
+
def to_html(source, options: nil)
|
|
91
|
+
source = source.to_s
|
|
92
|
+
return "" if source.empty?
|
|
93
|
+
_native_to_html(source, resolve_frozen_options(options))
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
# Render +source+ markdown through the filter pipeline and serialize back
|
|
97
|
+
# to Markdown text.
|
|
98
|
+
#
|
|
99
|
+
# The same event-level filters as {to_html} are applied (emoji expansion,
|
|
100
|
+
# allowlists, autolink, etc.), then the event stream is serialized back to
|
|
101
|
+
# Markdown using pulldown-cmark-to-cmark. Use this as a preprocessing step
|
|
102
|
+
# in pipelines that consume Markdown: LLM prompts, secondary renderers,
|
|
103
|
+
# content storage.
|
|
104
|
+
#
|
|
105
|
+
# HTML-emitting filters (+syntax_highlight+, +images: { lazy: true }+,
|
|
106
|
+
# +links: { nofollow: true }+) embed raw HTML verbatim in the
|
|
107
|
+
# Markdown output when enabled. That is valid CommonMark but may
|
|
108
|
+
# break downstream consumers.
|
|
109
|
+
# See the "Markdown-to-Markdown pipeline" section in the README.
|
|
110
|
+
#
|
|
111
|
+
# @param source [String, nil] the markdown source to process
|
|
112
|
+
# @param options [Hash, Inkmark::Options, nil] rendering options
|
|
113
|
+
# @return [String] the filtered Markdown
|
|
114
|
+
def to_markdown(source, options: nil)
|
|
115
|
+
source = source.to_s
|
|
116
|
+
return "" if source.empty?
|
|
117
|
+
_native_to_markdown(source, resolve_frozen_options(options))
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
# Chunk +source+ by heading into an Array of section Hashes. Each
|
|
121
|
+
# section's +:content+ is filter-applied Markdown (emoji expanded,
|
|
122
|
+
# autolinks resolved, allowlists applied). Designed for feeding
|
|
123
|
+
# RAG / embedding pipelines that want pre-HTML chunks with clean
|
|
124
|
+
# content.
|
|
125
|
+
#
|
|
126
|
+
# Sections are hierarchical: a +##+ section's +:content+ includes
|
|
127
|
+
# any nested +###+ subsections, which also appear as their own
|
|
128
|
+
# entries. Content before the first heading (if any) is emitted
|
|
129
|
+
# as a preamble entry with +heading: nil+ and +level: 0+.
|
|
130
|
+
#
|
|
131
|
+
# Filter the returned array with plain +Enumerable+—by heading,
|
|
132
|
+
# level, id, or any other field. See the "Section extraction" in
|
|
133
|
+
# the README for recipes.
|
|
134
|
+
#
|
|
135
|
+
# **HTML-emitting filters** (+syntax_highlight+, +images: { lazy: true }+,
|
|
136
|
+
# +links: { nofollow: true }+) embed raw HTML into +:content+ when
|
|
137
|
+
# enabled. For RAG pipelines you almost always want these off so
|
|
138
|
+
# chunks stay pure Markdown.
|
|
139
|
+
#
|
|
140
|
+
# @param source [String, nil] the markdown source
|
|
141
|
+
# @param options [Hash, Inkmark::Options, nil] rendering options
|
|
142
|
+
# @return [Array<Hash>] section records
|
|
143
|
+
# @example Fetch one section
|
|
144
|
+
# Inkmark.chunks_by_heading(readme).find { |s| s[:heading] == "Installation" }
|
|
145
|
+
# @example Filter by heading pattern
|
|
146
|
+
# Inkmark.chunks_by_heading(readme).select { |s| s[:heading]&.match?(/install/i) }
|
|
147
|
+
# @example RAG chunking
|
|
148
|
+
# Inkmark.chunks_by_heading(readme).each do |s|
|
|
149
|
+
# embed_and_store("#{s[:heading]}\n\n#{s[:content]}") if s[:heading]
|
|
150
|
+
# end
|
|
151
|
+
def chunks_by_heading(source, options: nil, truncate: nil)
|
|
152
|
+
source = source.to_s
|
|
153
|
+
return [] if source.empty?
|
|
154
|
+
|
|
155
|
+
opts_hash = resolve_mutable_options(options)
|
|
156
|
+
opts_hash[:truncate] = normalize_truncate_params(truncate) if truncate
|
|
157
|
+
_native_chunks_by_heading(source, opts_hash)
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
# Split +source+ into sliding-window chunks bounded by a character
|
|
161
|
+
# and/or word budget. Adjacent chunks can share trailing context
|
|
162
|
+
# via +overlap+, which preserves continuity for embedding models.
|
|
163
|
+
# Unlike {chunks_by_heading}, this ignores document structure and
|
|
164
|
+
# walks the filter-applied Markdown sequentially — useful for
|
|
165
|
+
# heading-free or heading-uneven documents.
|
|
166
|
+
#
|
|
167
|
+
# @param source [String, nil] the markdown source
|
|
168
|
+
# @param chars [Integer, nil] max characters per chunk
|
|
169
|
+
# @param words [Integer, nil] max Unicode words per chunk; at
|
|
170
|
+
# least one of +chars+/+words+ must be set
|
|
171
|
+
# @param overlap [Integer] chars carried from the end of the
|
|
172
|
+
# previous chunk into the start of the next. Defaults to 0.
|
|
173
|
+
# Must be less than +chars+ when +chars+ is set.
|
|
174
|
+
# @param at [Symbol] +:block+ (valid-Markdown cut, oversized
|
|
175
|
+
# blocks emit as their own chunk) or +:word+ (word-boundary
|
|
176
|
+
# cut, may split open constructs).
|
|
177
|
+
# @param options [Hash, Inkmark::Options, nil] rendering options
|
|
178
|
+
# @return [Array<Hash>] each +{index:, content:}+, plus
|
|
179
|
+
# +:character_count+/+:word_count+ when +statistics: true+
|
|
180
|
+
# @raise [ArgumentError] on invalid parameter combinations
|
|
181
|
+
# @example
|
|
182
|
+
# Inkmark.chunks_by_size(readme, chars: 500, overlap: 50)
|
|
183
|
+
def chunks_by_size(source, chars: nil, words: nil, overlap: 0, at: :block, options: nil)
|
|
184
|
+
source = source.to_s
|
|
185
|
+
return [] if source.empty?
|
|
186
|
+
|
|
187
|
+
opts_hash = resolve_mutable_options(options)
|
|
188
|
+
opts_hash[:__window] = normalize_window_params(
|
|
189
|
+
chars: chars, words: words, overlap: overlap, at: at
|
|
190
|
+
)
|
|
191
|
+
_native_chunks_by_size(source, opts_hash)
|
|
192
|
+
end
|
|
193
|
+
|
|
194
|
+
# Truncate a Markdown document to fit a char and/or word budget.
|
|
195
|
+
# Returns filter-applied Markdown cut at either the last block
|
|
196
|
+
# boundary that fits (+at: :block+) or the last Unicode word
|
|
197
|
+
# boundary that fits (+at: :word+).
|
|
198
|
+
#
|
|
199
|
+
# Designed as a preprocessing step for LLM context-window budgeting
|
|
200
|
+
# and RAG chunk normalization. The marker (default +"…"+) is
|
|
201
|
+
# appended only when truncation actually occurred and counts toward
|
|
202
|
+
# the budget, so +chars: 4000+ always yields output ≤ 4000
|
|
203
|
+
# codepoints.
|
|
204
|
+
#
|
|
205
|
+
# @param source [String, nil] the markdown source
|
|
206
|
+
# @param chars [Integer, nil] maximum codepoint count; at least
|
|
207
|
+
# one of +chars+/+words+ must be set
|
|
208
|
+
# @param words [Integer, nil] maximum Unicode word count
|
|
209
|
+
# @param at [Symbol] +:block+ (valid-Markdown cut) or +:word+
|
|
210
|
+
# (word-boundary cut; may split open constructs)
|
|
211
|
+
# @param marker [String, nil] appended when truncation occurs.
|
|
212
|
+
# Pass +nil+ to suppress. Defaults to +"…"+ (U+2026).
|
|
213
|
+
# @param options [Hash, Inkmark::Options, nil] rendering options
|
|
214
|
+
# @return [String] truncated Markdown, or the source unchanged
|
|
215
|
+
# when it already fits
|
|
216
|
+
# @raise [ArgumentError] if neither chars nor words is set,
|
|
217
|
+
# +at+ is not +:block+/+:word+, or the marker exceeds the budget
|
|
218
|
+
def truncate_markdown(source, chars: nil, words: nil, at: :block, marker: "…", options: nil)
|
|
219
|
+
source = source.to_s
|
|
220
|
+
return "" if source.empty?
|
|
221
|
+
|
|
222
|
+
params = normalize_truncate_params(
|
|
223
|
+
chars: chars, words: words, at: at, marker: marker
|
|
224
|
+
)
|
|
225
|
+
_native_truncate_markdown(source, params, resolve_frozen_options(options))
|
|
226
|
+
end
|
|
227
|
+
|
|
228
|
+
# Render +source+ through the filter pipeline and serialize to plain
|
|
229
|
+
# text. Markdown syntax (emphasis, headings, list bullets, fences)
|
|
230
|
+
# is stripped; inline content is preserved. Links become
|
|
231
|
+
# +"text (url)"+; images become +"alt (src)"+; tables are
|
|
232
|
+
# tab-separated; code blocks keep their raw body.
|
|
233
|
+
#
|
|
234
|
+
# Designed as a preprocessor for embedding models, token counting,
|
|
235
|
+
# LLM input, and any downstream consumer that treats Markdown
|
|
236
|
+
# syntax as noise.
|
|
237
|
+
#
|
|
238
|
+
# @param source [String, nil] the markdown source
|
|
239
|
+
# @param options [Hash, Inkmark::Options, nil] rendering options
|
|
240
|
+
# @return [String] plain-text output
|
|
241
|
+
def to_plain_text(source, options: nil)
|
|
242
|
+
source = source.to_s
|
|
243
|
+
return "" if source.empty?
|
|
244
|
+
_native_to_plain_text(source, resolve_frozen_options(options))
|
|
245
|
+
end
|
|
246
|
+
|
|
247
|
+
# Normalize and validate truncation params coming from either the
|
|
248
|
+
# {.truncate_markdown} kwargs or the {.chunks_by_heading}
|
|
249
|
+
# +truncate:+ kwarg. Accepts a Hash with +:chars+/+:words+/+:at+/
|
|
250
|
+
# +:marker+ keys, or positional kwargs (collected by the caller
|
|
251
|
+
# into a Hash). Returns a Hash ready to hand to the native side.
|
|
252
|
+
#
|
|
253
|
+
# @api private
|
|
254
|
+
def normalize_truncate_params(params)
|
|
255
|
+
if params.respond_to?(:to_hash)
|
|
256
|
+
params = params.to_hash
|
|
257
|
+
end
|
|
258
|
+
unless params.is_a?(Hash)
|
|
259
|
+
raise TypeError, "truncate must be a Hash, got #{params.class}"
|
|
260
|
+
end
|
|
261
|
+
|
|
262
|
+
unknown = params.keys - [:chars, :words, :at, :marker]
|
|
263
|
+
unless unknown.empty?
|
|
264
|
+
raise ArgumentError, "unknown truncate key(s): #{unknown.inspect}; " \
|
|
265
|
+
"expected :chars, :words, :at, :marker"
|
|
266
|
+
end
|
|
267
|
+
|
|
268
|
+
chars = params[:chars]
|
|
269
|
+
words = params[:words]
|
|
270
|
+
at = params.fetch(:at, :block)
|
|
271
|
+
marker = params.fetch(:marker, "…")
|
|
272
|
+
|
|
273
|
+
if chars.nil? && words.nil?
|
|
274
|
+
raise ArgumentError, "truncate requires at least one of :chars or :words"
|
|
275
|
+
end
|
|
276
|
+
if chars && !chars.is_a?(Integer)
|
|
277
|
+
raise ArgumentError, ":chars must be an Integer, got #{chars.class}"
|
|
278
|
+
end
|
|
279
|
+
if words && !words.is_a?(Integer)
|
|
280
|
+
raise ArgumentError, ":words must be an Integer, got #{words.class}"
|
|
281
|
+
end
|
|
282
|
+
unless %i[block word].include?(at)
|
|
283
|
+
raise ArgumentError, ":at must be :block or :word, got #{at.inspect}"
|
|
284
|
+
end
|
|
285
|
+
unless marker.nil? || marker.is_a?(String)
|
|
286
|
+
raise ArgumentError, ":marker must be a String or nil, got #{marker.class}"
|
|
287
|
+
end
|
|
288
|
+
if marker && chars && marker.length >= chars
|
|
289
|
+
raise ArgumentError, ":marker (#{marker.length} chars) must be shorter than :chars budget (#{chars})"
|
|
290
|
+
end
|
|
291
|
+
|
|
292
|
+
{chars: chars, words: words, at: at.to_s, marker: marker}
|
|
293
|
+
end
|
|
294
|
+
|
|
295
|
+
# Validate sliding-window chunking params. Keeps {.chunks_by_size}
|
|
296
|
+
# tight by raising on obvious misconfiguration rather than silent
|
|
297
|
+
# clamping — invalid overlap or missing budget is almost always a
|
|
298
|
+
# swapped-arg bug.
|
|
299
|
+
#
|
|
300
|
+
# @api private
|
|
301
|
+
def normalize_window_params(chars:, words:, overlap:, at:)
|
|
302
|
+
if chars.nil? && words.nil?
|
|
303
|
+
raise ArgumentError, "chunks_by_size requires at least one of :chars or :words"
|
|
304
|
+
end
|
|
305
|
+
if chars && !chars.is_a?(Integer)
|
|
306
|
+
raise ArgumentError, ":chars must be an Integer, got #{chars.class}"
|
|
307
|
+
end
|
|
308
|
+
if words && !words.is_a?(Integer)
|
|
309
|
+
raise ArgumentError, ":words must be an Integer, got #{words.class}"
|
|
310
|
+
end
|
|
311
|
+
if chars && chars <= 0
|
|
312
|
+
raise ArgumentError, ":chars must be positive, got #{chars}"
|
|
313
|
+
end
|
|
314
|
+
if words && words <= 0
|
|
315
|
+
raise ArgumentError, ":words must be positive, got #{words}"
|
|
316
|
+
end
|
|
317
|
+
unless overlap.is_a?(Integer)
|
|
318
|
+
raise ArgumentError, ":overlap must be an Integer, got #{overlap.class}"
|
|
319
|
+
end
|
|
320
|
+
if overlap < 0
|
|
321
|
+
raise ArgumentError, ":overlap must be non-negative, got #{overlap}"
|
|
322
|
+
end
|
|
323
|
+
if chars && overlap >= chars
|
|
324
|
+
raise ArgumentError, ":overlap (#{overlap}) must be less than :chars budget (#{chars})"
|
|
325
|
+
end
|
|
326
|
+
unless %i[block word].include?(at)
|
|
327
|
+
raise ArgumentError, ":at must be :block or :word, got #{at.inspect}"
|
|
328
|
+
end
|
|
329
|
+
|
|
330
|
+
{chars: chars, words: words, overlap: overlap, at: at.to_s}
|
|
331
|
+
end
|
|
332
|
+
|
|
333
|
+
# Return the CSS stylesheet for syntax-highlighted code blocks.
|
|
334
|
+
# Pair this with +syntax_highlight: true+ in the rendering options.
|
|
335
|
+
#
|
|
336
|
+
# @param theme [String, nil] syntect theme name; defaults to
|
|
337
|
+
# "base16-ocean.dark". Call {highlight_themes} for available names.
|
|
338
|
+
# @return [String] CSS text suitable for a +<style>+ tag or +.css+ file
|
|
339
|
+
# @raise [ArgumentError] if the theme name is not recognized
|
|
340
|
+
# @example
|
|
341
|
+
# Inkmark.highlight_css
|
|
342
|
+
# Inkmark.highlight_css(theme: "InspiredGitHub")
|
|
343
|
+
def highlight_css(theme: nil)
|
|
344
|
+
_syntax_css(theme)
|
|
345
|
+
end
|
|
346
|
+
|
|
347
|
+
# Return an array of available syntax-highlighting theme names.
|
|
348
|
+
# Memoized—the theme list is fixed at compile time.
|
|
349
|
+
#
|
|
350
|
+
# @return [Array<String>]
|
|
351
|
+
def highlight_themes
|
|
352
|
+
@highlight_themes ||= _syntax_themes.freeze
|
|
353
|
+
end
|
|
354
|
+
|
|
355
|
+
# The class-level default options used when no per-instance options are given.
|
|
356
|
+
#
|
|
357
|
+
# @return [Inkmark::Options]
|
|
358
|
+
def default_options
|
|
359
|
+
@default_options ||= Inkmark::Options.new
|
|
360
|
+
end
|
|
361
|
+
|
|
362
|
+
# Replace the class-level default options.
|
|
363
|
+
#
|
|
364
|
+
# @param value [Hash, Inkmark::Options] new defaults; a Hash is converted to
|
|
365
|
+
# Inkmark::Options, a Inkmark::Options is duped
|
|
366
|
+
# @return [Inkmark::Options] the stored options object
|
|
367
|
+
# @raise [TypeError] if +value+ is not a Hash or Inkmark::Options
|
|
368
|
+
def default_options=(value)
|
|
369
|
+
@default_options =
|
|
370
|
+
case value
|
|
371
|
+
when Inkmark::Options then value.dup
|
|
372
|
+
when Hash then Inkmark::Options.new(value)
|
|
373
|
+
else raise TypeError, "default_options must be a Hash or Inkmark::Options, got #{value.class}"
|
|
374
|
+
end
|
|
375
|
+
end
|
|
376
|
+
|
|
377
|
+
private
|
|
378
|
+
|
|
379
|
+
# Resolve +options+ to a frozen flat Rust-facing hash for the
|
|
380
|
+
# read-only FFI paths (to_html, to_markdown, to_plain_text,
|
|
381
|
+
# truncate_markdown). When no options are supplied and no class-
|
|
382
|
+
# level default_options has been set, return nil so the Rust side
|
|
383
|
+
# skips hash-key lookups entirely and uses its hardcoded defaults—
|
|
384
|
+
# the absolute fast path for one-shot renders.
|
|
385
|
+
def resolve_frozen_options(options)
|
|
386
|
+
return nil if options.nil? && @default_options.nil?
|
|
387
|
+
case options
|
|
388
|
+
when nil then default_options.to_native_hash_frozen
|
|
389
|
+
when Inkmark::Options then options.to_native_hash_frozen
|
|
390
|
+
when Hash then Inkmark::Options.native_hash_from(options)
|
|
391
|
+
else raise TypeError, "options must be a Hash or Inkmark::Options, got #{options.class}"
|
|
392
|
+
end
|
|
393
|
+
end
|
|
394
|
+
|
|
395
|
+
# Resolve +options+ to a mutable flat hash for FFI paths that
|
|
396
|
+
# splice in per-call params ({chunks_by_heading}'s +:truncate+,
|
|
397
|
+
# {chunks_by_size}'s +:__window+). Always builds or dups a hash—
|
|
398
|
+
# the nil fast path doesn't apply because the caller will mutate
|
|
399
|
+
# the result.
|
|
400
|
+
def resolve_mutable_options(options)
|
|
401
|
+
case options
|
|
402
|
+
when nil then default_options.to_native_hash_frozen.dup
|
|
403
|
+
when Inkmark::Options then options.to_native_hash_frozen.dup
|
|
404
|
+
when Hash then Inkmark::Options.native_hash_from(options).dup
|
|
405
|
+
else raise TypeError, "options must be a Hash or Inkmark::Options, got #{options.class}"
|
|
406
|
+
end
|
|
407
|
+
end
|
|
408
|
+
end
|
|
409
|
+
|
|
410
|
+
# Create a new renderer for +source+.
|
|
411
|
+
#
|
|
412
|
+
# @param source [String, nil] markdown source; +nil+ is treated as an
|
|
413
|
+
# empty string
|
|
414
|
+
# @param options [Hash, Inkmark::Options, nil] rendering options; falls back
|
|
415
|
+
# to a dup of {Inkmark.default_options} when nil
|
|
416
|
+
# @raise [TypeError] if +options+ is not a Hash, Inkmark::Options, or nil
|
|
417
|
+
def initialize(source = nil, options: nil)
|
|
418
|
+
self.source = source
|
|
419
|
+
self.options = options
|
|
420
|
+
@handlers = nil
|
|
421
|
+
end
|
|
422
|
+
|
|
423
|
+
# @!attribute [r] source
|
|
424
|
+
# The markdown source string that will be rendered. Always a String
|
|
425
|
+
# (never nil); a nil assignment is stored as an empty string.
|
|
426
|
+
# @return [String]
|
|
427
|
+
#
|
|
428
|
+
# @!attribute [r] options
|
|
429
|
+
# The rendering options for this instance.
|
|
430
|
+
# @return [Inkmark::Options]
|
|
431
|
+
attr_reader :source, :options
|
|
432
|
+
|
|
433
|
+
# Coerce the renderer to a String by returning the stored source.
|
|
434
|
+
# Mirrors the wrapper idiom used by +Pathname+, +URI+, etc.: the
|
|
435
|
+
# stringified form of the wrapper is its carried value. Explicit
|
|
436
|
+
# renderings (HTML, Markdown, plain text) are available via
|
|
437
|
+
# {#to_html}, {#to_markdown}, {#to_plain_text}, and
|
|
438
|
+
# {#chunks_by_heading}.
|
|
439
|
+
#
|
|
440
|
+
# @return [String] the stored source, unchanged
|
|
441
|
+
def to_s
|
|
442
|
+
@source
|
|
443
|
+
end
|
|
444
|
+
|
|
445
|
+
# Set the markdown source.
|
|
446
|
+
#
|
|
447
|
+
# @param value [String, nil] markdown text; nil and non-Strings are coerced
|
|
448
|
+
# via +#to_s+
|
|
449
|
+
# @return [String] the stored source
|
|
450
|
+
def source=(value)
|
|
451
|
+
@source = value.to_s
|
|
452
|
+
end
|
|
453
|
+
|
|
454
|
+
# Set rendering options.
|
|
455
|
+
#
|
|
456
|
+
# @param value [Hash, Inkmark::Options, nil] new options; nil resets to a dup
|
|
457
|
+
# of {Inkmark.default_options}
|
|
458
|
+
# @return [Inkmark::Options] the stored options object
|
|
459
|
+
# @raise [TypeError] if +value+ is not a Hash, Inkmark::Options, or nil
|
|
460
|
+
def options=(value)
|
|
461
|
+
@options =
|
|
462
|
+
case value
|
|
463
|
+
when nil then Inkmark.default_options.dup
|
|
464
|
+
when Inkmark::Options then value.dup
|
|
465
|
+
when Hash then Inkmark::Options.new(value)
|
|
466
|
+
else raise TypeError, "options must be a Hash or Inkmark::Options, got #{value.class}"
|
|
467
|
+
end
|
|
468
|
+
end
|
|
469
|
+
|
|
470
|
+
# Register a handler block for a document element kind.
|
|
471
|
+
#
|
|
472
|
+
# The block receives a {Inkmark::Event} object when an element of +kind+ is
|
|
473
|
+
# encountered. Handlers fire post-order—children before parents—so
|
|
474
|
+
# container elements (tables, blockquotes, lists) see their children
|
|
475
|
+
# populated when the handler runs.
|
|
476
|
+
#
|
|
477
|
+
# Multiple handlers for the same kind are supported and fire in
|
|
478
|
+
# registration order. Returns +self+ for chaining.
|
|
479
|
+
#
|
|
480
|
+
# Trigger handlers by calling {#to_html} (render + transform) or
|
|
481
|
+
# {#walk} (analysis only, no HTML output).
|
|
482
|
+
#
|
|
483
|
+
# @param kind [Symbol] element kind—e.g. +:heading+, +:image+, +:link+
|
|
484
|
+
# @yieldparam event [Inkmark::Event]
|
|
485
|
+
# @return [self]
|
|
486
|
+
# @example Rewrite image sources to a CDN
|
|
487
|
+
# md.on(:image) { |img| img.dest = cdn(img.dest) }
|
|
488
|
+
# @example Replace mermaid code blocks
|
|
489
|
+
# md.on(:code_block) { |c| c.html = Mermaid.render(c.source) if c.lang == "mermaid" }
|
|
490
|
+
def on(kind, &block)
|
|
491
|
+
(@handlers ||= {})[kind.to_sym] ||= []
|
|
492
|
+
@handlers[kind.to_sym] << block
|
|
493
|
+
self
|
|
494
|
+
end
|
|
495
|
+
|
|
496
|
+
# Walk the document, firing all registered handlers, without producing
|
|
497
|
+
# HTML output. Use this for analysis—collecting headings, extracting
|
|
498
|
+
# links, building a TOC—when you don't need to render.
|
|
499
|
+
#
|
|
500
|
+
# Returns +self+.
|
|
501
|
+
#
|
|
502
|
+
# @return [self]
|
|
503
|
+
# @example Collect all links
|
|
504
|
+
# links = []
|
|
505
|
+
# md.on(:link) { |l| links << { href: l.dest, text: l.text } }
|
|
506
|
+
# md.walk
|
|
507
|
+
def walk
|
|
508
|
+
return self if @source.empty?
|
|
509
|
+
Inkmark._native_walk(@source, @options.to_native_hash_frozen, @handlers || {})
|
|
510
|
+
self
|
|
511
|
+
end
|
|
512
|
+
|
|
513
|
+
# Render the stored source to HTML using the stored options.
|
|
514
|
+
#
|
|
515
|
+
# When +statistics: true+ or +toc: true+ is set, the render uses a
|
|
516
|
+
# single-pass entry point that also collects stats and TOC data as
|
|
517
|
+
# side-effects (set as instance variables by the Rust side). Call
|
|
518
|
+
# {#statistics} or {#toc} after +to_html+ to read the collected data.
|
|
519
|
+
#
|
|
520
|
+
# @return [String] rendered HTML, or an empty string when source is empty
|
|
521
|
+
def to_html
|
|
522
|
+
return "" if @source.empty?
|
|
523
|
+
if @handlers
|
|
524
|
+
Inkmark._native_render_with_handlers(@source, @options.to_native_hash_frozen, @handlers)
|
|
525
|
+
elsif @options[:statistics] || @options[:toc] || @options[:frontmatter] || extract_requested?
|
|
526
|
+
result = Inkmark._native_render_full(@source, @options.to_native_hash_frozen)
|
|
527
|
+
@toc_value = if result[:toc] || result[:toc_html]
|
|
528
|
+
Inkmark::Toc.new(markdown: result[:toc] || "", html: result[:toc_html] || "")
|
|
529
|
+
end
|
|
530
|
+
@statistics_data = result[:statistics]
|
|
531
|
+
@extracts_data = result[:extracts]
|
|
532
|
+
@frontmatter_raw = result[:frontmatter]
|
|
533
|
+
result[:html]
|
|
534
|
+
else
|
|
535
|
+
Inkmark._native_to_html(@source, @options.to_native_hash_frozen)
|
|
536
|
+
end
|
|
537
|
+
end
|
|
538
|
+
|
|
539
|
+
# Apply the filter pipeline and serialize back to Markdown text.
|
|
540
|
+
#
|
|
541
|
+
# Runs the same event-level filters as {#to_html} (controlled by the same
|
|
542
|
+
# options object), then serializes the event stream to Markdown. Useful as a
|
|
543
|
+
# preprocessing step in LLM or multi-renderer pipelines.
|
|
544
|
+
#
|
|
545
|
+
# HTML-emitting filters (+syntax_highlight+, +images: { lazy: true }+,
|
|
546
|
+
# +links: { nofollow: true }+) embed raw HTML in the output when enabled—see
|
|
547
|
+
# the "Markdown-to-Markdown pipeline" section in the README for guidance on
|
|
548
|
+
# which filters to enable.
|
|
549
|
+
#
|
|
550
|
+
# @return [String] filtered Markdown, or an empty string when source is empty
|
|
551
|
+
def to_markdown
|
|
552
|
+
return "" if @source.empty?
|
|
553
|
+
Inkmark._native_to_markdown(@source, @options.to_native_hash_frozen)
|
|
554
|
+
end
|
|
555
|
+
|
|
556
|
+
# Serialize the parsed document to plain text. Runs the same event-
|
|
557
|
+
# level filters as {#to_html} (controlled by the same options object).
|
|
558
|
+
# See {.to_plain_text} for output format details.
|
|
559
|
+
#
|
|
560
|
+
# @return [String] plain-text output, or an empty string when source is empty
|
|
561
|
+
def to_plain_text
|
|
562
|
+
return "" if @source.empty?
|
|
563
|
+
Inkmark._native_to_plain_text(@source, @options.to_native_hash_frozen)
|
|
564
|
+
end
|
|
565
|
+
|
|
566
|
+
# Chunk the document by heading into an Array of section Hashes, with
|
|
567
|
+
# filter-applied Markdown content. See {.chunks_by_heading} for the
|
|
568
|
+
# output shape.
|
|
569
|
+
#
|
|
570
|
+
# @param truncate [Hash, nil] optional per-section truncation spec;
|
|
571
|
+
# same shape as kwargs to {#truncate_markdown} (+:chars+, +:words+,
|
|
572
|
+
# +:at+, +:marker+). Applied to every section's +:content+; counts
|
|
573
|
+
# (if +statistics: true+) are recomputed on the truncated content.
|
|
574
|
+
# @return [Array<Hash>] section records
|
|
575
|
+
def chunks_by_heading(truncate: nil)
|
|
576
|
+
return [] if @source.empty?
|
|
577
|
+
opts_hash = @options.to_native_hash_frozen.dup
|
|
578
|
+
opts_hash[:truncate] = Inkmark.normalize_truncate_params(truncate) if truncate
|
|
579
|
+
Inkmark._native_chunks_by_heading(@source, opts_hash)
|
|
580
|
+
end
|
|
581
|
+
|
|
582
|
+
# Split the stored document into sliding-window chunks. See
|
|
583
|
+
# {.chunks_by_size} for the full parameter contract.
|
|
584
|
+
#
|
|
585
|
+
# @return [Array<Hash>] each +{index:, content:}+, with counts
|
|
586
|
+
# when +statistics: true+
|
|
587
|
+
def chunks_by_size(chars: nil, words: nil, overlap: 0, at: :block)
|
|
588
|
+
return [] if @source.empty?
|
|
589
|
+
opts_hash = @options.to_native_hash_frozen.dup
|
|
590
|
+
opts_hash[:__window] = Inkmark.normalize_window_params(
|
|
591
|
+
chars: chars, words: words, overlap: overlap, at: at
|
|
592
|
+
)
|
|
593
|
+
Inkmark._native_chunks_by_size(@source, opts_hash)
|
|
594
|
+
end
|
|
595
|
+
|
|
596
|
+
# Truncate the stored document. See {.truncate_markdown} for the full
|
|
597
|
+
# parameter contract.
|
|
598
|
+
#
|
|
599
|
+
# @return [String] truncated Markdown, or the source unchanged when
|
|
600
|
+
# it already fits
|
|
601
|
+
def truncate_markdown(chars: nil, words: nil, at: :block, marker: "…")
|
|
602
|
+
return "" if @source.empty?
|
|
603
|
+
params = Inkmark.normalize_truncate_params(
|
|
604
|
+
chars: chars, words: words, at: at, marker: marker
|
|
605
|
+
)
|
|
606
|
+
Inkmark._native_truncate_markdown(@source, params, @options.to_native_hash_frozen)
|
|
607
|
+
end
|
|
608
|
+
|
|
609
|
+
# Return the table of contents as a {Inkmark::Toc} value object,
|
|
610
|
+
# exposing +#to_markdown+ / +#to_html+ / +#to_s+ (markdown). Returns
|
|
611
|
+
# +nil+ when no TOC was requested (neither +toc+, +statistics+, nor
|
|
612
|
+
# +extract: { headings: true }+ is set).
|
|
613
|
+
#
|
|
614
|
+
# Collected during {#to_html} as a side-effect of the single-pass
|
|
615
|
+
# render. If +to_html+ hasn't been called yet, calling this triggers
|
|
616
|
+
# it.
|
|
617
|
+
#
|
|
618
|
+
# @return [Inkmark::Toc, nil]
|
|
619
|
+
# @example
|
|
620
|
+
# g.toc.to_markdown # "- [Intro](#intro)\n..."
|
|
621
|
+
# g.toc.to_html # "<ul><li>..."
|
|
622
|
+
# puts g.toc # prints markdown form (via to_s)
|
|
623
|
+
def toc
|
|
624
|
+
return nil unless toc_surface_requested?
|
|
625
|
+
to_html unless defined?(@toc_value) && @toc_value
|
|
626
|
+
@toc_value
|
|
627
|
+
end
|
|
628
|
+
|
|
629
|
+
# Return the collected document statistics as a Hash, or +nil+ when
|
|
630
|
+
# neither +statistics+ nor +toc+ is enabled.
|
|
631
|
+
#
|
|
632
|
+
# When +statistics: true+, the full hash includes language detection,
|
|
633
|
+
# character/word counts, code block count, and image/link arrays.
|
|
634
|
+
# When only +toc: true+, a lightweight hash with +heading_count+ is
|
|
635
|
+
# returned.
|
|
636
|
+
#
|
|
637
|
+
# Collected during {#to_html}. Calling this before +to_html+ triggers
|
|
638
|
+
# the render.
|
|
639
|
+
#
|
|
640
|
+
# @return [Hash, nil]
|
|
641
|
+
def statistics
|
|
642
|
+
return nil unless @options[:statistics] || @options[:toc]
|
|
643
|
+
to_html unless @statistics_data
|
|
644
|
+
@statistics_data
|
|
645
|
+
end
|
|
646
|
+
|
|
647
|
+
# Return structured extracts for the element kinds requested via
|
|
648
|
+
# +extract: { ... }+, or +nil+ when no kinds were requested.
|
|
649
|
+
#
|
|
650
|
+
# The returned Hash is keyed by the same symbols you passed in
|
|
651
|
+
# (+:images+, +:links+, +:code_blocks+, +:headings+,
|
|
652
|
+
# +:footnote_definitions+); each value is an Array of record Hashes
|
|
653
|
+
# including a +:byte_range+ Range for slicing the original source.
|
|
654
|
+
#
|
|
655
|
+
# +toc: true+ auto-enables +extract[:headings]+—the heading walk is
|
|
656
|
+
# shared, so you get the structured view for free.
|
|
657
|
+
#
|
|
658
|
+
# Collected during {#to_html} as a side-effect of the single-pass
|
|
659
|
+
# render. Calling this before +to_html+ triggers the render.
|
|
660
|
+
#
|
|
661
|
+
# @return [Hash, nil]
|
|
662
|
+
# @example
|
|
663
|
+
# md = Inkmark.new(source, options: { extract: { images: true } })
|
|
664
|
+
# md.extracts[:images]
|
|
665
|
+
# #=> [{ src: "cat.png", alt: "cat", title: "", byte_range: 12...28 }]
|
|
666
|
+
def extracts
|
|
667
|
+
return nil unless extract_requested?
|
|
668
|
+
to_html unless @extracts_data
|
|
669
|
+
@extracts_data
|
|
670
|
+
end
|
|
671
|
+
|
|
672
|
+
# Return the parsed frontmatter as a Hash, or +nil+ when the document
|
|
673
|
+
# has no frontmatter block or the +frontmatter+ option is not enabled.
|
|
674
|
+
#
|
|
675
|
+
# The raw YAML text is extracted by Rust during the event walk;
|
|
676
|
+
# parsing uses Ruby's stdlib +YAML.safe_load+ so all standard YAML
|
|
677
|
+
# types (strings, numbers, arrays, nested hashes) are supported.
|
|
678
|
+
#
|
|
679
|
+
# @return [Hash, nil] parsed frontmatter or nil
|
|
680
|
+
# @example
|
|
681
|
+
# md = Inkmark.new("---\ntitle: Hello\n---\n\n# Content",
|
|
682
|
+
# options: { frontmatter: true })
|
|
683
|
+
# md.frontmatter #=> { "title" => "Hello" }
|
|
684
|
+
def frontmatter
|
|
685
|
+
return @frontmatter if defined?(@frontmatter)
|
|
686
|
+
return @frontmatter = nil unless @options[:frontmatter]
|
|
687
|
+
to_html unless @frontmatter_raw
|
|
688
|
+
@frontmatter = @frontmatter_raw ? YAML.safe_load(@frontmatter_raw) : nil
|
|
689
|
+
end
|
|
690
|
+
|
|
691
|
+
private
|
|
692
|
+
|
|
693
|
+
# True when any request triggers the TOC walk—`toc: true`,
|
|
694
|
+
# `statistics: true`, or `extract: { headings: true }`. Used by
|
|
695
|
+
# {#toc} and {#toc_to_html} to decide whether to surface their
|
|
696
|
+
# computed value to the caller.
|
|
697
|
+
def toc_surface_requested?
|
|
698
|
+
return true if @options[:toc] || @options[:statistics]
|
|
699
|
+
extract = @options[:extract]
|
|
700
|
+
extract.is_a?(Hash) && extract[:headings] == true
|
|
701
|
+
end
|
|
702
|
+
|
|
703
|
+
# True when the user explicitly asked for any extract kind, OR when
|
|
704
|
+
# `toc: true` implicitly pulls headings into extracts. Matches the
|
|
705
|
+
# mutual trigger implemented on the Rust side.
|
|
706
|
+
def extract_requested?
|
|
707
|
+
return true if @options[:toc]
|
|
708
|
+
extract = @options[:extract]
|
|
709
|
+
extract.is_a?(Hash) && extract.any? { |_, v| v }
|
|
710
|
+
end
|
|
711
|
+
end
|