inkmark 0.1.0-aarch64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/inkmark.rb ADDED
@@ -0,0 +1,711 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "yaml"
4
+
5
+ # Inkmark is a very fast, feature-rich, AI-first CommonMark/GFM
6
+ # markdown renderer backed by the Rust pulldown-cmark parser.
7
+ #
8
+ # Default behavior: GFM extensions (tables, strikethrough, tasklists,
9
+ # footnotes) are enabled; raw HTML is suppressed. Override via options.
10
+ #
11
+ # ### Presets
12
+ #
13
+ # Four named bundles of options cover the common profiles:
14
+ #
15
+ # - +:gfm+ (the default): CommonMark + core GFM only.
16
+ # - +:commonmark+: strict CommonMark, no GFM.
17
+ # - +:recommended+: opinionated bundle for modern web content (smart
18
+ # punctuation, auto heading IDs, lazy images, autolinks + nofollow,
19
+ # URL scheme allowlists, emoji shortcodes, syntax highlighting,
20
+ # frontmatter).
21
+ # - +:trusted+: +:recommended+ plus raw-HTML pass-through. **Use only
22
+ # for fully trusted content.**
23
+ #
24
+ # See {Inkmark::Options::PRESETS}.
25
+ #
26
+ # ### Raw HTML safety
27
+ #
28
+ # Raw HTML is suppressed by default; every +<tag>+ in the source is
29
+ # escaped to text. Enable pass-through with +raw_html: true+ or the
30
+ # +:trusted+ preset **only for trusted input**. Inkmark does not
31
+ # sanitize raw HTML beyond the narrow GFM tagfilter; sanitize before rendering
32
+ # user-influenced content.
33
+ #
34
+ # @example Class-method shortcut
35
+ # Inkmark.to_html("**hello**")
36
+ # #=> "<p><strong>hello</strong></p>\n"
37
+ #
38
+ # @example Instance form with options
39
+ # g = Inkmark.new("# hi", options: { tables: false })
40
+ # g.to_html
41
+ #
42
+ # @example Mutable options after construction
43
+ # g = Inkmark.new("# hi")
44
+ # g.options.tables = false
45
+ # g.to_html
46
+ #
47
+ # @example Recommended profile
48
+ # Inkmark.to_html(md, options: { preset: :recommended })
49
+ class Inkmark
50
+ # Base error class for Inkmark-specific runtime failures.
51
+ class Error < StandardError; end
52
+ end
53
+
54
+ require_relative "inkmark/version"
55
+ require_relative "inkmark/options"
56
+ require_relative "inkmark/event"
57
+ require_relative "inkmark/toc"
58
+ require_relative "inkmark/native"
59
+
60
+ class Inkmark
61
+ class << self
62
+ # Render +source+ markdown to HTML in one call.
63
+ #
64
+ # This is a class-method fast path that skips Inkmark instance and
65
+ # Options copy allocation for the common one-shot render pattern.
66
+ # When the caller passes +options: nil+ (the default), we reuse the
67
+ # cached frozen hash that {Inkmark::Options#to_native_hash_frozen}
68
+ # returns; the cache lives on the Options instance itself and is
69
+ # invalidated by the Options mutation methods, so
70
+ # +Inkmark.default_options.tables = false+ followed by
71
+ # +Inkmark.to_html(src)+ picks up the new value without stale-cache
72
+ # bugs.
73
+ #
74
+ # **Raw HTML safety.** +raw_html: false+ (the default) escapes
75
+ # every raw HTML tag in the source—safe for untrusted input.
76
+ # Enable +raw_html: true+ (or +preset: :trusted+) only for
77
+ # content you fully trust, and run the output through a dedicated
78
+ # HTML sanitizer before displaying it.
79
+ #
80
+ # @param source [String, nil] the markdown source to render
81
+ # @param options [Hash, Inkmark::Options, nil] rendering options; merged
82
+ # over {default_options} when a Hash is supplied. Accepts
83
+ # +preset: :name+ (see {Inkmark::Options::PRESETS}).
84
+ # @return [String] the rendered HTML
85
+ # @raise [TypeError] if +options+ is not a Hash, Inkmark::Options, or nil
86
+ # @example
87
+ # Inkmark.to_html("**bold**") #=> "<p><strong>bold</strong></p>\n"
88
+ # @example With a preset
89
+ # Inkmark.to_html(md, options: { preset: :recommended })
90
+ def to_html(source, options: nil)
91
+ source = source.to_s
92
+ return "" if source.empty?
93
+ _native_to_html(source, resolve_frozen_options(options))
94
+ end
95
+
96
+ # Render +source+ markdown through the filter pipeline and serialize back
97
+ # to Markdown text.
98
+ #
99
+ # The same event-level filters as {to_html} are applied (emoji expansion,
100
+ # allowlists, autolink, etc.), then the event stream is serialized back to
101
+ # Markdown using pulldown-cmark-to-cmark. Use this as a preprocessing step
102
+ # in pipelines that consume Markdown: LLM prompts, secondary renderers,
103
+ # content storage.
104
+ #
105
+ # HTML-emitting filters (+syntax_highlight+, +images: { lazy: true }+,
106
+ # +links: { nofollow: true }+) embed raw HTML verbatim in the
107
+ # Markdown output when enabled. That is valid CommonMark but may
108
+ # break downstream consumers.
109
+ # See the "Markdown-to-Markdown pipeline" section in the README.
110
+ #
111
+ # @param source [String, nil] the markdown source to process
112
+ # @param options [Hash, Inkmark::Options, nil] rendering options
113
+ # @return [String] the filtered Markdown
114
+ def to_markdown(source, options: nil)
115
+ source = source.to_s
116
+ return "" if source.empty?
117
+ _native_to_markdown(source, resolve_frozen_options(options))
118
+ end
119
+
120
+ # Chunk +source+ by heading into an Array of section Hashes. Each
121
+ # section's +:content+ is filter-applied Markdown (emoji expanded,
122
+ # autolinks resolved, allowlists applied). Designed for feeding
123
+ # RAG / embedding pipelines that want pre-HTML chunks with clean
124
+ # content.
125
+ #
126
+ # Sections are hierarchical: a +##+ section's +:content+ includes
127
+ # any nested +###+ subsections, which also appear as their own
128
+ # entries. Content before the first heading (if any) is emitted
129
+ # as a preamble entry with +heading: nil+ and +level: 0+.
130
+ #
131
+ # Filter the returned array with plain +Enumerable+—by heading,
132
+ # level, id, or any other field. See the "Section extraction" in
133
+ # the README for recipes.
134
+ #
135
+ # **HTML-emitting filters** (+syntax_highlight+, +images: { lazy: true }+,
136
+ # +links: { nofollow: true }+) embed raw HTML into +:content+ when
137
+ # enabled. For RAG pipelines you almost always want these off so
138
+ # chunks stay pure Markdown.
139
+ #
140
+ # @param source [String, nil] the markdown source
141
+ # @param options [Hash, Inkmark::Options, nil] rendering options
142
+ # @return [Array<Hash>] section records
143
+ # @example Fetch one section
144
+ # Inkmark.chunks_by_heading(readme).find { |s| s[:heading] == "Installation" }
145
+ # @example Filter by heading pattern
146
+ # Inkmark.chunks_by_heading(readme).select { |s| s[:heading]&.match?(/install/i) }
147
+ # @example RAG chunking
148
+ # Inkmark.chunks_by_heading(readme).each do |s|
149
+ # embed_and_store("#{s[:heading]}\n\n#{s[:content]}") if s[:heading]
150
+ # end
151
+ def chunks_by_heading(source, options: nil, truncate: nil)
152
+ source = source.to_s
153
+ return [] if source.empty?
154
+
155
+ opts_hash = resolve_mutable_options(options)
156
+ opts_hash[:truncate] = normalize_truncate_params(truncate) if truncate
157
+ _native_chunks_by_heading(source, opts_hash)
158
+ end
159
+
160
+ # Split +source+ into sliding-window chunks bounded by a character
161
+ # and/or word budget. Adjacent chunks can share trailing context
162
+ # via +overlap+, which preserves continuity for embedding models.
163
+ # Unlike {chunks_by_heading}, this ignores document structure and
164
+ # walks the filter-applied Markdown sequentially — useful for
165
+ # heading-free or heading-uneven documents.
166
+ #
167
+ # @param source [String, nil] the markdown source
168
+ # @param chars [Integer, nil] max characters per chunk
169
+ # @param words [Integer, nil] max Unicode words per chunk; at
170
+ # least one of +chars+/+words+ must be set
171
+ # @param overlap [Integer] chars carried from the end of the
172
+ # previous chunk into the start of the next. Defaults to 0.
173
+ # Must be less than +chars+ when +chars+ is set.
174
+ # @param at [Symbol] +:block+ (valid-Markdown cut, oversized
175
+ # blocks emit as their own chunk) or +:word+ (word-boundary
176
+ # cut, may split open constructs).
177
+ # @param options [Hash, Inkmark::Options, nil] rendering options
178
+ # @return [Array<Hash>] each +{index:, content:}+, plus
179
+ # +:character_count+/+:word_count+ when +statistics: true+
180
+ # @raise [ArgumentError] on invalid parameter combinations
181
+ # @example
182
+ # Inkmark.chunks_by_size(readme, chars: 500, overlap: 50)
183
+ def chunks_by_size(source, chars: nil, words: nil, overlap: 0, at: :block, options: nil)
184
+ source = source.to_s
185
+ return [] if source.empty?
186
+
187
+ opts_hash = resolve_mutable_options(options)
188
+ opts_hash[:__window] = normalize_window_params(
189
+ chars: chars, words: words, overlap: overlap, at: at
190
+ )
191
+ _native_chunks_by_size(source, opts_hash)
192
+ end
193
+
194
+ # Truncate a Markdown document to fit a char and/or word budget.
195
+ # Returns filter-applied Markdown cut at either the last block
196
+ # boundary that fits (+at: :block+) or the last Unicode word
197
+ # boundary that fits (+at: :word+).
198
+ #
199
+ # Designed as a preprocessing step for LLM context-window budgeting
200
+ # and RAG chunk normalization. The marker (default +"…"+) is
201
+ # appended only when truncation actually occurred and counts toward
202
+ # the budget, so +chars: 4000+ always yields output ≤ 4000
203
+ # codepoints.
204
+ #
205
+ # @param source [String, nil] the markdown source
206
+ # @param chars [Integer, nil] maximum codepoint count; at least
207
+ # one of +chars+/+words+ must be set
208
+ # @param words [Integer, nil] maximum Unicode word count
209
+ # @param at [Symbol] +:block+ (valid-Markdown cut) or +:word+
210
+ # (word-boundary cut; may split open constructs)
211
+ # @param marker [String, nil] appended when truncation occurs.
212
+ # Pass +nil+ to suppress. Defaults to +"…"+ (U+2026).
213
+ # @param options [Hash, Inkmark::Options, nil] rendering options
214
+ # @return [String] truncated Markdown, or the source unchanged
215
+ # when it already fits
216
+ # @raise [ArgumentError] if neither chars nor words is set,
217
+ # +at+ is not +:block+/+:word+, or the marker exceeds the budget
218
+ def truncate_markdown(source, chars: nil, words: nil, at: :block, marker: "…", options: nil)
219
+ source = source.to_s
220
+ return "" if source.empty?
221
+
222
+ params = normalize_truncate_params(
223
+ chars: chars, words: words, at: at, marker: marker
224
+ )
225
+ _native_truncate_markdown(source, params, resolve_frozen_options(options))
226
+ end
227
+
228
+ # Render +source+ through the filter pipeline and serialize to plain
229
+ # text. Markdown syntax (emphasis, headings, list bullets, fences)
230
+ # is stripped; inline content is preserved. Links become
231
+ # +"text (url)"+; images become +"alt (src)"+; tables are
232
+ # tab-separated; code blocks keep their raw body.
233
+ #
234
+ # Designed as a preprocessor for embedding models, token counting,
235
+ # LLM input, and any downstream consumer that treats Markdown
236
+ # syntax as noise.
237
+ #
238
+ # @param source [String, nil] the markdown source
239
+ # @param options [Hash, Inkmark::Options, nil] rendering options
240
+ # @return [String] plain-text output
241
+ def to_plain_text(source, options: nil)
242
+ source = source.to_s
243
+ return "" if source.empty?
244
+ _native_to_plain_text(source, resolve_frozen_options(options))
245
+ end
246
+
247
+ # Normalize and validate truncation params coming from either the
248
+ # {.truncate_markdown} kwargs or the {.chunks_by_heading}
249
+ # +truncate:+ kwarg. Accepts a Hash with +:chars+/+:words+/+:at+/
250
+ # +:marker+ keys, or positional kwargs (collected by the caller
251
+ # into a Hash). Returns a Hash ready to hand to the native side.
252
+ #
253
+ # @api private
254
+ def normalize_truncate_params(params)
255
+ if params.respond_to?(:to_hash)
256
+ params = params.to_hash
257
+ end
258
+ unless params.is_a?(Hash)
259
+ raise TypeError, "truncate must be a Hash, got #{params.class}"
260
+ end
261
+
262
+ unknown = params.keys - [:chars, :words, :at, :marker]
263
+ unless unknown.empty?
264
+ raise ArgumentError, "unknown truncate key(s): #{unknown.inspect}; " \
265
+ "expected :chars, :words, :at, :marker"
266
+ end
267
+
268
+ chars = params[:chars]
269
+ words = params[:words]
270
+ at = params.fetch(:at, :block)
271
+ marker = params.fetch(:marker, "…")
272
+
273
+ if chars.nil? && words.nil?
274
+ raise ArgumentError, "truncate requires at least one of :chars or :words"
275
+ end
276
+ if chars && !chars.is_a?(Integer)
277
+ raise ArgumentError, ":chars must be an Integer, got #{chars.class}"
278
+ end
279
+ if words && !words.is_a?(Integer)
280
+ raise ArgumentError, ":words must be an Integer, got #{words.class}"
281
+ end
282
+ unless %i[block word].include?(at)
283
+ raise ArgumentError, ":at must be :block or :word, got #{at.inspect}"
284
+ end
285
+ unless marker.nil? || marker.is_a?(String)
286
+ raise ArgumentError, ":marker must be a String or nil, got #{marker.class}"
287
+ end
288
+ if marker && chars && marker.length >= chars
289
+ raise ArgumentError, ":marker (#{marker.length} chars) must be shorter than :chars budget (#{chars})"
290
+ end
291
+
292
+ {chars: chars, words: words, at: at.to_s, marker: marker}
293
+ end
294
+
295
+ # Validate sliding-window chunking params. Keeps {.chunks_by_size}
296
+ # tight by raising on obvious misconfiguration rather than silent
297
+ # clamping — invalid overlap or missing budget is almost always a
298
+ # swapped-arg bug.
299
+ #
300
+ # @api private
301
+ def normalize_window_params(chars:, words:, overlap:, at:)
302
+ if chars.nil? && words.nil?
303
+ raise ArgumentError, "chunks_by_size requires at least one of :chars or :words"
304
+ end
305
+ if chars && !chars.is_a?(Integer)
306
+ raise ArgumentError, ":chars must be an Integer, got #{chars.class}"
307
+ end
308
+ if words && !words.is_a?(Integer)
309
+ raise ArgumentError, ":words must be an Integer, got #{words.class}"
310
+ end
311
+ if chars && chars <= 0
312
+ raise ArgumentError, ":chars must be positive, got #{chars}"
313
+ end
314
+ if words && words <= 0
315
+ raise ArgumentError, ":words must be positive, got #{words}"
316
+ end
317
+ unless overlap.is_a?(Integer)
318
+ raise ArgumentError, ":overlap must be an Integer, got #{overlap.class}"
319
+ end
320
+ if overlap < 0
321
+ raise ArgumentError, ":overlap must be non-negative, got #{overlap}"
322
+ end
323
+ if chars && overlap >= chars
324
+ raise ArgumentError, ":overlap (#{overlap}) must be less than :chars budget (#{chars})"
325
+ end
326
+ unless %i[block word].include?(at)
327
+ raise ArgumentError, ":at must be :block or :word, got #{at.inspect}"
328
+ end
329
+
330
+ {chars: chars, words: words, overlap: overlap, at: at.to_s}
331
+ end
332
+
333
+ # Return the CSS stylesheet for syntax-highlighted code blocks.
334
+ # Pair this with +syntax_highlight: true+ in the rendering options.
335
+ #
336
+ # @param theme [String, nil] syntect theme name; defaults to
337
+ # "base16-ocean.dark". Call {highlight_themes} for available names.
338
+ # @return [String] CSS text suitable for a +<style>+ tag or +.css+ file
339
+ # @raise [ArgumentError] if the theme name is not recognized
340
+ # @example
341
+ # Inkmark.highlight_css
342
+ # Inkmark.highlight_css(theme: "InspiredGitHub")
343
+ def highlight_css(theme: nil)
344
+ _syntax_css(theme)
345
+ end
346
+
347
+ # Return an array of available syntax-highlighting theme names.
348
+ # Memoized—the theme list is fixed at compile time.
349
+ #
350
+ # @return [Array<String>]
351
+ def highlight_themes
352
+ @highlight_themes ||= _syntax_themes.freeze
353
+ end
354
+
355
+ # The class-level default options used when no per-instance options are given.
356
+ #
357
+ # @return [Inkmark::Options]
358
+ def default_options
359
+ @default_options ||= Inkmark::Options.new
360
+ end
361
+
362
+ # Replace the class-level default options.
363
+ #
364
+ # @param value [Hash, Inkmark::Options] new defaults; a Hash is converted to
365
+ # Inkmark::Options, a Inkmark::Options is duped
366
+ # @return [Inkmark::Options] the stored options object
367
+ # @raise [TypeError] if +value+ is not a Hash or Inkmark::Options
368
+ def default_options=(value)
369
+ @default_options =
370
+ case value
371
+ when Inkmark::Options then value.dup
372
+ when Hash then Inkmark::Options.new(value)
373
+ else raise TypeError, "default_options must be a Hash or Inkmark::Options, got #{value.class}"
374
+ end
375
+ end
376
+
377
+ private
378
+
379
+ # Resolve +options+ to a frozen flat Rust-facing hash for the
380
+ # read-only FFI paths (to_html, to_markdown, to_plain_text,
381
+ # truncate_markdown). When no options are supplied and no class-
382
+ # level default_options has been set, return nil so the Rust side
383
+ # skips hash-key lookups entirely and uses its hardcoded defaults—
384
+ # the absolute fast path for one-shot renders.
385
+ def resolve_frozen_options(options)
386
+ return nil if options.nil? && @default_options.nil?
387
+ case options
388
+ when nil then default_options.to_native_hash_frozen
389
+ when Inkmark::Options then options.to_native_hash_frozen
390
+ when Hash then Inkmark::Options.native_hash_from(options)
391
+ else raise TypeError, "options must be a Hash or Inkmark::Options, got #{options.class}"
392
+ end
393
+ end
394
+
395
+ # Resolve +options+ to a mutable flat hash for FFI paths that
396
+ # splice in per-call params ({chunks_by_heading}'s +:truncate+,
397
+ # {chunks_by_size}'s +:__window+). Always builds or dups a hash—
398
+ # the nil fast path doesn't apply because the caller will mutate
399
+ # the result.
400
+ def resolve_mutable_options(options)
401
+ case options
402
+ when nil then default_options.to_native_hash_frozen.dup
403
+ when Inkmark::Options then options.to_native_hash_frozen.dup
404
+ when Hash then Inkmark::Options.native_hash_from(options).dup
405
+ else raise TypeError, "options must be a Hash or Inkmark::Options, got #{options.class}"
406
+ end
407
+ end
408
+ end
409
+
410
+ # Create a new renderer for +source+.
411
+ #
412
+ # @param source [String, nil] markdown source; +nil+ is treated as an
413
+ # empty string
414
+ # @param options [Hash, Inkmark::Options, nil] rendering options; falls back
415
+ # to a dup of {Inkmark.default_options} when nil
416
+ # @raise [TypeError] if +options+ is not a Hash, Inkmark::Options, or nil
417
+ def initialize(source = nil, options: nil)
418
+ self.source = source
419
+ self.options = options
420
+ @handlers = nil
421
+ end
422
+
423
+ # @!attribute [r] source
424
+ # The markdown source string that will be rendered. Always a String
425
+ # (never nil); a nil assignment is stored as an empty string.
426
+ # @return [String]
427
+ #
428
+ # @!attribute [r] options
429
+ # The rendering options for this instance.
430
+ # @return [Inkmark::Options]
431
+ attr_reader :source, :options
432
+
433
+ # Coerce the renderer to a String by returning the stored source.
434
+ # Mirrors the wrapper idiom used by +Pathname+, +URI+, etc.: the
435
+ # stringified form of the wrapper is its carried value. Explicit
436
+ # renderings (HTML, Markdown, plain text) are available via
437
+ # {#to_html}, {#to_markdown}, {#to_plain_text}, and
438
+ # {#chunks_by_heading}.
439
+ #
440
+ # @return [String] the stored source, unchanged
441
+ def to_s
442
+ @source
443
+ end
444
+
445
+ # Set the markdown source.
446
+ #
447
+ # @param value [String, nil] markdown text; nil and non-Strings are coerced
448
+ # via +#to_s+
449
+ # @return [String] the stored source
450
+ def source=(value)
451
+ @source = value.to_s
452
+ end
453
+
454
+ # Set rendering options.
455
+ #
456
+ # @param value [Hash, Inkmark::Options, nil] new options; nil resets to a dup
457
+ # of {Inkmark.default_options}
458
+ # @return [Inkmark::Options] the stored options object
459
+ # @raise [TypeError] if +value+ is not a Hash, Inkmark::Options, or nil
460
+ def options=(value)
461
+ @options =
462
+ case value
463
+ when nil then Inkmark.default_options.dup
464
+ when Inkmark::Options then value.dup
465
+ when Hash then Inkmark::Options.new(value)
466
+ else raise TypeError, "options must be a Hash or Inkmark::Options, got #{value.class}"
467
+ end
468
+ end
469
+
470
+ # Register a handler block for a document element kind.
471
+ #
472
+ # The block receives a {Inkmark::Event} object when an element of +kind+ is
473
+ # encountered. Handlers fire post-order—children before parents—so
474
+ # container elements (tables, blockquotes, lists) see their children
475
+ # populated when the handler runs.
476
+ #
477
+ # Multiple handlers for the same kind are supported and fire in
478
+ # registration order. Returns +self+ for chaining.
479
+ #
480
+ # Trigger handlers by calling {#to_html} (render + transform) or
481
+ # {#walk} (analysis only, no HTML output).
482
+ #
483
+ # @param kind [Symbol] element kind—e.g. +:heading+, +:image+, +:link+
484
+ # @yieldparam event [Inkmark::Event]
485
+ # @return [self]
486
+ # @example Rewrite image sources to a CDN
487
+ # md.on(:image) { |img| img.dest = cdn(img.dest) }
488
+ # @example Replace mermaid code blocks
489
+ # md.on(:code_block) { |c| c.html = Mermaid.render(c.source) if c.lang == "mermaid" }
490
+ def on(kind, &block)
491
+ (@handlers ||= {})[kind.to_sym] ||= []
492
+ @handlers[kind.to_sym] << block
493
+ self
494
+ end
495
+
496
+ # Walk the document, firing all registered handlers, without producing
497
+ # HTML output. Use this for analysis—collecting headings, extracting
498
+ # links, building a TOC—when you don't need to render.
499
+ #
500
+ # Returns +self+.
501
+ #
502
+ # @return [self]
503
+ # @example Collect all links
504
+ # links = []
505
+ # md.on(:link) { |l| links << { href: l.dest, text: l.text } }
506
+ # md.walk
507
+ def walk
508
+ return self if @source.empty?
509
+ Inkmark._native_walk(@source, @options.to_native_hash_frozen, @handlers || {})
510
+ self
511
+ end
512
+
513
+ # Render the stored source to HTML using the stored options.
514
+ #
515
+ # When +statistics: true+ or +toc: true+ is set, the render uses a
516
+ # single-pass entry point that also collects stats and TOC data as
517
+ # side-effects (set as instance variables by the Rust side). Call
518
+ # {#statistics} or {#toc} after +to_html+ to read the collected data.
519
+ #
520
+ # @return [String] rendered HTML, or an empty string when source is empty
521
+ def to_html
522
+ return "" if @source.empty?
523
+ if @handlers
524
+ Inkmark._native_render_with_handlers(@source, @options.to_native_hash_frozen, @handlers)
525
+ elsif @options[:statistics] || @options[:toc] || @options[:frontmatter] || extract_requested?
526
+ result = Inkmark._native_render_full(@source, @options.to_native_hash_frozen)
527
+ @toc_value = if result[:toc] || result[:toc_html]
528
+ Inkmark::Toc.new(markdown: result[:toc] || "", html: result[:toc_html] || "")
529
+ end
530
+ @statistics_data = result[:statistics]
531
+ @extracts_data = result[:extracts]
532
+ @frontmatter_raw = result[:frontmatter]
533
+ result[:html]
534
+ else
535
+ Inkmark._native_to_html(@source, @options.to_native_hash_frozen)
536
+ end
537
+ end
538
+
539
+ # Apply the filter pipeline and serialize back to Markdown text.
540
+ #
541
+ # Runs the same event-level filters as {#to_html} (controlled by the same
542
+ # options object), then serializes the event stream to Markdown. Useful as a
543
+ # preprocessing step in LLM or multi-renderer pipelines.
544
+ #
545
+ # HTML-emitting filters (+syntax_highlight+, +images: { lazy: true }+,
546
+ # +links: { nofollow: true }+) embed raw HTML in the output when enabled—see
547
+ # the "Markdown-to-Markdown pipeline" section in the README for guidance on
548
+ # which filters to enable.
549
+ #
550
+ # @return [String] filtered Markdown, or an empty string when source is empty
551
+ def to_markdown
552
+ return "" if @source.empty?
553
+ Inkmark._native_to_markdown(@source, @options.to_native_hash_frozen)
554
+ end
555
+
556
+ # Serialize the parsed document to plain text. Runs the same event-
557
+ # level filters as {#to_html} (controlled by the same options object).
558
+ # See {.to_plain_text} for output format details.
559
+ #
560
+ # @return [String] plain-text output, or an empty string when source is empty
561
+ def to_plain_text
562
+ return "" if @source.empty?
563
+ Inkmark._native_to_plain_text(@source, @options.to_native_hash_frozen)
564
+ end
565
+
566
+ # Chunk the document by heading into an Array of section Hashes, with
567
+ # filter-applied Markdown content. See {.chunks_by_heading} for the
568
+ # output shape.
569
+ #
570
+ # @param truncate [Hash, nil] optional per-section truncation spec;
571
+ # same shape as kwargs to {#truncate_markdown} (+:chars+, +:words+,
572
+ # +:at+, +:marker+). Applied to every section's +:content+; counts
573
+ # (if +statistics: true+) are recomputed on the truncated content.
574
+ # @return [Array<Hash>] section records
575
+ def chunks_by_heading(truncate: nil)
576
+ return [] if @source.empty?
577
+ opts_hash = @options.to_native_hash_frozen.dup
578
+ opts_hash[:truncate] = Inkmark.normalize_truncate_params(truncate) if truncate
579
+ Inkmark._native_chunks_by_heading(@source, opts_hash)
580
+ end
581
+
582
+ # Split the stored document into sliding-window chunks. See
583
+ # {.chunks_by_size} for the full parameter contract.
584
+ #
585
+ # @return [Array<Hash>] each +{index:, content:}+, with counts
586
+ # when +statistics: true+
587
+ def chunks_by_size(chars: nil, words: nil, overlap: 0, at: :block)
588
+ return [] if @source.empty?
589
+ opts_hash = @options.to_native_hash_frozen.dup
590
+ opts_hash[:__window] = Inkmark.normalize_window_params(
591
+ chars: chars, words: words, overlap: overlap, at: at
592
+ )
593
+ Inkmark._native_chunks_by_size(@source, opts_hash)
594
+ end
595
+
596
+ # Truncate the stored document. See {.truncate_markdown} for the full
597
+ # parameter contract.
598
+ #
599
+ # @return [String] truncated Markdown, or the source unchanged when
600
+ # it already fits
601
+ def truncate_markdown(chars: nil, words: nil, at: :block, marker: "…")
602
+ return "" if @source.empty?
603
+ params = Inkmark.normalize_truncate_params(
604
+ chars: chars, words: words, at: at, marker: marker
605
+ )
606
+ Inkmark._native_truncate_markdown(@source, params, @options.to_native_hash_frozen)
607
+ end
608
+
609
+ # Return the table of contents as a {Inkmark::Toc} value object,
610
+ # exposing +#to_markdown+ / +#to_html+ / +#to_s+ (markdown). Returns
611
+ # +nil+ when no TOC was requested (neither +toc+, +statistics+, nor
612
+ # +extract: { headings: true }+ is set).
613
+ #
614
+ # Collected during {#to_html} as a side-effect of the single-pass
615
+ # render. If +to_html+ hasn't been called yet, calling this triggers
616
+ # it.
617
+ #
618
+ # @return [Inkmark::Toc, nil]
619
+ # @example
620
+ # g.toc.to_markdown # "- [Intro](#intro)\n..."
621
+ # g.toc.to_html # "<ul><li>..."
622
+ # puts g.toc # prints markdown form (via to_s)
623
+ def toc
624
+ return nil unless toc_surface_requested?
625
+ to_html unless defined?(@toc_value) && @toc_value
626
+ @toc_value
627
+ end
628
+
629
+ # Return the collected document statistics as a Hash, or +nil+ when
630
+ # neither +statistics+ nor +toc+ is enabled.
631
+ #
632
+ # When +statistics: true+, the full hash includes language detection,
633
+ # character/word counts, code block count, and image/link arrays.
634
+ # When only +toc: true+, a lightweight hash with +heading_count+ is
635
+ # returned.
636
+ #
637
+ # Collected during {#to_html}. Calling this before +to_html+ triggers
638
+ # the render.
639
+ #
640
+ # @return [Hash, nil]
641
+ def statistics
642
+ return nil unless @options[:statistics] || @options[:toc]
643
+ to_html unless @statistics_data
644
+ @statistics_data
645
+ end
646
+
647
+ # Return structured extracts for the element kinds requested via
648
+ # +extract: { ... }+, or +nil+ when no kinds were requested.
649
+ #
650
+ # The returned Hash is keyed by the same symbols you passed in
651
+ # (+:images+, +:links+, +:code_blocks+, +:headings+,
652
+ # +:footnote_definitions+); each value is an Array of record Hashes
653
+ # including a +:byte_range+ Range for slicing the original source.
654
+ #
655
+ # +toc: true+ auto-enables +extract[:headings]+—the heading walk is
656
+ # shared, so you get the structured view for free.
657
+ #
658
+ # Collected during {#to_html} as a side-effect of the single-pass
659
+ # render. Calling this before +to_html+ triggers the render.
660
+ #
661
+ # @return [Hash, nil]
662
+ # @example
663
+ # md = Inkmark.new(source, options: { extract: { images: true } })
664
+ # md.extracts[:images]
665
+ # #=> [{ src: "cat.png", alt: "cat", title: "", byte_range: 12...28 }]
666
+ def extracts
667
+ return nil unless extract_requested?
668
+ to_html unless @extracts_data
669
+ @extracts_data
670
+ end
671
+
672
+ # Return the parsed frontmatter as a Hash, or +nil+ when the document
673
+ # has no frontmatter block or the +frontmatter+ option is not enabled.
674
+ #
675
+ # The raw YAML text is extracted by Rust during the event walk;
676
+ # parsing uses Ruby's stdlib +YAML.safe_load+ so all standard YAML
677
+ # types (strings, numbers, arrays, nested hashes) are supported.
678
+ #
679
+ # @return [Hash, nil] parsed frontmatter or nil
680
+ # @example
681
+ # md = Inkmark.new("---\ntitle: Hello\n---\n\n# Content",
682
+ # options: { frontmatter: true })
683
+ # md.frontmatter #=> { "title" => "Hello" }
684
+ def frontmatter
685
+ return @frontmatter if defined?(@frontmatter)
686
+ return @frontmatter = nil unless @options[:frontmatter]
687
+ to_html unless @frontmatter_raw
688
+ @frontmatter = @frontmatter_raw ? YAML.safe_load(@frontmatter_raw) : nil
689
+ end
690
+
691
+ private
692
+
693
+ # True when any request triggers the TOC walk—`toc: true`,
694
+ # `statistics: true`, or `extract: { headings: true }`. Used by
695
+ # {#toc} and {#toc_to_html} to decide whether to surface their
696
+ # computed value to the caller.
697
+ def toc_surface_requested?
698
+ return true if @options[:toc] || @options[:statistics]
699
+ extract = @options[:extract]
700
+ extract.is_a?(Hash) && extract[:headings] == true
701
+ end
702
+
703
+ # True when the user explicitly asked for any extract kind, OR when
704
+ # `toc: true` implicitly pulls headings into extracts. Matches the
705
+ # mutual trigger implemented on the Rust side.
706
+ def extract_requested?
707
+ return true if @options[:toc]
708
+ extract = @options[:extract]
709
+ extract.is_a?(Hash) && extract.any? { |_, v| v }
710
+ end
711
+ end