pikuri-core 0.0.4 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/pikuri/agent.rb CHANGED
@@ -94,8 +94,16 @@ module Pikuri
94
94
  # queue is drained on every +after_tool_result+, each item
95
95
  # appended as a +role: :user+ message and emitted as
96
96
  # {Event::UserTurn} with +mid_loop: true+
97
+ # @param on_user_message [Proc, nil] when set, called with each
98
+ # drained interloper +content+ String *after* it is appended
99
+ # to the chat — the per-turn {Extension#on_user_message}
100
+ # dispatch (prefetch + recording). Threaded through here rather
101
+ # than fired inline so {Synthesizer.run}, which reuses this
102
+ # wiring without an interloper or memory, simply passes +nil+.
103
+ # Only consulted when +interloper+ is also set.
97
104
  # @return [void]
98
- def self.wire_chat(chat, listeners:, step_limit: nil, cancellable: nil, interloper: nil)
105
+ def self.wire_chat(chat, listeners:, step_limit: nil, cancellable: nil, interloper: nil,
106
+ on_user_message: nil)
99
107
  chat.after_message do |msg|
100
108
  emit_after_message(msg, listeners)
101
109
  end
@@ -106,7 +114,7 @@ module Pikuri
106
114
  end
107
115
  chat.after_tool_result do |result|
108
116
  listeners.emit(Event::ToolResult.new(content: result))
109
- drain_interloper(interloper, chat, listeners) if interloper
117
+ drain_interloper(interloper, chat, listeners, on_user_message) if interloper
110
118
  end
111
119
  end
112
120
 
@@ -216,18 +224,29 @@ module Pikuri
216
224
 
217
225
  # Drain the interloper queue: for each pending item, append a
218
226
  # +role: :user+ message to the chat history so the next
219
- # round-trip sees it, then emit an {Event::UserTurn} with
220
- # +mid_loop: true+ to the listener stream so renderers see
221
- # the injection.
227
+ # round-trip sees it, emit an {Event::UserTurn} with
228
+ # +mid_loop: true+ to the listener stream so renderers see the
229
+ # injection, then run the per-turn {Extension#on_user_message}
230
+ # dispatch (so mid-loop injections are prefetched + recorded
231
+ # exactly like initial turns).
232
+ #
233
+ # The dispatch runs *after* the +:user+ append so any
234
+ # +<memory-context>+ it injects lands as a +:system+ message
235
+ # right behind the user turn it annotates — the same
236
+ # append-at-the-tail ordering {#run_loop} produces for initial
237
+ # turns.
222
238
  #
223
239
  # @param interloper [Control::Interloper]
224
240
  # @param chat [RubyLLM::Chat]
225
241
  # @param listeners [ListenerList]
242
+ # @param on_user_message [Proc, nil] per-content dispatch; +nil+
243
+ # skips it (e.g. an interloper with no memory extension wired)
226
244
  # @return [void]
227
- def self.drain_interloper(interloper, chat, listeners)
245
+ def self.drain_interloper(interloper, chat, listeners, on_user_message = nil)
228
246
  interloper.drain!.each do |content|
229
247
  chat.add_message(role: :user, content: content)
230
248
  listeners.emit(Event::UserTurn.new(content: content, mid_loop: true))
249
+ on_user_message&.call(content)
231
250
  end
232
251
  end
233
252
  private_class_method :drain_interloper
@@ -381,71 +400,31 @@ module Pikuri
381
400
  @streaming = streaming
382
401
  @synth_answer = nil
383
402
  @on_close_handlers = []
384
-
385
- # Single Configurator funnel for everything the block adds —
386
- # tools, listeners, system-prompt snippets, extensions, and
387
- # on_close handlers. See {Configurator} for the per-method
388
- # contract.
389
- configurator = Configurator.new(
390
- transport: @transport,
391
- system_prompt_base: system_prompt,
392
- id: @id,
393
- streaming: @streaming,
394
- step_limit: @step_limit,
395
- cancellable: @cancellable,
396
- interloper: @interloper
397
- )
398
-
399
- block&.call(configurator)
400
-
401
- @tools = configurator.tools.dup
402
- @sub_agent_tools = configurator.sub_agent_tools.dup
403
- @listeners = ListenerList.new(configurator.listeners)
404
- configurator.system_prompt_additions.each do |snippet|
405
- @system_prompt = "#{@system_prompt}\n\n#{snippet}"
403
+ # Stashed for {#run_configure}, which runs the failure-prone
404
+ # build phase below out of a separate method.
405
+ @block = block
406
+ @context_window = context_window
407
+ @llama_probe_url = llama_probe_url
408
+
409
+ # Register *before* the build phase so a mid-construction raise
410
+ # is still recoverable: extensions arm their cleanup via
411
+ # +c.on_close+ (which writes straight to +@on_close_handlers+,
412
+ # see {Configurator}), and the rescue below fires whatever was
413
+ # armed before the failure. On the happy path this registration
414
+ # is the at-exit backstop if the host forgets {#close}; an
415
+ # explicit {#close} unregisters, so the agent isn't pinned alive
416
+ # until process exit.
417
+ Pikuri::Finalizers.register(self)
418
+
419
+ begin
420
+ run_configure
421
+ rescue StandardError
422
+ # Half-built agent (e.g. an extension's +configure+ raised
423
+ # Cancelled mid-spawn). Fire the handlers armed so far, drop
424
+ # out of the registry, and re-raise — no partial state leaks.
425
+ close
426
+ raise
406
427
  end
407
- @on_close_handlers.concat(configurator.on_close_handlers)
408
- @extensions = configurator.extensions.dup
409
-
410
- @chat = RubyLLM.chat(**@transport.to_h)
411
- @chat.with_instructions(@system_prompt)
412
- @tools.each { |t| @chat.with_tool(t.to_ruby_llm_tool) }
413
-
414
- @context_window_cap = ContextWindowDetector.new(
415
- override: context_window,
416
- ruby_llm_reported: @chat.model.context_window,
417
- llama_probe_url: llama_probe_url
418
- ).detect
419
-
420
- self.class.wire_chat(
421
- @chat,
422
- listeners: @listeners,
423
- step_limit: @step_limit,
424
- cancellable: @cancellable,
425
- interloper: @interloper
426
- )
427
-
428
- # One-shot context-window cap: lets every listener that
429
- # cares (notably TokenLog) pick the value off the stream
430
- # before any Tokens event arrives.
431
- @listeners.emit(Event::ContextCap.new(cap: @context_window_cap))
432
-
433
- # Bind sweep — each extension gets its chance to install
434
- # per-agent state (dynamic tools via #internal_add_tool,
435
- # per-agent close hooks via #on_close, etc.) now that the
436
- # chat is fully wired. See IDEAS.md §"Extension protocol
437
- # design" for what #configure vs #bind are each for.
438
- @extensions.each { |ext| ext.bind(self) }
439
-
440
- # Fallback cleanup: if the host forgets to call #close, the
441
- # at_exit hook fires it on process exit. Idempotent, so an
442
- # explicit close earlier makes this a no-op. The closure
443
- # captures self, which keeps the agent reachable until
444
- # process exit — fine for the handful of agents a typical
445
- # host creates; if pikuri grows a long-running host that
446
- # constructs many short-lived agents, switch to a single
447
- # process-global registry that close-then-removes.
448
- at_exit { close }
449
428
  end
450
429
 
451
430
  # @return [RubyLLM::Chat] underlying chat; the extension seam
@@ -601,13 +580,23 @@ module Pikuri
601
580
  if user_message.nil? || user_message.to_s.strip.empty?
602
581
 
603
582
  @synth_answer = nil
604
- @listeners.emit(Event::UserTurn.new(content: user_message, mid_loop: false))
605
583
  @step_limit&.reset!
606
584
  @cancellable&.reset!
585
+ # Append the user turn, emit it, then run the memory dispatch — so
586
+ # any <memory-context> the dispatch injects lands as a :system
587
+ # message *after* the user turn it annotates (append-only at the
588
+ # tail; see {#dispatch_ext_on_user_message}). `ask` would bundle the
589
+ # user-message append with completion atomically, leaving no seam to
590
+ # inject between them, so the two halves run explicitly here:
591
+ # add_message + complete (the exact pair `ask` is sugar for). A raw
592
+ # String content matches the interloper drain path.
593
+ @chat.add_message(role: :user, content: user_message)
594
+ @listeners.emit(Event::UserTurn.new(content: user_message, mid_loop: false))
595
+ dispatch_ext_on_user_message(user_message)
607
596
  if @streaming
608
- @chat.ask(user_message, &self.class.streaming_block(listeners: @listeners, cancellable: @cancellable))
597
+ @chat.complete(&self.class.streaming_block(listeners: @listeners, cancellable: @cancellable))
609
598
  else
610
- @chat.ask(user_message)
599
+ @chat.complete
611
600
  end
612
601
  nil
613
602
  rescue Control::Cancellable::Cancelled
@@ -661,6 +650,10 @@ module Pikuri
661
650
  return if @closed
662
651
 
663
652
  @closed = true
653
+ # Drop out of the process-global registry first: a deliberate
654
+ # close means this agent no longer needs the at-exit fallback,
655
+ # and removing the reference lets it be garbage-collected.
656
+ Pikuri::Finalizers.unregister(self)
664
657
  @on_close_handlers.reverse_each do |handler|
665
658
  handler.call
666
659
  rescue StandardError => e
@@ -719,5 +712,113 @@ module Pikuri
719
712
  def to_s
720
713
  "Agent(id=#{@id}, model=#{model}, tools=#{@tools.size}, listeners=#{@listeners})"
721
714
  end
715
+
716
+ private
717
+
718
+ # The failure-prone build phase, split out of {#initialize} so the
719
+ # constructor can wrap it in a rescue and self-heal. Funnels the
720
+ # +Agent.new+ block through a single {Configurator} — tools,
721
+ # listeners, system-prompt snippets, extensions, and +on_close+
722
+ # handlers — then wires the chat and runs the extension +bind+
723
+ # sweep. The Configurator's +on_close_sink:+ is +@on_close_handlers+
724
+ # itself, so a handler an extension arms via +c.on_close+ is live on
725
+ # the agent the instant it's registered — that's what lets the
726
+ # constructor's rescue close a half-built agent.
727
+ #
728
+ # @return [void]
729
+ def run_configure
730
+ configurator = Configurator.new(
731
+ transport: @transport,
732
+ system_prompt_base: @system_prompt,
733
+ id: @id,
734
+ streaming: @streaming,
735
+ step_limit: @step_limit,
736
+ cancellable: @cancellable,
737
+ interloper: @interloper,
738
+ on_close_sink: @on_close_handlers
739
+ )
740
+
741
+ @block&.call(configurator)
742
+
743
+ @tools = configurator.tools.dup
744
+ @sub_agent_tools = configurator.sub_agent_tools.dup
745
+ @listeners = ListenerList.new(configurator.listeners)
746
+ configurator.system_prompt_additions.each do |snippet|
747
+ @system_prompt = "#{@system_prompt}\n\n#{snippet}"
748
+ end
749
+ @extensions = configurator.extensions.dup
750
+
751
+ @chat = RubyLLM.chat(**@transport.to_h)
752
+ @chat.with_instructions(@system_prompt)
753
+ @tools.each { |t| @chat.with_tool(t.to_ruby_llm_tool) }
754
+
755
+ @context_window_cap = ContextWindowDetector.new(
756
+ override: @context_window,
757
+ ruby_llm_reported: @chat.model.context_window,
758
+ llama_probe_url: @llama_probe_url,
759
+ model_id: @chat.model.id
760
+ ).detect
761
+
762
+ self.class.wire_chat(
763
+ @chat,
764
+ listeners: @listeners,
765
+ step_limit: @step_limit,
766
+ cancellable: @cancellable,
767
+ interloper: @interloper,
768
+ on_user_message: method(:dispatch_ext_on_user_message)
769
+ )
770
+
771
+ # One-shot context-window cap: lets every listener that
772
+ # cares (notably TokenLog) pick the value off the stream
773
+ # before any Tokens event arrives.
774
+ @listeners.emit(Event::ContextCap.new(cap: @context_window_cap))
775
+
776
+ # Bind sweep — each extension gets its chance to install
777
+ # per-agent state (dynamic tools via #internal_add_tool,
778
+ # per-agent close hooks via #on_close, etc.) now that the
779
+ # chat is fully wired. See IDEAS.md §"Extension protocol
780
+ # design" for what #configure vs #bind are each for.
781
+ @extensions.each { |ext| ext.bind(self) }
782
+ end
783
+
784
+ # Fire the per-turn {Extension#on_user_message} hook on every
785
+ # extension that defines it, appending any returned
786
+ # +<memory-context>+ block to the chat as a +role: :system+
787
+ # message right after the user turn it annotates (callers append
788
+ # the +:user+ message first; this runs last). The system role is
789
+ # load-bearing — it tags the block as recalled reference (not new
790
+ # input) and keeps it excludable from a later extraction pass.
791
+ # See {Extension#on_user_message}.
792
+ #
793
+ # Each injected block also emits an {Event::SystemInjected} at
794
+ # this site, so the listener stream mirrors the log growth (the
795
+ # Terminal renders it; otherwise an injection would be invisible
796
+ # except as a downstream echo in the assistant's reasoning).
797
+ #
798
+ # Private and the single place the chat log grows by a memory
799
+ # block — keeps "what mutates the log, when" one grep in this
800
+ # file. Fired from {#run_loop} (initial turn) and, via the
801
+ # +on_user_message:+ proc threaded into {.wire_chat}, from
802
+ # {.drain_interloper} (mid-loop interlopers). Called on every
803
+ # extension unconditionally — same as {Extension#configure} /
804
+ # {Extension#bind}: the hook is part of the protocol and the
805
+ # {Extension} module supplies a no-op default, so any extension
806
+ # that includes the module responds. An extension is "opted out"
807
+ # by leaving the default in place (it returns +nil+, injecting
808
+ # nothing), not by omitting the method.
809
+ #
810
+ # @param content [String] the incoming user message
811
+ # @return [void]
812
+ def dispatch_ext_on_user_message(content)
813
+ @extensions.each do |ext|
814
+ message = ext.on_user_message(self, content)
815
+ next unless message.is_a?(String) && !message.strip.empty?
816
+
817
+ block = message.strip
818
+ @chat.add_message(role: :system, content: block)
819
+ @listeners.emit(Event::SystemInjected.new(content: block))
820
+ end
821
+ nil
822
+ end
722
823
  end
723
824
  end
@@ -0,0 +1,303 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+ require 'nokogiri'
5
+ require 'readability'
6
+ require 'reverse_markdown'
7
+
8
+ module Pikuri
9
+ module Extractor
10
+ # HTML → Markdown extractor.
11
+ #
12
+ # Matched by content-type only (+text/html+ /
13
+ # +application/xhtml+xml+) — deliberately no byte sniff. The web
14
+ # path always has the header; for local files a sniff would route
15
+ # +Workspace::Read+ of an +.html+ source file through readability
16
+ # extraction, when a developer reading an HTML file wants the
17
+ # source. Local HTML stays on the {Passthrough} arm until a
18
+ # consumer genuinely needs otherwise.
19
+ #
20
+ # Always renders both views of the page when available:
21
+ #
22
+ # 1. JSON-LD section. Any +<script type="application/ld+json">+ node
23
+ # whose +@type+ matches a substantive schema.org content type
24
+ # (Product, Article, Recipe, ...) is rendered as a header — title,
25
+ # metadata bullets (brand, SKU, price, rating, author, published),
26
+ # and the +articleBody+/+description+ copy when present.
27
+ # 2. Readability section. The page is run through +Readability+ +
28
+ # +reverse_markdown+, with a +<main>+/+<article>+ fallback for
29
+ # pages whose content sits mostly outside +<p>+ tags.
30
+ #
31
+ # Concatenated with a horizontal rule, so the LLM gets both the
32
+ # structured metadata and the rendered body and can pick whichever
33
+ # is more useful for the task. Trades some duplication (when a
34
+ # publisher embeds the article body in JSON-LD AND in HTML) for
35
+ # fewer type-based heuristics on which branch should win — the
36
+ # earlier "is this Article's +description+ a teaser or the real
37
+ # body?" carve-out is no longer needed because both end up in
38
+ # the output regardless.
39
+ module HTML
40
+ # @return [Array<String>] content-types this extractor claims.
41
+ CONTENT_TYPES = %w[text/html application/xhtml+xml].freeze
42
+
43
+ # @return [Array<String>] schema.org +@type+ values that we treat
44
+ # as "the primary entity of this page" when picking a JSON-LD
45
+ # node to render. Order does not matter — the first matching
46
+ # node wins. Skips noise nodes (Organization, BreadcrumbList,
47
+ # WebSite, ...) that ship on most pages but carry no page
48
+ # content.
49
+ INTERESTING_TYPES = %w[
50
+ Product Article NewsArticle BlogPosting Recipe Event Book Movie
51
+ ].freeze
52
+
53
+ # @return [Array<String>] HTML tags preserved by the readability
54
+ # pass. Anything outside this list is stripped before Markdown
55
+ # conversion.
56
+ READABILITY_TAGS = %w[
57
+ h1 h2 h3 h4 h5 h6 p div span ul ol li blockquote pre code a img
58
+ strong em b i br hr table thead tbody tr td th
59
+ ].freeze
60
+
61
+ # @return [Array<String>] HTML attributes preserved by the
62
+ # readability pass; everything else (class, id, style, data-*)
63
+ # is dropped before Markdown conversion
64
+ READABILITY_ATTRS = %w[href src alt title].freeze
65
+
66
+ # @return [Float] minimum +<main>+/+<article>+ to Readability
67
+ # text-length ratio that triggers the semantic-container
68
+ # fallback in {.readability_to_markdown}. Picked low enough to
69
+ # catch the failure mode (Readability collapsing a page that
70
+ # uses divs/lists instead of +<p>+ — e.g. +vaadin.com/company+,
71
+ # ~5x) but high enough that pages where both produce
72
+ # comparable output keep Readability's noise filtering.
73
+ MAIN_FALLBACK_RATIO = 2.0
74
+
75
+ # @return [Integer] minimum text length the
76
+ # +<main>+/+<article>+ container must hold before the fallback
77
+ # in {.readability_to_markdown} can fire. Below this, the
78
+ # ratio comparison is dominated by noise and we'd swap on
79
+ # tiny pages where Readability is doing the right thing.
80
+ MAIN_FALLBACK_MIN_CHARS = 500
81
+
82
+ # @return [Symbol] {Page#kind} tag.
83
+ def self.kind
84
+ :html
85
+ end
86
+
87
+ # @param sample [String] leading bytes of the content (unused —
88
+ # see the no-sniff rationale in the module doc).
89
+ # @param content_type [String, nil] normalized content-type.
90
+ # @return [Boolean]
91
+ def self.matches?(sample:, content_type:)
92
+ CONTENT_TYPES.include?(content_type)
93
+ end
94
+
95
+ # Render the HTML document behind +io+ as Markdown by emitting
96
+ # both the JSON-LD section (when an interesting node is present)
97
+ # and the readability / +<main>+ section, joined by a horizontal
98
+ # rule. Either section may be missing — pages with no JSON-LD
99
+ # return only the readability output, and a malformed page with
100
+ # no extractable body returns only the JSON-LD render.
101
+ #
102
+ # @param io [IO, StringIO] IO over the HTML document.
103
+ # @return [String] Markdown representation
104
+ def self.extract(io)
105
+ html = io.read
106
+ sections = [jsonld_section(html), readability_to_markdown(html)]
107
+ sections.reject! { |s| s.nil? || s.strip.empty? }
108
+ sections.join("\n\n---\n\n")
109
+ end
110
+
111
+ # Pick the first JSON-LD node whose +@type+ matches one of
112
+ # {INTERESTING_TYPES} and render it as Markdown. Returns +nil+
113
+ # when no such node exists, in which case {.extract} emits only
114
+ # the readability section.
115
+ #
116
+ # No content-field gating: a node carrying just +name+/+author+/
117
+ # +datePublished+ still renders (as a metadata-only header),
118
+ # because the readability pass independently produces the page
119
+ # body. That is the trade-off that lets us drop the type-based
120
+ # "is this teaser or article copy?" heuristics — duplication is
121
+ # acceptable when both views are available, and the LLM can
122
+ # pick whichever it needs.
123
+ #
124
+ # @param html [String] HTML document body
125
+ # @return [String, nil] Markdown render of the picked JSON-LD
126
+ # node, or +nil+ when nothing matched
127
+ def self.jsonld_section(html)
128
+ node = parse_jsonld(html).find do |n|
129
+ Array(n['@type']).any? { |t| INTERESTING_TYPES.include?(t) }
130
+ end
131
+ node ? jsonld_to_markdown(node) : nil
132
+ end
133
+
134
+ # Collect every JSON-LD payload embedded in +html+, flattening
135
+ # +@graph+ wrappers so callers see one flat array of schema.org
136
+ # nodes. Malformed JSON blocks are silently skipped — sites
137
+ # frequently ship broken JSON-LD and we only need at least one
138
+ # parseable block.
139
+ #
140
+ # @param html [String] HTML document body
141
+ # @return [Array<Hash>] parsed JSON-LD nodes; possibly empty
142
+ def self.parse_jsonld(html)
143
+ doc = Nokogiri::HTML(html)
144
+ blobs = doc.css('script[type="application/ld+json"]').map(&:text)
145
+
146
+ blobs.flat_map do |raw|
147
+ parsed = begin
148
+ JSON.parse(raw)
149
+ rescue JSON::ParserError
150
+ nil
151
+ end
152
+ next [] unless parsed
153
+
154
+ nodes = parsed.is_a?(Array) ? parsed : [parsed]
155
+ nodes.flat_map { |n| n['@graph'].is_a?(Array) ? n['@graph'] : [n] }
156
+ end
157
+ end
158
+
159
+ # Render a single JSON-LD +node+ as Markdown: a top-level title
160
+ # from +name+/+headline+, a bullet list of common useful fields
161
+ # (brand, SKU, price, rating, author, published date, ...), the
162
+ # body copy, and the lead image.
163
+ #
164
+ # When the node carries +articleBody+ (the full publisher-supplied
165
+ # article text), that wins over +description+ — the description
166
+ # is typically a lede teaser and would just repeat the article's
167
+ # opening lines.
168
+ #
169
+ # @param node [Hash] JSON-LD node, typically picked by
170
+ # {.jsonld_section}
171
+ # @return [String] Markdown representation
172
+ def self.jsonld_to_markdown(node)
173
+ out = +''
174
+ name = node['name'] || node['headline']
175
+ out << "# #{name}\n\n" if name
176
+
177
+ offer = first_obj(node['offers'])
178
+ rating = first_obj(node['aggregateRating'])
179
+ brand = first_obj_or_string(node['brand'])
180
+ author = first_obj_or_string(node['author'])
181
+
182
+ brand_name = brand.is_a?(Hash) ? brand['name'] : brand
183
+ author_name = author.is_a?(Hash) ? author['name'] : author
184
+
185
+ fields = {
186
+ 'Brand' => brand_name,
187
+ 'SKU' => node['sku'],
188
+ 'GTIN' => node['gtin13'] || node['gtin'],
189
+ 'Price' => [offer['price'], offer['priceCurrency']].compact.join(' '),
190
+ 'Availability' => offer['availability'],
191
+ 'Rating' => rating['ratingValue'],
192
+ 'Reviews' => rating['reviewCount'],
193
+ 'Author' => author_name,
194
+ 'Published' => node['datePublished']
195
+ }.reject { |_, v| v.nil? || v.to_s.strip.empty? }
196
+
197
+ unless fields.empty?
198
+ fields.each { |k, v| out << "- **#{k}:** #{v}\n" }
199
+ out << "\n"
200
+ end
201
+
202
+ if (body = node['articleBody'] || node['description'])
203
+ out << "#{body}\n\n"
204
+ end
205
+
206
+ if (img = node['image'])
207
+ img = img.first if img.is_a?(Array)
208
+ img = img['url'] if img.is_a?(Hash)
209
+ out << "![image](#{img})\n\n" if img
210
+ end
211
+
212
+ out
213
+ end
214
+
215
+ # Run +Readability+ over +html+ to isolate the main content node,
216
+ # then convert that to Markdown via +reverse_markdown+. The page
217
+ # +<title>+ is rendered as a top-level heading.
218
+ #
219
+ # When the page uses semantic HTML5 (+<main>+ or +<article>+) but
220
+ # leaves most of its content outside +<p>+ tags — divs, lists,
221
+ # spans — Readability's paragraph-density scoring collapses the
222
+ # extraction to a sliver of the page. In that case we render the
223
+ # +<main>+/+<article>+ container directly. The fallback only
224
+ # fires when the container holds substantially more text than
225
+ # Readability picked up (see {MAIN_FALLBACK_RATIO} /
226
+ # {MAIN_FALLBACK_MIN_CHARS}); on pages where both agree we keep
227
+ # Readability so its noise filtering still strips nav/ads/etc.
228
+ #
229
+ # @param html [String] HTML document body
230
+ # @return [String] Markdown representation
231
+ def self.readability_to_markdown(html)
232
+ rdoc = Readability::Document.new(
233
+ html,
234
+ tags: READABILITY_TAGS,
235
+ attributes: READABILITY_ATTRS,
236
+ remove_empty_nodes: true
237
+ )
238
+ readability_html = rdoc.content
239
+ title = rdoc.title
240
+
241
+ body_html = main_fallback_html(html, readability_html) || readability_html
242
+ body = ReverseMarkdown.convert(body_html, unknown_tags: :bypass, github_flavored: true)
243
+
244
+ out = +''
245
+ out << "# #{title.strip}\n\n" if title && !title.strip.empty?
246
+ out << body
247
+ out
248
+ end
249
+
250
+ # If +html+ has a +<main>+ or +<article>+ element holding
251
+ # substantially more text than Readability extracted, return that
252
+ # container's HTML so the caller can render it instead. Returns
253
+ # +nil+ when the fallback should not fire — when there is no
254
+ # semantic container, when it's too small to be meaningful, or
255
+ # when Readability's output is already comparable.
256
+ #
257
+ # @param html [String] full HTML document body, used to locate
258
+ # the +<main>+/+<article>+ container
259
+ # @param readability_html [String] HTML produced by
260
+ # +Readability::Document#content+, used as the comparison
261
+ # baseline
262
+ # @return [String, nil] container HTML when the fallback should
263
+ # fire, +nil+ otherwise
264
+ def self.main_fallback_html(html, readability_html)
265
+ doc = Nokogiri::HTML(html)
266
+ container = doc.at_css('main') || doc.at_css('article')
267
+ return nil unless container
268
+
269
+ container_text_len = container.text.gsub(/\s+/, ' ').strip.length
270
+ return nil if container_text_len < MAIN_FALLBACK_MIN_CHARS
271
+
272
+ readability_text_len = Nokogiri::HTML(readability_html).text.gsub(/\s+/, ' ').strip.length
273
+ return nil if container_text_len < MAIN_FALLBACK_RATIO * readability_text_len
274
+
275
+ container.to_html
276
+ end
277
+ private_class_method :main_fallback_html
278
+
279
+ # JSON-LD fields can be a string, hash, or array of either.
280
+ # Normalize to a single hash (the first one if it's a list) so
281
+ # callers can +.dig+ safely.
282
+ #
283
+ # @param value [Object] raw JSON-LD field value
284
+ # @return [Hash] empty hash when +value+ does not contain a hash
285
+ def self.first_obj(value)
286
+ value = value.first if value.is_a?(Array)
287
+ value.is_a?(Hash) ? value : {}
288
+ end
289
+ private_class_method :first_obj
290
+
291
+ # Same idea as {.first_obj} but preserves a bare string (e.g.
292
+ # +brand: "Apple"+) instead of replacing it with +{}+.
293
+ #
294
+ # @param value [Object] raw JSON-LD field value
295
+ # @return [String, Hash, nil]
296
+ def self.first_obj_or_string(value)
297
+ value = value.first if value.is_a?(Array)
298
+ value
299
+ end
300
+ private_class_method :first_obj_or_string
301
+ end
302
+ end
303
+ end
@@ -0,0 +1,64 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Pikuri
4
+ module Extractor
5
+ # The terminal plain-text arm of the registry: content that *is*
6
+ # already text needs no extraction, so it passes through verbatim
7
+ # (forced to UTF-8 — invalid bytes are left in for downstream to
8
+ # deal with, matching what +File.read+ with a UTF-8 encoding does).
9
+ # Markdown, source files, JSON, robots.txt all land here.
10
+ #
11
+ # Matching is split by whether the transport supplied a
12
+ # content-type:
13
+ #
14
+ # * With a content-type (the web path): claim +text/*+ only.
15
+ # A non-text type that no earlier extractor claimed is *not*
16
+ # second-guessed by sniffing — a server declaring
17
+ # +application/octet-stream+ gets the {Unsupported} refusal the
18
+ # LLM can react to, same as before this registry existed.
19
+ # * Without one (the local-file path, where {FileType.detect_mime}
20
+ # returned +nil+ for "unrecognised"): claim anything that passes
21
+ # the {FileType.binary?} heuristic on the sample. Opaque
22
+ # binaries stay unclaimed and surface as {Unsupported}.
23
+ module Passthrough
24
+ # @return [Symbol] {Page#kind} tag.
25
+ def self.kind
26
+ :text
27
+ end
28
+
29
+ # @param sample [String] leading bytes of the content.
30
+ # @param content_type [String, nil] normalized content-type,
31
+ # +nil+ when the transport has none.
32
+ # @return [Boolean]
33
+ def self.matches?(sample:, content_type:)
34
+ return content_type.start_with?('text/') unless content_type.nil?
35
+
36
+ !FileType.binary?(sample)
37
+ end
38
+
39
+ # @param io [IO, StringIO] IO over the text content.
40
+ # @return [String] the content, tagged UTF-8. Deliberately NOT
41
+ # derived from {.extract_lines} — a passthrough must stay
42
+ # verbatim (trailing newline, CRLF line endings), which a
43
+ # join of chomped lines would silently normalize away.
44
+ def self.extract(io)
45
+ io.read.force_encoding(Encoding::UTF_8)
46
+ end
47
+
48
+ # The lazy line stream for {Extractor.extract_paged}: the IO is
49
+ # read line-by-line, so a window over the head of a gigabyte
50
+ # log never loads the rest. Consuming the whole stream is a
51
+ # cheap sequential read — which is why the paging window counts
52
+ # this stream's tail for an exact +total_lines+ (see
53
+ # {Extractor.extract_paged}).
54
+ #
55
+ # @param io [IO, StringIO] IO over the text content; must
56
+ # remain open while the enumerator is consumed.
57
+ # @return [Enumerator::Lazy<String>] chomped lines, tagged
58
+ # UTF-8.
59
+ def self.extract_lines(io)
60
+ io.each_line.lazy.map { |raw| raw.chomp.force_encoding(Encoding::UTF_8) }
61
+ end
62
+ end
63
+ end
64
+ end