pikuri-core 0.0.4 → 0.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/pikuri/agent/configurator.rb +9 -2
- data/lib/pikuri/agent/context_window_detector.rb +70 -10
- data/lib/pikuri/agent/control/interloper.rb +10 -2
- data/lib/pikuri/agent/event.rb +15 -0
- data/lib/pikuri/agent/extension.rb +37 -9
- data/lib/pikuri/agent/listener/terminal.rb +22 -36
- data/lib/pikuri/agent.rb +174 -73
- data/lib/pikuri/extractor/html.rb +303 -0
- data/lib/pikuri/extractor/passthrough.rb +64 -0
- data/lib/pikuri/extractor.rb +314 -0
- data/lib/pikuri/file_type.rb +87 -59
- data/lib/pikuri/finalizers.rb +118 -0
- data/lib/pikuri/paths.rb +29 -0
- data/lib/pikuri/subprocess.rb +109 -12
- data/lib/pikuri/tool/calculator.rb +213 -41
- data/lib/pikuri/tool/fetch.rb +10 -9
- data/lib/pikuri/tool/scraper.rb +186 -0
- data/lib/pikuri/tool/web_scrape.rb +5 -5
- data/lib/pikuri/version.rb +1 -1
- data/lib/pikuri-core.rb +0 -1
- metadata +8 -62
- data/lib/pikuri/tool/scraper/fetch_error.rb +0 -16
- data/lib/pikuri/tool/scraper/html.rb +0 -285
- data/lib/pikuri/tool/scraper/pdf.rb +0 -54
- data/lib/pikuri/tool/scraper/simple.rb +0 -183
data/lib/pikuri/agent.rb
CHANGED
|
@@ -94,8 +94,16 @@ module Pikuri
|
|
|
94
94
|
# queue is drained on every +after_tool_result+, each item
|
|
95
95
|
# appended as a +role: :user+ message and emitted as
|
|
96
96
|
# {Event::UserTurn} with +mid_loop: true+
|
|
97
|
+
# @param on_user_message [Proc, nil] when set, called with each
|
|
98
|
+
# drained interloper +content+ String *after* it is appended
|
|
99
|
+
# to the chat — the per-turn {Extension#on_user_message}
|
|
100
|
+
# dispatch (prefetch + recording). Threaded through here rather
|
|
101
|
+
# than fired inline so {Synthesizer.run}, which reuses this
|
|
102
|
+
# wiring without an interloper or memory, simply passes +nil+.
|
|
103
|
+
# Only consulted when +interloper+ is also set.
|
|
97
104
|
# @return [void]
|
|
98
|
-
def self.wire_chat(chat, listeners:, step_limit: nil, cancellable: nil, interloper: nil
|
|
105
|
+
def self.wire_chat(chat, listeners:, step_limit: nil, cancellable: nil, interloper: nil,
|
|
106
|
+
on_user_message: nil)
|
|
99
107
|
chat.after_message do |msg|
|
|
100
108
|
emit_after_message(msg, listeners)
|
|
101
109
|
end
|
|
@@ -106,7 +114,7 @@ module Pikuri
|
|
|
106
114
|
end
|
|
107
115
|
chat.after_tool_result do |result|
|
|
108
116
|
listeners.emit(Event::ToolResult.new(content: result))
|
|
109
|
-
drain_interloper(interloper, chat, listeners) if interloper
|
|
117
|
+
drain_interloper(interloper, chat, listeners, on_user_message) if interloper
|
|
110
118
|
end
|
|
111
119
|
end
|
|
112
120
|
|
|
@@ -216,18 +224,29 @@ module Pikuri
|
|
|
216
224
|
|
|
217
225
|
# Drain the interloper queue: for each pending item, append a
|
|
218
226
|
# +role: :user+ message to the chat history so the next
|
|
219
|
-
# round-trip sees it,
|
|
220
|
-
# +mid_loop: true+ to the listener stream so renderers see
|
|
221
|
-
# the
|
|
227
|
+
# round-trip sees it, emit an {Event::UserTurn} with
|
|
228
|
+
# +mid_loop: true+ to the listener stream so renderers see the
|
|
229
|
+
# injection, then run the per-turn {Extension#on_user_message}
|
|
230
|
+
# dispatch (so mid-loop injections are prefetched + recorded
|
|
231
|
+
# exactly like initial turns).
|
|
232
|
+
#
|
|
233
|
+
# The dispatch runs *after* the +:user+ append so any
|
|
234
|
+
# +<memory-context>+ it injects lands as a +:system+ message
|
|
235
|
+
# right behind the user turn it annotates — the same
|
|
236
|
+
# append-at-the-tail ordering {#run_loop} produces for initial
|
|
237
|
+
# turns.
|
|
222
238
|
#
|
|
223
239
|
# @param interloper [Control::Interloper]
|
|
224
240
|
# @param chat [RubyLLM::Chat]
|
|
225
241
|
# @param listeners [ListenerList]
|
|
242
|
+
# @param on_user_message [Proc, nil] per-content dispatch; +nil+
|
|
243
|
+
# skips it (e.g. an interloper with no memory extension wired)
|
|
226
244
|
# @return [void]
|
|
227
|
-
def self.drain_interloper(interloper, chat, listeners)
|
|
245
|
+
def self.drain_interloper(interloper, chat, listeners, on_user_message = nil)
|
|
228
246
|
interloper.drain!.each do |content|
|
|
229
247
|
chat.add_message(role: :user, content: content)
|
|
230
248
|
listeners.emit(Event::UserTurn.new(content: content, mid_loop: true))
|
|
249
|
+
on_user_message&.call(content)
|
|
231
250
|
end
|
|
232
251
|
end
|
|
233
252
|
private_class_method :drain_interloper
|
|
@@ -381,71 +400,31 @@ module Pikuri
|
|
|
381
400
|
@streaming = streaming
|
|
382
401
|
@synth_answer = nil
|
|
383
402
|
@on_close_handlers = []
|
|
384
|
-
|
|
385
|
-
#
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
403
|
+
# Stashed for {#run_configure}, which runs the failure-prone
|
|
404
|
+
# build phase below out of a separate method.
|
|
405
|
+
@block = block
|
|
406
|
+
@context_window = context_window
|
|
407
|
+
@llama_probe_url = llama_probe_url
|
|
408
|
+
|
|
409
|
+
# Register *before* the build phase so a mid-construction raise
|
|
410
|
+
# is still recoverable: extensions arm their cleanup via
|
|
411
|
+
# +c.on_close+ (which writes straight to +@on_close_handlers+,
|
|
412
|
+
# see {Configurator}), and the rescue below fires whatever was
|
|
413
|
+
# armed before the failure. On the happy path this registration
|
|
414
|
+
# is the at-exit backstop if the host forgets {#close}; an
|
|
415
|
+
# explicit {#close} unregisters, so the agent isn't pinned alive
|
|
416
|
+
# until process exit.
|
|
417
|
+
Pikuri::Finalizers.register(self)
|
|
418
|
+
|
|
419
|
+
begin
|
|
420
|
+
run_configure
|
|
421
|
+
rescue StandardError
|
|
422
|
+
# Half-built agent (e.g. an extension's +configure+ raised
|
|
423
|
+
# Cancelled mid-spawn). Fire the handlers armed so far, drop
|
|
424
|
+
# out of the registry, and re-raise — no partial state leaks.
|
|
425
|
+
close
|
|
426
|
+
raise
|
|
406
427
|
end
|
|
407
|
-
@on_close_handlers.concat(configurator.on_close_handlers)
|
|
408
|
-
@extensions = configurator.extensions.dup
|
|
409
|
-
|
|
410
|
-
@chat = RubyLLM.chat(**@transport.to_h)
|
|
411
|
-
@chat.with_instructions(@system_prompt)
|
|
412
|
-
@tools.each { |t| @chat.with_tool(t.to_ruby_llm_tool) }
|
|
413
|
-
|
|
414
|
-
@context_window_cap = ContextWindowDetector.new(
|
|
415
|
-
override: context_window,
|
|
416
|
-
ruby_llm_reported: @chat.model.context_window,
|
|
417
|
-
llama_probe_url: llama_probe_url
|
|
418
|
-
).detect
|
|
419
|
-
|
|
420
|
-
self.class.wire_chat(
|
|
421
|
-
@chat,
|
|
422
|
-
listeners: @listeners,
|
|
423
|
-
step_limit: @step_limit,
|
|
424
|
-
cancellable: @cancellable,
|
|
425
|
-
interloper: @interloper
|
|
426
|
-
)
|
|
427
|
-
|
|
428
|
-
# One-shot context-window cap: lets every listener that
|
|
429
|
-
# cares (notably TokenLog) pick the value off the stream
|
|
430
|
-
# before any Tokens event arrives.
|
|
431
|
-
@listeners.emit(Event::ContextCap.new(cap: @context_window_cap))
|
|
432
|
-
|
|
433
|
-
# Bind sweep — each extension gets its chance to install
|
|
434
|
-
# per-agent state (dynamic tools via #internal_add_tool,
|
|
435
|
-
# per-agent close hooks via #on_close, etc.) now that the
|
|
436
|
-
# chat is fully wired. See IDEAS.md §"Extension protocol
|
|
437
|
-
# design" for what #configure vs #bind are each for.
|
|
438
|
-
@extensions.each { |ext| ext.bind(self) }
|
|
439
|
-
|
|
440
|
-
# Fallback cleanup: if the host forgets to call #close, the
|
|
441
|
-
# at_exit hook fires it on process exit. Idempotent, so an
|
|
442
|
-
# explicit close earlier makes this a no-op. The closure
|
|
443
|
-
# captures self, which keeps the agent reachable until
|
|
444
|
-
# process exit — fine for the handful of agents a typical
|
|
445
|
-
# host creates; if pikuri grows a long-running host that
|
|
446
|
-
# constructs many short-lived agents, switch to a single
|
|
447
|
-
# process-global registry that close-then-removes.
|
|
448
|
-
at_exit { close }
|
|
449
428
|
end
|
|
450
429
|
|
|
451
430
|
# @return [RubyLLM::Chat] underlying chat; the extension seam
|
|
@@ -601,13 +580,23 @@ module Pikuri
|
|
|
601
580
|
if user_message.nil? || user_message.to_s.strip.empty?
|
|
602
581
|
|
|
603
582
|
@synth_answer = nil
|
|
604
|
-
@listeners.emit(Event::UserTurn.new(content: user_message, mid_loop: false))
|
|
605
583
|
@step_limit&.reset!
|
|
606
584
|
@cancellable&.reset!
|
|
585
|
+
# Append the user turn, emit it, then run the memory dispatch — so
|
|
586
|
+
# any <memory-context> the dispatch injects lands as a :system
|
|
587
|
+
# message *after* the user turn it annotates (append-only at the
|
|
588
|
+
# tail; see {#dispatch_ext_on_user_message}). `ask` would bundle the
|
|
589
|
+
# user-message append with completion atomically, leaving no seam to
|
|
590
|
+
# inject between them, so the two halves run explicitly here:
|
|
591
|
+
# add_message + complete (the exact pair `ask` is sugar for). A raw
|
|
592
|
+
# String content matches the interloper drain path.
|
|
593
|
+
@chat.add_message(role: :user, content: user_message)
|
|
594
|
+
@listeners.emit(Event::UserTurn.new(content: user_message, mid_loop: false))
|
|
595
|
+
dispatch_ext_on_user_message(user_message)
|
|
607
596
|
if @streaming
|
|
608
|
-
@chat.
|
|
597
|
+
@chat.complete(&self.class.streaming_block(listeners: @listeners, cancellable: @cancellable))
|
|
609
598
|
else
|
|
610
|
-
@chat.
|
|
599
|
+
@chat.complete
|
|
611
600
|
end
|
|
612
601
|
nil
|
|
613
602
|
rescue Control::Cancellable::Cancelled
|
|
@@ -661,6 +650,10 @@ module Pikuri
|
|
|
661
650
|
return if @closed
|
|
662
651
|
|
|
663
652
|
@closed = true
|
|
653
|
+
# Drop out of the process-global registry first: a deliberate
|
|
654
|
+
# close means this agent no longer needs the at-exit fallback,
|
|
655
|
+
# and removing the reference lets it be garbage-collected.
|
|
656
|
+
Pikuri::Finalizers.unregister(self)
|
|
664
657
|
@on_close_handlers.reverse_each do |handler|
|
|
665
658
|
handler.call
|
|
666
659
|
rescue StandardError => e
|
|
@@ -719,5 +712,113 @@ module Pikuri
|
|
|
719
712
|
def to_s
|
|
720
713
|
"Agent(id=#{@id}, model=#{model}, tools=#{@tools.size}, listeners=#{@listeners})"
|
|
721
714
|
end
|
|
715
|
+
|
|
716
|
+
private
|
|
717
|
+
|
|
718
|
+
# The failure-prone build phase, split out of {#initialize} so the
|
|
719
|
+
# constructor can wrap it in a rescue and self-heal. Funnels the
|
|
720
|
+
# +Agent.new+ block through a single {Configurator} — tools,
|
|
721
|
+
# listeners, system-prompt snippets, extensions, and +on_close+
|
|
722
|
+
# handlers — then wires the chat and runs the extension +bind+
|
|
723
|
+
# sweep. The Configurator's +on_close_sink:+ is +@on_close_handlers+
|
|
724
|
+
# itself, so a handler an extension arms via +c.on_close+ is live on
|
|
725
|
+
# the agent the instant it's registered — that's what lets the
|
|
726
|
+
# constructor's rescue close a half-built agent.
|
|
727
|
+
#
|
|
728
|
+
# @return [void]
|
|
729
|
+
def run_configure
|
|
730
|
+
configurator = Configurator.new(
|
|
731
|
+
transport: @transport,
|
|
732
|
+
system_prompt_base: @system_prompt,
|
|
733
|
+
id: @id,
|
|
734
|
+
streaming: @streaming,
|
|
735
|
+
step_limit: @step_limit,
|
|
736
|
+
cancellable: @cancellable,
|
|
737
|
+
interloper: @interloper,
|
|
738
|
+
on_close_sink: @on_close_handlers
|
|
739
|
+
)
|
|
740
|
+
|
|
741
|
+
@block&.call(configurator)
|
|
742
|
+
|
|
743
|
+
@tools = configurator.tools.dup
|
|
744
|
+
@sub_agent_tools = configurator.sub_agent_tools.dup
|
|
745
|
+
@listeners = ListenerList.new(configurator.listeners)
|
|
746
|
+
configurator.system_prompt_additions.each do |snippet|
|
|
747
|
+
@system_prompt = "#{@system_prompt}\n\n#{snippet}"
|
|
748
|
+
end
|
|
749
|
+
@extensions = configurator.extensions.dup
|
|
750
|
+
|
|
751
|
+
@chat = RubyLLM.chat(**@transport.to_h)
|
|
752
|
+
@chat.with_instructions(@system_prompt)
|
|
753
|
+
@tools.each { |t| @chat.with_tool(t.to_ruby_llm_tool) }
|
|
754
|
+
|
|
755
|
+
@context_window_cap = ContextWindowDetector.new(
|
|
756
|
+
override: @context_window,
|
|
757
|
+
ruby_llm_reported: @chat.model.context_window,
|
|
758
|
+
llama_probe_url: @llama_probe_url,
|
|
759
|
+
model_id: @chat.model.id
|
|
760
|
+
).detect
|
|
761
|
+
|
|
762
|
+
self.class.wire_chat(
|
|
763
|
+
@chat,
|
|
764
|
+
listeners: @listeners,
|
|
765
|
+
step_limit: @step_limit,
|
|
766
|
+
cancellable: @cancellable,
|
|
767
|
+
interloper: @interloper,
|
|
768
|
+
on_user_message: method(:dispatch_ext_on_user_message)
|
|
769
|
+
)
|
|
770
|
+
|
|
771
|
+
# One-shot context-window cap: lets every listener that
|
|
772
|
+
# cares (notably TokenLog) pick the value off the stream
|
|
773
|
+
# before any Tokens event arrives.
|
|
774
|
+
@listeners.emit(Event::ContextCap.new(cap: @context_window_cap))
|
|
775
|
+
|
|
776
|
+
# Bind sweep — each extension gets its chance to install
|
|
777
|
+
# per-agent state (dynamic tools via #internal_add_tool,
|
|
778
|
+
# per-agent close hooks via #on_close, etc.) now that the
|
|
779
|
+
# chat is fully wired. See IDEAS.md §"Extension protocol
|
|
780
|
+
# design" for what #configure vs #bind are each for.
|
|
781
|
+
@extensions.each { |ext| ext.bind(self) }
|
|
782
|
+
end
|
|
783
|
+
|
|
784
|
+
# Fire the per-turn {Extension#on_user_message} hook on every
|
|
785
|
+
# extension that defines it, appending any returned
|
|
786
|
+
# +<memory-context>+ block to the chat as a +role: :system+
|
|
787
|
+
# message right after the user turn it annotates (callers append
|
|
788
|
+
# the +:user+ message first; this runs last). The system role is
|
|
789
|
+
# load-bearing — it tags the block as recalled reference (not new
|
|
790
|
+
# input) and keeps it excludable from a later extraction pass.
|
|
791
|
+
# See {Extension#on_user_message}.
|
|
792
|
+
#
|
|
793
|
+
# Each injected block also emits an {Event::SystemInjected} at
|
|
794
|
+
# this site, so the listener stream mirrors the log growth (the
|
|
795
|
+
# Terminal renders it; otherwise an injection would be invisible
|
|
796
|
+
# except as a downstream echo in the assistant's reasoning).
|
|
797
|
+
#
|
|
798
|
+
# Private and the single place the chat log grows by a memory
|
|
799
|
+
# block — keeps "what mutates the log, when" one grep in this
|
|
800
|
+
# file. Fired from {#run_loop} (initial turn) and, via the
|
|
801
|
+
# +on_user_message:+ proc threaded into {.wire_chat}, from
|
|
802
|
+
# {.drain_interloper} (mid-loop interlopers). Called on every
|
|
803
|
+
# extension unconditionally — same as {Extension#configure} /
|
|
804
|
+
# {Extension#bind}: the hook is part of the protocol and the
|
|
805
|
+
# {Extension} module supplies a no-op default, so any extension
|
|
806
|
+
# that includes the module responds. An extension is "opted out"
|
|
807
|
+
# by leaving the default in place (it returns +nil+, injecting
|
|
808
|
+
# nothing), not by omitting the method.
|
|
809
|
+
#
|
|
810
|
+
# @param content [String] the incoming user message
|
|
811
|
+
# @return [void]
|
|
812
|
+
def dispatch_ext_on_user_message(content)
|
|
813
|
+
@extensions.each do |ext|
|
|
814
|
+
message = ext.on_user_message(self, content)
|
|
815
|
+
next unless message.is_a?(String) && !message.strip.empty?
|
|
816
|
+
|
|
817
|
+
block = message.strip
|
|
818
|
+
@chat.add_message(role: :system, content: block)
|
|
819
|
+
@listeners.emit(Event::SystemInjected.new(content: block))
|
|
820
|
+
end
|
|
821
|
+
nil
|
|
822
|
+
end
|
|
722
823
|
end
|
|
723
824
|
end
|
|
@@ -0,0 +1,303 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'json'
|
|
4
|
+
require 'nokogiri'
|
|
5
|
+
require 'readability'
|
|
6
|
+
require 'reverse_markdown'
|
|
7
|
+
|
|
8
|
+
module Pikuri
|
|
9
|
+
module Extractor
|
|
10
|
+
# HTML → Markdown extractor.
|
|
11
|
+
#
|
|
12
|
+
# Matched by content-type only (+text/html+ /
|
|
13
|
+
# +application/xhtml+xml+) — deliberately no byte sniff. The web
|
|
14
|
+
# path always has the header; for local files a sniff would route
|
|
15
|
+
# +Workspace::Read+ of an +.html+ source file through readability
|
|
16
|
+
# extraction, when a developer reading an HTML file wants the
|
|
17
|
+
# source. Local HTML stays on the {Passthrough} arm until a
|
|
18
|
+
# consumer genuinely needs otherwise.
|
|
19
|
+
#
|
|
20
|
+
# Always renders both views of the page when available:
|
|
21
|
+
#
|
|
22
|
+
# 1. JSON-LD section. Any +<script type="application/ld+json">+ node
|
|
23
|
+
# whose +@type+ matches a substantive schema.org content type
|
|
24
|
+
# (Product, Article, Recipe, ...) is rendered as a header — title,
|
|
25
|
+
# metadata bullets (brand, SKU, price, rating, author, published),
|
|
26
|
+
# and the +articleBody+/+description+ copy when present.
|
|
27
|
+
# 2. Readability section. The page is run through +Readability+ +
|
|
28
|
+
# +reverse_markdown+, with a +<main>+/+<article>+ fallback for
|
|
29
|
+
# pages whose content sits mostly outside +<p>+ tags.
|
|
30
|
+
#
|
|
31
|
+
# Concatenated with a horizontal rule, so the LLM gets both the
|
|
32
|
+
# structured metadata and the rendered body and can pick whichever
|
|
33
|
+
# is more useful for the task. Trades some duplication (when a
|
|
34
|
+
# publisher embeds the article body in JSON-LD AND in HTML) for
|
|
35
|
+
# fewer type-based heuristics on which branch should win — the
|
|
36
|
+
# earlier "is this Article's +description+ a teaser or the real
|
|
37
|
+
# body?" carve-out is no longer needed because both end up in
|
|
38
|
+
# the output regardless.
|
|
39
|
+
module HTML
|
|
40
|
+
# @return [Array<String>] content-types this extractor claims.
|
|
41
|
+
CONTENT_TYPES = %w[text/html application/xhtml+xml].freeze
|
|
42
|
+
|
|
43
|
+
# @return [Array<String>] schema.org +@type+ values that we treat
|
|
44
|
+
# as "the primary entity of this page" when picking a JSON-LD
|
|
45
|
+
# node to render. Order does not matter — the first matching
|
|
46
|
+
# node wins. Skips noise nodes (Organization, BreadcrumbList,
|
|
47
|
+
# WebSite, ...) that ship on most pages but carry no page
|
|
48
|
+
# content.
|
|
49
|
+
INTERESTING_TYPES = %w[
|
|
50
|
+
Product Article NewsArticle BlogPosting Recipe Event Book Movie
|
|
51
|
+
].freeze
|
|
52
|
+
|
|
53
|
+
# @return [Array<String>] HTML tags preserved by the readability
|
|
54
|
+
# pass. Anything outside this list is stripped before Markdown
|
|
55
|
+
# conversion.
|
|
56
|
+
READABILITY_TAGS = %w[
|
|
57
|
+
h1 h2 h3 h4 h5 h6 p div span ul ol li blockquote pre code a img
|
|
58
|
+
strong em b i br hr table thead tbody tr td th
|
|
59
|
+
].freeze
|
|
60
|
+
|
|
61
|
+
# @return [Array<String>] HTML attributes preserved by the
|
|
62
|
+
# readability pass; everything else (class, id, style, data-*)
|
|
63
|
+
# is dropped before Markdown conversion
|
|
64
|
+
READABILITY_ATTRS = %w[href src alt title].freeze
|
|
65
|
+
|
|
66
|
+
# @return [Float] minimum +<main>+/+<article>+ to Readability
|
|
67
|
+
# text-length ratio that triggers the semantic-container
|
|
68
|
+
# fallback in {.readability_to_markdown}. Picked low enough to
|
|
69
|
+
# catch the failure mode (Readability collapsing a page that
|
|
70
|
+
# uses divs/lists instead of +<p>+ — e.g. +vaadin.com/company+,
|
|
71
|
+
# ~5x) but high enough that pages where both produce
|
|
72
|
+
# comparable output keep Readability's noise filtering.
|
|
73
|
+
MAIN_FALLBACK_RATIO = 2.0
|
|
74
|
+
|
|
75
|
+
# @return [Integer] minimum text length the
|
|
76
|
+
# +<main>+/+<article>+ container must hold before the fallback
|
|
77
|
+
# in {.readability_to_markdown} can fire. Below this, the
|
|
78
|
+
# ratio comparison is dominated by noise and we'd swap on
|
|
79
|
+
# tiny pages where Readability is doing the right thing.
|
|
80
|
+
MAIN_FALLBACK_MIN_CHARS = 500
|
|
81
|
+
|
|
82
|
+
# @return [Symbol] {Page#kind} tag.
|
|
83
|
+
def self.kind
|
|
84
|
+
:html
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
# @param sample [String] leading bytes of the content (unused —
|
|
88
|
+
# see the no-sniff rationale in the module doc).
|
|
89
|
+
# @param content_type [String, nil] normalized content-type.
|
|
90
|
+
# @return [Boolean]
|
|
91
|
+
def self.matches?(sample:, content_type:)
|
|
92
|
+
CONTENT_TYPES.include?(content_type)
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
# Render the HTML document behind +io+ as Markdown by emitting
|
|
96
|
+
# both the JSON-LD section (when an interesting node is present)
|
|
97
|
+
# and the readability / +<main>+ section, joined by a horizontal
|
|
98
|
+
# rule. Either section may be missing — pages with no JSON-LD
|
|
99
|
+
# return only the readability output, and a malformed page with
|
|
100
|
+
# no extractable body returns only the JSON-LD render.
|
|
101
|
+
#
|
|
102
|
+
# @param io [IO, StringIO] IO over the HTML document.
|
|
103
|
+
# @return [String] Markdown representation
|
|
104
|
+
def self.extract(io)
|
|
105
|
+
html = io.read
|
|
106
|
+
sections = [jsonld_section(html), readability_to_markdown(html)]
|
|
107
|
+
sections.reject! { |s| s.nil? || s.strip.empty? }
|
|
108
|
+
sections.join("\n\n---\n\n")
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
# Pick the first JSON-LD node whose +@type+ matches one of
|
|
112
|
+
# {INTERESTING_TYPES} and render it as Markdown. Returns +nil+
|
|
113
|
+
# when no such node exists, in which case {.extract} emits only
|
|
114
|
+
# the readability section.
|
|
115
|
+
#
|
|
116
|
+
# No content-field gating: a node carrying just +name+/+author+/
|
|
117
|
+
# +datePublished+ still renders (as a metadata-only header),
|
|
118
|
+
# because the readability pass independently produces the page
|
|
119
|
+
# body. That is the trade-off that lets us drop the type-based
|
|
120
|
+
# "is this teaser or article copy?" heuristics — duplication is
|
|
121
|
+
# acceptable when both views are available, and the LLM can
|
|
122
|
+
# pick whichever it needs.
|
|
123
|
+
#
|
|
124
|
+
# @param html [String] HTML document body
|
|
125
|
+
# @return [String, nil] Markdown render of the picked JSON-LD
|
|
126
|
+
# node, or +nil+ when nothing matched
|
|
127
|
+
def self.jsonld_section(html)
|
|
128
|
+
node = parse_jsonld(html).find do |n|
|
|
129
|
+
Array(n['@type']).any? { |t| INTERESTING_TYPES.include?(t) }
|
|
130
|
+
end
|
|
131
|
+
node ? jsonld_to_markdown(node) : nil
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
# Collect every JSON-LD payload embedded in +html+, flattening
|
|
135
|
+
# +@graph+ wrappers so callers see one flat array of schema.org
|
|
136
|
+
# nodes. Malformed JSON blocks are silently skipped — sites
|
|
137
|
+
# frequently ship broken JSON-LD and we only need at least one
|
|
138
|
+
# parseable block.
|
|
139
|
+
#
|
|
140
|
+
# @param html [String] HTML document body
|
|
141
|
+
# @return [Array<Hash>] parsed JSON-LD nodes; possibly empty
|
|
142
|
+
def self.parse_jsonld(html)
|
|
143
|
+
doc = Nokogiri::HTML(html)
|
|
144
|
+
blobs = doc.css('script[type="application/ld+json"]').map(&:text)
|
|
145
|
+
|
|
146
|
+
blobs.flat_map do |raw|
|
|
147
|
+
parsed = begin
|
|
148
|
+
JSON.parse(raw)
|
|
149
|
+
rescue JSON::ParserError
|
|
150
|
+
nil
|
|
151
|
+
end
|
|
152
|
+
next [] unless parsed
|
|
153
|
+
|
|
154
|
+
nodes = parsed.is_a?(Array) ? parsed : [parsed]
|
|
155
|
+
nodes.flat_map { |n| n['@graph'].is_a?(Array) ? n['@graph'] : [n] }
|
|
156
|
+
end
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
# Render a single JSON-LD +node+ as Markdown: a top-level title
|
|
160
|
+
# from +name+/+headline+, a bullet list of common useful fields
|
|
161
|
+
# (brand, SKU, price, rating, author, published date, ...), the
|
|
162
|
+
# body copy, and the lead image.
|
|
163
|
+
#
|
|
164
|
+
# When the node carries +articleBody+ (the full publisher-supplied
|
|
165
|
+
# article text), that wins over +description+ — the description
|
|
166
|
+
# is typically a lede teaser and would just repeat the article's
|
|
167
|
+
# opening lines.
|
|
168
|
+
#
|
|
169
|
+
# @param node [Hash] JSON-LD node, typically picked by
|
|
170
|
+
# {.jsonld_section}
|
|
171
|
+
# @return [String] Markdown representation
|
|
172
|
+
def self.jsonld_to_markdown(node)
|
|
173
|
+
out = +''
|
|
174
|
+
name = node['name'] || node['headline']
|
|
175
|
+
out << "# #{name}\n\n" if name
|
|
176
|
+
|
|
177
|
+
offer = first_obj(node['offers'])
|
|
178
|
+
rating = first_obj(node['aggregateRating'])
|
|
179
|
+
brand = first_obj_or_string(node['brand'])
|
|
180
|
+
author = first_obj_or_string(node['author'])
|
|
181
|
+
|
|
182
|
+
brand_name = brand.is_a?(Hash) ? brand['name'] : brand
|
|
183
|
+
author_name = author.is_a?(Hash) ? author['name'] : author
|
|
184
|
+
|
|
185
|
+
fields = {
|
|
186
|
+
'Brand' => brand_name,
|
|
187
|
+
'SKU' => node['sku'],
|
|
188
|
+
'GTIN' => node['gtin13'] || node['gtin'],
|
|
189
|
+
'Price' => [offer['price'], offer['priceCurrency']].compact.join(' '),
|
|
190
|
+
'Availability' => offer['availability'],
|
|
191
|
+
'Rating' => rating['ratingValue'],
|
|
192
|
+
'Reviews' => rating['reviewCount'],
|
|
193
|
+
'Author' => author_name,
|
|
194
|
+
'Published' => node['datePublished']
|
|
195
|
+
}.reject { |_, v| v.nil? || v.to_s.strip.empty? }
|
|
196
|
+
|
|
197
|
+
unless fields.empty?
|
|
198
|
+
fields.each { |k, v| out << "- **#{k}:** #{v}\n" }
|
|
199
|
+
out << "\n"
|
|
200
|
+
end
|
|
201
|
+
|
|
202
|
+
if (body = node['articleBody'] || node['description'])
|
|
203
|
+
out << "#{body}\n\n"
|
|
204
|
+
end
|
|
205
|
+
|
|
206
|
+
if (img = node['image'])
|
|
207
|
+
img = img.first if img.is_a?(Array)
|
|
208
|
+
img = img['url'] if img.is_a?(Hash)
|
|
209
|
+
out << "\n\n" if img
|
|
210
|
+
end
|
|
211
|
+
|
|
212
|
+
out
|
|
213
|
+
end
|
|
214
|
+
|
|
215
|
+
# Run +Readability+ over +html+ to isolate the main content node,
|
|
216
|
+
# then convert that to Markdown via +reverse_markdown+. The page
|
|
217
|
+
# +<title>+ is rendered as a top-level heading.
|
|
218
|
+
#
|
|
219
|
+
# When the page uses semantic HTML5 (+<main>+ or +<article>+) but
|
|
220
|
+
# leaves most of its content outside +<p>+ tags — divs, lists,
|
|
221
|
+
# spans — Readability's paragraph-density scoring collapses the
|
|
222
|
+
# extraction to a sliver of the page. In that case we render the
|
|
223
|
+
# +<main>+/+<article>+ container directly. The fallback only
|
|
224
|
+
# fires when the container holds substantially more text than
|
|
225
|
+
# Readability picked up (see {MAIN_FALLBACK_RATIO} /
|
|
226
|
+
# {MAIN_FALLBACK_MIN_CHARS}); on pages where both agree we keep
|
|
227
|
+
# Readability so its noise filtering still strips nav/ads/etc.
|
|
228
|
+
#
|
|
229
|
+
# @param html [String] HTML document body
|
|
230
|
+
# @return [String] Markdown representation
|
|
231
|
+
def self.readability_to_markdown(html)
|
|
232
|
+
rdoc = Readability::Document.new(
|
|
233
|
+
html,
|
|
234
|
+
tags: READABILITY_TAGS,
|
|
235
|
+
attributes: READABILITY_ATTRS,
|
|
236
|
+
remove_empty_nodes: true
|
|
237
|
+
)
|
|
238
|
+
readability_html = rdoc.content
|
|
239
|
+
title = rdoc.title
|
|
240
|
+
|
|
241
|
+
body_html = main_fallback_html(html, readability_html) || readability_html
|
|
242
|
+
body = ReverseMarkdown.convert(body_html, unknown_tags: :bypass, github_flavored: true)
|
|
243
|
+
|
|
244
|
+
out = +''
|
|
245
|
+
out << "# #{title.strip}\n\n" if title && !title.strip.empty?
|
|
246
|
+
out << body
|
|
247
|
+
out
|
|
248
|
+
end
|
|
249
|
+
|
|
250
|
+
# If +html+ has a +<main>+ or +<article>+ element holding
|
|
251
|
+
# substantially more text than Readability extracted, return that
|
|
252
|
+
# container's HTML so the caller can render it instead. Returns
|
|
253
|
+
# +nil+ when the fallback should not fire — when there is no
|
|
254
|
+
# semantic container, when it's too small to be meaningful, or
|
|
255
|
+
# when Readability's output is already comparable.
|
|
256
|
+
#
|
|
257
|
+
# @param html [String] full HTML document body, used to locate
|
|
258
|
+
# the +<main>+/+<article>+ container
|
|
259
|
+
# @param readability_html [String] HTML produced by
|
|
260
|
+
# +Readability::Document#content+, used as the comparison
|
|
261
|
+
# baseline
|
|
262
|
+
# @return [String, nil] container HTML when the fallback should
|
|
263
|
+
# fire, +nil+ otherwise
|
|
264
|
+
def self.main_fallback_html(html, readability_html)
|
|
265
|
+
doc = Nokogiri::HTML(html)
|
|
266
|
+
container = doc.at_css('main') || doc.at_css('article')
|
|
267
|
+
return nil unless container
|
|
268
|
+
|
|
269
|
+
container_text_len = container.text.gsub(/\s+/, ' ').strip.length
|
|
270
|
+
return nil if container_text_len < MAIN_FALLBACK_MIN_CHARS
|
|
271
|
+
|
|
272
|
+
readability_text_len = Nokogiri::HTML(readability_html).text.gsub(/\s+/, ' ').strip.length
|
|
273
|
+
return nil if container_text_len < MAIN_FALLBACK_RATIO * readability_text_len
|
|
274
|
+
|
|
275
|
+
container.to_html
|
|
276
|
+
end
|
|
277
|
+
private_class_method :main_fallback_html
|
|
278
|
+
|
|
279
|
+
# JSON-LD fields can be a string, hash, or array of either.
|
|
280
|
+
# Normalize to a single hash (the first one if it's a list) so
|
|
281
|
+
# callers can +.dig+ safely.
|
|
282
|
+
#
|
|
283
|
+
# @param value [Object] raw JSON-LD field value
|
|
284
|
+
# @return [Hash] empty hash when +value+ does not contain a hash
|
|
285
|
+
def self.first_obj(value)
|
|
286
|
+
value = value.first if value.is_a?(Array)
|
|
287
|
+
value.is_a?(Hash) ? value : {}
|
|
288
|
+
end
|
|
289
|
+
private_class_method :first_obj
|
|
290
|
+
|
|
291
|
+
# Same idea as {.first_obj} but preserves a bare string (e.g.
|
|
292
|
+
# +brand: "Apple"+) instead of replacing it with +{}+.
|
|
293
|
+
#
|
|
294
|
+
# @param value [Object] raw JSON-LD field value
|
|
295
|
+
# @return [String, Hash, nil]
|
|
296
|
+
def self.first_obj_or_string(value)
|
|
297
|
+
value = value.first if value.is_a?(Array)
|
|
298
|
+
value
|
|
299
|
+
end
|
|
300
|
+
private_class_method :first_obj_or_string
|
|
301
|
+
end
|
|
302
|
+
end
|
|
303
|
+
end
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Pikuri
|
|
4
|
+
module Extractor
|
|
5
|
+
# The terminal plain-text arm of the registry: content that *is*
|
|
6
|
+
# already text needs no extraction, so it passes through verbatim
|
|
7
|
+
# (forced to UTF-8 — invalid bytes are left in for downstream to
|
|
8
|
+
# deal with, matching what +File.read+ with a UTF-8 encoding does).
|
|
9
|
+
# Markdown, source files, JSON, robots.txt all land here.
|
|
10
|
+
#
|
|
11
|
+
# Matching is split by whether the transport supplied a
|
|
12
|
+
# content-type:
|
|
13
|
+
#
|
|
14
|
+
# * With a content-type (the web path): claim +text/*+ only.
|
|
15
|
+
# A non-text type that no earlier extractor claimed is *not*
|
|
16
|
+
# second-guessed by sniffing — a server declaring
|
|
17
|
+
# +application/octet-stream+ gets the {Unsupported} refusal the
|
|
18
|
+
# LLM can react to, same as before this registry existed.
|
|
19
|
+
# * Without one (the local-file path, where {FileType.detect_mime}
|
|
20
|
+
# returned +nil+ for "unrecognised"): claim anything that passes
|
|
21
|
+
# the {FileType.binary?} heuristic on the sample. Opaque
|
|
22
|
+
# binaries stay unclaimed and surface as {Unsupported}.
|
|
23
|
+
module Passthrough
|
|
24
|
+
# @return [Symbol] {Page#kind} tag.
|
|
25
|
+
def self.kind
|
|
26
|
+
:text
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
# @param sample [String] leading bytes of the content.
|
|
30
|
+
# @param content_type [String, nil] normalized content-type,
|
|
31
|
+
# +nil+ when the transport has none.
|
|
32
|
+
# @return [Boolean]
|
|
33
|
+
def self.matches?(sample:, content_type:)
|
|
34
|
+
return content_type.start_with?('text/') unless content_type.nil?
|
|
35
|
+
|
|
36
|
+
!FileType.binary?(sample)
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
# @param io [IO, StringIO] IO over the text content.
|
|
40
|
+
# @return [String] the content, tagged UTF-8. Deliberately NOT
|
|
41
|
+
# derived from {.extract_lines} — a passthrough must stay
|
|
42
|
+
# verbatim (trailing newline, CRLF line endings), which a
|
|
43
|
+
# join of chomped lines would silently normalize away.
|
|
44
|
+
def self.extract(io)
|
|
45
|
+
io.read.force_encoding(Encoding::UTF_8)
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
# The lazy line stream for {Extractor.extract_paged}: the IO is
|
|
49
|
+
# read line-by-line, so a window over the head of a gigabyte
|
|
50
|
+
# log never loads the rest. Consuming the whole stream is a
|
|
51
|
+
# cheap sequential read — which is why the paging window counts
|
|
52
|
+
# this stream's tail for an exact +total_lines+ (see
|
|
53
|
+
# {Extractor.extract_paged}).
|
|
54
|
+
#
|
|
55
|
+
# @param io [IO, StringIO] IO over the text content; must
|
|
56
|
+
# remain open while the enumerator is consumed.
|
|
57
|
+
# @return [Enumerator::Lazy<String>] chomped lines, tagged
|
|
58
|
+
# UTF-8.
|
|
59
|
+
def self.extract_lines(io)
|
|
60
|
+
io.each_line.lazy.map { |raw| raw.chomp.force_encoding(Encoding::UTF_8) }
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
end
|
|
64
|
+
end
|