pikuri-core 0.0.5 → 0.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/pikuri/agent/listener/terminal.rb +18 -36
- data/lib/pikuri/extractor/html.rb +303 -0
- data/lib/pikuri/extractor/passthrough.rb +64 -0
- data/lib/pikuri/extractor.rb +314 -0
- data/lib/pikuri/file_type.rb +74 -266
- data/lib/pikuri/subprocess.rb +73 -2
- data/lib/pikuri/tool/calculator.rb +213 -41
- data/lib/pikuri/tool/fetch.rb +10 -9
- data/lib/pikuri/tool/scraper.rb +186 -0
- data/lib/pikuri/tool/web_scrape.rb +5 -5
- data/lib/pikuri/version.rb +1 -1
- data/lib/pikuri-core.rb +0 -1
- metadata +5 -61
- data/lib/pikuri/tool/scraper/fetch_error.rb +0 -16
- data/lib/pikuri/tool/scraper/html.rb +0 -285
- data/lib/pikuri/tool/scraper/pdf.rb +0 -54
- data/lib/pikuri/tool/scraper/simple.rb +0 -183
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: ac822a7bd46228f2eea2994c2e1428e3aa90c269e6ebafd603474fb630ba34ee
|
|
4
|
+
data.tar.gz: 00c69d139bc38c1a881bf87980970672517db51f59469aef266009a803a874db
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: d148d78b2027d747ef10f4dd7a19252f66bb1b5f99e8eef763149bf3e93ae608a3f7cf739b02ed4d92ab3ab39d8fe6888615a5acaf90dcb44ca783058a95a716
|
|
7
|
+
data.tar.gz: 32ef75bbd6d825970e5a1e6b5e27cf0fc812980e87e5ddd59b14f2c10772c91cf93003c2ebe2c9501f5e9682e726f77c4c387c7c83b467b9aa57dc91a9a178f0
|
|
@@ -1,21 +1,32 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
3
|
require 'rainbow'
|
|
4
|
-
require 'tty-markdown'
|
|
5
4
|
|
|
6
5
|
module Pikuri
|
|
7
6
|
class Agent
|
|
8
7
|
module Listener
|
|
9
8
|
# Terminal renderer for the normalized event stream: dim grey
|
|
10
|
-
# reasoning,
|
|
11
|
-
# call and tool-result lines, yellow fallback
|
|
12
|
-
# cancelled notice. An {Event::SystemInjected} block (recalled
|
|
9
|
+
# reasoning, assistant content printed raw (Markdown as-is),
|
|
10
|
+
# cyan tool-call and tool-result lines, yellow fallback
|
|
11
|
+
# notice, red cancelled notice. An {Event::SystemInjected} block (recalled
|
|
13
12
|
# memory / context an extension injected) renders dim grey
|
|
14
13
|
# with a +⊕+ marker. {Event::UserTurn} is intentionally silent
|
|
15
14
|
# (the terminal user just typed the message, so re-rendering
|
|
16
15
|
# it adds nothing); {Event::Tokens} and {Event::ContextCap}
|
|
17
16
|
# are silent too (their consumer is {TokenLog}).
|
|
18
17
|
#
|
|
18
|
+
# Assistant Markdown deliberately prints raw, with no
|
|
19
|
+
# Markdown-to-ANSI rendering. A renderer (+tty-markdown+)
|
|
20
|
+
# used to sit on the non-streaming path; it was dropped:
|
|
21
|
+
# rendering can never apply to the streaming path anyway
|
|
22
|
+
# (half-finished Markdown — broken code fences, half-built
|
|
23
|
+
# tables — doesn't render), the gem hadn't shipped a release
|
|
24
|
+
# since 2023 (its known ANSI-in-table crashes forced a
|
|
25
|
+
# rescue-and-degrade carve-out here), and it pulled seven
|
|
26
|
+
# transitive gems into the audit surface. Raw Markdown is
|
|
27
|
+
# perfectly readable in a terminal; proper rendering belongs
|
|
28
|
+
# to a richer host (the planned pikuri-tui).
|
|
29
|
+
#
|
|
19
30
|
# Optionally prepends a fixed number of leading spaces to
|
|
20
31
|
# every rendered line via the +padding:+ kwarg. Sub-agents
|
|
21
32
|
# get a fresh padded instance through {#for_sub_agent}
|
|
@@ -30,11 +41,8 @@ module Pikuri
|
|
|
30
41
|
# - {Event::ThinkingDelta} fragments print live in the same
|
|
31
42
|
# dim grey as the non-streaming {Event::Thinking}, with no
|
|
32
43
|
# trailing newline so the next fragment continues the line.
|
|
33
|
-
# - {Event::AssistantDelta} fragments print live
|
|
34
|
-
#
|
|
35
|
-
# Markdown (broken code blocks, half-rendered tables), so
|
|
36
|
-
# the live stream gives up formatting in exchange for
|
|
37
|
-
# liveness.
|
|
44
|
+
# - {Event::AssistantDelta} fragments print live the same
|
|
45
|
+
# way, uncolored.
|
|
38
46
|
# - {Event::Thinking} and {Event::Assistant} bookends print
|
|
39
47
|
# a single blank line as a stream terminator, not their
|
|
40
48
|
# content (the content already landed via the deltas). The
|
|
@@ -45,15 +53,6 @@ module Pikuri
|
|
|
45
53
|
# the deltas are silently ignored and the bookend events
|
|
46
54
|
# render the full text the way they always have.
|
|
47
55
|
class Terminal < Base
|
|
48
|
-
# Subsystem logger; set its level with +PIKURI_LOG_TERMINAL+
|
|
49
|
-
# or the global +PIKURI_LOG+. Used for the narrow rescue
|
|
50
|
-
# around third-party rendering (+tty-markdown+ choking on
|
|
51
|
-
# assistant output) — see the CLAUDE.md "secondary to the
|
|
52
|
-
# loop" carve-out.
|
|
53
|
-
#
|
|
54
|
-
# @return [Logger]
|
|
55
|
-
LOGGER = Pikuri.logger_for('Terminal')
|
|
56
|
-
|
|
57
56
|
# Cap, in characters, applied to tool-result content
|
|
58
57
|
# rendered to the terminal. Anything longer is truncated
|
|
59
58
|
# with a marker that reports the original byte size so the
|
|
@@ -119,7 +118,7 @@ module Pikuri
|
|
|
119
118
|
if @streaming
|
|
120
119
|
terminate_stream
|
|
121
120
|
else
|
|
122
|
-
println(indent(
|
|
121
|
+
println(indent(content))
|
|
123
122
|
end
|
|
124
123
|
in Event::ThinkingDelta(content:)
|
|
125
124
|
stream_fragment(Rainbow(content).color(85, 85, 85)) if @streaming
|
|
@@ -228,23 +227,6 @@ module Pikuri
|
|
|
228
227
|
text.to_s.each_line.map { |line| prefix + line }.join
|
|
229
228
|
end
|
|
230
229
|
|
|
231
|
-
# Render assistant Markdown for the terminal, degrading to
|
|
232
|
-
# the raw string when the renderer raises. tty-markdown /
|
|
233
|
-
# strings have known bugs around ANSI inside tables (e.g.
|
|
234
|
-
# +Strings::Wrap.insert_ansi+ raising +IndexError+); we'd
|
|
235
|
-
# rather show ugly Markdown than abort an in-flight
|
|
236
|
-
# conversation.
|
|
237
|
-
#
|
|
238
|
-
# @param content [String] assistant Markdown
|
|
239
|
-
# @return [String] rendered ANSI text, or +content+
|
|
240
|
-
# unchanged on render failure
|
|
241
|
-
def render_markdown(content)
|
|
242
|
-
TTY::Markdown.parse(content)
|
|
243
|
-
rescue StandardError => e
|
|
244
|
-
LOGGER.warn("TTY::Markdown render failed (#{e.class}: #{e.message}); falling back to raw text")
|
|
245
|
-
content
|
|
246
|
-
end
|
|
247
|
-
|
|
248
230
|
# Flatten whitespace and cap to {MAX_TOOL_RESULT_CHARS}. The
|
|
249
231
|
# cap keeps multi-screen dumps (rendered HTML, PDF text)
|
|
250
232
|
# from drowning the terminal stream; the byte-count suffix
|
|
@@ -0,0 +1,303 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'json'
|
|
4
|
+
require 'nokogiri'
|
|
5
|
+
require 'readability'
|
|
6
|
+
require 'reverse_markdown'
|
|
7
|
+
|
|
8
|
+
module Pikuri
|
|
9
|
+
module Extractor
|
|
10
|
+
# HTML → Markdown extractor.
|
|
11
|
+
#
|
|
12
|
+
# Matched by content-type only (+text/html+ /
|
|
13
|
+
# +application/xhtml+xml+) — deliberately no byte sniff. The web
|
|
14
|
+
# path always has the header; for local files a sniff would route
|
|
15
|
+
# +Workspace::Read+ of an +.html+ source file through readability
|
|
16
|
+
# extraction, when a developer reading an HTML file wants the
|
|
17
|
+
# source. Local HTML stays on the {Passthrough} arm until a
|
|
18
|
+
# consumer genuinely needs otherwise.
|
|
19
|
+
#
|
|
20
|
+
# Always renders both views of the page when available:
|
|
21
|
+
#
|
|
22
|
+
# 1. JSON-LD section. Any +<script type="application/ld+json">+ node
|
|
23
|
+
# whose +@type+ matches a substantive schema.org content type
|
|
24
|
+
# (Product, Article, Recipe, ...) is rendered as a header — title,
|
|
25
|
+
# metadata bullets (brand, SKU, price, rating, author, published),
|
|
26
|
+
# and the +articleBody+/+description+ copy when present.
|
|
27
|
+
# 2. Readability section. The page is run through +Readability+ +
|
|
28
|
+
# +reverse_markdown+, with a +<main>+/+<article>+ fallback for
|
|
29
|
+
# pages whose content sits mostly outside +<p>+ tags.
|
|
30
|
+
#
|
|
31
|
+
# Concatenated with a horizontal rule, so the LLM gets both the
|
|
32
|
+
# structured metadata and the rendered body and can pick whichever
|
|
33
|
+
# is more useful for the task. Trades some duplication (when a
|
|
34
|
+
# publisher embeds the article body in JSON-LD AND in HTML) for
|
|
35
|
+
# fewer type-based heuristics on which branch should win — the
|
|
36
|
+
# earlier "is this Article's +description+ a teaser or the real
|
|
37
|
+
# body?" carve-out is no longer needed because both end up in
|
|
38
|
+
# the output regardless.
|
|
39
|
+
module HTML
|
|
40
|
+
# @return [Array<String>] content-types this extractor claims.
|
|
41
|
+
CONTENT_TYPES = %w[text/html application/xhtml+xml].freeze
|
|
42
|
+
|
|
43
|
+
# @return [Array<String>] schema.org +@type+ values that we treat
|
|
44
|
+
# as "the primary entity of this page" when picking a JSON-LD
|
|
45
|
+
# node to render. Order does not matter — the first matching
|
|
46
|
+
# node wins. Skips noise nodes (Organization, BreadcrumbList,
|
|
47
|
+
# WebSite, ...) that ship on most pages but carry no page
|
|
48
|
+
# content.
|
|
49
|
+
INTERESTING_TYPES = %w[
|
|
50
|
+
Product Article NewsArticle BlogPosting Recipe Event Book Movie
|
|
51
|
+
].freeze
|
|
52
|
+
|
|
53
|
+
# @return [Array<String>] HTML tags preserved by the readability
|
|
54
|
+
# pass. Anything outside this list is stripped before Markdown
|
|
55
|
+
# conversion.
|
|
56
|
+
READABILITY_TAGS = %w[
|
|
57
|
+
h1 h2 h3 h4 h5 h6 p div span ul ol li blockquote pre code a img
|
|
58
|
+
strong em b i br hr table thead tbody tr td th
|
|
59
|
+
].freeze
|
|
60
|
+
|
|
61
|
+
# @return [Array<String>] HTML attributes preserved by the
|
|
62
|
+
# readability pass; everything else (class, id, style, data-*)
|
|
63
|
+
# is dropped before Markdown conversion
|
|
64
|
+
READABILITY_ATTRS = %w[href src alt title].freeze
|
|
65
|
+
|
|
66
|
+
# @return [Float] minimum +<main>+/+<article>+ to Readability
|
|
67
|
+
# text-length ratio that triggers the semantic-container
|
|
68
|
+
# fallback in {.readability_to_markdown}. Picked low enough to
|
|
69
|
+
# catch the failure mode (Readability collapsing a page that
|
|
70
|
+
# uses divs/lists instead of +<p>+ — e.g. +vaadin.com/company+,
|
|
71
|
+
# ~5x) but high enough that pages where both produce
|
|
72
|
+
# comparable output keep Readability's noise filtering.
|
|
73
|
+
MAIN_FALLBACK_RATIO = 2.0
|
|
74
|
+
|
|
75
|
+
# @return [Integer] minimum text length the
|
|
76
|
+
# +<main>+/+<article>+ container must hold before the fallback
|
|
77
|
+
# in {.readability_to_markdown} can fire. Below this, the
|
|
78
|
+
# ratio comparison is dominated by noise and we'd swap on
|
|
79
|
+
# tiny pages where Readability is doing the right thing.
|
|
80
|
+
MAIN_FALLBACK_MIN_CHARS = 500
|
|
81
|
+
|
|
82
|
+
# @return [Symbol] {Page#kind} tag.
|
|
83
|
+
def self.kind
|
|
84
|
+
:html
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
# @param sample [String] leading bytes of the content (unused —
|
|
88
|
+
# see the no-sniff rationale in the module doc).
|
|
89
|
+
# @param content_type [String, nil] normalized content-type.
|
|
90
|
+
# @return [Boolean]
|
|
91
|
+
def self.matches?(sample:, content_type:)
|
|
92
|
+
CONTENT_TYPES.include?(content_type)
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
# Render the HTML document behind +io+ as Markdown by emitting
|
|
96
|
+
# both the JSON-LD section (when an interesting node is present)
|
|
97
|
+
# and the readability / +<main>+ section, joined by a horizontal
|
|
98
|
+
# rule. Either section may be missing — pages with no JSON-LD
|
|
99
|
+
# return only the readability output, and a malformed page with
|
|
100
|
+
# no extractable body returns only the JSON-LD render.
|
|
101
|
+
#
|
|
102
|
+
# @param io [IO, StringIO] IO over the HTML document.
|
|
103
|
+
# @return [String] Markdown representation
|
|
104
|
+
def self.extract(io)
|
|
105
|
+
html = io.read
|
|
106
|
+
sections = [jsonld_section(html), readability_to_markdown(html)]
|
|
107
|
+
sections.reject! { |s| s.nil? || s.strip.empty? }
|
|
108
|
+
sections.join("\n\n---\n\n")
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
# Pick the first JSON-LD node whose +@type+ matches one of
|
|
112
|
+
# {INTERESTING_TYPES} and render it as Markdown. Returns +nil+
|
|
113
|
+
# when no such node exists, in which case {.extract} emits only
|
|
114
|
+
# the readability section.
|
|
115
|
+
#
|
|
116
|
+
# No content-field gating: a node carrying just +name+/+author+/
|
|
117
|
+
# +datePublished+ still renders (as a metadata-only header),
|
|
118
|
+
# because the readability pass independently produces the page
|
|
119
|
+
# body. That is the trade-off that lets us drop the type-based
|
|
120
|
+
# "is this teaser or article copy?" heuristics — duplication is
|
|
121
|
+
# acceptable when both views are available, and the LLM can
|
|
122
|
+
# pick whichever it needs.
|
|
123
|
+
#
|
|
124
|
+
# @param html [String] HTML document body
|
|
125
|
+
# @return [String, nil] Markdown render of the picked JSON-LD
|
|
126
|
+
# node, or +nil+ when nothing matched
|
|
127
|
+
def self.jsonld_section(html)
|
|
128
|
+
node = parse_jsonld(html).find do |n|
|
|
129
|
+
Array(n['@type']).any? { |t| INTERESTING_TYPES.include?(t) }
|
|
130
|
+
end
|
|
131
|
+
node ? jsonld_to_markdown(node) : nil
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
# Collect every JSON-LD payload embedded in +html+, flattening
|
|
135
|
+
# +@graph+ wrappers so callers see one flat array of schema.org
|
|
136
|
+
# nodes. Malformed JSON blocks are silently skipped — sites
|
|
137
|
+
# frequently ship broken JSON-LD and we only need at least one
|
|
138
|
+
# parseable block.
|
|
139
|
+
#
|
|
140
|
+
# @param html [String] HTML document body
|
|
141
|
+
# @return [Array<Hash>] parsed JSON-LD nodes; possibly empty
|
|
142
|
+
def self.parse_jsonld(html)
|
|
143
|
+
doc = Nokogiri::HTML(html)
|
|
144
|
+
blobs = doc.css('script[type="application/ld+json"]').map(&:text)
|
|
145
|
+
|
|
146
|
+
blobs.flat_map do |raw|
|
|
147
|
+
parsed = begin
|
|
148
|
+
JSON.parse(raw)
|
|
149
|
+
rescue JSON::ParserError
|
|
150
|
+
nil
|
|
151
|
+
end
|
|
152
|
+
next [] unless parsed
|
|
153
|
+
|
|
154
|
+
nodes = parsed.is_a?(Array) ? parsed : [parsed]
|
|
155
|
+
nodes.flat_map { |n| n['@graph'].is_a?(Array) ? n['@graph'] : [n] }
|
|
156
|
+
end
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
# Render a single JSON-LD +node+ as Markdown: a top-level title
|
|
160
|
+
# from +name+/+headline+, a bullet list of common useful fields
|
|
161
|
+
# (brand, SKU, price, rating, author, published date, ...), the
|
|
162
|
+
# body copy, and the lead image.
|
|
163
|
+
#
|
|
164
|
+
# When the node carries +articleBody+ (the full publisher-supplied
|
|
165
|
+
# article text), that wins over +description+ — the description
|
|
166
|
+
# is typically a lede teaser and would just repeat the article's
|
|
167
|
+
# opening lines.
|
|
168
|
+
#
|
|
169
|
+
# @param node [Hash] JSON-LD node, typically picked by
|
|
170
|
+
# {.jsonld_section}
|
|
171
|
+
# @return [String] Markdown representation
|
|
172
|
+
def self.jsonld_to_markdown(node)
|
|
173
|
+
out = +''
|
|
174
|
+
name = node['name'] || node['headline']
|
|
175
|
+
out << "# #{name}\n\n" if name
|
|
176
|
+
|
|
177
|
+
offer = first_obj(node['offers'])
|
|
178
|
+
rating = first_obj(node['aggregateRating'])
|
|
179
|
+
brand = first_obj_or_string(node['brand'])
|
|
180
|
+
author = first_obj_or_string(node['author'])
|
|
181
|
+
|
|
182
|
+
brand_name = brand.is_a?(Hash) ? brand['name'] : brand
|
|
183
|
+
author_name = author.is_a?(Hash) ? author['name'] : author
|
|
184
|
+
|
|
185
|
+
fields = {
|
|
186
|
+
'Brand' => brand_name,
|
|
187
|
+
'SKU' => node['sku'],
|
|
188
|
+
'GTIN' => node['gtin13'] || node['gtin'],
|
|
189
|
+
'Price' => [offer['price'], offer['priceCurrency']].compact.join(' '),
|
|
190
|
+
'Availability' => offer['availability'],
|
|
191
|
+
'Rating' => rating['ratingValue'],
|
|
192
|
+
'Reviews' => rating['reviewCount'],
|
|
193
|
+
'Author' => author_name,
|
|
194
|
+
'Published' => node['datePublished']
|
|
195
|
+
}.reject { |_, v| v.nil? || v.to_s.strip.empty? }
|
|
196
|
+
|
|
197
|
+
unless fields.empty?
|
|
198
|
+
fields.each { |k, v| out << "- **#{k}:** #{v}\n" }
|
|
199
|
+
out << "\n"
|
|
200
|
+
end
|
|
201
|
+
|
|
202
|
+
if (body = node['articleBody'] || node['description'])
|
|
203
|
+
out << "#{body}\n\n"
|
|
204
|
+
end
|
|
205
|
+
|
|
206
|
+
if (img = node['image'])
|
|
207
|
+
img = img.first if img.is_a?(Array)
|
|
208
|
+
img = img['url'] if img.is_a?(Hash)
|
|
209
|
+
out << "\n\n" if img
|
|
210
|
+
end
|
|
211
|
+
|
|
212
|
+
out
|
|
213
|
+
end
|
|
214
|
+
|
|
215
|
+
# Run +Readability+ over +html+ to isolate the main content node,
|
|
216
|
+
# then convert that to Markdown via +reverse_markdown+. The page
|
|
217
|
+
# +<title>+ is rendered as a top-level heading.
|
|
218
|
+
#
|
|
219
|
+
# When the page uses semantic HTML5 (+<main>+ or +<article>+) but
|
|
220
|
+
# leaves most of its content outside +<p>+ tags — divs, lists,
|
|
221
|
+
# spans — Readability's paragraph-density scoring collapses the
|
|
222
|
+
# extraction to a sliver of the page. In that case we render the
|
|
223
|
+
# +<main>+/+<article>+ container directly. The fallback only
|
|
224
|
+
# fires when the container holds substantially more text than
|
|
225
|
+
# Readability picked up (see {MAIN_FALLBACK_RATIO} /
|
|
226
|
+
# {MAIN_FALLBACK_MIN_CHARS}); on pages where both agree we keep
|
|
227
|
+
# Readability so its noise filtering still strips nav/ads/etc.
|
|
228
|
+
#
|
|
229
|
+
# @param html [String] HTML document body
|
|
230
|
+
# @return [String] Markdown representation
|
|
231
|
+
def self.readability_to_markdown(html)
|
|
232
|
+
rdoc = Readability::Document.new(
|
|
233
|
+
html,
|
|
234
|
+
tags: READABILITY_TAGS,
|
|
235
|
+
attributes: READABILITY_ATTRS,
|
|
236
|
+
remove_empty_nodes: true
|
|
237
|
+
)
|
|
238
|
+
readability_html = rdoc.content
|
|
239
|
+
title = rdoc.title
|
|
240
|
+
|
|
241
|
+
body_html = main_fallback_html(html, readability_html) || readability_html
|
|
242
|
+
body = ReverseMarkdown.convert(body_html, unknown_tags: :bypass, github_flavored: true)
|
|
243
|
+
|
|
244
|
+
out = +''
|
|
245
|
+
out << "# #{title.strip}\n\n" if title && !title.strip.empty?
|
|
246
|
+
out << body
|
|
247
|
+
out
|
|
248
|
+
end
|
|
249
|
+
|
|
250
|
+
# If +html+ has a +<main>+ or +<article>+ element holding
|
|
251
|
+
# substantially more text than Readability extracted, return that
|
|
252
|
+
# container's HTML so the caller can render it instead. Returns
|
|
253
|
+
# +nil+ when the fallback should not fire — when there is no
|
|
254
|
+
# semantic container, when it's too small to be meaningful, or
|
|
255
|
+
# when Readability's output is already comparable.
|
|
256
|
+
#
|
|
257
|
+
# @param html [String] full HTML document body, used to locate
|
|
258
|
+
# the +<main>+/+<article>+ container
|
|
259
|
+
# @param readability_html [String] HTML produced by
|
|
260
|
+
# +Readability::Document#content+, used as the comparison
|
|
261
|
+
# baseline
|
|
262
|
+
# @return [String, nil] container HTML when the fallback should
|
|
263
|
+
# fire, +nil+ otherwise
|
|
264
|
+
def self.main_fallback_html(html, readability_html)
|
|
265
|
+
doc = Nokogiri::HTML(html)
|
|
266
|
+
container = doc.at_css('main') || doc.at_css('article')
|
|
267
|
+
return nil unless container
|
|
268
|
+
|
|
269
|
+
container_text_len = container.text.gsub(/\s+/, ' ').strip.length
|
|
270
|
+
return nil if container_text_len < MAIN_FALLBACK_MIN_CHARS
|
|
271
|
+
|
|
272
|
+
readability_text_len = Nokogiri::HTML(readability_html).text.gsub(/\s+/, ' ').strip.length
|
|
273
|
+
return nil if container_text_len < MAIN_FALLBACK_RATIO * readability_text_len
|
|
274
|
+
|
|
275
|
+
container.to_html
|
|
276
|
+
end
|
|
277
|
+
private_class_method :main_fallback_html
|
|
278
|
+
|
|
279
|
+
# JSON-LD fields can be a string, hash, or array of either.
|
|
280
|
+
# Normalize to a single hash (the first one if it's a list) so
|
|
281
|
+
# callers can +.dig+ safely.
|
|
282
|
+
#
|
|
283
|
+
# @param value [Object] raw JSON-LD field value
|
|
284
|
+
# @return [Hash] empty hash when +value+ does not contain a hash
|
|
285
|
+
def self.first_obj(value)
|
|
286
|
+
value = value.first if value.is_a?(Array)
|
|
287
|
+
value.is_a?(Hash) ? value : {}
|
|
288
|
+
end
|
|
289
|
+
private_class_method :first_obj
|
|
290
|
+
|
|
291
|
+
# Same idea as {.first_obj} but preserves a bare string (e.g.
|
|
292
|
+
# +brand: "Apple"+) instead of replacing it with +{}+.
|
|
293
|
+
#
|
|
294
|
+
# @param value [Object] raw JSON-LD field value
|
|
295
|
+
# @return [String, Hash, nil]
|
|
296
|
+
def self.first_obj_or_string(value)
|
|
297
|
+
value = value.first if value.is_a?(Array)
|
|
298
|
+
value
|
|
299
|
+
end
|
|
300
|
+
private_class_method :first_obj_or_string
|
|
301
|
+
end
|
|
302
|
+
end
|
|
303
|
+
end
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Pikuri
|
|
4
|
+
module Extractor
|
|
5
|
+
# The terminal plain-text arm of the registry: content that *is*
|
|
6
|
+
# already text needs no extraction, so it passes through verbatim
|
|
7
|
+
# (forced to UTF-8 — invalid bytes are left in for downstream to
|
|
8
|
+
# deal with, matching what +File.read+ with a UTF-8 encoding does).
|
|
9
|
+
# Markdown, source files, JSON, robots.txt all land here.
|
|
10
|
+
#
|
|
11
|
+
# Matching is split by whether the transport supplied a
|
|
12
|
+
# content-type:
|
|
13
|
+
#
|
|
14
|
+
# * With a content-type (the web path): claim +text/*+ only.
|
|
15
|
+
# A non-text type that no earlier extractor claimed is *not*
|
|
16
|
+
# second-guessed by sniffing — a server declaring
|
|
17
|
+
# +application/octet-stream+ gets the {Unsupported} refusal the
|
|
18
|
+
# LLM can react to, same as before this registry existed.
|
|
19
|
+
# * Without one (the local-file path, where {FileType.detect_mime}
|
|
20
|
+
# returned +nil+ for "unrecognised"): claim anything that passes
|
|
21
|
+
# the {FileType.binary?} heuristic on the sample. Opaque
|
|
22
|
+
# binaries stay unclaimed and surface as {Unsupported}.
|
|
23
|
+
module Passthrough
|
|
24
|
+
# @return [Symbol] {Page#kind} tag.
|
|
25
|
+
def self.kind
|
|
26
|
+
:text
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
# @param sample [String] leading bytes of the content.
|
|
30
|
+
# @param content_type [String, nil] normalized content-type,
|
|
31
|
+
# +nil+ when the transport has none.
|
|
32
|
+
# @return [Boolean]
|
|
33
|
+
def self.matches?(sample:, content_type:)
|
|
34
|
+
return content_type.start_with?('text/') unless content_type.nil?
|
|
35
|
+
|
|
36
|
+
!FileType.binary?(sample)
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
# @param io [IO, StringIO] IO over the text content.
|
|
40
|
+
# @return [String] the content, tagged UTF-8. Deliberately NOT
|
|
41
|
+
# derived from {.extract_lines} — a passthrough must stay
|
|
42
|
+
# verbatim (trailing newline, CRLF line endings), which a
|
|
43
|
+
# join of chomped lines would silently normalize away.
|
|
44
|
+
def self.extract(io)
|
|
45
|
+
io.read.force_encoding(Encoding::UTF_8)
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
# The lazy line stream for {Extractor.extract_paged}: the IO is
|
|
49
|
+
# read line-by-line, so a window over the head of a gigabyte
|
|
50
|
+
# log never loads the rest. Consuming the whole stream is a
|
|
51
|
+
# cheap sequential read — which is why the paging window counts
|
|
52
|
+
# this stream's tail for an exact +total_lines+ (see
|
|
53
|
+
# {Extractor.extract_paged}).
|
|
54
|
+
#
|
|
55
|
+
# @param io [IO, StringIO] IO over the text content; must
|
|
56
|
+
# remain open while the enumerator is consumed.
|
|
57
|
+
# @return [Enumerator::Lazy<String>] chomped lines, tagged
|
|
58
|
+
# UTF-8.
|
|
59
|
+
def self.extract_lines(io)
|
|
60
|
+
io.each_line.lazy.map { |raw| raw.chomp.force_encoding(Encoding::UTF_8) }
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
end
|
|
64
|
+
end
|