pikuri-core 0.0.5 → 0.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. checksums.yaml +4 -4
  2. data/README.md +5 -3
  3. data/lib/pikuri/agent/chat_transport.rb +135 -11
  4. data/lib/pikuri/agent/configurator.rb +4 -4
  5. data/lib/pikuri/agent/context_window_detector.rb +103 -52
  6. data/lib/pikuri/agent/control/step_limit.rb +39 -7
  7. data/lib/pikuri/agent/event.rb +43 -16
  8. data/lib/pikuri/agent/extension.rb +31 -17
  9. data/lib/pikuri/agent/extension_context.rb +147 -0
  10. data/lib/pikuri/agent/listener/terminal.rb +30 -37
  11. data/lib/pikuri/agent/listener/token_log.rb +60 -13
  12. data/lib/pikuri/agent/listener.rb +12 -5
  13. data/lib/pikuri/agent/listener_list.rb +7 -17
  14. data/lib/pikuri/agent/synthesizer.rb +93 -67
  15. data/lib/pikuri/agent.rb +358 -403
  16. data/lib/pikuri/extractor/html.rb +303 -0
  17. data/lib/pikuri/extractor/passthrough.rb +64 -0
  18. data/lib/pikuri/extractor.rb +314 -0
  19. data/lib/pikuri/file_type.rb +74 -266
  20. data/lib/pikuri/sanitizer.rb +179 -0
  21. data/lib/pikuri/subprocess.rb +73 -2
  22. data/lib/pikuri/tool/calculator.rb +213 -41
  23. data/lib/pikuri/tool/fetch.rb +10 -9
  24. data/lib/pikuri/tool/parameters.rb +65 -2
  25. data/lib/pikuri/tool/scraper.rb +186 -0
  26. data/lib/pikuri/tool/search/brave.rb +32 -18
  27. data/lib/pikuri/tool/search/duckduckgo.rb +18 -7
  28. data/lib/pikuri/tool/search/engines.rb +72 -49
  29. data/lib/pikuri/tool/search/exa.rb +34 -22
  30. data/lib/pikuri/tool/web_scrape.rb +5 -5
  31. data/lib/pikuri/tool/web_search.rb +45 -26
  32. data/lib/pikuri/version.rb +1 -1
  33. data/lib/pikuri-core.rb +11 -10
  34. metadata +9 -66
  35. data/lib/pikuri/tool/scraper/fetch_error.rb +0 -16
  36. data/lib/pikuri/tool/scraper/html.rb +0 -285
  37. data/lib/pikuri/tool/scraper/pdf.rb +0 -54
  38. data/lib/pikuri/tool/scraper/simple.rb +0 -183
@@ -0,0 +1,303 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+ require 'nokogiri'
5
+ require 'readability'
6
+ require 'reverse_markdown'
7
+
8
+ module Pikuri
9
+ module Extractor
10
+ # HTML → Markdown extractor.
11
+ #
12
+ # Matched by content-type only (+text/html+ /
13
+ # +application/xhtml+xml+) — deliberately no byte sniff. The web
14
+ # path always has the header; for local files a sniff would route
15
+ # +Workspace::Read+ of an +.html+ source file through readability
16
+ # extraction, when a developer reading an HTML file wants the
17
+ # source. Local HTML stays on the {Passthrough} arm until a
18
+ # consumer genuinely needs otherwise.
19
+ #
20
+ # Always renders both views of the page when available:
21
+ #
22
+ # 1. JSON-LD section. Any +<script type="application/ld+json">+ node
23
+ # whose +@type+ matches a substantive schema.org content type
24
+ # (Product, Article, Recipe, ...) is rendered as a header — title,
25
+ # metadata bullets (brand, SKU, price, rating, author, published),
26
+ # and the +articleBody+/+description+ copy when present.
27
+ # 2. Readability section. The page is run through +Readability+ +
28
+ # +reverse_markdown+, with a +<main>+/+<article>+ fallback for
29
+ # pages whose content sits mostly outside +<p>+ tags.
30
+ #
31
+ # Concatenated with a horizontal rule, so the LLM gets both the
32
+ # structured metadata and the rendered body and can pick whichever
33
+ # is more useful for the task. Trades some duplication (when a
34
+ # publisher embeds the article body in JSON-LD AND in HTML) for
35
+ # fewer type-based heuristics on which branch should win — the
36
+ # earlier "is this Article's +description+ a teaser or the real
37
+ # body?" carve-out is no longer needed because both end up in
38
+ # the output regardless.
39
+ module HTML
40
+ # @return [Array<String>] content-types this extractor claims.
41
+ CONTENT_TYPES = %w[text/html application/xhtml+xml].freeze
42
+
43
+ # @return [Array<String>] schema.org +@type+ values that we treat
44
+ # as "the primary entity of this page" when picking a JSON-LD
45
+ # node to render. Order does not matter — the first matching
46
+ # node wins. Skips noise nodes (Organization, BreadcrumbList,
47
+ # WebSite, ...) that ship on most pages but carry no page
48
+ # content.
49
+ INTERESTING_TYPES = %w[
50
+ Product Article NewsArticle BlogPosting Recipe Event Book Movie
51
+ ].freeze
52
+
53
+ # @return [Array<String>] HTML tags preserved by the readability
54
+ # pass. Anything outside this list is stripped before Markdown
55
+ # conversion.
56
+ READABILITY_TAGS = %w[
57
+ h1 h2 h3 h4 h5 h6 p div span ul ol li blockquote pre code a img
58
+ strong em b i br hr table thead tbody tr td th
59
+ ].freeze
60
+
61
+ # @return [Array<String>] HTML attributes preserved by the
62
+ # readability pass; everything else (class, id, style, data-*)
63
+ # is dropped before Markdown conversion
64
+ READABILITY_ATTRS = %w[href src alt title].freeze
65
+
66
+ # @return [Float] minimum +<main>+/+<article>+ to Readability
67
+ # text-length ratio that triggers the semantic-container
68
+ # fallback in {.readability_to_markdown}. Picked low enough to
69
+ # catch the failure mode (Readability collapsing a page that
70
+ # uses divs/lists instead of +<p>+ — e.g. +vaadin.com/company+,
71
+ # ~5x) but high enough that pages where both produce
72
+ # comparable output keep Readability's noise filtering.
73
+ MAIN_FALLBACK_RATIO = 2.0
74
+
75
+ # @return [Integer] minimum text length the
76
+ # +<main>+/+<article>+ container must hold before the fallback
77
+ # in {.readability_to_markdown} can fire. Below this, the
78
+ # ratio comparison is dominated by noise and we'd swap on
79
+ # tiny pages where Readability is doing the right thing.
80
+ MAIN_FALLBACK_MIN_CHARS = 500
81
+
82
+ # @return [Symbol] {Page#kind} tag.
83
+ def self.kind
84
+ :html
85
+ end
86
+
87
+ # @param sample [String] leading bytes of the content (unused —
88
+ # see the no-sniff rationale in the module doc).
89
+ # @param content_type [String, nil] normalized content-type.
90
+ # @return [Boolean]
91
+ def self.matches?(sample:, content_type:)
92
+ CONTENT_TYPES.include?(content_type)
93
+ end
94
+
95
+ # Render the HTML document behind +io+ as Markdown by emitting
96
+ # both the JSON-LD section (when an interesting node is present)
97
+ # and the readability / +<main>+ section, joined by a horizontal
98
+ # rule. Either section may be missing — pages with no JSON-LD
99
+ # return only the readability output, and a malformed page with
100
+ # no extractable body returns only the JSON-LD render.
101
+ #
102
+ # @param io [IO, StringIO] IO over the HTML document.
103
+ # @return [String] Markdown representation
104
+ def self.extract(io)
105
+ html = io.read
106
+ sections = [jsonld_section(html), readability_to_markdown(html)]
107
+ sections.reject! { |s| s.nil? || s.strip.empty? }
108
+ sections.join("\n\n---\n\n")
109
+ end
110
+
111
+ # Pick the first JSON-LD node whose +@type+ matches one of
112
+ # {INTERESTING_TYPES} and render it as Markdown. Returns +nil+
113
+ # when no such node exists, in which case {.extract} emits only
114
+ # the readability section.
115
+ #
116
+ # No content-field gating: a node carrying just +name+/+author+/
117
+ # +datePublished+ still renders (as a metadata-only header),
118
+ # because the readability pass independently produces the page
119
+ # body. That is the trade-off that lets us drop the type-based
120
+ # "is this teaser or article copy?" heuristics — duplication is
121
+ # acceptable when both views are available, and the LLM can
122
+ # pick whichever it needs.
123
+ #
124
+ # @param html [String] HTML document body
125
+ # @return [String, nil] Markdown render of the picked JSON-LD
126
+ # node, or +nil+ when nothing matched
127
+ def self.jsonld_section(html)
128
+ node = parse_jsonld(html).find do |n|
129
+ Array(n['@type']).any? { |t| INTERESTING_TYPES.include?(t) }
130
+ end
131
+ node ? jsonld_to_markdown(node) : nil
132
+ end
133
+
134
+ # Collect every JSON-LD payload embedded in +html+, flattening
135
+ # +@graph+ wrappers so callers see one flat array of schema.org
136
+ # nodes. Malformed JSON blocks are silently skipped — sites
137
+ # frequently ship broken JSON-LD and we only need at least one
138
+ # parseable block.
139
+ #
140
+ # @param html [String] HTML document body
141
+ # @return [Array<Hash>] parsed JSON-LD nodes; possibly empty
142
+ def self.parse_jsonld(html)
143
+ doc = Nokogiri::HTML(html)
144
+ blobs = doc.css('script[type="application/ld+json"]').map(&:text)
145
+
146
+ blobs.flat_map do |raw|
147
+ parsed = begin
148
+ JSON.parse(raw)
149
+ rescue JSON::ParserError
150
+ nil
151
+ end
152
+ next [] unless parsed
153
+
154
+ nodes = parsed.is_a?(Array) ? parsed : [parsed]
155
+ nodes.flat_map { |n| n['@graph'].is_a?(Array) ? n['@graph'] : [n] }
156
+ end
157
+ end
158
+
159
+ # Render a single JSON-LD +node+ as Markdown: a top-level title
160
+ # from +name+/+headline+, a bullet list of common useful fields
161
+ # (brand, SKU, price, rating, author, published date, ...), the
162
+ # body copy, and the lead image.
163
+ #
164
+ # When the node carries +articleBody+ (the full publisher-supplied
165
+ # article text), that wins over +description+ — the description
166
+ # is typically a lede teaser and would just repeat the article's
167
+ # opening lines.
168
+ #
169
+ # @param node [Hash] JSON-LD node, typically picked by
170
+ # {.jsonld_section}
171
+ # @return [String] Markdown representation
172
+ def self.jsonld_to_markdown(node)
173
+ out = +''
174
+ name = node['name'] || node['headline']
175
+ out << "# #{name}\n\n" if name
176
+
177
+ offer = first_obj(node['offers'])
178
+ rating = first_obj(node['aggregateRating'])
179
+ brand = first_obj_or_string(node['brand'])
180
+ author = first_obj_or_string(node['author'])
181
+
182
+ brand_name = brand.is_a?(Hash) ? brand['name'] : brand
183
+ author_name = author.is_a?(Hash) ? author['name'] : author
184
+
185
+ fields = {
186
+ 'Brand' => brand_name,
187
+ 'SKU' => node['sku'],
188
+ 'GTIN' => node['gtin13'] || node['gtin'],
189
+ 'Price' => [offer['price'], offer['priceCurrency']].compact.join(' '),
190
+ 'Availability' => offer['availability'],
191
+ 'Rating' => rating['ratingValue'],
192
+ 'Reviews' => rating['reviewCount'],
193
+ 'Author' => author_name,
194
+ 'Published' => node['datePublished']
195
+ }.reject { |_, v| v.nil? || v.to_s.strip.empty? }
196
+
197
+ unless fields.empty?
198
+ fields.each { |k, v| out << "- **#{k}:** #{v}\n" }
199
+ out << "\n"
200
+ end
201
+
202
+ if (body = node['articleBody'] || node['description'])
203
+ out << "#{body}\n\n"
204
+ end
205
+
206
+ if (img = node['image'])
207
+ img = img.first if img.is_a?(Array)
208
+ img = img['url'] if img.is_a?(Hash)
209
+ out << "![image](#{img})\n\n" if img
210
+ end
211
+
212
+ out
213
+ end
214
+
215
+ # Run +Readability+ over +html+ to isolate the main content node,
216
+ # then convert that to Markdown via +reverse_markdown+. The page
217
+ # +<title>+ is rendered as a top-level heading.
218
+ #
219
+ # When the page uses semantic HTML5 (+<main>+ or +<article>+) but
220
+ # leaves most of its content outside +<p>+ tags — divs, lists,
221
+ # spans — Readability's paragraph-density scoring collapses the
222
+ # extraction to a sliver of the page. In that case we render the
223
+ # +<main>+/+<article>+ container directly. The fallback only
224
+ # fires when the container holds substantially more text than
225
+ # Readability picked up (see {MAIN_FALLBACK_RATIO} /
226
+ # {MAIN_FALLBACK_MIN_CHARS}); on pages where both agree we keep
227
+ # Readability so its noise filtering still strips nav/ads/etc.
228
+ #
229
+ # @param html [String] HTML document body
230
+ # @return [String] Markdown representation
231
+ def self.readability_to_markdown(html)
232
+ rdoc = Readability::Document.new(
233
+ html,
234
+ tags: READABILITY_TAGS,
235
+ attributes: READABILITY_ATTRS,
236
+ remove_empty_nodes: true
237
+ )
238
+ readability_html = rdoc.content
239
+ title = rdoc.title
240
+
241
+ body_html = main_fallback_html(html, readability_html) || readability_html
242
+ body = ReverseMarkdown.convert(body_html, unknown_tags: :bypass, github_flavored: true)
243
+
244
+ out = +''
245
+ out << "# #{title.strip}\n\n" if title && !title.strip.empty?
246
+ out << body
247
+ out
248
+ end
249
+
250
+ # If +html+ has a +<main>+ or +<article>+ element holding
251
+ # substantially more text than Readability extracted, return that
252
+ # container's HTML so the caller can render it instead. Returns
253
+ # +nil+ when the fallback should not fire — when there is no
254
+ # semantic container, when it's too small to be meaningful, or
255
+ # when Readability's output is already comparable.
256
+ #
257
+ # @param html [String] full HTML document body, used to locate
258
+ # the +<main>+/+<article>+ container
259
+ # @param readability_html [String] HTML produced by
260
+ # +Readability::Document#content+, used as the comparison
261
+ # baseline
262
+ # @return [String, nil] container HTML when the fallback should
263
+ # fire, +nil+ otherwise
264
+ def self.main_fallback_html(html, readability_html)
265
+ doc = Nokogiri::HTML(html)
266
+ container = doc.at_css('main') || doc.at_css('article')
267
+ return nil unless container
268
+
269
+ container_text_len = container.text.gsub(/\s+/, ' ').strip.length
270
+ return nil if container_text_len < MAIN_FALLBACK_MIN_CHARS
271
+
272
+ readability_text_len = Nokogiri::HTML(readability_html).text.gsub(/\s+/, ' ').strip.length
273
+ return nil if container_text_len < MAIN_FALLBACK_RATIO * readability_text_len
274
+
275
+ container.to_html
276
+ end
277
+ private_class_method :main_fallback_html
278
+
279
+ # JSON-LD fields can be a string, hash, or array of either.
280
+ # Normalize to a single hash (the first one if it's a list) so
281
+ # callers can +.dig+ safely.
282
+ #
283
+ # @param value [Object] raw JSON-LD field value
284
+ # @return [Hash] empty hash when +value+ does not contain a hash
285
+ def self.first_obj(value)
286
+ value = value.first if value.is_a?(Array)
287
+ value.is_a?(Hash) ? value : {}
288
+ end
289
+ private_class_method :first_obj
290
+
291
+ # Same idea as {.first_obj} but preserves a bare string (e.g.
292
+ # +brand: "Apple"+) instead of replacing it with +{}+.
293
+ #
294
+ # @param value [Object] raw JSON-LD field value
295
+ # @return [String, Hash, nil]
296
+ def self.first_obj_or_string(value)
297
+ value = value.first if value.is_a?(Array)
298
+ value
299
+ end
300
+ private_class_method :first_obj_or_string
301
+ end
302
+ end
303
+ end
@@ -0,0 +1,64 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Pikuri
4
+ module Extractor
5
+ # The terminal plain-text arm of the registry: content that *is*
6
+ # already text needs no extraction, so it passes through verbatim
7
+ # (forced to UTF-8 — invalid bytes are left in for downstream to
8
+ # deal with, matching what +File.read+ with a UTF-8 encoding does).
9
+ # Markdown, source files, JSON, robots.txt all land here.
10
+ #
11
+ # Matching is split by whether the transport supplied a
12
+ # content-type:
13
+ #
14
+ # * With a content-type (the web path): claim +text/*+ only.
15
+ # A non-text type that no earlier extractor claimed is *not*
16
+ # second-guessed by sniffing — a server declaring
17
+ # +application/octet-stream+ gets the {Unsupported} refusal the
18
+ # LLM can react to, same as before this registry existed.
19
+ # * Without one (the local-file path, where {FileType.detect_mime}
20
+ # returned +nil+ for "unrecognised"): claim anything that passes
21
+ # the {FileType.binary?} heuristic on the sample. Opaque
22
+ # binaries stay unclaimed and surface as {Unsupported}.
23
+ module Passthrough
24
+ # @return [Symbol] {Page#kind} tag.
25
+ def self.kind
26
+ :text
27
+ end
28
+
29
+ # @param sample [String] leading bytes of the content.
30
+ # @param content_type [String, nil] normalized content-type,
31
+ # +nil+ when the transport has none.
32
+ # @return [Boolean]
33
+ def self.matches?(sample:, content_type:)
34
+ return content_type.start_with?('text/') unless content_type.nil?
35
+
36
+ !FileType.binary?(sample)
37
+ end
38
+
39
+ # @param io [IO, StringIO] IO over the text content.
40
+ # @return [String] the content, tagged UTF-8. Deliberately NOT
41
+ # derived from {.extract_lines} — a passthrough must stay
42
+ # verbatim (trailing newline, CRLF line endings), which a
43
+ # join of chomped lines would silently normalize away.
44
+ def self.extract(io)
45
+ io.read.force_encoding(Encoding::UTF_8)
46
+ end
47
+
48
+ # The lazy line stream for {Extractor.extract_paged}: the IO is
49
+ # read line-by-line, so a window over the head of a gigabyte
50
+ # log never loads the rest. Consuming the whole stream is a
51
+ # cheap sequential read — which is why the paging window counts
52
+ # this stream's tail for an exact +total_lines+ (see
53
+ # {Extractor.extract_paged}).
54
+ #
55
+ # @param io [IO, StringIO] IO over the text content; must
56
+ # remain open while the enumerator is consumed.
57
+ # @return [Enumerator::Lazy<String>] chomped lines, tagged
58
+ # UTF-8.
59
+ def self.extract_lines(io)
60
+ io.each_line.lazy.map { |raw| raw.chomp.force_encoding(Encoding::UTF_8) }
61
+ end
62
+ end
63
+ end
64
+ end
@@ -0,0 +1,314 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Pikuri
4
+ # The format→text extraction seam: one registry of extractors that
5
+ # turn an +IO+ of some recognised format (HTML and plain text out
6
+ # of the box; PDF / office formats via the pikuri-pdf /
7
+ # pikuri-extractors plug-in gems) into Markdown-flavoured UTF-8
8
+ # text, consumed through two front doors:
9
+ #
10
+ # * {.extract} — the whole document as one String. The shape the
11
+ # indexing / caching callers want ({Pikuri::VectorDb}'s indexer,
12
+ # {Tool::WebScrape}'s URL cache): no windowing, no presentation.
13
+ # * {.extract_paged} — the LLM-tool shape: the same extraction,
14
+ # windowed to a line range with a byte cap, returned as a {Page}
15
+ # the caller renders. Backs +Workspace::Read+ and
16
+ # +VectorDb::Tools::Read+ (via the {FileType} path wrappers) so
17
+ # the offset/limit/byte-cap logic lives in one tested place.
18
+ #
19
+ # Both front doors — +Tool::Scraper+ dispatching on the HTTP
20
+ # +Content-Type+ header for the web tools, and {FileType} resolving
21
+ # local paths — route through this one registry, so both share one
22
+ # set of format truths and "support a new format" is a registry
23
+ # entry (pikuri-pdf and pikuri-extractors plug PDF and office
24
+ # formats in without pikuri-core knowing), not a new special case
25
+ # in two dispatchers.
26
+ #
27
+ # == The extractor duck type
28
+ #
29
+ # Each {.registry} entry implements three methods:
30
+ #
31
+ # * +matches?(sample:, content_type:)+ → +Boolean+ — claim the
32
+ # content. +sample+ is the leading {FileType::SAMPLE_BYTES} bytes
33
+ # (for magic-byte sniffs); +content_type+ is the normalized HTTP
34
+ # +Content-Type+ for web content, the {FileType.detect_mime}
35
+ # result for local files, and may be +nil+ ("no transport
36
+ # metadata — sniff if you can").
37
+ # * +extract(io)+ → +String+ — the whole document as
38
+ # Markdown-flavoured UTF-8 text. Raises {Error} on content the
39
+ # extractor claimed but cannot parse (malformed PDF, ...).
40
+ # * +kind+ → +Symbol+ — a short tag (+:text+ / +:pdf+ / +:html+)
41
+ # carried on {Page#kind} so rendering callers can word
42
+ # format-specific trailers ("End of PDF", the scanned-image
43
+ # hint) without re-sniffing.
44
+ #
45
+ # plus one *optional* method for formats whose lines can be
46
+ # produced incrementally:
47
+ #
48
+ # * +extract_lines(io)+ → +Enumerator<String>+ — the same content
49
+ # as +extract+, as a lazy stream of already-+chomp+ed lines.
50
+ # {.extract_paged} prefers this when present and stops consuming
51
+ # the moment the window fills, so the rest of the document is
52
+ # never parsed (pikuri-pdf's extractor: pdf-reader's page list
53
+ # parses on access; {Passthrough}: the IO is read line-by-line).
54
+ # The enumerator
55
+ # must be consumed while +io+ is still open, and may raise
56
+ # {Error} mid-iteration. Extractors that need the whole document
57
+ # to produce anything ({HTML}: Readability walks the full DOM —
58
+ # true of any subprocess-based extractor too) simply omit it;
59
+ # {.extract_paged} then extracts in full and windows the result.
60
+ #
61
+ # Windowing itself (offset / limit / byte cap / line truncation) is
62
+ # presentation and deliberately lives once in {.extract_paged}, not
63
+ # per extractor — +extract_lines+ is line *production*, the only
64
+ # genuinely format-specific half of paging.
65
+ #
66
+ # == Errors
67
+ #
68
+ # Both failure modes are failures the *caller's* LLM can react to,
69
+ # so they share one rescuable root:
70
+ #
71
+ # * {Unsupported} — nothing in {.registry} claimed the content
72
+ # (opaque binary, an unhandled content-type).
73
+ # * {Error} (the root) — an extractor claimed the content but the
74
+ # parse failed (malformed PDF, ...).
75
+ #
76
+ # Callers map them to their own conventions:
77
+ # +Tool::Scraper+ re-raises both as +FetchError+;
78
+ # {FileType.read_as_text} maps {Unsupported} to the +ArgumentError+
79
+ # binary refusal and {Error} to a +RuntimeError+ carrying the path.
80
+ module Extractor
81
+ module_function
82
+
83
+ # Raised when an extractor claims content but fails to parse it
84
+ # (e.g. a malformed PDF). Message is LLM-presentable.
85
+ Error = Class.new(StandardError)
86
+
87
+ # Raised by {.extract} / {.extract_paged} when no registry entry
88
+ # claims the content. Subclass of {Error} so callers that don't
89
+ # care about the distinction rescue one class.
90
+ Unsupported = Class.new(Error)
91
+
92
+ # @return [Integer] default line-window size for {.extract_paged}
93
+ # when the caller omits +limit+.
94
+ PAGE_DEFAULT_LIMIT = 2000
95
+
96
+ # @return [Integer] default hard byte cap on the content collected
97
+ # by a single {.extract_paged} call. Bypassable by paging via
98
+ # +offset+. The rendered output is slightly larger (line
99
+ # numbering, trailer) — that's the caller's concern.
100
+ PAGE_MAX_BYTES = 50 * 1024
101
+
102
+ # @return [Integer] default per-line character cap;
103
+ # {.extract_paged} truncates longer lines and appends
104
+ # {PAGE_LINE_TRUNCATION_MARKER}.
105
+ PAGE_MAX_LINE_LENGTH = 2000
106
+
107
+ # @return [String] suffix appended to a line truncated at
108
+ # {PAGE_MAX_LINE_LENGTH}.
109
+ PAGE_LINE_TRUNCATION_MARKER = "... (line truncated to #{PAGE_MAX_LINE_LENGTH} chars)"
110
+
111
+ # One windowed slice of a document, returned by {.extract_paged}.
112
+ # The caller turns this into an observation; this struct carries
113
+ # everything a trailer needs without the caller re-reading the
114
+ # document.
115
+ #
116
+ # == Fields
117
+ #
118
+ # * +lines+ — +Array<String>+, the collected window. Already
119
+ # per-line truncated (with {PAGE_LINE_TRUNCATION_MARKER}); *not*
120
+ # line-numbered — numbering is presentation the caller adds. For
121
+ # a PDF the array includes the +"--- Page N ---"+ marker lines
122
+ # pikuri-pdf's extractor emits, which count toward +limit+ / the
123
+ # byte cap like any other line.
124
+ # * +start_line+ — the 1-indexed line number of +lines.first+
125
+ # (i.e. the +offset+ the caller asked for). +lines.last+ is at
126
+ # +start_line + lines.length - 1+.
127
+ # * +total_lines+ — total line count of the document when known,
128
+ # else +nil+. Known when the read reached EOF, when the format
129
+ # was extracted in full (no +extract_lines+ — e.g. HTML), or
130
+ # when the lazy stream is cheap enough to count to the end
131
+ # (plain text). +nil+ when a lazy stream stopped early — the
132
+ # byte cap fired, or a PDF filled the window before its last
133
+ # page (counting the rest would mean parsing every page,
134
+ # defeating the laziness).
135
+ # * +more+ — +true+ if content remains past this window (the
136
+ # caller should offer +offset = start_line + lines.length+).
137
+ # * +byte_capped+ — +true+ if the byte cap (not the line limit)
138
+ # was the stopping criterion.
139
+ # * +kind+ — the matched extractor's +kind+ tag (+:text+ /
140
+ # +:pdf+ / +:html+); lets the caller word format-specific
141
+ # trailers and the empty-document message.
142
+ #
143
+ # An empty document yields +lines: []+, +total_lines: 0+; an
144
+ # +offset+ past EOF yields +lines: []+ with +total_lines+ set to
145
+ # the real (non-zero) count — the caller distinguishes the two.
146
+ Page = Data.define(:lines, :start_line, :total_lines, :more, :byte_capped, :kind)
147
+
148
+ # The extractor registry, consulted in order — first match wins.
149
+ # Core ships two entries: {HTML} matches on content-type, and
150
+ # {Passthrough} is the terminal plain-text arm. A gem adding a
151
+ # format picks its insertion point by the strength of its claim:
152
+ # a magic-byte sniff that never misfires on text goes at the
153
+ # *front* so it beats {HTML}'s content-type match even under a
154
+ # lying header (+registry.unshift(X)+ — pikuri-pdf does this);
155
+ # a content-type / weaker-sniff claimer inserts before the
156
+ # terminal entry (+registry.insert(-2, X)+ — pikuri-extractors
157
+ # does this).
158
+ #
159
+ # @return [Array<#matches?>] mutable, deliberately — this is the
160
+ # plug-in seam.
161
+ def registry
162
+ @registry ||= [HTML, Passthrough]
163
+ end
164
+
165
+ # Extract the whole document behind +io+ as one Markdown-flavoured
166
+ # UTF-8 String. May be empty (empty text file, scanned-image PDF
167
+ # with no extractable text).
168
+ #
169
+ # @param io [IO, StringIO] seekable IO positioned at the start of
170
+ # the content; this method reads a leading sample for the
171
+ # +matches?+ sniff and rewinds before extracting.
172
+ # @param content_type [String, nil] normalized content-type when
173
+ # the transport supplies one (HTTP header, {FileType.detect_mime}
174
+ # result); +nil+ when unknown — extractors then rely on their
175
+ # byte sniffs.
176
+ # @return [String]
177
+ # @raise [Unsupported] when no registry entry claims the content.
178
+ # @raise [Error] when the matched extractor cannot parse it.
179
+ def extract(io, content_type: nil)
180
+ extractor_for(io, content_type).extract(io)
181
+ end
182
+
183
+ # Extract +io+ and return a windowed {Page}: the lines from
184
+ # +offset+ (1-indexed) up to +limit+ of them, stopping early if
185
+ # +max_bytes+ is reached, with over-long lines truncated at
186
+ # +max_line_length+.
187
+ #
188
+ # Lazy where the format allows: extractors that implement
189
+ # +extract_lines+ (plain text; pikuri-pdf's PDF) are consumed
190
+ # only until the window fills — reading the first window of a
191
+ # 500-page PDF parses a handful of pages, and the first page of
192
+ # a gigabyte log never loads it. Extractors without it (HTML) are extracted
193
+ # in full and then windowed, which is also what makes their
194
+ # +total_lines+ always exact.
195
+ #
196
+ # @param io [IO, StringIO] seekable IO positioned at the start.
197
+ # @param content_type [String, nil] as for {.extract}.
198
+ # @param offset [Integer] 1-indexed first line to include. The
199
+ # caller is responsible for validating +offset >= 1+.
200
+ # @param limit [Integer] maximum lines to collect. Caller
201
+ # validates +limit >= 1+.
202
+ # @param max_bytes [Integer] hard byte cap on collected content.
203
+ # @param max_line_length [Integer] per-line truncation threshold.
204
+ # @return [Page] the windowed slice.
205
+ # @raise [Unsupported] when no registry entry claims the content.
206
+ # @raise [Error] when the matched extractor cannot parse it.
207
+ def extract_paged(io, content_type: nil, offset: 1, limit: PAGE_DEFAULT_LIMIT,
208
+ max_bytes: PAGE_MAX_BYTES, max_line_length: PAGE_MAX_LINE_LENGTH)
209
+ extractor = extractor_for(io, content_type)
210
+ if extractor.respond_to?(:extract_lines)
211
+ # count_tail is a per-format economics call: once the window
212
+ # fills, counting the rest of a plain-text stream is a cheap
213
+ # sequential read (so the trailer can say "of N"), while for
214
+ # a PDF it would mean parsing every remaining page — exactly
215
+ # what extract_lines exists to avoid. Plugged-in extractors
216
+ # (pikuri-pdf's included) get the conservative default (stop
217
+ # early, total unknown).
218
+ window(extractor.extract_lines(io),
219
+ offset: offset, limit: limit, max_bytes: max_bytes,
220
+ max_line_length: max_line_length, kind: extractor.kind,
221
+ known_total: nil, count_tail: extractor.equal?(Passthrough))
222
+ else
223
+ lines = extractor.extract(io).split("\n")
224
+ window(lines, offset: offset, limit: limit, max_bytes: max_bytes,
225
+ max_line_length: max_line_length, kind: extractor.kind,
226
+ known_total: lines.length)
227
+ end
228
+ end
229
+
230
+ # Find the first registry entry claiming +io+'s content: read the
231
+ # leading {FileType::SAMPLE_BYTES} for the sniff, rewind, and ask
232
+ # each extractor in order.
233
+ #
234
+ # @param io [IO, StringIO] seekable IO positioned at the start.
235
+ # @param content_type [String, nil]
236
+ # @return [#extract] the matched extractor.
237
+ # @raise [Unsupported] when nothing matches.
238
+ def extractor_for(io, content_type)
239
+ sample = io.read(FileType::SAMPLE_BYTES) || +''
240
+ io.rewind
241
+ registry.find { |ex| ex.matches?(sample: sample, content_type: content_type) } ||
242
+ raise(Unsupported, 'no extractor for this content' \
243
+ "#{content_type && !content_type.empty? ? " (content-type #{content_type.inspect})" : ''}")
244
+ end
245
+ private_class_method :extractor_for
246
+
247
+ # Collect a {Page} window out of +lines+ (an Array or a lazy
248
+ # Enumerator of already-+chomp+ed lines). +known_total+ is the
249
+ # full line count when the caller extracted everything up front
250
+ # (Array case), +nil+ for a lazy stream — then +total_lines+ is
251
+ # exact only if the iteration reached EOF: +count_tail+ keeps
252
+ # the loop counting (without collecting) past the line limit
253
+ # when consuming the rest of the stream is cheap; without it the
254
+ # loop breaks and leaves the total unknown. The byte cap always
255
+ # aborts the count.
256
+ #
257
+ # @param lines [Enumerable<String>]
258
+ # @param known_total [Integer, nil]
259
+ # @param count_tail [Boolean]
260
+ # @return [Page]
261
+ def window(lines, offset:, limit:, max_bytes:, max_line_length:, kind:,
262
+ known_total:, count_tail: false)
263
+ start_index = offset - 1
264
+ collected = []
265
+ seen = 0
266
+ bytes = 0
267
+ byte_capped = false
268
+ more = false
269
+ stopped_early = false
270
+
271
+ lines.each do |raw|
272
+ seen += 1
273
+ next if seen <= start_index
274
+
275
+ if collected.length >= limit
276
+ more = true
277
+ next if count_tail # keep counting so total_lines stays exact
278
+
279
+ stopped_early = true
280
+ break
281
+ end
282
+
283
+ line = truncate_line(raw, max_line_length)
284
+ size = line.bytesize + 1 # +1 for the joining newline
285
+ if bytes + size > max_bytes
286
+ byte_capped = true
287
+ more = true
288
+ stopped_early = true
289
+ break
290
+ end
291
+ collected << line
292
+ bytes += size
293
+ end
294
+
295
+ Page.new(lines: collected, start_line: offset,
296
+ total_lines: known_total || (stopped_early ? nil : seen),
297
+ more: more, byte_capped: byte_capped, kind: kind)
298
+ end
299
+ private_class_method :window
300
+
301
+ # Truncate +line+ to +max_line_length+ chars, appending
302
+ # {PAGE_LINE_TRUNCATION_MARKER} when it overflows.
303
+ #
304
+ # @param line [String]
305
+ # @param max_line_length [Integer]
306
+ # @return [String]
307
+ def truncate_line(line, max_line_length)
308
+ return line if line.length <= max_line_length
309
+
310
+ line[0, max_line_length] + PAGE_LINE_TRUNCATION_MARKER
311
+ end
312
+ private_class_method :truncate_line
313
+ end
314
+ end