rubycrawl 0.1.4 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,128 @@
1
+ # frozen_string_literal: true
2
+
3
+ class RubyCrawl
4
+ class Browser
5
+ # JavaScript extraction constants, evaluated inside Chromium via page.evaluate().
6
+ # All constants are IIFEs — Ferrum's page.evaluate() evaluates an expression,
7
+ # it does NOT call function definitions. Wrapping as (() => { ... })() ensures
8
+ # the function is immediately invoked and its return value is captured.
9
+ module Extraction
10
+ EXTRACT_METADATA_JS = <<~JS
11
+ (() => {
12
+ const getMeta = (name) => {
13
+ const meta = document.querySelector(`meta[name="${name}"], meta[property="${name}"]`);
14
+ return meta?.getAttribute("content") || null;
15
+ };
16
+ const getLink = (rel) => {
17
+ const link = document.querySelector(`link[rel="${rel}"]`);
18
+ return link?.getAttribute("href") || null;
19
+ };
20
+ return {
21
+ title: document.title || null,
22
+ description: getMeta("description") || getMeta("og:description") || null,
23
+ keywords: getMeta("keywords"),
24
+ author: getMeta("author"),
25
+ og_title: getMeta("og:title"),
26
+ og_description: getMeta("og:description"),
27
+ og_image: getMeta("og:image"),
28
+ og_url: getMeta("og:url"),
29
+ og_type: getMeta("og:type"),
30
+ twitter_card: getMeta("twitter:card"),
31
+ twitter_title: getMeta("twitter:title"),
32
+ twitter_description: getMeta("twitter:description"),
33
+ twitter_image: getMeta("twitter:image"),
34
+ canonical: getLink("canonical"),
35
+ lang: document.documentElement.lang || null,
36
+ charset: document.characterSet || null,
37
+ };
38
+ })()
39
+ JS
40
+
41
+ EXTRACT_LINKS_JS = <<~JS
42
+ (() => Array.from(document.querySelectorAll("a[href]")).map(link => ({
43
+ url: link.href,
44
+ text: (link.textContent || "").trim(),
45
+ title: link.getAttribute("title") || null,
46
+ rel: link.getAttribute("rel") || null,
47
+ })))()
48
+ JS
49
+
50
+ EXTRACT_RAW_TEXT_JS = <<~JS
51
+ (() => (document.body?.innerText || "").trim())()
52
+ JS
53
+
54
+ # Semantic noise selectors — used by the heuristic fallback.
55
+ NOISE_SELECTORS = [
56
+ 'nav', 'header', 'footer', 'aside',
57
+ '[role="navigation"]', '[role="banner"]', '[role="contentinfo"]',
58
+ '[role="complementary"]', '[role="dialog"]', '[role="tooltip"]',
59
+ '[role="alert"]', '[aria-hidden="true"]',
60
+ 'script', 'style', 'noscript', 'iframe'
61
+ ].join(', ').freeze
62
+
63
+ # Mozilla Readability.js v0.6.0 — vendored source, read once at load time.
64
+ # Embedded inside EXTRACT_CONTENT_JS's outer IIFE so Readability is defined
65
+ # and used within the same Runtime.evaluate expression (Ferrum evaluates a
66
+ # single expression — separate evaluate calls have separate scopes).
67
+ READABILITY_JS = File.read(File.join(__dir__, 'readability.js')).freeze
68
+
69
+ # Extracts clean article HTML using Mozilla Readability (primary) with a
70
+ # link-density heuristic as fallback when Readability returns no content.
71
+ # Everything is wrapped in one outer IIFE so page.evaluate gets a single
72
+ # expression and Readability is in scope for the extraction logic.
73
+ # DOM mutations from the fallback path are reversed after extraction.
74
+ EXTRACT_CONTENT_JS = <<~JS.freeze
75
+ (() => {
76
+ // Mozilla Readability.js v0.6.0 — defined in this IIFE's scope.
77
+ #{READABILITY_JS}
78
+
79
+ // Primary: Mozilla Readability — article-quality extraction.
80
+ let readabilityDebug = null;
81
+ try {
82
+ const docClone = document.cloneNode(true);
83
+ const reader = new Readability(docClone, { charThreshold: 100 });
84
+ const article = reader.parse();
85
+ if (article && article.textContent && article.textContent.trim().length > 200) {
86
+ return { cleanHtml: article.content, extractor: "readability" };
87
+ }
88
+ readabilityDebug = article ? `returned ${article.textContent?.trim().length ?? 0} text chars (below threshold)` : "returned null (no article detected)";
89
+ } catch (e) {
90
+ readabilityDebug = `error: ${e.message}`;
91
+ }
92
+
93
+ // Fallback: link-density heuristic (works on nav-heavy / non-article pages).
94
+ const noiseSelectors = #{NOISE_SELECTORS.to_json};
95
+ function linkDensity(el) {
96
+ const total = (el.innerText || "").trim().length;
97
+ if (!total) return 1;
98
+ const linked = Array.from(el.querySelectorAll("a"))
99
+ .reduce((sum, a) => sum + (a.innerText || "").trim().length, 0);
100
+ return linked / total;
101
+ }
102
+ const removed = [];
103
+ function stash(el) {
104
+ if (el.parentNode) {
105
+ removed.push({ el, parent: el.parentNode, next: el.nextSibling });
106
+ el.parentNode.removeChild(el);
107
+ }
108
+ }
109
+ document.body.querySelectorAll(noiseSelectors).forEach(stash);
110
+ const blockTags = new Set(["script", "style", "noscript", "link", "meta"]);
111
+ const topChildren = Array.from(document.body.children)
112
+ .filter(el => !blockTags.has(el.tagName.toLowerCase()));
113
+ const roots = topChildren.length === 1
114
+ ? [document.body, topChildren[0]] : [document.body];
115
+ for (const root of roots) {
116
+ for (const el of Array.from(root.children)) {
117
+ const text = (el.innerText || "").trim();
118
+ if (text.length >= 20 && linkDensity(el) > 0.5) stash(el);
119
+ }
120
+ }
121
+ const cleanHtml = document.body.innerHTML;
122
+ removed.reverse().forEach(({ el, parent, next }) => parent.insertBefore(el, next));
123
+ return { cleanHtml, extractor: "heuristic", debug: readabilityDebug };
124
+ })()
125
+ JS
126
+ end
127
+ end
128
+ end