viruagent-cli 0.3.3 → 0.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/providers/tistory/auth.js +167 -0
- package/src/providers/tistory/browserHelpers.js +91 -0
- package/src/providers/tistory/chromeImport.js +745 -0
- package/src/providers/tistory/fetchLayer.js +237 -0
- package/src/providers/tistory/imageEnrichment.js +574 -0
- package/src/providers/tistory/imageNormalization.js +301 -0
- package/src/providers/tistory/imageSources.js +270 -0
- package/src/providers/tistory/index.js +561 -0
- package/src/providers/tistory/selectors.js +51 -0
- package/src/providers/tistory/session.js +117 -0
- package/src/providers/tistory/utils.js +235 -0
- package/src/services/providerManager.js +1 -1
- package/src/providers/tistoryProvider.js +0 -3141
|
@@ -0,0 +1,237 @@
|
|
|
1
|
+
const { sleep, imageTrace } = require('./utils');
|
|
2
|
+
|
|
3
|
+
const fetchText = async (url, retryCount = 0) => {
|
|
4
|
+
if (!url) {
|
|
5
|
+
throw new Error('텍스트 URL이 없습니다.');
|
|
6
|
+
}
|
|
7
|
+
|
|
8
|
+
const headers = {
|
|
9
|
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)',
|
|
10
|
+
'Accept': 'text/html,application/xhtml+xml',
|
|
11
|
+
};
|
|
12
|
+
const controller = new AbortController();
|
|
13
|
+
const timeout = setTimeout(() => controller.abort(), 20000);
|
|
14
|
+
|
|
15
|
+
try {
|
|
16
|
+
imageTrace('fetchText', { url, retryCount });
|
|
17
|
+
const response = await fetch(url, {
|
|
18
|
+
method: 'GET',
|
|
19
|
+
redirect: 'follow',
|
|
20
|
+
headers,
|
|
21
|
+
signal: controller.signal,
|
|
22
|
+
});
|
|
23
|
+
|
|
24
|
+
if (!response.ok) {
|
|
25
|
+
throw new Error(`텍스트 요청 실패: ${response.status} ${response.statusText}, url=${url}`);
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
return response.text();
|
|
29
|
+
} catch (error) {
|
|
30
|
+
if (retryCount < 1) {
|
|
31
|
+
await sleep(700);
|
|
32
|
+
return fetchText(url, retryCount + 1);
|
|
33
|
+
}
|
|
34
|
+
throw new Error(`웹 텍스트 다운로드 실패: ${error.message}`);
|
|
35
|
+
} finally {
|
|
36
|
+
clearTimeout(timeout);
|
|
37
|
+
}
|
|
38
|
+
};
|
|
39
|
+
|
|
40
|
+
const fetchTextWithHeaders = async (url, headers = {}, retryCount = 0) => {
|
|
41
|
+
const merged = {
|
|
42
|
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)',
|
|
43
|
+
...headers,
|
|
44
|
+
};
|
|
45
|
+
const controller = new AbortController();
|
|
46
|
+
const timeout = setTimeout(() => controller.abort(), 20000);
|
|
47
|
+
try {
|
|
48
|
+
const response = await fetch(url, {
|
|
49
|
+
method: 'GET',
|
|
50
|
+
redirect: 'follow',
|
|
51
|
+
headers: merged,
|
|
52
|
+
signal: controller.signal,
|
|
53
|
+
});
|
|
54
|
+
if (!response.ok) {
|
|
55
|
+
throw new Error(`텍스트 요청 실패: ${response.status} ${response.statusText}, url=${url}`);
|
|
56
|
+
}
|
|
57
|
+
return response.text();
|
|
58
|
+
} catch (error) {
|
|
59
|
+
if (retryCount < 1) {
|
|
60
|
+
await sleep(700);
|
|
61
|
+
return fetchTextWithHeaders(url, headers, retryCount + 1);
|
|
62
|
+
}
|
|
63
|
+
throw new Error(`웹 텍스트 다운로드 실패: ${error.message}`);
|
|
64
|
+
} finally {
|
|
65
|
+
clearTimeout(timeout);
|
|
66
|
+
}
|
|
67
|
+
};
|
|
68
|
+
|
|
69
|
+
const normalizeAbsoluteUrl = (value = '', base = '') => {
|
|
70
|
+
const trimmed = String(value || '').trim();
|
|
71
|
+
if (!trimmed) return null;
|
|
72
|
+
try {
|
|
73
|
+
const url = base ? new URL(trimmed, base) : new URL(trimmed);
|
|
74
|
+
if (!['http:', 'https:'].includes(url.protocol)) {
|
|
75
|
+
return null;
|
|
76
|
+
}
|
|
77
|
+
return url.toString();
|
|
78
|
+
} catch {
|
|
79
|
+
return null;
|
|
80
|
+
}
|
|
81
|
+
};
|
|
82
|
+
|
|
83
|
+
const extractArticleUrlsFromContent = (content = '') => {
|
|
84
|
+
const matches = Array.from(String(content).matchAll(/<a\s+[^>]*href=(['"])(.*?)\1/gi));
|
|
85
|
+
const urls = matches
|
|
86
|
+
.map((match) => match[2])
|
|
87
|
+
.filter((href) => /^https?:\/\//i.test(href))
|
|
88
|
+
.map((href) => href.trim())
|
|
89
|
+
.filter(Boolean);
|
|
90
|
+
return Array.from(new Set(urls));
|
|
91
|
+
};
|
|
92
|
+
|
|
93
|
+
const extractDuckDuckGoRedirectTarget = (value = '') => {
|
|
94
|
+
const urlText = String(value || '').trim();
|
|
95
|
+
if (!urlText) return null;
|
|
96
|
+
|
|
97
|
+
try {
|
|
98
|
+
const parsed = new URL(urlText);
|
|
99
|
+
if (parsed.hostname.includes('duckduckgo.com') && parsed.pathname === '/l/') {
|
|
100
|
+
const encoded = parsed.searchParams.get('uddg');
|
|
101
|
+
if (encoded) {
|
|
102
|
+
try {
|
|
103
|
+
return decodeURIComponent(encoded);
|
|
104
|
+
} catch {
|
|
105
|
+
return encoded;
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
if (parsed.hostname === 'duckduckgo.com' && parsed.pathname === '/y.js') {
|
|
111
|
+
const articleLike = parsed.searchParams.get('u3') || parsed.searchParams.get('url');
|
|
112
|
+
if (articleLike) {
|
|
113
|
+
try {
|
|
114
|
+
return decodeURIComponent(articleLike);
|
|
115
|
+
} catch {
|
|
116
|
+
return articleLike;
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
} catch {
|
|
121
|
+
return null;
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
return null;
|
|
125
|
+
};
|
|
126
|
+
|
|
127
|
+
const extractImageFromHtml = (html = '', base = '') => {
|
|
128
|
+
const normalizedHtml = String(html || '');
|
|
129
|
+
const metaCandidates = [
|
|
130
|
+
/<meta[^>]*property=["']og:image["'][^>]*content=["']([^"']+)["'][^>]*>/i,
|
|
131
|
+
/<meta[^>]*name=["']twitter:image["'][^>]*content=["']([^"']+)["'][^>]*>/i,
|
|
132
|
+
/<meta[^>]*name=["']og:image["'][^>]*content=["']([^"']+)["'][^>]*>/i,
|
|
133
|
+
/<meta[^>]*itemprop=["']image["'][^>]*content=["']([^"']+)["'][^>]*>/i,
|
|
134
|
+
/<link[^>]*rel=["']image_src["'][^>]*href=["']([^"']+)["'][^>]*>/i,
|
|
135
|
+
];
|
|
136
|
+
|
|
137
|
+
for (const pattern of metaCandidates) {
|
|
138
|
+
const match = normalizedHtml.match(pattern);
|
|
139
|
+
if (match?.[1]) {
|
|
140
|
+
const url = normalizeAbsoluteUrl(match[1], base);
|
|
141
|
+
if (url && !/favicon/i.test(url) && !/logo/i.test(url)) {
|
|
142
|
+
return url;
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
const imageMatch = normalizedHtml.match(/<img[^>]+src=["']([^"']+)["'][^>]*>/i);
|
|
148
|
+
if (imageMatch?.[1]) {
|
|
149
|
+
const src = normalizeAbsoluteUrl(imageMatch[1], base);
|
|
150
|
+
if (src && !/logo|favicon|avatar|pixel|spacer/i.test(src)) {
|
|
151
|
+
return src;
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
return null;
|
|
155
|
+
};
|
|
156
|
+
|
|
157
|
+
const resolveArticleImageByUrl = async (articleUrl) => {
|
|
158
|
+
try {
|
|
159
|
+
const html = await fetchText(articleUrl);
|
|
160
|
+
const imageUrl = extractImageFromHtml(html, articleUrl);
|
|
161
|
+
if (imageUrl) {
|
|
162
|
+
return imageUrl;
|
|
163
|
+
}
|
|
164
|
+
} catch {
|
|
165
|
+
// fallback below
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
try {
|
|
169
|
+
const normalizedArticleUrl = String(articleUrl).trim();
|
|
170
|
+
if (!normalizedArticleUrl) return null;
|
|
171
|
+
const normalizedForJina = normalizedArticleUrl.startsWith('https://')
|
|
172
|
+
? normalizedArticleUrl.slice(8)
|
|
173
|
+
: normalizedArticleUrl.startsWith('http://')
|
|
174
|
+
? normalizedArticleUrl.slice(7)
|
|
175
|
+
: normalizedArticleUrl;
|
|
176
|
+
const jinaUrl = `https://r.jina.ai/http://${normalizedForJina}`;
|
|
177
|
+
const jinaHtml = await fetchText(jinaUrl);
|
|
178
|
+
return extractImageFromHtml(jinaHtml, articleUrl);
|
|
179
|
+
} catch {
|
|
180
|
+
return null;
|
|
181
|
+
}
|
|
182
|
+
};
|
|
183
|
+
|
|
184
|
+
const extractSearchUrlsFromText = (markdown = '') => {
|
|
185
|
+
const matched = [];
|
|
186
|
+
const pattern = /https?:\/\/duckduckgo\.com\/l\/\?uddg=([^)\s"']+)(?:&[^)\s"']*)?/g;
|
|
187
|
+
let m = pattern.exec(markdown);
|
|
188
|
+
while (m) {
|
|
189
|
+
const decoded = extractDuckDuckGoRedirectTarget(`https://duckduckgo.com/l/?uddg=${m[1]}`);
|
|
190
|
+
if (decoded && /^https?:\/\/.+/i.test(decoded)) {
|
|
191
|
+
matched.push(decoded);
|
|
192
|
+
}
|
|
193
|
+
m = pattern.exec(markdown);
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
if (matched.length === 0) {
|
|
197
|
+
const directLinks = String(markdown).match(/https?:\/\/(?:www\.)?[^\\s\)\]\[]+/g) || [];
|
|
198
|
+
directLinks.forEach((link) => {
|
|
199
|
+
if (link.length > 12) {
|
|
200
|
+
matched.push(link);
|
|
201
|
+
}
|
|
202
|
+
});
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
return Array.from(new Set(matched));
|
|
206
|
+
};
|
|
207
|
+
|
|
208
|
+
const extractDuckDuckGoVqd = (html = '') => {
|
|
209
|
+
const raw = String(html || '');
|
|
210
|
+
const patterns = [
|
|
211
|
+
/vqd='([^']+)'/i,
|
|
212
|
+
/vqd="([^"]+)"/i,
|
|
213
|
+
/["']vqd["']\s*:\s*["']([^"']+)["']/i,
|
|
214
|
+
/vqd=([^&"'\\s>]+)/i,
|
|
215
|
+
];
|
|
216
|
+
|
|
217
|
+
for (const pattern of patterns) {
|
|
218
|
+
const matched = raw.match(pattern);
|
|
219
|
+
if (matched?.[1] && matched[1].trim()) {
|
|
220
|
+
return matched[1].trim();
|
|
221
|
+
}
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
return null;
|
|
225
|
+
};
|
|
226
|
+
|
|
227
|
+
module.exports = {
|
|
228
|
+
fetchText,
|
|
229
|
+
fetchTextWithHeaders,
|
|
230
|
+
normalizeAbsoluteUrl,
|
|
231
|
+
extractArticleUrlsFromContent,
|
|
232
|
+
extractDuckDuckGoRedirectTarget,
|
|
233
|
+
extractImageFromHtml,
|
|
234
|
+
resolveArticleImageByUrl,
|
|
235
|
+
extractSearchUrlsFromText,
|
|
236
|
+
extractDuckDuckGoVqd,
|
|
237
|
+
};
|