@clazic/urban 0.2.8 → 0.2.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/scripts/postinstall.js +11 -5
- package/src/agent/skills/_dbpia-access-filter.js +109 -0
- package/src/agent/skills/_normalize.js +228 -0
- package/src/agent/skills/_registry.js +79 -0
- package/src/agent/skills/base.js +90 -0
- package/src/agent/skills/dbpia.js +678 -0
- package/src/agent/skills/nanet.js +485 -0
- package/src/agent/skills/prism.js +150 -0
|
@@ -0,0 +1,485 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* 국회도서관 (NANET) 수집 플러그인
|
|
3
|
+
*
|
|
4
|
+
* prism-dl/src/nanet-api.js 를 ESM + Skill 클래스로 포팅
|
|
5
|
+
* - 쿠키 기반 ssotoken 세션 관리
|
|
6
|
+
* - HTML 검색 결과 파싱
|
|
7
|
+
* - rate limiting (요청 간 1초)
|
|
8
|
+
*/
|
|
9
|
+
import { Agent, fetch, request } from 'undici';
|
|
10
|
+
import { Skill } from './base.js';
|
|
11
|
+
import { normalizeNanetItem } from './_normalize.js';
|
|
12
|
+
import { getSecureConfig } from '../../kb/db.js';
|
|
13
|
+
|
|
14
|
+
const BASE_URL = 'https://www.nanet.go.kr';
|
|
15
|
+
const DL_BASE_URL = 'https://dl.nanet.go.kr';
|
|
16
|
+
const URLS = {
|
|
17
|
+
loginForm: `${BASE_URL}/loginForm.do`,
|
|
18
|
+
loginProc: `${BASE_URL}/newLoginProc.do`,
|
|
19
|
+
pdfInfoList: `${DL_BASE_URL}/search/getPDFInfoList.do`,
|
|
20
|
+
searchInnerList: `${DL_BASE_URL}/search/searchInnerList.do`,
|
|
21
|
+
fileDownload: `${DL_BASE_URL}/file/fileDownload.do`,
|
|
22
|
+
};
|
|
23
|
+
const HEADERS = {
|
|
24
|
+
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
25
|
+
Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
26
|
+
'Accept-Language': 'ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7',
|
|
27
|
+
};
|
|
28
|
+
const JSON_HEADERS = {
|
|
29
|
+
...HEADERS,
|
|
30
|
+
'Content-Type': 'application/json',
|
|
31
|
+
Accept: 'application/json, text/javascript, */*; q=0.01',
|
|
32
|
+
'X-Requested-With': 'XMLHttpRequest',
|
|
33
|
+
};
|
|
34
|
+
|
|
35
|
+
const SESSION_TTL_MS = 25 * 60 * 1_000; // 25분
|
|
36
|
+
const REQUEST_DELAY_MS = 1_000;
|
|
37
|
+
const DEFAULT_TIMEOUT_MS = 15_000;
|
|
38
|
+
const PAGE_SIZE = 50;
|
|
39
|
+
|
|
40
|
+
// ── 인증 정보 로드 ────────────────────────────────────────────────────
|
|
41
|
+
|
|
42
|
+
function loadCredentials() {
|
|
43
|
+
// DB 우선 조회 (설정 UI에서 저장한 값)
|
|
44
|
+
try {
|
|
45
|
+
const dbUserId = getSecureConfig('source.nanet.userId');
|
|
46
|
+
const dbPassword = getSecureConfig('source.nanet.password');
|
|
47
|
+
if (dbUserId && dbPassword) return { userId: dbUserId, password: dbPassword };
|
|
48
|
+
} catch { /* DB 미초기화 시 무시 */ }
|
|
49
|
+
|
|
50
|
+
// 환경변수 폴백
|
|
51
|
+
const userId = process.env.NANET_USER_ID;
|
|
52
|
+
const password = process.env.NANET_USER_PW;
|
|
53
|
+
if (userId && password) return { userId, password };
|
|
54
|
+
|
|
55
|
+
throw new Error('NANET 인증정보가 없습니다. 설정 화면에서 아이디/비밀번호를 입력하거나 ~/.urban/.env 를 확인하세요.');
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
// ── 세션 클래스 ──────────────────────────────────────────────────────
|
|
59
|
+
|
|
60
|
+
class NanetSession {
|
|
61
|
+
constructor() {
|
|
62
|
+
this._agent = new Agent({
|
|
63
|
+
connect: { rejectUnauthorized: false, connectTimeout: 15_000 },
|
|
64
|
+
connections: 5,
|
|
65
|
+
pipelining: 1,
|
|
66
|
+
keepAliveTimeout: 30_000,
|
|
67
|
+
keepAliveMaxTimeout: 60_000,
|
|
68
|
+
});
|
|
69
|
+
this._ssotoken = null;
|
|
70
|
+
this._loginTime = null;
|
|
71
|
+
this._lastRequestTime = 0;
|
|
72
|
+
this._dlSessionReady = false;
|
|
73
|
+
this._dlCookies = '';
|
|
74
|
+
this._credentials = null;
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
get isExpired() {
|
|
78
|
+
return !this._loginTime || Date.now() - this._loginTime > SESSION_TTL_MS;
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
async _throttle() {
|
|
82
|
+
const elapsed = Date.now() - this._lastRequestTime;
|
|
83
|
+
if (elapsed < REQUEST_DELAY_MS)
|
|
84
|
+
await new Promise(r => setTimeout(r, REQUEST_DELAY_MS - elapsed));
|
|
85
|
+
this._lastRequestTime = Date.now();
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
async login() {
|
|
89
|
+
if (this._ssotoken && !this.isExpired) return;
|
|
90
|
+
if (!this._credentials) this._credentials = loadCredentials();
|
|
91
|
+
const { userId, password } = this._credentials;
|
|
92
|
+
|
|
93
|
+
const ctrl = new AbortController();
|
|
94
|
+
const timer = setTimeout(() => ctrl.abort(), DEFAULT_TIMEOUT_MS);
|
|
95
|
+
try {
|
|
96
|
+
const { statusCode, headers, body: resBody } = await request(URLS.loginProc, {
|
|
97
|
+
method: 'POST',
|
|
98
|
+
headers: {
|
|
99
|
+
...HEADERS,
|
|
100
|
+
'Content-Type': 'application/x-www-form-urlencoded',
|
|
101
|
+
Referer: URLS.loginForm,
|
|
102
|
+
},
|
|
103
|
+
body: new URLSearchParams({ userId, password }).toString(),
|
|
104
|
+
dispatcher: this._agent,
|
|
105
|
+
signal: ctrl.signal,
|
|
106
|
+
maxRedirections: 0,
|
|
107
|
+
});
|
|
108
|
+
|
|
109
|
+
await resBody.dump().catch(() => {});
|
|
110
|
+
|
|
111
|
+
const rawCookies = headers['set-cookie'];
|
|
112
|
+
const cookieList = Array.isArray(rawCookies) ? rawCookies : rawCookies ? [rawCookies] : [];
|
|
113
|
+
let ssotoken = null;
|
|
114
|
+
for (const c of cookieList) {
|
|
115
|
+
const m = c?.match(/ssotoken=([^;]+)/i);
|
|
116
|
+
if (m) { ssotoken = m[1]; break; }
|
|
117
|
+
}
|
|
118
|
+
if (!ssotoken)
|
|
119
|
+
throw new Error(`NANET 로그인 실패: ssotoken 없음 (HTTP ${statusCode}). 아이디/비밀번호 확인 필요.`);
|
|
120
|
+
|
|
121
|
+
this._ssotoken = ssotoken;
|
|
122
|
+
this._loginTime = Date.now();
|
|
123
|
+
} finally {
|
|
124
|
+
clearTimeout(timer);
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
cookieHeader() { return `ssotoken=${this._ssotoken}`; }
|
|
129
|
+
|
|
130
|
+
invalidate() {
|
|
131
|
+
this._loginTime = null;
|
|
132
|
+
this._ssotoken = null;
|
|
133
|
+
this._dlSessionReady = false;
|
|
134
|
+
this._dlCookies = '';
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
async ensureDlSession() {
|
|
138
|
+
if (this._dlSessionReady) return;
|
|
139
|
+
const ctrl = new AbortController();
|
|
140
|
+
const timer = setTimeout(() => ctrl.abort(), DEFAULT_TIMEOUT_MS);
|
|
141
|
+
let res;
|
|
142
|
+
try {
|
|
143
|
+
res = await fetch(`${DL_BASE_URL}/`, {
|
|
144
|
+
headers: { ...HEADERS, Cookie: this.cookieHeader() },
|
|
145
|
+
dispatcher: this._agent,
|
|
146
|
+
signal: ctrl.signal,
|
|
147
|
+
});
|
|
148
|
+
} finally {
|
|
149
|
+
clearTimeout(timer);
|
|
150
|
+
}
|
|
151
|
+
const setCookies = res.headers.getSetCookie?.() ?? [];
|
|
152
|
+
const map = {};
|
|
153
|
+
for (const p of this._dlCookies.split(';')) {
|
|
154
|
+
const [k, v] = p.split('=').map(s => s.trim());
|
|
155
|
+
if (k && v) map[k] = v;
|
|
156
|
+
}
|
|
157
|
+
for (const sc of setCookies) {
|
|
158
|
+
const m = sc.match(/^([^=]+)=([^;]*)/);
|
|
159
|
+
if (m) map[m[1].trim()] = m[2].trim();
|
|
160
|
+
}
|
|
161
|
+
this._dlCookies = Object.entries(map).map(([k, v]) => `${k}=${v}`).join('; ');
|
|
162
|
+
await res.text();
|
|
163
|
+
this._dlSessionReady = true;
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
allCookies() {
|
|
167
|
+
return this._dlCookies ? `${this.cookieHeader()}; ${this._dlCookies}` : this.cookieHeader();
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
async close() { await this._agent.close(); }
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
let _session = null;
|
|
174
|
+
function getSession() {
|
|
175
|
+
if (!_session) _session = new NanetSession();
|
|
176
|
+
return _session;
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
/** 인증정보 변경 시 기존 세션 무효화 (server.js에서 호출) */
|
|
180
|
+
export function invalidateNanetSession() {
|
|
181
|
+
if (_session) _session.invalidate();
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
// ── HTML 파싱 ─────────────────────────────────────────────────────────
|
|
185
|
+
|
|
186
|
+
function parseSearchHtml(html) {
|
|
187
|
+
const items = [];
|
|
188
|
+
const pattern = /searchInnerDetail\('([A-Z0-9]+)'[^)]*\)\s*"?\s*>\s*([\s\S]*?)<\/a>/gi;
|
|
189
|
+
|
|
190
|
+
for (const match of html.matchAll(pattern)) {
|
|
191
|
+
const controlNo = match[1];
|
|
192
|
+
const preceding = html.slice(Math.max(0, match.index - 800), match.index);
|
|
193
|
+
if (!preceding.includes('li_wrap')) continue;
|
|
194
|
+
|
|
195
|
+
const rawTitle = match[2].replace(/<[^>]+>/g, ' ').replace(/\s+/g, ' ').trim();
|
|
196
|
+
const afterStart = match.index + match[0].length;
|
|
197
|
+
const nextDetail = html.indexOf("searchInnerDetail('", afterStart);
|
|
198
|
+
const rawChunk = html.slice(afterStart, nextDetail > 0 ? nextDetail : afterStart + 2000);
|
|
199
|
+
// 버튼/액션 영역 이전까지만 메타 추출 (원문보기·다운로드 버튼 텍스트 제외)
|
|
200
|
+
const btnCut = rawChunk.search(/<button|li_buttom_wrap|btn_bd_blue/);
|
|
201
|
+
const afterChunk = btnCut > 0 ? rawChunk.slice(0, btnCut) : rawChunk;
|
|
202
|
+
const metaParts = afterChunk.replace(/<[^>]+>/g, ' ').replace(/\s+/g, ' ').trim()
|
|
203
|
+
.split('|').map(s => s.trim()).filter(Boolean);
|
|
204
|
+
|
|
205
|
+
let publishYear = '';
|
|
206
|
+
for (const p of metaParts) {
|
|
207
|
+
if (/^\d{4}$/.test(p)) { publishYear = p; break; }
|
|
208
|
+
}
|
|
209
|
+
const yearIdx = metaParts.findIndex(p => p === publishYear);
|
|
210
|
+
let publisher = '';
|
|
211
|
+
if (yearIdx >= 2) publisher = metaParts[yearIdx - 1];
|
|
212
|
+
else if (metaParts.length >= 3) publisher = metaParts[2];
|
|
213
|
+
|
|
214
|
+
// publisher·year 외 나머지 메타(저자, 주제 등)를 description으로
|
|
215
|
+
const skipIdx = new Set([yearIdx, yearIdx - 1].filter(i => i >= 0));
|
|
216
|
+
const description = metaParts
|
|
217
|
+
.filter((p, i) => !skipIdx.has(i) && !/^\d+p$|^PDF$|^v\.\d/i.test(p))
|
|
218
|
+
.join(' · ');
|
|
219
|
+
|
|
220
|
+
// 다운로드 버튼 onclick에서 pdfCount 추출 (rawChunk 사용 — 버튼 태그 포함 전체 범위)
|
|
221
|
+
// 실제 JS: count==1 → itemNo 없이 바로 다운로드, count>1 → getPDFInfoList로 선택
|
|
222
|
+
// 다운로드 버튼 자체가 없으면 pdfCount=0 (서울관 실물소장 등 원문 미제공)
|
|
223
|
+
let pdfCount = 0;
|
|
224
|
+
const dlBtnMatch = rawChunk.match(new RegExp(`downloadDoc\\(this,\\s*'${controlNo}',\\s*'(\\d+)'`));
|
|
225
|
+
if (dlBtnMatch) pdfCount = parseInt(dlBtnMatch[1], 10);
|
|
226
|
+
|
|
227
|
+
// 다운로드 버튼 없는 항목(실물소장·열람전용) 제외
|
|
228
|
+
if (pdfCount === 0) continue;
|
|
229
|
+
|
|
230
|
+
items.push({ controlNo, title: rawTitle, publisher, publishYear, description, pdfCount });
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
const totalMatch = html.match(/전체\s*\(\s*([\d,]+)\s*\)/) || html.match(/총\s*([\d,]+)\s*건/);
|
|
234
|
+
const totalCount = totalMatch ? Number(totalMatch[1].replace(/,/g, '')) : items.length;
|
|
235
|
+
return { totalCount, items };
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
// ── API 함수 ──────────────────────────────────────────────────────────
|
|
239
|
+
|
|
240
|
+
async function fetchPDFInfoList(controlNo) {
|
|
241
|
+
const session = getSession();
|
|
242
|
+
await session.login();
|
|
243
|
+
await session._throttle();
|
|
244
|
+
|
|
245
|
+
const ctrl = new AbortController();
|
|
246
|
+
const timer = setTimeout(() => ctrl.abort(), DEFAULT_TIMEOUT_MS);
|
|
247
|
+
try {
|
|
248
|
+
const res = await fetch(URLS.pdfInfoList, {
|
|
249
|
+
method: 'POST',
|
|
250
|
+
headers: {
|
|
251
|
+
...JSON_HEADERS,
|
|
252
|
+
Cookie: session.cookieHeader(),
|
|
253
|
+
Referer: `${DL_BASE_URL}/search/searchInnerDetail.do`,
|
|
254
|
+
},
|
|
255
|
+
body: JSON.stringify({ controlNo }),
|
|
256
|
+
dispatcher: session._agent,
|
|
257
|
+
signal: ctrl.signal,
|
|
258
|
+
});
|
|
259
|
+
const text = await res.text();
|
|
260
|
+
if (!res.ok) return [];
|
|
261
|
+
// HTTP 200이어도 HTML 응답이면 세션 만료로 간주 (prism-dl nanetFetchJson 동일 로직)
|
|
262
|
+
if (text.startsWith('<!') || text.startsWith('<html') || text.includes('loginForm')) {
|
|
263
|
+
session.invalidate();
|
|
264
|
+
throw new Error('NANET 세션 만료: 서버가 HTML 페이지를 반환했습니다');
|
|
265
|
+
}
|
|
266
|
+
try {
|
|
267
|
+
const json = JSON.parse(text);
|
|
268
|
+
const list = json.pdfList || json.list || json.data || json;
|
|
269
|
+
if (!Array.isArray(list)) return [];
|
|
270
|
+
return list.map(item => ({
|
|
271
|
+
itemNo: String(item.itemNo || item.ITEM_NO || item.seq || '1'),
|
|
272
|
+
fileName: item.fileName || item.FILE_NM || item.orgFileNm || `${controlNo}.pdf`,
|
|
273
|
+
fileSize: Number(item.fileSize || item.FILE_SZ || item.fileSz || 0),
|
|
274
|
+
}));
|
|
275
|
+
} catch { return []; }
|
|
276
|
+
} finally {
|
|
277
|
+
clearTimeout(timer);
|
|
278
|
+
}
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
async function searchPage(query, { pageNo = 1 } = {}) {
|
|
282
|
+
const session = getSession();
|
|
283
|
+
await session.login();
|
|
284
|
+
await session._throttle();
|
|
285
|
+
await session.ensureDlSession();
|
|
286
|
+
|
|
287
|
+
// 브라우저 실제 URL 파라미터 (searchMehtod=L, 전체 자료유형)
|
|
288
|
+
const params = new URLSearchParams({
|
|
289
|
+
searchType: 'INNER_SEARCH', resultType: 'INNER_SEARCH_LIST',
|
|
290
|
+
searchMehtod: 'F', searchClass: 'S',
|
|
291
|
+
controlNo: '',
|
|
292
|
+
queryText: `${query}:ALL_NI_TOC:AND`,
|
|
293
|
+
prevQueryText: `${query}:ALL_NI_TOC:AND`,
|
|
294
|
+
zone: 'ALL_NI_TOC',
|
|
295
|
+
fieldText: '', prevPubYearFieldText: '', languageCode: '',
|
|
296
|
+
synonymYn: '', refineSearchYn: '', ddcPopSearchYn: '',
|
|
297
|
+
pageNum: pageNo <= 1 ? '' : String(pageNo),
|
|
298
|
+
pageSize: String(PAGE_SIZE),
|
|
299
|
+
orderBy: 'WEIGHT',
|
|
300
|
+
topMainMenuCode: '', topSubMenuCode: '',
|
|
301
|
+
totalSize: '', totalSizeByMenu: '', seqNo: '',
|
|
302
|
+
hanjaYn: 'Y',
|
|
303
|
+
knowPub: '', isdb: '', isdbsvc: '', tt1: '', down: '',
|
|
304
|
+
frgnLangMtrlYn: '', targetLangCode: '',
|
|
305
|
+
checkedDbIdList: '', baseDbId: '', selectedDbIndexIdList: '',
|
|
306
|
+
caller: '', asideState: '',
|
|
307
|
+
dpBranch: 'ALL', journalKind: '', degreeDiv: '',
|
|
308
|
+
selZone: 'ALL_NI_TOC',
|
|
309
|
+
searchQuery: query,
|
|
310
|
+
});
|
|
311
|
+
|
|
312
|
+
const ctrl = new AbortController();
|
|
313
|
+
const timer = setTimeout(() => ctrl.abort(), DEFAULT_TIMEOUT_MS);
|
|
314
|
+
try {
|
|
315
|
+
const res = await fetch(`${URLS.searchInnerList}?${params}`, {
|
|
316
|
+
method: 'GET',
|
|
317
|
+
headers: { ...HEADERS, Cookie: session.allCookies(), Referer: URLS.searchInnerList.replace('/searchInnerList.do', '/') },
|
|
318
|
+
dispatcher: session._agent,
|
|
319
|
+
signal: ctrl.signal,
|
|
320
|
+
});
|
|
321
|
+
const html = await res.text();
|
|
322
|
+
if (!res.ok) throw new Error(`NANET 검색 HTTP ${res.status}`);
|
|
323
|
+
if (html.includes('오류가 발생했습니다') && html.length < 5000)
|
|
324
|
+
throw new Error('NANET 서버 오류 페이지');
|
|
325
|
+
return parseSearchHtml(html);
|
|
326
|
+
} finally {
|
|
327
|
+
clearTimeout(timer);
|
|
328
|
+
}
|
|
329
|
+
}
|
|
330
|
+
|
|
331
|
+
// ── Skill 클래스 ──────────────────────────────────────────────────────
|
|
332
|
+
|
|
333
|
+
export class NanetSkill extends Skill {
|
|
334
|
+
static manifest = {
|
|
335
|
+
id: 'nanet',
|
|
336
|
+
name: '국회도서관 (NANET)',
|
|
337
|
+
version: '1.0.0',
|
|
338
|
+
defaultCron: '0 3 * * *',
|
|
339
|
+
defaultKeywords: [
|
|
340
|
+
'도시계획', '도시재생', '주거정책', '교통계획', '환경계획', '스마트시티',
|
|
341
|
+
'도시개발', '토지이용계획', '공간계획', '주택공급',
|
|
342
|
+
],
|
|
343
|
+
rateLimit: { rps: 1, burst: 1 },
|
|
344
|
+
requiredEnv: [], // 인증정보는 설정창(secure_config) 또는 ~/.urban/.env 환경변수 폴백
|
|
345
|
+
capabilities: ['discover', 'download', 'normalize', 'search'],
|
|
346
|
+
};
|
|
347
|
+
|
|
348
|
+
get meta() {
|
|
349
|
+
return {
|
|
350
|
+
sessionTtlMs: SESSION_TTL_MS,
|
|
351
|
+
minRequestIntervalMs: REQUEST_DELAY_MS,
|
|
352
|
+
earlyStopConsecutiveEmpty: 5,
|
|
353
|
+
supportsRangeRequest: false,
|
|
354
|
+
};
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
async ensureSession() {
|
|
358
|
+
await getSession().login();
|
|
359
|
+
}
|
|
360
|
+
|
|
361
|
+
/** 세션 강제 무효화 — HTML 응답 감지 시 downloader가 호출 */
|
|
362
|
+
invalidateSession() {
|
|
363
|
+
getSession().invalidate();
|
|
364
|
+
}
|
|
365
|
+
|
|
366
|
+
async discover({ keywords = [], limit = 50 }) {
|
|
367
|
+
const results = [];
|
|
368
|
+
const seen = new Set();
|
|
369
|
+
|
|
370
|
+
for (const kw of keywords) {
|
|
371
|
+
if (results.length >= limit) break;
|
|
372
|
+
try {
|
|
373
|
+
const { items } = await searchPage(kw);
|
|
374
|
+
for (const searchItem of items) {
|
|
375
|
+
if (results.length >= limit) break;
|
|
376
|
+
// PDF 목록 조회
|
|
377
|
+
const pdfFiles = await fetchPDFInfoList(searchItem.controlNo);
|
|
378
|
+
const files = pdfFiles.length > 0 ? pdfFiles : [null];
|
|
379
|
+
for (const pdfFile of files) {
|
|
380
|
+
const item = normalizeNanetItem(searchItem, pdfFile, kw);
|
|
381
|
+
if (!seen.has(item.docid)) {
|
|
382
|
+
seen.add(item.docid);
|
|
383
|
+
results.push(item);
|
|
384
|
+
}
|
|
385
|
+
}
|
|
386
|
+
}
|
|
387
|
+
} catch (err) {
|
|
388
|
+
console.warn(`[nanet] discover 오류 (${kw}): ${err.message}`);
|
|
389
|
+
}
|
|
390
|
+
}
|
|
391
|
+
return results;
|
|
392
|
+
}
|
|
393
|
+
|
|
394
|
+
/**
|
|
395
|
+
* 단일 쿼리로 NANET 검색 (UI 실시간 검색용)
|
|
396
|
+
* PDF 목록 조회 없이 빠르게 검색 결과만 반환
|
|
397
|
+
*/
|
|
398
|
+
async search(query, { limit = 20 } = {}) {
|
|
399
|
+
const { items } = await searchPage(query);
|
|
400
|
+
const seenDocid = new Set();
|
|
401
|
+
const seenFile = new Set();
|
|
402
|
+
return items
|
|
403
|
+
.map(item => normalizeNanetItem(item, null, query))
|
|
404
|
+
.filter(item => {
|
|
405
|
+
const fileKey = `${item.fileName}|${item.fileSize}`;
|
|
406
|
+
if (seenDocid.has(item.docid) || seenFile.has(fileKey)) return false;
|
|
407
|
+
seenDocid.add(item.docid);
|
|
408
|
+
seenFile.add(fileKey);
|
|
409
|
+
return true;
|
|
410
|
+
})
|
|
411
|
+
.slice(0, limit);
|
|
412
|
+
}
|
|
413
|
+
|
|
414
|
+
async openDownloadResponse(item, { signal } = {}) {
|
|
415
|
+
const session = getSession();
|
|
416
|
+
await session.login();
|
|
417
|
+
await session._throttle();
|
|
418
|
+
await session.ensureDlSession();
|
|
419
|
+
|
|
420
|
+
const { controlNo, linkSystemId = 'NADL' } = item;
|
|
421
|
+
if (!controlNo) throw new Error(`NANET 다운로드: controlNo 없음 (docid=${item.docid})`);
|
|
422
|
+
|
|
423
|
+
// NANET JS 동작 재현:
|
|
424
|
+
// count == 1 → downloadBySingleCount(controlNo) → itemNo 없이 직접 다운로드
|
|
425
|
+
// count > 1 → getPDFInfoList → downloadItem(controlNo, itemNo) → itemNo 포함
|
|
426
|
+
const pdfCount = Number(item.pdfCount ?? 1);
|
|
427
|
+
let itemNo = item.itemNo || '';
|
|
428
|
+
|
|
429
|
+
if (pdfCount === 0) {
|
|
430
|
+
throw Object.assign(
|
|
431
|
+
new Error(`NANET 원문 미제공: 실물소장 또는 열람전용 자료입니다 (controlNo=${controlNo})`),
|
|
432
|
+
{ nonRetryable: true, reason: 'link_missing' }
|
|
433
|
+
);
|
|
434
|
+
}
|
|
435
|
+
|
|
436
|
+
if (!itemNo) {
|
|
437
|
+
if (pdfCount > 1) {
|
|
438
|
+
// 다중 파일: getPDFInfoList로 첫 번째 itemNo 획득 (브라우저의 팝업 → 첫 항목 자동 선택)
|
|
439
|
+
const pdfFiles = await fetchPDFInfoList(controlNo);
|
|
440
|
+
if (pdfFiles.length > 0) {
|
|
441
|
+
itemNo = pdfFiles[0].itemNo;
|
|
442
|
+
} else {
|
|
443
|
+
throw Object.assign(
|
|
444
|
+
new Error(`NANET 원문 없음: PDF 목록 조회 실패 (controlNo=${controlNo})`),
|
|
445
|
+
{ nonRetryable: true, reason: 'link_missing' }
|
|
446
|
+
);
|
|
447
|
+
}
|
|
448
|
+
}
|
|
449
|
+
// pdfCount == 1: itemNo 없이 바로 다운로드 (downloadBySingleCount 동작과 동일)
|
|
450
|
+
}
|
|
451
|
+
|
|
452
|
+
let url = `${URLS.fileDownload}?linkSystemId=${encodeURIComponent(linkSystemId)}&controlNo=${encodeURIComponent(controlNo)}`;
|
|
453
|
+
if (itemNo) url += `&itemNo=${encodeURIComponent(itemNo)}`;
|
|
454
|
+
|
|
455
|
+
const ctrl = signal ? null : new AbortController();
|
|
456
|
+
const timer = ctrl ? setTimeout(() => ctrl.abort(), 300_000) : null;
|
|
457
|
+
try {
|
|
458
|
+
const res = await fetch(url, {
|
|
459
|
+
method: 'GET',
|
|
460
|
+
headers: {
|
|
461
|
+
...HEADERS,
|
|
462
|
+
Cookie: session.allCookies(),
|
|
463
|
+
Referer: `${DL_BASE_URL}/search/searchInnerDetail.do`,
|
|
464
|
+
},
|
|
465
|
+
dispatcher: session._agent,
|
|
466
|
+
redirect: 'follow',
|
|
467
|
+
signal: signal ?? ctrl.signal,
|
|
468
|
+
});
|
|
469
|
+
if (res.status === 401) {
|
|
470
|
+
session.invalidate();
|
|
471
|
+
// 세션 만료는 재시도 허용 (cache 적재 X) — downloader 에서 nonRetryable 미지정 시 백오프 후 재시도
|
|
472
|
+
throw Object.assign(new Error('NANET 세션 만료'), { reason: 'session_expired' });
|
|
473
|
+
}
|
|
474
|
+
return res;
|
|
475
|
+
} finally {
|
|
476
|
+
if (timer) clearTimeout(timer);
|
|
477
|
+
}
|
|
478
|
+
}
|
|
479
|
+
|
|
480
|
+
normalize(searchItem, pdfFile = null, query = '') {
|
|
481
|
+
return normalizeNanetItem(searchItem, pdfFile, query);
|
|
482
|
+
}
|
|
483
|
+
}
|
|
484
|
+
|
|
485
|
+
export default NanetSkill;
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* PRISM (정책연구정보서비스) 수집 플러그인
|
|
3
|
+
*
|
|
4
|
+
* prism-dl/src/api.js 를 ESM + Skill 클래스로 포팅
|
|
5
|
+
* SSL 인증서 오류 대응: undici Agent { rejectUnauthorized: false }
|
|
6
|
+
*/
|
|
7
|
+
import { Agent, fetch } from 'undici';
|
|
8
|
+
import { Skill } from './base.js';
|
|
9
|
+
import { normalizePrismItem } from './_normalize.js';
|
|
10
|
+
|
|
11
|
+
const SEARCH_URL = 'https://api.prism.go.kr/prism-be-prtl/search/totalSearch.do';
|
|
12
|
+
const DOWNLOAD_URL = 'https://api.prism.go.kr/prism-be-asmt/v1/progress/download-file';
|
|
13
|
+
const HEADERS = {
|
|
14
|
+
Origin: 'https://www.prism.go.kr',
|
|
15
|
+
Referer: 'https://www.prism.go.kr/homepage/prtl/totalsearch/list',
|
|
16
|
+
'Content-Type': 'application/json',
|
|
17
|
+
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
18
|
+
};
|
|
19
|
+
|
|
20
|
+
const dispatcher = new Agent({
|
|
21
|
+
connect: { rejectUnauthorized: false, connectTimeout: 15_000 },
|
|
22
|
+
connections: 10,
|
|
23
|
+
pipelining: 1,
|
|
24
|
+
keepAliveTimeout: 30_000,
|
|
25
|
+
keepAliveMaxTimeout: 60_000,
|
|
26
|
+
});
|
|
27
|
+
|
|
28
|
+
async function fetchJson(url, payload, { timeoutMs = 10_000, signal } = {}) {
|
|
29
|
+
const ctrl = signal ? null : new AbortController();
|
|
30
|
+
const timer = ctrl ? setTimeout(() => ctrl.abort(), timeoutMs) : null;
|
|
31
|
+
try {
|
|
32
|
+
const res = await fetch(url, {
|
|
33
|
+
method: 'POST',
|
|
34
|
+
headers: HEADERS,
|
|
35
|
+
body: JSON.stringify(payload),
|
|
36
|
+
dispatcher,
|
|
37
|
+
signal: signal ?? ctrl.signal,
|
|
38
|
+
});
|
|
39
|
+
const text = await res.text();
|
|
40
|
+
if (!res.ok) throw new Error(`PRISM HTTP ${res.status}: ${text.slice(0, 200)}`);
|
|
41
|
+
try { return JSON.parse(text); } catch { throw new Error(`PRISM JSON 파싱 실패: ${text.slice(0, 200)}`); }
|
|
42
|
+
} finally {
|
|
43
|
+
if (timer) clearTimeout(timer);
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
async function searchPage(query, { startCount = 0, listCount = 100 } = {}) {
|
|
48
|
+
const json = await fetchJson(SEARCH_URL, {
|
|
49
|
+
query, collection: 'report',
|
|
50
|
+
startCount, listCount: Math.min(listCount, 1000),
|
|
51
|
+
jrsdInstGrntNo: '', hghrkFwkClsfSysId: '',
|
|
52
|
+
asmtOtln: '', kywdCn: '', rptpDtlCn: '', thssSmryCn: '',
|
|
53
|
+
});
|
|
54
|
+
const report = json.report || {};
|
|
55
|
+
return {
|
|
56
|
+
totalCount: Number(report.totalCount || 0),
|
|
57
|
+
items: Array.isArray(report.searchResultList) ? report.searchResultList : [],
|
|
58
|
+
};
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
export class PrismSkill extends Skill {
|
|
62
|
+
static manifest = {
|
|
63
|
+
id: 'prism',
|
|
64
|
+
name: '정책연구정보서비스 (PRISM)',
|
|
65
|
+
version: '1.0.0',
|
|
66
|
+
defaultCron: '0 2 * * *',
|
|
67
|
+
defaultKeywords: [
|
|
68
|
+
'도시계획', '도시재생', '주거', '교통', '환경', '스마트시티',
|
|
69
|
+
'토지이용', '도시정책', '공간정책', '주택정책',
|
|
70
|
+
],
|
|
71
|
+
rateLimit: { rps: 1, burst: 3 },
|
|
72
|
+
requiredEnv: [],
|
|
73
|
+
capabilities: ['discover', 'download', 'normalize', 'search'],
|
|
74
|
+
};
|
|
75
|
+
|
|
76
|
+
/**
|
|
77
|
+
* 키워드 목록으로 PRISM 검색 → NormalizedItem 배열 반환
|
|
78
|
+
*/
|
|
79
|
+
async discover({ keywords = [], limit = 50 }) {
|
|
80
|
+
const results = [];
|
|
81
|
+
const seen = new Set();
|
|
82
|
+
|
|
83
|
+
for (const kw of keywords) {
|
|
84
|
+
if (results.length >= limit) break;
|
|
85
|
+
try {
|
|
86
|
+
const { items } = await searchPage(kw, { listCount: Math.min(limit - results.length, 100) });
|
|
87
|
+
for (const raw of items) {
|
|
88
|
+
const item = this.normalize(raw, kw);
|
|
89
|
+
if (!seen.has(item.docid)) {
|
|
90
|
+
seen.add(item.docid);
|
|
91
|
+
results.push(item);
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
} catch (err) {
|
|
95
|
+
console.warn(`[prism] discover 오류 (${kw}): ${err.message}`);
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
return results;
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
/**
|
|
102
|
+
* 단일 쿼리로 PRISM 검색 (UI 실시간 검색용)
|
|
103
|
+
* @param {string} query
|
|
104
|
+
* @param {{ limit?: number }} [options]
|
|
105
|
+
* @returns {Promise<object[]>} NormalizedItem 배열
|
|
106
|
+
*/
|
|
107
|
+
async search(query, { limit = 20 } = {}) {
|
|
108
|
+
const { items } = await searchPage(query, { listCount: Math.min(limit, 100) });
|
|
109
|
+
const seenDocid = new Set();
|
|
110
|
+
const seenFile = new Set();
|
|
111
|
+
return items
|
|
112
|
+
.map(raw => this.normalize(raw, query))
|
|
113
|
+
.filter(item => {
|
|
114
|
+
const fileKey = `${item.fileName}|${item.fileSize}`;
|
|
115
|
+
if (seenDocid.has(item.docid) || seenFile.has(fileKey)) return false;
|
|
116
|
+
seenDocid.add(item.docid);
|
|
117
|
+
seenFile.add(fileKey);
|
|
118
|
+
return true;
|
|
119
|
+
});
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
async openDownloadResponse(item, { signal } = {}) {
|
|
123
|
+
const payload = {
|
|
124
|
+
asmtId: item.asmtId,
|
|
125
|
+
fileTypeCd: item.fileTypeCd,
|
|
126
|
+
fileSn: item.fileSn,
|
|
127
|
+
fileWkky: item.fileWkky,
|
|
128
|
+
pdfTrsfYn: item.pdfTrsfYn || 'Y',
|
|
129
|
+
};
|
|
130
|
+
const ctrl = signal ? null : new AbortController();
|
|
131
|
+
const timer = ctrl ? setTimeout(() => ctrl.abort(), 120_000) : null;
|
|
132
|
+
try {
|
|
133
|
+
return await fetch(DOWNLOAD_URL, {
|
|
134
|
+
method: 'POST',
|
|
135
|
+
headers: HEADERS,
|
|
136
|
+
body: JSON.stringify(payload),
|
|
137
|
+
dispatcher,
|
|
138
|
+
signal: signal ?? ctrl.signal,
|
|
139
|
+
});
|
|
140
|
+
} finally {
|
|
141
|
+
if (timer) clearTimeout(timer);
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
normalize(rawItem, query = '') {
|
|
146
|
+
return normalizePrismItem(rawItem, query);
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
export default PrismSkill;
|