@nahisaho/katashiro-collector 0.2.2 → 0.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/browser/ActionExecutor.d.ts +85 -0
- package/dist/browser/ActionExecutor.d.ts.map +1 -0
- package/dist/browser/ActionExecutor.js +171 -0
- package/dist/browser/ActionExecutor.js.map +1 -0
- package/dist/browser/BrowserAutomation.d.ts +147 -0
- package/dist/browser/BrowserAutomation.d.ts.map +1 -0
- package/dist/browser/BrowserAutomation.js +463 -0
- package/dist/browser/BrowserAutomation.js.map +1 -0
- package/dist/browser/ContentExtractor.d.ts +54 -0
- package/dist/browser/ContentExtractor.d.ts.map +1 -0
- package/dist/browser/ContentExtractor.js +159 -0
- package/dist/browser/ContentExtractor.js.map +1 -0
- package/dist/browser/SessionManager.d.ts +67 -0
- package/dist/browser/SessionManager.d.ts.map +1 -0
- package/dist/browser/SessionManager.js +173 -0
- package/dist/browser/SessionManager.js.map +1 -0
- package/dist/browser/index.d.ts +17 -0
- package/dist/browser/index.d.ts.map +1 -0
- package/dist/browser/index.js +17 -0
- package/dist/browser/index.js.map +1 -0
- package/dist/browser/types.d.ts +361 -0
- package/dist/browser/types.d.ts.map +1 -0
- package/dist/browser/types.js +23 -0
- package/dist/browser/types.js.map +1 -0
- package/dist/document/DocumentParser.d.ts +91 -0
- package/dist/document/DocumentParser.d.ts.map +1 -0
- package/dist/document/DocumentParser.js +234 -0
- package/dist/document/DocumentParser.js.map +1 -0
- package/dist/document/index.d.ts +11 -0
- package/dist/document/index.d.ts.map +1 -0
- package/dist/document/index.js +10 -0
- package/dist/document/index.js.map +1 -0
- package/dist/document/parsers/DOCXParser.d.ts +63 -0
- package/dist/document/parsers/DOCXParser.d.ts.map +1 -0
- package/dist/document/parsers/DOCXParser.js +362 -0
- package/dist/document/parsers/DOCXParser.js.map +1 -0
- package/dist/document/parsers/PDFParser.d.ts +60 -0
- package/dist/document/parsers/PDFParser.d.ts.map +1 -0
- package/dist/document/parsers/PDFParser.js +338 -0
- package/dist/document/parsers/PDFParser.js.map +1 -0
- package/dist/document/parsers/XLSXParser.d.ts +55 -0
- package/dist/document/parsers/XLSXParser.d.ts.map +1 -0
- package/dist/document/parsers/XLSXParser.js +314 -0
- package/dist/document/parsers/XLSXParser.js.map +1 -0
- package/dist/document/parsers/index.d.ts +10 -0
- package/dist/document/parsers/index.d.ts.map +1 -0
- package/dist/document/parsers/index.js +10 -0
- package/dist/document/parsers/index.js.map +1 -0
- package/dist/document/types.d.ts +251 -0
- package/dist/document/types.d.ts.map +1 -0
- package/dist/document/types.js +13 -0
- package/dist/document/types.js.map +1 -0
- package/dist/index.d.ts +7 -2
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +14 -2
- package/dist/index.js.map +1 -1
- package/dist/research/CoverageAnalyzer.d.ts +50 -0
- package/dist/research/CoverageAnalyzer.d.ts.map +1 -0
- package/dist/research/CoverageAnalyzer.js +169 -0
- package/dist/research/CoverageAnalyzer.js.map +1 -0
- package/dist/research/QueryPlanner.d.ts +57 -0
- package/dist/research/QueryPlanner.d.ts.map +1 -0
- package/dist/research/QueryPlanner.js +102 -0
- package/dist/research/QueryPlanner.js.map +1 -0
- package/dist/research/ResultAggregator.d.ts +39 -0
- package/dist/research/ResultAggregator.d.ts.map +1 -0
- package/dist/research/ResultAggregator.js +85 -0
- package/dist/research/ResultAggregator.js.map +1 -0
- package/dist/research/WideResearchEngine.d.ts +110 -0
- package/dist/research/WideResearchEngine.d.ts.map +1 -0
- package/dist/research/WideResearchEngine.js +330 -0
- package/dist/research/WideResearchEngine.js.map +1 -0
- package/dist/research/agents/AcademicSearchAgent.d.ts +57 -0
- package/dist/research/agents/AcademicSearchAgent.d.ts.map +1 -0
- package/dist/research/agents/AcademicSearchAgent.js +180 -0
- package/dist/research/agents/AcademicSearchAgent.js.map +1 -0
- package/dist/research/agents/EncyclopediaAgent.d.ts +49 -0
- package/dist/research/agents/EncyclopediaAgent.d.ts.map +1 -0
- package/dist/research/agents/EncyclopediaAgent.js +153 -0
- package/dist/research/agents/EncyclopediaAgent.js.map +1 -0
- package/dist/research/agents/NewsSearchAgent.d.ts +38 -0
- package/dist/research/agents/NewsSearchAgent.d.ts.map +1 -0
- package/dist/research/agents/NewsSearchAgent.js +146 -0
- package/dist/research/agents/NewsSearchAgent.js.map +1 -0
- package/dist/research/agents/WebSearchAgent.d.ts +45 -0
- package/dist/research/agents/WebSearchAgent.d.ts.map +1 -0
- package/dist/research/agents/WebSearchAgent.js +135 -0
- package/dist/research/agents/WebSearchAgent.js.map +1 -0
- package/dist/research/agents/index.d.ts +13 -0
- package/dist/research/agents/index.d.ts.map +1 -0
- package/dist/research/agents/index.js +12 -0
- package/dist/research/agents/index.js.map +1 -0
- package/dist/research/agents/types.d.ts +60 -0
- package/dist/research/agents/types.d.ts.map +1 -0
- package/dist/research/agents/types.js +9 -0
- package/dist/research/agents/types.js.map +1 -0
- package/dist/research/index.d.ts +16 -0
- package/dist/research/index.d.ts.map +1 -0
- package/dist/research/index.js +17 -0
- package/dist/research/index.js.map +1 -0
- package/dist/research/types.d.ts +206 -0
- package/dist/research/types.d.ts.map +1 -0
- package/dist/research/types.js +33 -0
- package/dist/research/types.js.map +1 -0
- package/package.json +1 -1
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* ContentExtractor - ページからコンテンツを抽出
|
|
3
|
+
*
|
|
4
|
+
* @requirement REQ-COLLECT-009
|
|
5
|
+
* @design DES-COLLECT-009-BrowserAutomation
|
|
6
|
+
*/
|
|
7
|
+
/**
|
|
8
|
+
* ページからコンテンツを抽出
|
|
9
|
+
*/
|
|
10
|
+
export class ContentExtractor {
|
|
11
|
+
/**
|
|
12
|
+
* ページからコンテンツを抽出
|
|
13
|
+
*/
|
|
14
|
+
async extract(page, extractors) {
|
|
15
|
+
// メインコンテンツを抽出
|
|
16
|
+
const content = await this.extractMainContent(page);
|
|
17
|
+
// HTML全体を取得
|
|
18
|
+
const html = await this.extractHtml(page);
|
|
19
|
+
// リンクを抽出
|
|
20
|
+
const links = await this.extractLinks(page);
|
|
21
|
+
// 画像を抽出
|
|
22
|
+
const images = await this.extractImages(page);
|
|
23
|
+
// メタデータを抽出
|
|
24
|
+
const metadata = await this.extractMetadata(page);
|
|
25
|
+
// カスタム抽出
|
|
26
|
+
let extractedData;
|
|
27
|
+
if (extractors && extractors.length > 0) {
|
|
28
|
+
extractedData = await this.extractCustomData(page, extractors);
|
|
29
|
+
}
|
|
30
|
+
return {
|
|
31
|
+
content,
|
|
32
|
+
html,
|
|
33
|
+
extractedData,
|
|
34
|
+
links,
|
|
35
|
+
images,
|
|
36
|
+
metadata,
|
|
37
|
+
};
|
|
38
|
+
}
|
|
39
|
+
/**
|
|
40
|
+
* メインコンテンツを抽出
|
|
41
|
+
*/
|
|
42
|
+
async extractMainContent(page) {
|
|
43
|
+
return page.evaluate(`
|
|
44
|
+
(() => {
|
|
45
|
+
const clone = document.body.cloneNode(true);
|
|
46
|
+
const removeSelectors = ['script', 'style', 'nav', 'footer', 'header', 'aside', 'noscript'];
|
|
47
|
+
removeSelectors.forEach(sel => {
|
|
48
|
+
clone.querySelectorAll(sel).forEach(el => el.remove());
|
|
49
|
+
});
|
|
50
|
+
return clone.textContent?.replace(/\\s+/g, ' ').trim() || '';
|
|
51
|
+
})()
|
|
52
|
+
`);
|
|
53
|
+
}
|
|
54
|
+
/**
|
|
55
|
+
* HTML全体を取得
|
|
56
|
+
*/
|
|
57
|
+
async extractHtml(page) {
|
|
58
|
+
return page.evaluate('document.documentElement.outerHTML');
|
|
59
|
+
}
|
|
60
|
+
/**
|
|
61
|
+
* リンクを抽出
|
|
62
|
+
*/
|
|
63
|
+
async extractLinks(page) {
|
|
64
|
+
return page.evaluate(`
|
|
65
|
+
(() => {
|
|
66
|
+
const anchors = document.querySelectorAll('a[href]');
|
|
67
|
+
return Array.from(anchors).map(a => ({
|
|
68
|
+
href: a.href,
|
|
69
|
+
text: a.textContent?.trim() || '',
|
|
70
|
+
rel: a.rel || undefined,
|
|
71
|
+
}));
|
|
72
|
+
})()
|
|
73
|
+
`);
|
|
74
|
+
}
|
|
75
|
+
/**
|
|
76
|
+
* 画像を抽出
|
|
77
|
+
*/
|
|
78
|
+
async extractImages(page) {
|
|
79
|
+
return page.evaluate(`
|
|
80
|
+
(() => {
|
|
81
|
+
const imgs = document.querySelectorAll('img[src]');
|
|
82
|
+
return Array.from(imgs).map(img => ({
|
|
83
|
+
src: img.src,
|
|
84
|
+
alt: img.alt || undefined,
|
|
85
|
+
width: img.naturalWidth || undefined,
|
|
86
|
+
height: img.naturalHeight || undefined,
|
|
87
|
+
}));
|
|
88
|
+
})()
|
|
89
|
+
`);
|
|
90
|
+
}
|
|
91
|
+
/**
|
|
92
|
+
* メタデータを抽出
|
|
93
|
+
*/
|
|
94
|
+
async extractMetadata(page) {
|
|
95
|
+
return page.evaluate(`
|
|
96
|
+
(() => {
|
|
97
|
+
const getMeta = (name) => {
|
|
98
|
+
const el = document.querySelector('meta[name="' + name + '"], meta[property="' + name + '"]');
|
|
99
|
+
return el?.getAttribute('content') || undefined;
|
|
100
|
+
};
|
|
101
|
+
|
|
102
|
+
const ogp = {};
|
|
103
|
+
document.querySelectorAll('meta[property^="og:"]').forEach(el => {
|
|
104
|
+
const property = el.getAttribute('property');
|
|
105
|
+
const content = el.getAttribute('content');
|
|
106
|
+
if (property && content) {
|
|
107
|
+
ogp[property.replace('og:', '')] = content;
|
|
108
|
+
}
|
|
109
|
+
});
|
|
110
|
+
|
|
111
|
+
return {
|
|
112
|
+
description: getMeta('description'),
|
|
113
|
+
keywords: getMeta('keywords')?.split(',').map(k => k.trim()),
|
|
114
|
+
ogp: Object.keys(ogp).length > 0 ? ogp : undefined,
|
|
115
|
+
language: document.documentElement.lang || undefined,
|
|
116
|
+
};
|
|
117
|
+
})()
|
|
118
|
+
`);
|
|
119
|
+
}
|
|
120
|
+
/**
|
|
121
|
+
* カスタム抽出
|
|
122
|
+
*/
|
|
123
|
+
async extractCustomData(page, extractors) {
|
|
124
|
+
const extractedData = {};
|
|
125
|
+
for (const extractor of extractors) {
|
|
126
|
+
try {
|
|
127
|
+
if (extractor.multiple) {
|
|
128
|
+
extractedData[extractor.name] = await page.$$eval(extractor.selector, (els, attr) => els.map((el) => attr ? el.getAttribute(attr) : el.textContent?.trim()), extractor.attribute);
|
|
129
|
+
}
|
|
130
|
+
else {
|
|
131
|
+
extractedData[extractor.name] = await page.$eval(extractor.selector, (el, attr) => attr ? el.getAttribute(attr) : el.textContent?.trim(), extractor.attribute);
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
catch {
|
|
135
|
+
extractedData[extractor.name] = null;
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
return extractedData;
|
|
139
|
+
}
|
|
140
|
+
/**
|
|
141
|
+
* テキストコンテンツを抽出(シンプル版)
|
|
142
|
+
*/
|
|
143
|
+
async extractText(page, selector) {
|
|
144
|
+
return page.$eval(selector, (el) => el.textContent?.trim() || '');
|
|
145
|
+
}
|
|
146
|
+
/**
|
|
147
|
+
* 属性を抽出
|
|
148
|
+
*/
|
|
149
|
+
async extractAttribute(page, selector, attribute) {
|
|
150
|
+
return page.$eval(selector, (el, attr) => el.getAttribute(attr), attribute);
|
|
151
|
+
}
|
|
152
|
+
/**
|
|
153
|
+
* 複数要素のテキストを抽出
|
|
154
|
+
*/
|
|
155
|
+
async extractAllText(page, selector) {
|
|
156
|
+
return page.$$eval(selector, (els) => els.map((el) => el.textContent?.trim() || ''));
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
//# sourceMappingURL=ContentExtractor.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"ContentExtractor.js","sourceRoot":"","sources":["../../src/browser/ContentExtractor.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAWH;;GAEG;AACH,MAAM,OAAO,gBAAgB;IAC3B;;OAEG;IACH,KAAK,CAAC,OAAO,CACX,IAAiB,EACjB,UAA8B;QAE9B,cAAc;QACd,MAAM,OAAO,GAAG,MAAM,IAAI,CAAC,kBAAkB,CAAC,IAAI,CAAC,CAAC;QAEpD,YAAY;QACZ,MAAM,IAAI,GAAG,MAAM,IAAI,CAAC,WAAW,CAAC,IAAI,CAAC,CAAC;QAE1C,SAAS;QACT,MAAM,KAAK,GAAG,MAAM,IAAI,CAAC,YAAY,CAAC,IAAI,CAAC,CAAC;QAE5C,QAAQ;QACR,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,aAAa,CAAC,IAAI,CAAC,CAAC;QAE9C,WAAW;QACX,MAAM,QAAQ,GAAG,MAAM,IAAI,CAAC,eAAe,CAAC,IAAI,CAAC,CAAC;QAElD,SAAS;QACT,IAAI,aAAkD,CAAC;QACvD,IAAI,UAAU,IAAI,UAAU,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACxC,aAAa,GAAG,MAAM,IAAI,CAAC,iBAAiB,CAAC,IAAI,EAAE,UAAU,CAAC,CAAC;QACjE,CAAC;QAED,OAAO;YACL,OAAO;YACP,IAAI;YACJ,aAAa;YACb,KAAK;YACL,MAAM;YACN,QAAQ;SACT,CAAC;IACJ,CAAC;IAED;;OAEG;IACK,KAAK,CAAC,kBAAkB,CAAC,IAAiB;QAChD,OAAO,IAAI,CAAC,QAAQ,CAAC;;;;;;;;;KASpB,CAAC,CAAC;IACL,CAAC;IAED;;OAEG;IACK,KAAK,CAAC,WAAW,CAAC,IAAiB;QACzC,OAAO,IAAI,CAAC,QAAQ,CAAC,oCAAoC,CAAC,CAAC;IAC7D,CAAC;IAED;;OAEG;IACK,KAAK,CAAC,YAAY,CAAC,IAAiB;QAC1C,OAAO,IAAI,CAAC,QAAQ,CAAC;;;;;;;;;KASpB,CAAC,CAAC;IACL,CAAC;IAED;;OAEG;IACK,KAAK,CAAC,aAAa,CAAC,IAAiB;QAC3C,OAAO,IAAI,CAAC,QAAQ,CAAC;;;;;;;;;;KAUpB,CAAC,CAAC;IACL,CAAC;IAED;;OAEG;IACK,KAAK,CAAC,eAAe,CAAC,IAAiB;QAC7C,OAAO,IAAI,CAAC,QAAQ,CAAC;;;;;;;;;;;;;;;;;;;;;;;KAuBpB,CAAC,CAAC;IACL,CAAC;IAED;;OAEG;IACK,KAAK,CAAC,iBAAiB,CAC7B,IAAiB,EACjB,UAA6B;QAE7B,MAAM,aAAa,GAA4B,EAAE,CAAC;QAElD,KAAK,MAAM,SAAS,IAAI,UAAU,EAAE,CAAC;YACnC,IAAI,CAAC;gBACH,IAAI,SAAS,CAAC,QAAQ,EAAE,CAAC;oBACvB,aAAa,CAAC,SAAS,CAAC,IAAI,CAAC,GAAG,MAAM,IAAI,CAAC,MAAM,CAC/C,SAAS,CAAC,QAAQ,EAClB,CAAC,GAAU,EAAE,IAAS,EAAE,EAAE,CACxB,GAAG,CAAC,GAAG,CAAC,CAAC,EAAO,EAAE,EAAE,CAClB,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,YAAY,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC,WAAW,EAAE,IAAI,EAAE,CACtD,EACH,SAAS,CAAC,SAAS,CACpB,CAAC;gBACJ,CAAC;qBAAM,CAAC;oBACN,aAAa,CAAC,SAAS,CAAC,IAAI,CAAC,GAAG,MAAM,IAAI,CAAC,KAAK,CAC9C,SAAS,CAAC,QAAQ,EAClB,CAAC,EAAO,EAAE,IAAS,EAAE,EAAE,CACrB,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,YAAY,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC,WAAW,EAAE,IAAI,EAAE,EACvD,SAAS,CAAC,SAAS,CACpB,CAAC;gBACJ,CAAC;YACH,CAAC;YAAC,MAAM,CAAC;gBACP,aAAa,CAAC,SAAS,CAAC,IAAI,CAAC,GAAG,IAAI,CAAC;YACvC,CAAC;QACH,CAAC;QAED,OAAO,aAAa,CAAC;IACvB,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,WAAW,CAAC,IAAiB,EAAE,QAAgB;QACnD,OAAO,IAAI,CAAC,KAAK,CACf,QAAQ,EACR,CAAC,EAAO,EAAE,EAAE,CAAC,EAAE,CAAC,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAC1C,CAAC;IACJ,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,gBAAgB,CACpB,IAAiB,EACjB,QAAgB,EAChB,SAAiB;QAEjB,OAAO,IAAI,CAAC,KAAK,CACf,QAAQ,EACR,CAAC,EAAO,EAAE,IAAS,EAAE,EAAE,CAAC,EAAE,CAAC,YAAY,CAAC,IAAI,CAAC,EAC7C,SAAS,CACV,CAAC;IACJ,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,cAAc,CAAC,IAAiB,EAAE,QAAgB;QACtD,OAAO,IAAI,CAAC,MAAM,CAChB,QAAQ,EACR,CAAC,GAAU,EAAE,EAAE,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,EAAO,EAAE,EAAE,CAAC,EAAE,CAAC,WAAW,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC,CACnE,CAAC;IACJ,CAAC;CACF"}
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* SessionManager - ブラウザセッションを管理
|
|
3
|
+
*
|
|
4
|
+
* @requirement REQ-COLLECT-009
|
|
5
|
+
* @design DES-COLLECT-009-BrowserAutomation
|
|
6
|
+
*/
|
|
7
|
+
import type { SessionInfo } from './types.js';
|
|
8
|
+
import type { BrowserPage } from './ActionExecutor.js';
|
|
9
|
+
/**
|
|
10
|
+
* ブラウザセッションを管理
|
|
11
|
+
*/
|
|
12
|
+
export declare class SessionManager {
|
|
13
|
+
private sessions;
|
|
14
|
+
/**
|
|
15
|
+
* セッションを保存
|
|
16
|
+
*/
|
|
17
|
+
save(page: BrowserPage, name: string): Promise<SessionInfo>;
|
|
18
|
+
/**
|
|
19
|
+
* セッションを復元
|
|
20
|
+
*/
|
|
21
|
+
load(page: BrowserPage, name: string): Promise<void>;
|
|
22
|
+
/**
|
|
23
|
+
* セッションを削除
|
|
24
|
+
*/
|
|
25
|
+
delete(name: string): boolean;
|
|
26
|
+
/**
|
|
27
|
+
* セッション一覧を取得
|
|
28
|
+
*/
|
|
29
|
+
list(): string[];
|
|
30
|
+
/**
|
|
31
|
+
* セッションが存在するか
|
|
32
|
+
*/
|
|
33
|
+
has(name: string): boolean;
|
|
34
|
+
/**
|
|
35
|
+
* 全セッションをクリア
|
|
36
|
+
*/
|
|
37
|
+
clear(): void;
|
|
38
|
+
/**
|
|
39
|
+
* セッションを取得
|
|
40
|
+
*/
|
|
41
|
+
get(name: string): SessionInfo | undefined;
|
|
42
|
+
/**
|
|
43
|
+
* セッションをエクスポート
|
|
44
|
+
*/
|
|
45
|
+
export(name: string): string;
|
|
46
|
+
/**
|
|
47
|
+
* セッションをインポート
|
|
48
|
+
*/
|
|
49
|
+
import(name: string, data: string): void;
|
|
50
|
+
/**
|
|
51
|
+
* Cookieを取得
|
|
52
|
+
*/
|
|
53
|
+
private getCookies;
|
|
54
|
+
/**
|
|
55
|
+
* Cookieを設定
|
|
56
|
+
*/
|
|
57
|
+
private setCookies;
|
|
58
|
+
/**
|
|
59
|
+
* ストレージを取得
|
|
60
|
+
*/
|
|
61
|
+
private getStorage;
|
|
62
|
+
/**
|
|
63
|
+
* ストレージを設定
|
|
64
|
+
*/
|
|
65
|
+
private setStorage;
|
|
66
|
+
}
|
|
67
|
+
//# sourceMappingURL=SessionManager.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"SessionManager.d.ts","sourceRoot":"","sources":["../../src/browser/SessionManager.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,KAAK,EAAE,WAAW,EAAU,MAAM,YAAY,CAAC;AACtD,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,qBAAqB,CAAC;AAUvD;;GAEG;AACH,qBAAa,cAAc;IACzB,OAAO,CAAC,QAAQ,CAAuC;IAEvD;;OAEG;IACG,IAAI,CAAC,IAAI,EAAE,WAAW,EAAE,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,WAAW,CAAC;IAoBjE;;OAEG;IACG,IAAI,CAAC,IAAI,EAAE,WAAW,EAAE,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IAe1D;;OAEG;IACH,MAAM,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO;IAI7B;;OAEG;IACH,IAAI,IAAI,MAAM,EAAE;IAIhB;;OAEG;IACH,GAAG,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO;IAI1B;;OAEG;IACH,KAAK,IAAI,IAAI;IAIb;;OAEG;IACH,GAAG,CAAC,IAAI,EAAE,MAAM,GAAG,WAAW,GAAG,SAAS;IAI1C;;OAEG;IACH,MAAM,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM;IAQ5B;;OAEG;IACH,MAAM,CAAC,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,GAAG,IAAI;IAKxC;;OAEG;YACW,UAAU;IAWxB;;OAEG;YACW,UAAU;IAUxB;;OAEG;YACW,UAAU;IA4BxB;;OAEG;YACW,UAAU;CA2BzB"}
|
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* SessionManager - ブラウザセッションを管理
|
|
3
|
+
*
|
|
4
|
+
* @requirement REQ-COLLECT-009
|
|
5
|
+
* @design DES-COLLECT-009-BrowserAutomation
|
|
6
|
+
*/
|
|
7
|
+
/**
|
|
8
|
+
* ブラウザセッションを管理
|
|
9
|
+
*/
|
|
10
|
+
export class SessionManager {
|
|
11
|
+
sessions = new Map();
|
|
12
|
+
/**
|
|
13
|
+
* セッションを保存
|
|
14
|
+
*/
|
|
15
|
+
async save(page, name) {
|
|
16
|
+
const pageWithCookies = page;
|
|
17
|
+
// Cookieを取得
|
|
18
|
+
const cookies = await this.getCookies(pageWithCookies);
|
|
19
|
+
// ストレージを取得
|
|
20
|
+
const { localStorage, sessionStorage } = await this.getStorage(page);
|
|
21
|
+
const session = {
|
|
22
|
+
id: `${name}-${Date.now()}`,
|
|
23
|
+
cookies,
|
|
24
|
+
localStorage,
|
|
25
|
+
sessionStorage,
|
|
26
|
+
};
|
|
27
|
+
this.sessions.set(name, session);
|
|
28
|
+
return session;
|
|
29
|
+
}
|
|
30
|
+
/**
|
|
31
|
+
* セッションを復元
|
|
32
|
+
*/
|
|
33
|
+
async load(page, name) {
|
|
34
|
+
const session = this.sessions.get(name);
|
|
35
|
+
if (!session) {
|
|
36
|
+
throw new Error(`Session not found: ${name}`);
|
|
37
|
+
}
|
|
38
|
+
const pageWithCookies = page;
|
|
39
|
+
// Cookieを設定
|
|
40
|
+
await this.setCookies(pageWithCookies, session.cookies);
|
|
41
|
+
// ストレージを設定
|
|
42
|
+
await this.setStorage(page, session.localStorage, session.sessionStorage);
|
|
43
|
+
}
|
|
44
|
+
/**
|
|
45
|
+
* セッションを削除
|
|
46
|
+
*/
|
|
47
|
+
delete(name) {
|
|
48
|
+
return this.sessions.delete(name);
|
|
49
|
+
}
|
|
50
|
+
/**
|
|
51
|
+
* セッション一覧を取得
|
|
52
|
+
*/
|
|
53
|
+
list() {
|
|
54
|
+
return [...this.sessions.keys()];
|
|
55
|
+
}
|
|
56
|
+
/**
|
|
57
|
+
* セッションが存在するか
|
|
58
|
+
*/
|
|
59
|
+
has(name) {
|
|
60
|
+
return this.sessions.has(name);
|
|
61
|
+
}
|
|
62
|
+
/**
|
|
63
|
+
* 全セッションをクリア
|
|
64
|
+
*/
|
|
65
|
+
clear() {
|
|
66
|
+
this.sessions.clear();
|
|
67
|
+
}
|
|
68
|
+
/**
|
|
69
|
+
* セッションを取得
|
|
70
|
+
*/
|
|
71
|
+
get(name) {
|
|
72
|
+
return this.sessions.get(name);
|
|
73
|
+
}
|
|
74
|
+
/**
|
|
75
|
+
* セッションをエクスポート
|
|
76
|
+
*/
|
|
77
|
+
export(name) {
|
|
78
|
+
const session = this.sessions.get(name);
|
|
79
|
+
if (!session) {
|
|
80
|
+
throw new Error(`Session not found: ${name}`);
|
|
81
|
+
}
|
|
82
|
+
return JSON.stringify(session);
|
|
83
|
+
}
|
|
84
|
+
/**
|
|
85
|
+
* セッションをインポート
|
|
86
|
+
*/
|
|
87
|
+
import(name, data) {
|
|
88
|
+
const session = JSON.parse(data);
|
|
89
|
+
this.sessions.set(name, session);
|
|
90
|
+
}
|
|
91
|
+
/**
|
|
92
|
+
* Cookieを取得
|
|
93
|
+
*/
|
|
94
|
+
async getCookies(page) {
|
|
95
|
+
try {
|
|
96
|
+
if (typeof page.cookies === 'function') {
|
|
97
|
+
return await page.cookies();
|
|
98
|
+
}
|
|
99
|
+
return [];
|
|
100
|
+
}
|
|
101
|
+
catch {
|
|
102
|
+
return [];
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
/**
|
|
106
|
+
* Cookieを設定
|
|
107
|
+
*/
|
|
108
|
+
async setCookies(page, cookies) {
|
|
109
|
+
try {
|
|
110
|
+
if (typeof page.setCookie === 'function' && cookies.length > 0) {
|
|
111
|
+
await page.setCookie(...cookies);
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
catch {
|
|
115
|
+
// Cookie設定失敗を無視
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
/**
|
|
119
|
+
* ストレージを取得
|
|
120
|
+
*/
|
|
121
|
+
async getStorage(page) {
|
|
122
|
+
try {
|
|
123
|
+
return await page.evaluate(`
|
|
124
|
+
(() => {
|
|
125
|
+
const getStorageData = (storage) => {
|
|
126
|
+
const data = {};
|
|
127
|
+
for (let i = 0; i < storage.length; i++) {
|
|
128
|
+
const key = storage.key(i);
|
|
129
|
+
if (key) {
|
|
130
|
+
data[key] = storage.getItem(key) || '';
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
return data;
|
|
134
|
+
};
|
|
135
|
+
return {
|
|
136
|
+
localStorage: getStorageData(localStorage),
|
|
137
|
+
sessionStorage: getStorageData(sessionStorage),
|
|
138
|
+
};
|
|
139
|
+
})()
|
|
140
|
+
`);
|
|
141
|
+
}
|
|
142
|
+
catch {
|
|
143
|
+
return { localStorage: {}, sessionStorage: {} };
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
/**
|
|
147
|
+
* ストレージを設定
|
|
148
|
+
*/
|
|
149
|
+
async setStorage(page, localStorage, sessionStorage) {
|
|
150
|
+
try {
|
|
151
|
+
const localStorageJson = JSON.stringify(localStorage);
|
|
152
|
+
const sessionStorageJson = JSON.stringify(sessionStorage);
|
|
153
|
+
await page.evaluate(`
|
|
154
|
+
(() => {
|
|
155
|
+
const localData = ${localStorageJson};
|
|
156
|
+
const sessionData = ${sessionStorageJson};
|
|
157
|
+
|
|
158
|
+
Object.entries(localData).forEach(([key, value]) => {
|
|
159
|
+
localStorage.setItem(key, value);
|
|
160
|
+
});
|
|
161
|
+
|
|
162
|
+
Object.entries(sessionData).forEach(([key, value]) => {
|
|
163
|
+
sessionStorage.setItem(key, value);
|
|
164
|
+
});
|
|
165
|
+
})()
|
|
166
|
+
`);
|
|
167
|
+
}
|
|
168
|
+
catch {
|
|
169
|
+
// ストレージ設定失敗を無視
|
|
170
|
+
}
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
//# sourceMappingURL=SessionManager.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"SessionManager.js","sourceRoot":"","sources":["../../src/browser/SessionManager.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAaH;;GAEG;AACH,MAAM,OAAO,cAAc;IACjB,QAAQ,GAA6B,IAAI,GAAG,EAAE,CAAC;IAEvD;;OAEG;IACH,KAAK,CAAC,IAAI,CAAC,IAAiB,EAAE,IAAY;QACxC,MAAM,eAAe,GAAG,IAAuB,CAAC;QAEhD,YAAY;QACZ,MAAM,OAAO,GAAG,MAAM,IAAI,CAAC,UAAU,CAAC,eAAe,CAAC,CAAC;QAEvD,WAAW;QACX,MAAM,EAAE,YAAY,EAAE,cAAc,EAAE,GAAG,MAAM,IAAI,CAAC,UAAU,CAAC,IAAI,CAAC,CAAC;QAErE,MAAM,OAAO,GAAgB;YAC3B,EAAE,EAAE,GAAG,IAAI,IAAI,IAAI,CAAC,GAAG,EAAE,EAAE;YAC3B,OAAO;YACP,YAAY;YACZ,cAAc;SACf,CAAC;QAEF,IAAI,CAAC,QAAQ,CAAC,GAAG,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC;QACjC,OAAO,OAAO,CAAC;IACjB,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,IAAI,CAAC,IAAiB,EAAE,IAAY;QACxC,MAAM,OAAO,GAAG,IAAI,CAAC,QAAQ,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;QACxC,IAAI,CAAC,OAAO,EAAE,CAAC;YACb,MAAM,IAAI,KAAK,CAAC,sBAAsB,IAAI,EAAE,CAAC,CAAC;QAChD,CAAC;QAED,MAAM,eAAe,GAAG,IAAuB,CAAC;QAEhD,YAAY;QACZ,MAAM,IAAI,CAAC,UAAU,CAAC,eAAe,EAAE,OAAO,CAAC,OAAO,CAAC,CAAC;QAExD,WAAW;QACX,MAAM,IAAI,CAAC,UAAU,CAAC,IAAI,EAAE,OAAO,CAAC,YAAY,EAAE,OAAO,CAAC,cAAc,CAAC,CAAC;IAC5E,CAAC;IAED;;OAEG;IACH,MAAM,CAAC,IAAY;QACjB,OAAO,IAAI,CAAC,QAAQ,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC;IACpC,CAAC;IAED;;OAEG;IACH,IAAI;QACF,OAAO,CAAC,GAAG,IAAI,CAAC,QAAQ,CAAC,IAAI,EAAE,CAAC,CAAC;IACnC,CAAC;IAED;;OAEG;IACH,GAAG,CAAC,IAAY;QACd,OAAO,IAAI,CAAC,QAAQ,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;IACjC,CAAC;IAED;;OAEG;IACH,KAAK;QACH,IAAI,CAAC,QAAQ,CAAC,KAAK,EAAE,CAAC;IACxB,CAAC;IAED;;OAEG;IACH,GAAG,CAAC,IAAY;QACd,OAAO,IAAI,CAAC,QAAQ,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;IACjC,CAAC;IAED;;OAEG;IACH,MAAM,CAAC,IAAY;QACjB,MAAM,OAAO,GAAG,IAAI,CAAC,QAAQ,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;QACxC,IAAI,CAAC,OAAO,EAAE,CAAC;YACb,MAAM,IAAI,KAAK,CAAC,sBAAsB,IAAI,EAAE,CAAC,CAAC;QAChD,CAAC;QACD,OAAO,IAAI,CAAC,SAAS,CAAC,OAAO,CAAC,CAAC;IACjC,CAAC;IAED;;OAEG;IACH,MAAM,CAAC,IAAY,EAAE,IAAY;QAC/B,MAAM,OAAO,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAgB,CAAC;QAChD,IAAI,CAAC,QAAQ,CAAC,GAAG,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC;IACnC,CAAC;IAED;;OAEG;IACK,KAAK,CAAC,UAAU,CAAC,IAAqB;QAC5C,IAAI,CAAC;YACH,IAAI,OAAO,IAAI,CAAC,OAAO,KAAK,UAAU,EAAE,CAAC;gBACvC,OAAO,MAAM,IAAI,CAAC,OAAO,EAAE,CAAC;YAC9B,CAAC;YACD,OAAO,EAAE,CAAC;QACZ,CAAC;QAAC,MAAM,CAAC;YACP,OAAO,EAAE,CAAC;QACZ,CAAC;IACH,CAAC;IAED;;OAEG;IACK,KAAK,CAAC,UAAU,CAAC,IAAqB,EAAE,OAAiB;QAC/D,IAAI,CAAC;YACH,IAAI,OAAO,IAAI,CAAC,SAAS,KAAK,UAAU,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBAC/D,MAAM,IAAI,CAAC,SAAS,CAAC,GAAG,OAAO,CAAC,CAAC;YACnC,CAAC;QACH,CAAC;QAAC,MAAM,CAAC;YACP,gBAAgB;QAClB,CAAC;IACH,CAAC;IAED;;OAEG;IACK,KAAK,CAAC,UAAU,CAAC,IAAiB;QAIxC,IAAI,CAAC;YACH,OAAO,MAAM,IAAI,CAAC,QAAQ,CAAC;;;;;;;;;;;;;;;;;OAiB1B,CAAC,CAAC;QACL,CAAC;QAAC,MAAM,CAAC;YACP,OAAO,EAAE,YAAY,EAAE,EAAE,EAAE,cAAc,EAAE,EAAE,EAAE,CAAC;QAClD,CAAC;IACH,CAAC;IAED;;OAEG;IACK,KAAK,CAAC,UAAU,CACtB,IAAiB,EACjB,YAAoC,EACpC,cAAsC;QAEtC,IAAI,CAAC;YACH,MAAM,gBAAgB,GAAG,IAAI,CAAC,SAAS,CAAC,YAAY,CAAC,CAAC;YACtD,MAAM,kBAAkB,GAAG,IAAI,CAAC,SAAS,CAAC,cAAc,CAAC,CAAC;YAE1D,MAAM,IAAI,CAAC,QAAQ,CAAC;;8BAEI,gBAAgB;gCACd,kBAAkB;;;;;;;;;;OAU3C,CAAC,CAAC;QACL,CAAC;QAAC,MAAM,CAAC;YACP,eAAe;QACjB,CAAC;IACH,CAAC;CACF"}
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Browser Automation モジュール
|
|
3
|
+
*
|
|
4
|
+
* @module @nahisaho/katashiro-collector/browser
|
|
5
|
+
* @requirement REQ-COLLECT-009
|
|
6
|
+
* @design DES-COLLECT-009-BrowserAutomation
|
|
7
|
+
*/
|
|
8
|
+
export type { BrowserConfig, Viewport, ProxyConfig, ResourceLimits, NavigationOptions, WaitUntilOption, ClickOptions, TypeOptions, ScrollOptions, WaitForSelectorOptions, ScreenshotOptions, PdfOptions, BrowserAction, NavigateAction, ClickAction, TypeAction, WaitAction, ScrollAction, SelectAction, HoverAction, ScreenshotAction, PdfAction, EvaluateAction, WaitForSelectorAction, ExtractAction, ActionResult, PageScrapeResult, PageLink, PageImage, PageMetadata, Cookie, SessionInfo, AuthCredentials, LoginSelectors, BrowserScript, ExtractorConfig, ExtractionResult, } from './types.js';
|
|
9
|
+
export { DEFAULT_BROWSER_CONFIG } from './types.js';
|
|
10
|
+
export { ActionExecutor } from './ActionExecutor.js';
|
|
11
|
+
export type { BrowserPage } from './ActionExecutor.js';
|
|
12
|
+
export { ContentExtractor } from './ContentExtractor.js';
|
|
13
|
+
export { SessionManager } from './SessionManager.js';
|
|
14
|
+
export { BrowserAutomation, BrowserAutomationError, } from './BrowserAutomation.js';
|
|
15
|
+
export type { Browser, PuppeteerLauncher, BrowserAutomationErrorCode, } from './BrowserAutomation.js';
|
|
16
|
+
export { BrowserAutomation as default } from './BrowserAutomation.js';
|
|
17
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/browser/index.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAGH,YAAY,EACV,aAAa,EACb,QAAQ,EACR,WAAW,EACX,cAAc,EACd,iBAAiB,EACjB,eAAe,EACf,YAAY,EACZ,WAAW,EACX,aAAa,EACb,sBAAsB,EACtB,iBAAiB,EACjB,UAAU,EACV,aAAa,EACb,cAAc,EACd,WAAW,EACX,UAAU,EACV,UAAU,EACV,YAAY,EACZ,YAAY,EACZ,WAAW,EACX,gBAAgB,EAChB,SAAS,EACT,cAAc,EACd,qBAAqB,EACrB,aAAa,EACb,YAAY,EACZ,gBAAgB,EAChB,QAAQ,EACR,SAAS,EACT,YAAY,EACZ,MAAM,EACN,WAAW,EACX,eAAe,EACf,cAAc,EACd,aAAa,EACb,eAAe,EACf,gBAAgB,GACjB,MAAM,YAAY,CAAC;AAGpB,OAAO,EAAE,sBAAsB,EAAE,MAAM,YAAY,CAAC;AAGpD,OAAO,EAAE,cAAc,EAAE,MAAM,qBAAqB,CAAC;AACrD,YAAY,EAAE,WAAW,EAAE,MAAM,qBAAqB,CAAC;AAEvD,OAAO,EAAE,gBAAgB,EAAE,MAAM,uBAAuB,CAAC;AAEzD,OAAO,EAAE,cAAc,EAAE,MAAM,qBAAqB,CAAC;AAErD,OAAO,EACL,iBAAiB,EACjB,sBAAsB,GACvB,MAAM,wBAAwB,CAAC;AAChC,YAAY,EACV,OAAO,EACP,iBAAiB,EACjB,0BAA0B,GAC3B,MAAM,wBAAwB,CAAC;AAGhC,OAAO,EAAE,iBAAiB,IAAI,OAAO,EAAE,MAAM,wBAAwB,CAAC"}
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Browser Automation モジュール
|
|
3
|
+
*
|
|
4
|
+
* @module @nahisaho/katashiro-collector/browser
|
|
5
|
+
* @requirement REQ-COLLECT-009
|
|
6
|
+
* @design DES-COLLECT-009-BrowserAutomation
|
|
7
|
+
*/
|
|
8
|
+
// 定数のエクスポート
|
|
9
|
+
export { DEFAULT_BROWSER_CONFIG } from './types.js';
|
|
10
|
+
// クラスのエクスポート
|
|
11
|
+
export { ActionExecutor } from './ActionExecutor.js';
|
|
12
|
+
export { ContentExtractor } from './ContentExtractor.js';
|
|
13
|
+
export { SessionManager } from './SessionManager.js';
|
|
14
|
+
export { BrowserAutomation, BrowserAutomationError, } from './BrowserAutomation.js';
|
|
15
|
+
// デフォルトエクスポート
|
|
16
|
+
export { BrowserAutomation as default } from './BrowserAutomation.js';
|
|
17
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/browser/index.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AA2CH,YAAY;AACZ,OAAO,EAAE,sBAAsB,EAAE,MAAM,YAAY,CAAC;AAEpD,aAAa;AACb,OAAO,EAAE,cAAc,EAAE,MAAM,qBAAqB,CAAC;AAGrD,OAAO,EAAE,gBAAgB,EAAE,MAAM,uBAAuB,CAAC;AAEzD,OAAO,EAAE,cAAc,EAAE,MAAM,qBAAqB,CAAC;AAErD,OAAO,EACL,iBAAiB,EACjB,sBAAsB,GACvB,MAAM,wBAAwB,CAAC;AAOhC,cAAc;AACd,OAAO,EAAE,iBAAiB,IAAI,OAAO,EAAE,MAAM,wBAAwB,CAAC"}
|