@nahisaho/katashiro-collector 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/api/api-client.d.ts +70 -0
- package/dist/api/api-client.d.ts.map +1 -0
- package/dist/api/api-client.js +132 -0
- package/dist/api/api-client.js.map +1 -0
- package/dist/api/index.d.ts +5 -0
- package/dist/api/index.d.ts.map +1 -0
- package/dist/api/index.js +5 -0
- package/dist/api/index.js.map +1 -0
- package/dist/feed/feed-reader.d.ts +70 -0
- package/dist/feed/feed-reader.d.ts.map +1 -0
- package/dist/feed/feed-reader.js +272 -0
- package/dist/feed/feed-reader.js.map +1 -0
- package/dist/feed/index.d.ts +5 -0
- package/dist/feed/index.d.ts.map +1 -0
- package/dist/feed/index.js +5 -0
- package/dist/feed/index.js.map +1 -0
- package/dist/index.d.ts +17 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +16 -0
- package/dist/index.js.map +1 -0
- package/dist/interfaces.d.ts +53 -0
- package/dist/interfaces.d.ts.map +1 -0
- package/dist/interfaces.js +9 -0
- package/dist/interfaces.js.map +1 -0
- package/dist/media/index.d.ts +5 -0
- package/dist/media/index.d.ts.map +1 -0
- package/dist/media/index.js +5 -0
- package/dist/media/index.js.map +1 -0
- package/dist/media/media-extractor.d.ts +74 -0
- package/dist/media/media-extractor.d.ts.map +1 -0
- package/dist/media/media-extractor.js +287 -0
- package/dist/media/media-extractor.js.map +1 -0
- package/dist/scraper/index.d.ts +5 -0
- package/dist/scraper/index.d.ts.map +1 -0
- package/dist/scraper/index.js +5 -0
- package/dist/scraper/index.js.map +1 -0
- package/dist/scraper/web-scraper.d.ts +48 -0
- package/dist/scraper/web-scraper.d.ts.map +1 -0
- package/dist/scraper/web-scraper.js +144 -0
- package/dist/scraper/web-scraper.js.map +1 -0
- package/dist/types.d.ts +82 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +9 -0
- package/dist/types.js.map +1 -0
- package/dist/web-search/index.d.ts +8 -0
- package/dist/web-search/index.d.ts.map +1 -0
- package/dist/web-search/index.js +8 -0
- package/dist/web-search/index.js.map +1 -0
- package/dist/web-search/web-search-client.d.ts +44 -0
- package/dist/web-search/web-search-client.d.ts.map +1 -0
- package/dist/web-search/web-search-client.js +131 -0
- package/dist/web-search/web-search-client.js.map +1 -0
- package/dist/youtube/index.d.ts +5 -0
- package/dist/youtube/index.d.ts.map +1 -0
- package/dist/youtube/index.js +5 -0
- package/dist/youtube/index.js.map +1 -0
- package/dist/youtube/youtube-transcript.d.ts +57 -0
- package/dist/youtube/youtube-transcript.d.ts.map +1 -0
- package/dist/youtube/youtube-transcript.js +228 -0
- package/dist/youtube/youtube-transcript.js.map +1 -0
- package/package.json +44 -0
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Collectorインターフェース定義
|
|
3
|
+
*
|
|
4
|
+
* @requirement REQ-COLLECT-001 ~ REQ-COLLECT-009
|
|
5
|
+
* @design DES-KATASHIRO-001 §2.2 Collector Container
|
|
6
|
+
* @task TSK-010 ~ TSK-015
|
|
7
|
+
*/
|
|
8
|
+
import type { Result, SearchResult, SearchQuery } from '@nahisaho/katashiro-core';
|
|
9
|
+
import type { WebSearchOptions, ScrapingOptions, ScrapingResult, FeedItem, TranscriptSegment, MediaMetadata } from './types.js';
|
|
10
|
+
/**
|
|
11
|
+
* Web検索クライアントインターフェース
|
|
12
|
+
* @requirement REQ-COLLECT-001
|
|
13
|
+
*/
|
|
14
|
+
export interface IWebSearchClient {
|
|
15
|
+
search(query: SearchQuery, options?: WebSearchOptions): Promise<Result<SearchResult[], Error>>;
|
|
16
|
+
}
|
|
17
|
+
/**
|
|
18
|
+
* Webスクレイパーインターフェース
|
|
19
|
+
* @requirement REQ-COLLECT-002
|
|
20
|
+
*/
|
|
21
|
+
export interface IWebScraper {
|
|
22
|
+
scrape(url: string, options?: ScrapingOptions): Promise<Result<ScrapingResult, Error>>;
|
|
23
|
+
}
|
|
24
|
+
/**
|
|
25
|
+
* フィードリーダーインターフェース
|
|
26
|
+
* @requirement REQ-COLLECT-004
|
|
27
|
+
*/
|
|
28
|
+
export interface IFeedReader {
|
|
29
|
+
fetch(feedUrl: string): Promise<Result<FeedItem[], Error>>;
|
|
30
|
+
}
|
|
31
|
+
/**
|
|
32
|
+
* APIクライアントインターフェース
|
|
33
|
+
* @requirement REQ-COLLECT-005
|
|
34
|
+
*/
|
|
35
|
+
export interface IAPIClient {
|
|
36
|
+
get<T>(endpoint: string, params?: Record<string, string>): Promise<Result<T, Error>>;
|
|
37
|
+
post<T>(endpoint: string, body: unknown): Promise<Result<T, Error>>;
|
|
38
|
+
}
|
|
39
|
+
/**
|
|
40
|
+
* YouTube文字起こしインターフェース
|
|
41
|
+
* @requirement REQ-COLLECT-003
|
|
42
|
+
*/
|
|
43
|
+
export interface IYouTubeTranscript {
|
|
44
|
+
getTranscript(videoId: string): Promise<Result<TranscriptSegment[], Error>>;
|
|
45
|
+
}
|
|
46
|
+
/**
|
|
47
|
+
* メディア抽出インターフェース
|
|
48
|
+
* @requirement REQ-COLLECT-006
|
|
49
|
+
*/
|
|
50
|
+
export interface IMediaExtractor {
|
|
51
|
+
extractMetadata(url: string): Promise<Result<MediaMetadata, Error>>;
|
|
52
|
+
}
|
|
53
|
+
//# sourceMappingURL=interfaces.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"interfaces.d.ts","sourceRoot":"","sources":["../src/interfaces.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAEH,OAAO,KAAK,EAAE,MAAM,EAAE,YAAY,EAAE,WAAW,EAAE,MAAM,0BAA0B,CAAC;AAClF,OAAO,KAAK,EACV,gBAAgB,EAChB,eAAe,EACf,cAAc,EACd,QAAQ,EACR,iBAAiB,EACjB,aAAa,EACd,MAAM,YAAY,CAAC;AAEpB;;;GAGG;AACH,MAAM,WAAW,gBAAgB;IAC/B,MAAM,CACJ,KAAK,EAAE,WAAW,EAClB,OAAO,CAAC,EAAE,gBAAgB,GACzB,OAAO,CAAC,MAAM,CAAC,YAAY,EAAE,EAAE,KAAK,CAAC,CAAC,CAAC;CAC3C;AAED;;;GAGG;AACH,MAAM,WAAW,WAAW;IAC1B,MAAM,CAAC,GAAG,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,eAAe,GAAG,OAAO,CAAC,MAAM,CAAC,cAAc,EAAE,KAAK,CAAC,CAAC,CAAC;CACxF;AAED;;;GAGG;AACH,MAAM,WAAW,WAAW;IAC1B,KAAK,CAAC,OAAO,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,QAAQ,EAAE,EAAE,KAAK,CAAC,CAAC,CAAC;CAC5D;AAED;;;GAGG;AACH,MAAM,WAAW,UAAU;IACzB,GAAG,CAAC,CAAC,EAAE,QAAQ,EAAE,MAAM,EAAE,MAAM,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,EAAE,KAAK,CAAC,CAAC,CAAC;IACrF,IAAI,CAAC,CAAC,EAAE,QAAQ,EAAE,MAAM,EAAE,IAAI,EAAE,OAAO,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,EAAE,KAAK,CAAC,CAAC,CAAC;CACrE;AAED;;;GAGG;AACH,MAAM,WAAW,kBAAkB;IACjC,aAAa,CAAC,OAAO,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,iBAAiB,EAAE,EAAE,KAAK,CAAC,CAAC,CAAC;CAC7E;AAED;;;GAGG;AACH,MAAM,WAAW,eAAe;IAC9B,eAAe,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,aAAa,EAAE,KAAK,CAAC,CAAC,CAAC;CACrE"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"interfaces.js","sourceRoot":"","sources":["../src/interfaces.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/media/index.ts"],"names":[],"mappings":"AAAA;;GAEG;AACH,OAAO,EAAE,cAAc,EAAE,KAAK,cAAc,EAAE,MAAM,sBAAsB,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/media/index.ts"],"names":[],"mappings":"AAAA;;GAEG;AACH,OAAO,EAAE,cAAc,EAAuB,MAAM,sBAAsB,CAAC"}
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* MediaExtractor - メディア抽出・メタデータ取得
|
|
3
|
+
*
|
|
4
|
+
* @requirement REQ-COLLECT-006
|
|
5
|
+
* @design DES-KATASHIRO-001 §2.2 Collector Container
|
|
6
|
+
* @task TSK-015
|
|
7
|
+
*/
|
|
8
|
+
import { type Result } from '@nahisaho/katashiro-core';
|
|
9
|
+
import type { IMediaExtractor, MediaMetadata } from '../index.js';
|
|
10
|
+
type MediaType = 'image' | 'video' | 'audio';
|
|
11
|
+
/**
|
|
12
|
+
* HTMLから抽出されたメディア
|
|
13
|
+
*/
|
|
14
|
+
export interface ExtractedMedia {
|
|
15
|
+
readonly images: string[];
|
|
16
|
+
readonly videos: string[];
|
|
17
|
+
readonly audio: string[];
|
|
18
|
+
}
|
|
19
|
+
/**
|
|
20
|
+
* メディア抽出・メタデータ取得実装
|
|
21
|
+
*/
|
|
22
|
+
export declare class MediaExtractor implements IMediaExtractor {
|
|
23
|
+
private readonly userAgent;
|
|
24
|
+
/**
|
|
25
|
+
* URLからメディアメタデータを抽出
|
|
26
|
+
*/
|
|
27
|
+
extractMetadata(url: string): Promise<Result<MediaMetadata, Error>>;
|
|
28
|
+
/**
|
|
29
|
+
* GETリクエストでメタデータを抽出(HEADが失敗した場合)
|
|
30
|
+
*/
|
|
31
|
+
private extractMetadataWithGet;
|
|
32
|
+
/**
|
|
33
|
+
* レスポンスからメタデータをパース
|
|
34
|
+
*/
|
|
35
|
+
private parseMetadataFromResponse;
|
|
36
|
+
/**
|
|
37
|
+
* Content-Typeからメディアタイプを検出
|
|
38
|
+
*/
|
|
39
|
+
detectMediaType(contentType: string): MediaType | null;
|
|
40
|
+
/**
|
|
41
|
+
* URLからメディアタイプを検出
|
|
42
|
+
*/
|
|
43
|
+
private detectMediaTypeFromUrl;
|
|
44
|
+
/**
|
|
45
|
+
* Content-Typeからフォーマットを抽出
|
|
46
|
+
*/
|
|
47
|
+
extractFormat(contentType: string): string | null;
|
|
48
|
+
/**
|
|
49
|
+
* URLから拡張子を抽出
|
|
50
|
+
*/
|
|
51
|
+
extractFormatFromUrl(url: string): string | null;
|
|
52
|
+
/**
|
|
53
|
+
* HTMLからメディアURLを抽出
|
|
54
|
+
*/
|
|
55
|
+
extractFromHtml(html: string, baseUrl: string): ExtractedMedia;
|
|
56
|
+
/**
|
|
57
|
+
* HTMLから画像URLを抽出
|
|
58
|
+
*/
|
|
59
|
+
private extractImages;
|
|
60
|
+
/**
|
|
61
|
+
* HTMLから動画URLを抽出
|
|
62
|
+
*/
|
|
63
|
+
private extractVideos;
|
|
64
|
+
/**
|
|
65
|
+
* HTMLから音声URLを抽出
|
|
66
|
+
*/
|
|
67
|
+
private extractAudio;
|
|
68
|
+
/**
|
|
69
|
+
* 相対URLを絶対URLに変換
|
|
70
|
+
*/
|
|
71
|
+
private resolveUrl;
|
|
72
|
+
}
|
|
73
|
+
export {};
|
|
74
|
+
//# sourceMappingURL=media-extractor.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"media-extractor.d.ts","sourceRoot":"","sources":["../../src/media/media-extractor.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAEH,OAAO,EACL,KAAK,MAAM,EAMZ,MAAM,0BAA0B,CAAC;AAClC,OAAO,KAAK,EAAE,eAAe,EAAE,aAAa,EAAE,MAAM,aAAa,CAAC;AAElE,KAAK,SAAS,GAAG,OAAO,GAAG,OAAO,GAAG,OAAO,CAAC;AAE7C;;GAEG;AACH,MAAM,WAAW,cAAc;IAC7B,QAAQ,CAAC,MAAM,EAAE,MAAM,EAAE,CAAC;IAC1B,QAAQ,CAAC,MAAM,EAAE,MAAM,EAAE,CAAC;IAC1B,QAAQ,CAAC,KAAK,EAAE,MAAM,EAAE,CAAC;CAC1B;AAED;;GAEG;AACH,qBAAa,cAAe,YAAW,eAAe;IACpD,OAAO,CAAC,QAAQ,CAAC,SAAS,CAA+C;IAEzE;;OAEG;IACG,eAAe,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,aAAa,EAAE,KAAK,CAAC,CAAC;IA0BzE;;OAEG;YACW,sBAAsB;IAsBpC;;OAEG;IACH,OAAO,CAAC,yBAAyB;IA4BjC;;OAEG;IACH,eAAe,CAAC,WAAW,EAAE,MAAM,GAAG,SAAS,GAAG,IAAI;IAgBtD;;OAEG;IACH,OAAO,CAAC,sBAAsB;IAe9B;;OAEG;IACH,aAAa,CAAC,WAAW,EAAE,MAAM,GAAG,MAAM,GAAG,IAAI;IAKjD;;OAEG;IACH,oBAAoB,CAAC,GAAG,EAAE,MAAM,GAAG,MAAM,GAAG,IAAI;IAUhD;;OAEG;IACH,eAAe,CAAC,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,GAAG,cAAc;IAQ9D;;OAEG;IACH,OAAO,CAAC,aAAa;IAkErB;;OAEG;IACH,OAAO,CAAC,aAAa;IAgCrB;;OAEG;IACH,OAAO,CAAC,YAAY;IAgCpB;;OAEG;IACH,OAAO,CAAC,UAAU;CAUnB"}
|
|
@@ -0,0 +1,287 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* MediaExtractor - メディア抽出・メタデータ取得
|
|
3
|
+
*
|
|
4
|
+
* @requirement REQ-COLLECT-006
|
|
5
|
+
* @design DES-KATASHIRO-001 §2.2 Collector Container
|
|
6
|
+
* @task TSK-015
|
|
7
|
+
*/
|
|
8
|
+
import { ok, err, formatTimestamp, validateUrl, isErr, } from '@nahisaho/katashiro-core';
|
|
9
|
+
/**
|
|
10
|
+
* メディア抽出・メタデータ取得実装
|
|
11
|
+
*/
|
|
12
|
+
export class MediaExtractor {
|
|
13
|
+
userAgent = 'Mozilla/5.0 (compatible; KATASHIRO/0.1.0)';
|
|
14
|
+
/**
|
|
15
|
+
* URLからメディアメタデータを抽出
|
|
16
|
+
*/
|
|
17
|
+
async extractMetadata(url) {
|
|
18
|
+
const urlValidation = validateUrl(url);
|
|
19
|
+
if (isErr(urlValidation)) {
|
|
20
|
+
return err(new Error(`Invalid URL: ${urlValidation.error}`));
|
|
21
|
+
}
|
|
22
|
+
try {
|
|
23
|
+
const response = await fetch(url, {
|
|
24
|
+
method: 'HEAD',
|
|
25
|
+
headers: {
|
|
26
|
+
'User-Agent': this.userAgent,
|
|
27
|
+
},
|
|
28
|
+
});
|
|
29
|
+
if (!response.ok) {
|
|
30
|
+
// Try GET if HEAD fails
|
|
31
|
+
return this.extractMetadataWithGet(url);
|
|
32
|
+
}
|
|
33
|
+
return this.parseMetadataFromResponse(url, response);
|
|
34
|
+
}
|
|
35
|
+
catch (error) {
|
|
36
|
+
const message = error instanceof Error ? error.message : 'Unknown error';
|
|
37
|
+
return err(new Error(`Metadata extraction error: ${message}`));
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
/**
|
|
41
|
+
* GETリクエストでメタデータを抽出(HEADが失敗した場合)
|
|
42
|
+
*/
|
|
43
|
+
async extractMetadataWithGet(url) {
|
|
44
|
+
try {
|
|
45
|
+
const response = await fetch(url, {
|
|
46
|
+
headers: {
|
|
47
|
+
'User-Agent': this.userAgent,
|
|
48
|
+
Range: 'bytes=0-0', // Minimize download
|
|
49
|
+
},
|
|
50
|
+
});
|
|
51
|
+
if (!response.ok && response.status !== 206) {
|
|
52
|
+
return err(new Error(`HTTP error: ${response.status}`));
|
|
53
|
+
}
|
|
54
|
+
return this.parseMetadataFromResponse(url, response);
|
|
55
|
+
}
|
|
56
|
+
catch (error) {
|
|
57
|
+
const message = error instanceof Error ? error.message : 'Unknown error';
|
|
58
|
+
return err(new Error(`Metadata extraction error: ${message}`));
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
/**
|
|
62
|
+
* レスポンスからメタデータをパース
|
|
63
|
+
*/
|
|
64
|
+
parseMetadataFromResponse(url, response) {
|
|
65
|
+
const contentType = response.headers.get('content-type') ?? '';
|
|
66
|
+
const contentLength = response.headers.get('content-length');
|
|
67
|
+
const mediaType = this.detectMediaType(contentType);
|
|
68
|
+
if (!mediaType) {
|
|
69
|
+
// Try to detect from URL
|
|
70
|
+
const urlType = this.detectMediaTypeFromUrl(url);
|
|
71
|
+
if (!urlType) {
|
|
72
|
+
return err(new Error('Unable to determine media type'));
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
const format = this.extractFormat(contentType) ?? this.extractFormatFromUrl(url);
|
|
76
|
+
return ok({
|
|
77
|
+
url,
|
|
78
|
+
type: mediaType ?? this.detectMediaTypeFromUrl(url) ?? 'image',
|
|
79
|
+
format: format ?? undefined,
|
|
80
|
+
size: contentLength ? parseInt(contentLength, 10) : undefined,
|
|
81
|
+
fetchedAt: formatTimestamp(),
|
|
82
|
+
});
|
|
83
|
+
}
|
|
84
|
+
/**
|
|
85
|
+
* Content-Typeからメディアタイプを検出
|
|
86
|
+
*/
|
|
87
|
+
detectMediaType(contentType) {
|
|
88
|
+
const lower = contentType.toLowerCase();
|
|
89
|
+
if (lower.startsWith('image/')) {
|
|
90
|
+
return 'image';
|
|
91
|
+
}
|
|
92
|
+
if (lower.startsWith('video/')) {
|
|
93
|
+
return 'video';
|
|
94
|
+
}
|
|
95
|
+
if (lower.startsWith('audio/')) {
|
|
96
|
+
return 'audio';
|
|
97
|
+
}
|
|
98
|
+
return null;
|
|
99
|
+
}
|
|
100
|
+
/**
|
|
101
|
+
* URLからメディアタイプを検出
|
|
102
|
+
*/
|
|
103
|
+
detectMediaTypeFromUrl(url) {
|
|
104
|
+
const ext = this.extractFormatFromUrl(url)?.toLowerCase();
|
|
105
|
+
if (!ext)
|
|
106
|
+
return null;
|
|
107
|
+
const imageExtensions = ['jpg', 'jpeg', 'png', 'gif', 'webp', 'svg', 'bmp', 'ico'];
|
|
108
|
+
const videoExtensions = ['mp4', 'webm', 'mov', 'avi', 'mkv', 'flv', 'wmv'];
|
|
109
|
+
const audioExtensions = ['mp3', 'wav', 'ogg', 'flac', 'aac', 'm4a', 'wma'];
|
|
110
|
+
if (imageExtensions.includes(ext))
|
|
111
|
+
return 'image';
|
|
112
|
+
if (videoExtensions.includes(ext))
|
|
113
|
+
return 'video';
|
|
114
|
+
if (audioExtensions.includes(ext))
|
|
115
|
+
return 'audio';
|
|
116
|
+
return null;
|
|
117
|
+
}
|
|
118
|
+
/**
|
|
119
|
+
* Content-Typeからフォーマットを抽出
|
|
120
|
+
*/
|
|
121
|
+
extractFormat(contentType) {
|
|
122
|
+
const match = contentType.match(/^(?:image|video|audio)\/([^;]+)/);
|
|
123
|
+
return match?.[1] ?? null;
|
|
124
|
+
}
|
|
125
|
+
/**
|
|
126
|
+
* URLから拡張子を抽出
|
|
127
|
+
*/
|
|
128
|
+
extractFormatFromUrl(url) {
|
|
129
|
+
try {
|
|
130
|
+
const pathname = new URL(url).pathname;
|
|
131
|
+
const match = pathname.match(/\.([a-zA-Z0-9]+)$/);
|
|
132
|
+
return match?.[1] ?? null;
|
|
133
|
+
}
|
|
134
|
+
catch {
|
|
135
|
+
return null;
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
/**
|
|
139
|
+
* HTMLからメディアURLを抽出
|
|
140
|
+
*/
|
|
141
|
+
extractFromHtml(html, baseUrl) {
|
|
142
|
+
const images = this.extractImages(html, baseUrl);
|
|
143
|
+
const videos = this.extractVideos(html, baseUrl);
|
|
144
|
+
const audio = this.extractAudio(html, baseUrl);
|
|
145
|
+
return { images, videos, audio };
|
|
146
|
+
}
|
|
147
|
+
/**
|
|
148
|
+
* HTMLから画像URLを抽出
|
|
149
|
+
*/
|
|
150
|
+
extractImages(html, baseUrl) {
|
|
151
|
+
const images = [];
|
|
152
|
+
const seen = new Set();
|
|
153
|
+
// img src
|
|
154
|
+
const imgSrcRegex = /<img[^>]+src=["']([^"']+)["']/gi;
|
|
155
|
+
let match;
|
|
156
|
+
while ((match = imgSrcRegex.exec(html)) !== null) {
|
|
157
|
+
if (match[1]) {
|
|
158
|
+
const url = this.resolveUrl(match[1], baseUrl);
|
|
159
|
+
if (!seen.has(url)) {
|
|
160
|
+
seen.add(url);
|
|
161
|
+
images.push(url);
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
// img srcset
|
|
166
|
+
const srcsetRegex = /<img[^>]+srcset=["']([^"']+)["']/gi;
|
|
167
|
+
while ((match = srcsetRegex.exec(html)) !== null) {
|
|
168
|
+
if (match[1]) {
|
|
169
|
+
const srcset = match[1];
|
|
170
|
+
const urls = srcset.split(',').map((s) => s.trim().split(/\s+/)[0]);
|
|
171
|
+
for (const src of urls) {
|
|
172
|
+
if (src) {
|
|
173
|
+
const url = this.resolveUrl(src, baseUrl);
|
|
174
|
+
if (!seen.has(url)) {
|
|
175
|
+
seen.add(url);
|
|
176
|
+
images.push(url);
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
}
|
|
180
|
+
}
|
|
181
|
+
}
|
|
182
|
+
// picture source
|
|
183
|
+
const sourceRegex = /<source[^>]+srcset=["']([^"']+)["']/gi;
|
|
184
|
+
while ((match = sourceRegex.exec(html)) !== null) {
|
|
185
|
+
if (match[1]) {
|
|
186
|
+
const srcsetParts = match[1].split(',')[0]?.trim().split(/\s+/);
|
|
187
|
+
const src = srcsetParts?.[0];
|
|
188
|
+
if (src) {
|
|
189
|
+
const url = this.resolveUrl(src, baseUrl);
|
|
190
|
+
if (!seen.has(url)) {
|
|
191
|
+
seen.add(url);
|
|
192
|
+
images.push(url);
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
}
|
|
196
|
+
}
|
|
197
|
+
// background-image in style
|
|
198
|
+
const bgRegex = /background(?:-image)?:\s*url\(['"]?([^'")\s]+)['"]?\)/gi;
|
|
199
|
+
while ((match = bgRegex.exec(html)) !== null) {
|
|
200
|
+
if (match[1]) {
|
|
201
|
+
const url = this.resolveUrl(match[1], baseUrl);
|
|
202
|
+
if (!seen.has(url)) {
|
|
203
|
+
seen.add(url);
|
|
204
|
+
images.push(url);
|
|
205
|
+
}
|
|
206
|
+
}
|
|
207
|
+
}
|
|
208
|
+
return images;
|
|
209
|
+
}
|
|
210
|
+
/**
|
|
211
|
+
* HTMLから動画URLを抽出
|
|
212
|
+
*/
|
|
213
|
+
extractVideos(html, baseUrl) {
|
|
214
|
+
const videos = [];
|
|
215
|
+
const seen = new Set();
|
|
216
|
+
// video src
|
|
217
|
+
const videoSrcRegex = /<video[^>]+src=["']([^"']+)["']/gi;
|
|
218
|
+
let match;
|
|
219
|
+
while ((match = videoSrcRegex.exec(html)) !== null) {
|
|
220
|
+
if (match[1]) {
|
|
221
|
+
const url = this.resolveUrl(match[1], baseUrl);
|
|
222
|
+
if (!seen.has(url)) {
|
|
223
|
+
seen.add(url);
|
|
224
|
+
videos.push(url);
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
}
|
|
228
|
+
// source inside video
|
|
229
|
+
const videoSourceRegex = /<video[^>]*>[\s\S]*?<source[^>]+src=["']([^"']+)["']/gi;
|
|
230
|
+
while ((match = videoSourceRegex.exec(html)) !== null) {
|
|
231
|
+
if (match[1]) {
|
|
232
|
+
const url = this.resolveUrl(match[1], baseUrl);
|
|
233
|
+
if (!seen.has(url)) {
|
|
234
|
+
seen.add(url);
|
|
235
|
+
videos.push(url);
|
|
236
|
+
}
|
|
237
|
+
}
|
|
238
|
+
}
|
|
239
|
+
return videos;
|
|
240
|
+
}
|
|
241
|
+
/**
|
|
242
|
+
* HTMLから音声URLを抽出
|
|
243
|
+
*/
|
|
244
|
+
extractAudio(html, baseUrl) {
|
|
245
|
+
const audio = [];
|
|
246
|
+
const seen = new Set();
|
|
247
|
+
// audio src
|
|
248
|
+
const audioSrcRegex = /<audio[^>]+src=["']([^"']+)["']/gi;
|
|
249
|
+
let match;
|
|
250
|
+
while ((match = audioSrcRegex.exec(html)) !== null) {
|
|
251
|
+
if (match[1]) {
|
|
252
|
+
const url = this.resolveUrl(match[1], baseUrl);
|
|
253
|
+
if (!seen.has(url)) {
|
|
254
|
+
seen.add(url);
|
|
255
|
+
audio.push(url);
|
|
256
|
+
}
|
|
257
|
+
}
|
|
258
|
+
}
|
|
259
|
+
// source inside audio
|
|
260
|
+
const audioSourceRegex = /<audio[^>]*>[\s\S]*?<source[^>]+src=["']([^"']+)["']/gi;
|
|
261
|
+
while ((match = audioSourceRegex.exec(html)) !== null) {
|
|
262
|
+
if (match[1]) {
|
|
263
|
+
const url = this.resolveUrl(match[1], baseUrl);
|
|
264
|
+
if (!seen.has(url)) {
|
|
265
|
+
seen.add(url);
|
|
266
|
+
audio.push(url);
|
|
267
|
+
}
|
|
268
|
+
}
|
|
269
|
+
}
|
|
270
|
+
return audio;
|
|
271
|
+
}
|
|
272
|
+
/**
|
|
273
|
+
* 相対URLを絶対URLに変換
|
|
274
|
+
*/
|
|
275
|
+
resolveUrl(url, baseUrl) {
|
|
276
|
+
if (url.startsWith('http://') || url.startsWith('https://')) {
|
|
277
|
+
return url;
|
|
278
|
+
}
|
|
279
|
+
try {
|
|
280
|
+
return new URL(url, baseUrl).href;
|
|
281
|
+
}
|
|
282
|
+
catch {
|
|
283
|
+
return url;
|
|
284
|
+
}
|
|
285
|
+
}
|
|
286
|
+
}
|
|
287
|
+
//# sourceMappingURL=media-extractor.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"media-extractor.js","sourceRoot":"","sources":["../../src/media/media-extractor.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAEH,OAAO,EAEL,EAAE,EACF,GAAG,EACH,eAAe,EACf,WAAW,EACX,KAAK,GACN,MAAM,0BAA0B,CAAC;AAclC;;GAEG;AACH,MAAM,OAAO,cAAc;IACR,SAAS,GAAG,2CAA2C,CAAC;IAEzE;;OAEG;IACH,KAAK,CAAC,eAAe,CAAC,GAAW;QAC/B,MAAM,aAAa,GAAG,WAAW,CAAC,GAAG,CAAC,CAAC;QACvC,IAAI,KAAK,CAAC,aAAa,CAAC,EAAE,CAAC;YACzB,OAAO,GAAG,CAAC,IAAI,KAAK,CAAC,gBAAgB,aAAa,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC;QAC/D,CAAC;QAED,IAAI,CAAC;YACH,MAAM,QAAQ,GAAG,MAAM,KAAK,CAAC,GAAG,EAAE;gBAChC,MAAM,EAAE,MAAM;gBACd,OAAO,EAAE;oBACP,YAAY,EAAE,IAAI,CAAC,SAAS;iBAC7B;aACF,CAAC,CAAC;YAEH,IAAI,CAAC,QAAQ,CAAC,EAAE,EAAE,CAAC;gBACjB,wBAAwB;gBACxB,OAAO,IAAI,CAAC,sBAAsB,CAAC,GAAG,CAAC,CAAC;YAC1C,CAAC;YAED,OAAO,IAAI,CAAC,yBAAyB,CAAC,GAAG,EAAE,QAAQ,CAAC,CAAC;QACvD,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,MAAM,OAAO,GAAG,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,eAAe,CAAC;YACzE,OAAO,GAAG,CAAC,IAAI,KAAK,CAAC,8BAA8B,OAAO,EAAE,CAAC,CAAC,CAAC;QACjE,CAAC;IACH,CAAC;IAED;;OAEG;IACK,KAAK,CAAC,sBAAsB,CAClC,GAAW;QAEX,IAAI,CAAC;YACH,MAAM,QAAQ,GAAG,MAAM,KAAK,CAAC,GAAG,EAAE;gBAChC,OAAO,EAAE;oBACP,YAAY,EAAE,IAAI,CAAC,SAAS;oBAC5B,KAAK,EAAE,WAAW,EAAE,oBAAoB;iBACzC;aACF,CAAC,CAAC;YAEH,IAAI,CAAC,QAAQ,CAAC,EAAE,IAAI,QAAQ,CAAC,MAAM,KAAK,GAAG,EAAE,CAAC;gBAC5C,OAAO,GAAG,CAAC,IAAI,KAAK,CAAC,eAAe,QAAQ,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC;YAC1D,CAAC;YAED,OAAO,IAAI,CAAC,yBAAyB,CAAC,GAAG,EAAE,QAAQ,CAAC,CAAC;QACvD,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,MAAM,OAAO,GAAG,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,eAAe,CAAC;YACzE,OAAO,GAAG,CAAC,IAAI,KAAK,CAAC,8BAA8B,OAAO,EAAE,CAAC,CAAC,CAAC;QACjE,CAAC;IACH,CAAC;IAED;;OAEG;IACK,yBAAyB,CAC/B,GAAW,EACX,QAAkB;QAElB,MAAM,WAAW,GAAG,QAAQ,CAAC,OAAO,CAAC,GAAG,CAAC,cAAc,CAAC,IAAI,EAAE,CAAC;QAC/D,MAAM,aAAa,GAAG,QAAQ,CAAC,OAAO,CAAC,GAAG,CAAC,gBAAgB,CAAC,CAAC;QAE7D,MAAM,SAAS,GAAG,IAAI,CAAC,eAAe,CAAC,WAAW,CAAC,CAAC;QACpD,IAAI,CAAC,SAAS,EAAE,CAAC;YACf,yBAAyB;YACzB,MAAM,OAAO,GAAG,IAAI,CAAC,sBAAsB,CAAC,GAAG,CAAC,CAAC;YACjD,IAAI,CAAC,OAAO,EAAE,CAAC;gBACb,OAAO,GAAG,CAAC,IAAI,KAAK,CAAC,gCAAgC,CAAC,CAAC,CAAC;YAC1D,CAAC;QACH,CAAC;QAED,MAAM,MAAM,GACV,IAAI,CAAC,aAAa,CAAC,WAAW,CAAC,IAAI,IAAI,CAAC,oBAAoB,CAAC,GAAG,CAAC,CAAC;QAEpE,OAAO,EAAE,CAAC;YACR,GAAG;YACH,IAAI,EAAE,SAAS,IAAI,IAAI,CAAC,sBAAsB,CAAC,GAAG,CAAC,IAAI,OAAO;YAC9D,MAAM,EAAE,MAAM,IAAI,SAAS;YAC3B,IAAI,EAAE,aAAa,CAAC,CAAC,CAAC,QAAQ,CAAC,aAAa,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,SAAS;YAC7D,SAAS,EAAE,eAAe,EAAE;SAC7B,CAAC,CAAC;IACL,CAAC;IAED;;OAEG;IACH,eAAe,CAAC,WAAmB;QACjC,MAAM,KAAK,GAAG,WAAW,CAAC,WAAW,EAAE,CAAC;QAExC,IAAI,KAAK,CAAC,UAAU,CAAC,QAAQ,CAAC,EAAE,CAAC;YAC/B,OAAO,OAAO,CAAC;QACjB,CAAC;QACD,IAAI,KAAK,CAAC,UAAU,CAAC,QAAQ,CAAC,EAAE,CAAC;YAC/B,OAAO,OAAO,CAAC;QACjB,CAAC;QACD,IAAI,KAAK,CAAC,UAAU,CAAC,QAAQ,CAAC,EAAE,CAAC;YAC/B,OAAO,OAAO,CAAC;QACjB,CAAC;QAED,OAAO,IAAI,CAAC;IACd,CAAC;IAED;;OAEG;IACK,sBAAsB,CAAC,GAAW;QACxC,MAAM,GAAG,GAAG,IAAI,CAAC,oBAAoB,CAAC,GAAG,CAAC,EAAE,WAAW,EAAE,CAAC;QAC1D,IAAI,CAAC,GAAG;YAAE,OAAO,IAAI,CAAC;QAEtB,MAAM,eAAe,GAAG,CAAC,KAAK,EAAE,MAAM,EAAE,KAAK,EAAE,KAAK,EAAE,MAAM,EAAE,KAAK,EAAE,KAAK,EAAE,KAAK,CAAC,CAAC;QACnF,MAAM,eAAe,GAAG,CAAC,KAAK,EAAE,MAAM,EAAE,KAAK,EAAE,KAAK,EAAE,KAAK,EAAE,KAAK,EAAE,KAAK,CAAC,CAAC;QAC3E,MAAM,eAAe,GAAG,CAAC,KAAK,EAAE,KAAK,EAAE,KAAK,EAAE,MAAM,EAAE,KAAK,EAAE,KAAK,EAAE,KAAK,CAAC,CAAC;QAE3E,IAAI,eAAe,CAAC,QAAQ,CAAC,GAAG,CAAC;YAAE,OAAO,OAAO,CAAC;QAClD,IAAI,eAAe,CAAC,QAAQ,CAAC,GAAG,CAAC;YAAE,OAAO,OAAO,CAAC;QAClD,IAAI,eAAe,CAAC,QAAQ,CAAC,GAAG,CAAC;YAAE,OAAO,OAAO,CAAC;QAElD,OAAO,IAAI,CAAC;IACd,CAAC;IAED;;OAEG;IACH,aAAa,CAAC,WAAmB;QAC/B,MAAM,KAAK,GAAG,WAAW,CAAC,KAAK,CAAC,iCAAiC,CAAC,CAAC;QACnE,OAAO,KAAK,EAAE,CAAC,CAAC,CAAC,IAAI,IAAI,CAAC;IAC5B,CAAC;IAED;;OAEG;IACH,oBAAoB,CAAC,GAAW;QAC9B,IAAI,CAAC;YACH,MAAM,QAAQ,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAC,QAAQ,CAAC;YACvC,MAAM,KAAK,GAAG,QAAQ,CAAC,KAAK,CAAC,mBAAmB,CAAC,CAAC;YAClD,OAAO,KAAK,EAAE,CAAC,CAAC,CAAC,IAAI,IAAI,CAAC;QAC5B,CAAC;QAAC,MAAM,CAAC;YACP,OAAO,IAAI,CAAC;QACd,CAAC;IACH,CAAC;IAED;;OAEG;IACH,eAAe,CAAC,IAAY,EAAE,OAAe;QAC3C,MAAM,MAAM,GAAG,IAAI,CAAC,aAAa,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC;QACjD,MAAM,MAAM,GAAG,IAAI,CAAC,aAAa,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC;QACjD,MAAM,KAAK,GAAG,IAAI,CAAC,YAAY,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC;QAE/C,OAAO,EAAE,MAAM,EAAE,MAAM,EAAE,KAAK,EAAE,CAAC;IACnC,CAAC;IAED;;OAEG;IACK,aAAa,CAAC,IAAY,EAAE,OAAe;QACjD,MAAM,MAAM,GAAa,EAAE,CAAC;QAC5B,MAAM,IAAI,GAAG,IAAI,GAAG,EAAU,CAAC;QAE/B,UAAU;QACV,MAAM,WAAW,GAAG,iCAAiC,CAAC;QACtD,IAAI,KAAK,CAAC;QACV,OAAO,CAAC,KAAK,GAAG,WAAW,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;YACjD,IAAI,KAAK,CAAC,CAAC,CAAC,EAAE,CAAC;gBACb,MAAM,GAAG,GAAG,IAAI,CAAC,UAAU,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,OAAO,CAAC,CAAC;gBAC/C,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC;oBACnB,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;oBACd,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;gBACnB,CAAC;YACH,CAAC;QACH,CAAC;QAED,aAAa;QACb,MAAM,WAAW,GAAG,oCAAoC,CAAC;QACzD,OAAO,CAAC,KAAK,GAAG,WAAW,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;YACjD,IAAI,KAAK,CAAC,CAAC,CAAC,EAAE,CAAC;gBACb,MAAM,MAAM,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC;gBACxB,MAAM,IAAI,GAAG,MAAM,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;gBACpE,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;oBACvB,IAAI,GAAG,EAAE,CAAC;wBACR,MAAM,GAAG,GAAG,IAAI,CAAC,UAAU,CAAC,GAAG,EAAE,OAAO,CAAC,CAAC;wBAC1C,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC;4BACnB,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;4BACd,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;wBACnB,CAAC;oBACH,CAAC;gBACH,CAAC;YACH,CAAC;QACH,CAAC;QAED,iBAAiB;QACjB,MAAM,WAAW,GAAG,uCAAuC,CAAC;QAC5D,OAAO,CAAC,KAAK,GAAG,WAAW,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;YACjD,IAAI,KAAK,CAAC,CAAC,CAAC,EAAE,CAAC;gBACb,MAAM,WAAW,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC;gBAChE,MAAM,GAAG,GAAG,WAAW,EAAE,CAAC,CAAC,CAAC,CAAC;gBAC7B,IAAI,GAAG,EAAE,CAAC;oBACR,MAAM,GAAG,GAAG,IAAI,CAAC,UAAU,CAAC,GAAG,EAAE,OAAO,CAAC,CAAC;oBAC1C,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC;wBACnB,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;wBACd,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;oBACnB,CAAC;gBACH,CAAC;YACH,CAAC;QACH,CAAC;QAED,4BAA4B;QAC5B,MAAM,OAAO,GAAG,yDAAyD,CAAC;QAC1E,OAAO,CAAC,KAAK,GAAG,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;YAC7C,IAAI,KAAK,CAAC,CAAC,CAAC,EAAE,CAAC;gBACb,MAAM,GAAG,GAAG,IAAI,CAAC,UAAU,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,OAAO,CAAC,CAAC;gBAC/C,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC;oBACnB,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;oBACd,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;gBACnB,CAAC;YACH,CAAC;QACH,CAAC;QAED,OAAO,MAAM,CAAC;IAChB,CAAC;IAED;;OAEG;IACK,aAAa,CAAC,IAAY,EAAE,OAAe;QACjD,MAAM,MAAM,GAAa,EAAE,CAAC;QAC5B,MAAM,IAAI,GAAG,IAAI,GAAG,EAAU,CAAC;QAE/B,YAAY;QACZ,MAAM,aAAa,GAAG,mCAAmC,CAAC;QAC1D,IAAI,KAAK,CAAC;QACV,OAAO,CAAC,KAAK,GAAG,aAAa,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;YACnD,IAAI,KAAK,CAAC,CAAC,CAAC,EAAE,CAAC;gBACb,MAAM,GAAG,GAAG,IAAI,CAAC,UAAU,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,OAAO,CAAC,CAAC;gBAC/C,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC;oBACnB,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;oBACd,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;gBACnB,CAAC;YACH,CAAC;QACH,CAAC;QAED,sBAAsB;QACtB,MAAM,gBAAgB,GAAG,wDAAwD,CAAC;QAClF,OAAO,CAAC,KAAK,GAAG,gBAAgB,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;YACtD,IAAI,KAAK,CAAC,CAAC,CAAC,EAAE,CAAC;gBACb,MAAM,GAAG,GAAG,IAAI,CAAC,UAAU,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,OAAO,CAAC,CAAC;gBAC/C,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC;oBACnB,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;oBACd,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;gBACnB,CAAC;YACH,CAAC;QACH,CAAC;QAED,OAAO,MAAM,CAAC;IAChB,CAAC;IAED;;OAEG;IACK,YAAY,CAAC,IAAY,EAAE,OAAe;QAChD,MAAM,KAAK,GAAa,EAAE,CAAC;QAC3B,MAAM,IAAI,GAAG,IAAI,GAAG,EAAU,CAAC;QAE/B,YAAY;QACZ,MAAM,aAAa,GAAG,mCAAmC,CAAC;QAC1D,IAAI,KAAK,CAAC;QACV,OAAO,CAAC,KAAK,GAAG,aAAa,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;YACnD,IAAI,KAAK,CAAC,CAAC,CAAC,EAAE,CAAC;gBACb,MAAM,GAAG,GAAG,IAAI,CAAC,UAAU,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,OAAO,CAAC,CAAC;gBAC/C,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC;oBACnB,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;oBACd,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;gBAClB,CAAC;YACH,CAAC;QACH,CAAC;QAED,sBAAsB;QACtB,MAAM,gBAAgB,GAAG,wDAAwD,CAAC;QAClF,OAAO,CAAC,KAAK,GAAG,gBAAgB,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;YACtD,IAAI,KAAK,CAAC,CAAC,CAAC,EAAE,CAAC;gBACb,MAAM,GAAG,GAAG,IAAI,CAAC,UAAU,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,OAAO,CAAC,CAAC;gBAC/C,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC;oBACnB,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;oBACd,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;gBAClB,CAAC;YACH,CAAC;QACH,CAAC;QAED,OAAO,KAAK,CAAC;IACf,CAAC;IAED;;OAEG;IACK,UAAU,CAAC,GAAW,EAAE,OAAe;QAC7C,IAAI,GAAG,CAAC,UAAU,CAAC,SAAS,CAAC,IAAI,GAAG,CAAC,UAAU,CAAC,UAAU,CAAC,EAAE,CAAC;YAC5D,OAAO,GAAG,CAAC;QACb,CAAC;QACD,IAAI,CAAC;YACH,OAAO,IAAI,GAAG,CAAC,GAAG,EAAE,OAAO,CAAC,CAAC,IAAI,CAAC;QACpC,CAAC;QAAC,MAAM,CAAC;YACP,OAAO,GAAG,CAAC;QACb,CAAC;IACH,CAAC;CACF"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/scraper/index.ts"],"names":[],"mappings":"AAAA;;GAEG;AACH,OAAO,EAAE,UAAU,EAAE,MAAM,kBAAkB,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/scraper/index.ts"],"names":[],"mappings":"AAAA;;GAEG;AACH,OAAO,EAAE,UAAU,EAAE,MAAM,kBAAkB,CAAC"}
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* WebScraper - Webスクレイパー
|
|
3
|
+
*
|
|
4
|
+
* @requirement REQ-COLLECT-002
|
|
5
|
+
* @design DES-KATASHIRO-001 §2.2 Collector Container
|
|
6
|
+
* @task TSK-011
|
|
7
|
+
*/
|
|
8
|
+
import { type Result } from '@nahisaho/katashiro-core';
|
|
9
|
+
import type { IWebScraper, ScrapingOptions, ScrapingResult } from '../index.js';
|
|
10
|
+
/**
|
|
11
|
+
* Webスクレイパー実装
|
|
12
|
+
* Note: 本番環境ではPlaywrightを使用推奨
|
|
13
|
+
*/
|
|
14
|
+
export declare class WebScraper implements IWebScraper {
|
|
15
|
+
/**
|
|
16
|
+
* URLからコンテンツをスクレイピング
|
|
17
|
+
*/
|
|
18
|
+
scrape(url: string, options?: ScrapingOptions): Promise<Result<ScrapingResult, Error>>;
|
|
19
|
+
/**
|
|
20
|
+
* ページを取得
|
|
21
|
+
*/
|
|
22
|
+
private fetchPage;
|
|
23
|
+
/**
|
|
24
|
+
* HTMLをパース
|
|
25
|
+
*/
|
|
26
|
+
private parseHtml;
|
|
27
|
+
/**
|
|
28
|
+
* タイトルを抽出
|
|
29
|
+
*/
|
|
30
|
+
private extractTitle;
|
|
31
|
+
/**
|
|
32
|
+
* コンテンツを抽出(スクリプトとスタイルを除去)
|
|
33
|
+
*/
|
|
34
|
+
private extractContent;
|
|
35
|
+
/**
|
|
36
|
+
* 画像URLを抽出
|
|
37
|
+
*/
|
|
38
|
+
private extractImages;
|
|
39
|
+
/**
|
|
40
|
+
* リンクを抽出
|
|
41
|
+
*/
|
|
42
|
+
private extractLinks;
|
|
43
|
+
/**
|
|
44
|
+
* 相対URLを絶対URLに変換
|
|
45
|
+
*/
|
|
46
|
+
private resolveUrl;
|
|
47
|
+
}
|
|
48
|
+
//# sourceMappingURL=web-scraper.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"web-scraper.d.ts","sourceRoot":"","sources":["../../src/scraper/web-scraper.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAEH,OAAO,EACL,KAAK,MAAM,EAMZ,MAAM,0BAA0B,CAAC;AAClC,OAAO,KAAK,EAAE,WAAW,EAAE,eAAe,EAAE,cAAc,EAAE,MAAM,aAAa,CAAC;AAEhF;;;GAGG;AACH,qBAAa,UAAW,YAAW,WAAW;IAC5C;;OAEG;IACG,MAAM,CACV,GAAG,EAAE,MAAM,EACX,OAAO,CAAC,EAAE,eAAe,GACxB,OAAO,CAAC,MAAM,CAAC,cAAc,EAAE,KAAK,CAAC,CAAC;IAiBzC;;OAEG;YACW,SAAS;IAqBvB;;OAEG;IACH,OAAO,CAAC,SAAS;IAqBjB;;OAEG;IACH,OAAO,CAAC,YAAY;IAKpB;;OAEG;IACH,OAAO,CAAC,cAAc;IAqBtB;;OAEG;IACH,OAAO,CAAC,aAAa;IAgBrB;;OAEG;IACH,OAAO,CAAC,YAAY;IAgBpB;;OAEG;IACH,OAAO,CAAC,UAAU;CAUnB"}
|