mcp-docs-scraper 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +357 -0
- package/dist/index.d.ts +3 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +20 -0
- package/dist/index.js.map +1 -0
- package/dist/server.d.ts +6 -0
- package/dist/server.d.ts.map +1 -0
- package/dist/server.js +231 -0
- package/dist/server.js.map +1 -0
- package/dist/services/cache-manager.d.ts +100 -0
- package/dist/services/cache-manager.d.ts.map +1 -0
- package/dist/services/cache-manager.js +212 -0
- package/dist/services/cache-manager.js.map +1 -0
- package/dist/services/content-cleaner.d.ts +48 -0
- package/dist/services/content-cleaner.d.ts.map +1 -0
- package/dist/services/content-cleaner.js +295 -0
- package/dist/services/content-cleaner.js.map +1 -0
- package/dist/services/github-detector.d.ts +49 -0
- package/dist/services/github-detector.d.ts.map +1 -0
- package/dist/services/github-detector.js +276 -0
- package/dist/services/github-detector.js.map +1 -0
- package/dist/services/github-fetcher.d.ts +94 -0
- package/dist/services/github-fetcher.d.ts.map +1 -0
- package/dist/services/github-fetcher.js +393 -0
- package/dist/services/github-fetcher.js.map +1 -0
- package/dist/services/search-index.d.ts +106 -0
- package/dist/services/search-index.d.ts.map +1 -0
- package/dist/services/search-index.js +210 -0
- package/dist/services/search-index.js.map +1 -0
- package/dist/services/web-scraper.d.ts +88 -0
- package/dist/services/web-scraper.d.ts.map +1 -0
- package/dist/services/web-scraper.js +244 -0
- package/dist/services/web-scraper.js.map +1 -0
- package/dist/tools/clear-cache.d.ts +24 -0
- package/dist/tools/clear-cache.d.ts.map +1 -0
- package/dist/tools/clear-cache.js +29 -0
- package/dist/tools/clear-cache.js.map +1 -0
- package/dist/tools/detect-github.d.ts +21 -0
- package/dist/tools/detect-github.d.ts.map +1 -0
- package/dist/tools/detect-github.js +18 -0
- package/dist/tools/detect-github.js.map +1 -0
- package/dist/tools/get-content.d.ts +43 -0
- package/dist/tools/get-content.d.ts.map +1 -0
- package/dist/tools/get-content.js +84 -0
- package/dist/tools/get-content.js.map +1 -0
- package/dist/tools/get-tree.d.ts +31 -0
- package/dist/tools/get-tree.d.ts.map +1 -0
- package/dist/tools/get-tree.js +102 -0
- package/dist/tools/get-tree.js.map +1 -0
- package/dist/tools/index-docs.d.ts +63 -0
- package/dist/tools/index-docs.d.ts.map +1 -0
- package/dist/tools/index-docs.js +371 -0
- package/dist/tools/index-docs.js.map +1 -0
- package/dist/tools/index.d.ts +11 -0
- package/dist/tools/index.d.ts.map +1 -0
- package/dist/tools/index.js +11 -0
- package/dist/tools/index.js.map +1 -0
- package/dist/tools/list-cached.d.ts +19 -0
- package/dist/tools/list-cached.d.ts.map +1 -0
- package/dist/tools/list-cached.js +20 -0
- package/dist/tools/list-cached.js.map +1 -0
- package/dist/tools/search-docs.d.ts +31 -0
- package/dist/tools/search-docs.d.ts.map +1 -0
- package/dist/tools/search-docs.js +64 -0
- package/dist/tools/search-docs.js.map +1 -0
- package/dist/types/cache.d.ts +53 -0
- package/dist/types/cache.d.ts.map +1 -0
- package/dist/types/cache.js +2 -0
- package/dist/types/cache.js.map +1 -0
- package/dist/types/errors.d.ts +102 -0
- package/dist/types/errors.d.ts.map +1 -0
- package/dist/types/errors.js +216 -0
- package/dist/types/errors.js.map +1 -0
- package/dist/types/index.d.ts +6 -0
- package/dist/types/index.d.ts.map +1 -0
- package/dist/types/index.js +5 -0
- package/dist/types/index.js.map +1 -0
- package/dist/utils/fs.d.ts +45 -0
- package/dist/utils/fs.d.ts.map +1 -0
- package/dist/utils/fs.js +113 -0
- package/dist/utils/fs.js.map +1 -0
- package/dist/utils/rate-limit.d.ts +55 -0
- package/dist/utils/rate-limit.d.ts.map +1 -0
- package/dist/utils/rate-limit.js +89 -0
- package/dist/utils/rate-limit.js.map +1 -0
- package/dist/utils/url.d.ts +69 -0
- package/dist/utils/url.d.ts.map +1 -0
- package/dist/utils/url.js +251 -0
- package/dist/utils/url.js.map +1 -0
- package/package.json +58 -0
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
import type { CacheMeta, CacheEntrySummary } from "../types/cache.js";
|
|
2
|
+
/**
|
|
3
|
+
* Manages the local documentation cache.
|
|
4
|
+
* Handles storing, retrieving, listing, and clearing cached docs.
|
|
5
|
+
*/
|
|
6
|
+
export declare class CacheManager {
|
|
7
|
+
private readonly cacheDir;
|
|
8
|
+
constructor(cacheDir?: string);
|
|
9
|
+
/**
|
|
10
|
+
* Gets the root cache directory path.
|
|
11
|
+
*/
|
|
12
|
+
getCacheDir(): string;
|
|
13
|
+
/**
|
|
14
|
+
* Initializes the cache directory structure.
|
|
15
|
+
*/
|
|
16
|
+
initialize(): Promise<void>;
|
|
17
|
+
/**
|
|
18
|
+
* Gets the directory path for a specific docs entry.
|
|
19
|
+
*/
|
|
20
|
+
private getEntryDir;
|
|
21
|
+
/**
|
|
22
|
+
* Gets the meta.json file path for a docs entry.
|
|
23
|
+
*/
|
|
24
|
+
private getMetaPath;
|
|
25
|
+
/**
|
|
26
|
+
* Gets the content directory path for a docs entry.
|
|
27
|
+
*/
|
|
28
|
+
private getContentDir;
|
|
29
|
+
/**
|
|
30
|
+
* Gets the search index file path for a docs entry.
|
|
31
|
+
*/
|
|
32
|
+
private getSearchIndexPath;
|
|
33
|
+
/**
|
|
34
|
+
* Generates an expiration timestamp based on source type.
|
|
35
|
+
*/
|
|
36
|
+
private getExpiresAt;
|
|
37
|
+
/**
|
|
38
|
+
* Stores metadata for a docs entry.
|
|
39
|
+
*/
|
|
40
|
+
storeMeta(meta: Omit<CacheMeta, "expires_at"> & {
|
|
41
|
+
expires_at?: string;
|
|
42
|
+
}): Promise<void>;
|
|
43
|
+
/**
|
|
44
|
+
* Retrieves metadata for a docs entry.
|
|
45
|
+
* Returns null if not found.
|
|
46
|
+
*/
|
|
47
|
+
getMeta(source: "github" | "scraped", id: string): Promise<CacheMeta | null>;
|
|
48
|
+
/**
|
|
49
|
+
* Stores content for a specific file path within a docs entry.
|
|
50
|
+
*/
|
|
51
|
+
storeContent(source: "github" | "scraped", id: string, filePath: string, content: string): Promise<void>;
|
|
52
|
+
/**
|
|
53
|
+
* Retrieves content for a specific file path within a docs entry.
|
|
54
|
+
* Returns null if not found.
|
|
55
|
+
*/
|
|
56
|
+
getContent(source: "github" | "scraped", id: string, filePath: string): Promise<string | null>;
|
|
57
|
+
/**
|
|
58
|
+
* Stores a search index for a docs entry.
|
|
59
|
+
*/
|
|
60
|
+
storeSearchIndex(source: "github" | "scraped", id: string, indexJson: string): Promise<void>;
|
|
61
|
+
/**
|
|
62
|
+
* Retrieves a search index for a docs entry.
|
|
63
|
+
* Returns null if not found.
|
|
64
|
+
*/
|
|
65
|
+
getSearchIndex(source: "github" | "scraped", id: string): Promise<string | null>;
|
|
66
|
+
/**
|
|
67
|
+
* Checks if a search index exists for a docs entry.
|
|
68
|
+
*/
|
|
69
|
+
hasSearchIndex(source: "github" | "scraped", id: string): Promise<boolean>;
|
|
70
|
+
/**
|
|
71
|
+
* Checks if a docs entry exists in the cache.
|
|
72
|
+
*/
|
|
73
|
+
hasEntry(source: "github" | "scraped", id: string): Promise<boolean>;
|
|
74
|
+
/**
|
|
75
|
+
* Lists all cached docs entries.
|
|
76
|
+
*/
|
|
77
|
+
listEntries(): Promise<CacheEntrySummary[]>;
|
|
78
|
+
/**
|
|
79
|
+
* Clears a specific docs entry from the cache.
|
|
80
|
+
*/
|
|
81
|
+
clearEntry(source: "github" | "scraped", id: string): Promise<boolean>;
|
|
82
|
+
/**
|
|
83
|
+
* Clears all entries from the cache.
|
|
84
|
+
*/
|
|
85
|
+
clearAll(): Promise<string[]>;
|
|
86
|
+
/**
|
|
87
|
+
* Checks if a cache entry has expired.
|
|
88
|
+
*/
|
|
89
|
+
isExpired(meta: CacheMeta): boolean;
|
|
90
|
+
/**
|
|
91
|
+
* Finds a docs entry by ID, checking both github and scraped sources.
|
|
92
|
+
* Returns null if not found.
|
|
93
|
+
*/
|
|
94
|
+
findById(id: string): Promise<CacheMeta | null>;
|
|
95
|
+
}
|
|
96
|
+
/**
|
|
97
|
+
* Default cache manager instance.
|
|
98
|
+
*/
|
|
99
|
+
export declare const cacheManager: CacheManager;
|
|
100
|
+
//# sourceMappingURL=cache-manager.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"cache-manager.d.ts","sourceRoot":"","sources":["../../src/services/cache-manager.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,SAAS,EAAE,iBAAiB,EAAE,MAAM,mBAAmB,CAAC;AAmBtE;;;GAGG;AACH,qBAAa,YAAY;IACvB,OAAO,CAAC,QAAQ,CAAC,QAAQ,CAAS;gBAEtB,QAAQ,GAAE,MAAkB;IAIxC;;OAEG;IACH,WAAW,IAAI,MAAM;IAIrB;;OAEG;IACG,UAAU,IAAI,OAAO,CAAC,IAAI,CAAC;IAKjC;;OAEG;IACH,OAAO,CAAC,WAAW;IAInB;;OAEG;IACH,OAAO,CAAC,WAAW;IAInB;;OAEG;IACH,OAAO,CAAC,aAAa;IAIrB;;OAEG;IACH,OAAO,CAAC,kBAAkB;IAI1B;;OAEG;IACH,OAAO,CAAC,YAAY;IAKpB;;OAEG;IACG,SAAS,CAAC,IAAI,EAAE,IAAI,CAAC,SAAS,EAAE,YAAY,CAAC,GAAG;QAAE,UAAU,CAAC,EAAE,MAAM,CAAA;KAAE,GAAG,OAAO,CAAC,IAAI,CAAC;IAQ7F;;;OAGG;IACG,OAAO,CAAC,MAAM,EAAE,QAAQ,GAAG,SAAS,EAAE,EAAE,EAAE,MAAM,GAAG,OAAO,CAAC,SAAS,GAAG,IAAI,CAAC;IAIlF;;OAEG;IACG,YAAY,CAChB,MAAM,EAAE,QAAQ,GAAG,SAAS,EAC5B,EAAE,EAAE,MAAM,EACV,QAAQ,EAAE,MAAM,EAChB,OAAO,EAAE,MAAM,GACd,OAAO,CAAC,IAAI,CAAC;IAKhB;;;OAGG;IACG,UAAU,CACd,MAAM,EAAE,QAAQ,GAAG,SAAS,EAC5B,EAAE,EAAE,MAAM,EACV,QAAQ,EAAE,MAAM,GACf,OAAO,CAAC,MAAM,GAAG,IAAI,CAAC;IAKzB;;OAEG;IACG,gBAAgB,CACpB,MAAM,EAAE,QAAQ,GAAG,SAAS,EAC5B,EAAE,EAAE,MAAM,EACV,SAAS,EAAE,MAAM,GAChB,OAAO,CAAC,IAAI,CAAC;IAKhB;;;OAGG;IACG,cAAc,CAClB,MAAM,EAAE,QAAQ,GAAG,SAAS,EAC5B,EAAE,EAAE,MAAM,GACT,OAAO,CAAC,MAAM,GAAG,IAAI,CAAC;IAKzB;;OAEG;IACG,cAAc,CAClB,MAAM,EAAE,QAAQ,GAAG,SAAS,EAC5B,EAAE,EAAE,MAAM,GACT,OAAO,CAAC,OAAO,CAAC;IAInB;;OAEG;IACG,QAAQ,CAAC,MAAM,EAAE,QAAQ,GAAG,SAAS,EAAE,EAAE,EAAE,MAAM,GAAG,OAAO,CAAC,OAAO,CAAC;IAI1E;;OAEG;IACG,WAAW,IAAI,OAAO,CAAC,iBAAiB,EAAE,CAAC;IAsCjD;;OAEG;IACG,UAAU,CAAC,MAAM,EAAE,QAAQ,GAAG,SAAS,EAAE,EAAE,EAAE,MAAM,GAAG,OAAO,CAAC,OAAO,CAAC;IAS5E;;OAEG;IACG,QAAQ,IAAI,OAAO,CAAC,MAAM,EAAE,CAAC;IAoBnC;;OAEG;IACH,SAAS,CAAC,IAAI,EAAE,SAAS,GAAG,OAAO;IAInC;;;OAGG;IACG,QAAQ,CAAC,EAAE,EAAE,MAAM,GAAG,OAAO,CAAC,SAAS,GAAG,IAAI,CAAC;CAWtD;AAED;;GAEG;AACH,eAAO,MAAM,YAAY,cAAqB,CAAC"}
|
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
import { join } from "node:path";
|
|
2
|
+
import { CACHE_DIR, ensureDir, readJson, writeJson, readText, writeText, remove, listDirectories, exists, } from "../utils/fs.js";
|
|
3
|
+
/** Default TTL for GitHub-sourced docs (7 days) */
|
|
4
|
+
const GITHUB_TTL_MS = 7 * 24 * 60 * 60 * 1000;
|
|
5
|
+
/** Default TTL for scraped docs (24 hours) */
|
|
6
|
+
const SCRAPED_TTL_MS = 24 * 60 * 60 * 1000;
|
|
7
|
+
/**
|
|
8
|
+
* Manages the local documentation cache.
|
|
9
|
+
* Handles storing, retrieving, listing, and clearing cached docs.
|
|
10
|
+
*/
|
|
11
|
+
export class CacheManager {
|
|
12
|
+
cacheDir;
|
|
13
|
+
constructor(cacheDir = CACHE_DIR) {
|
|
14
|
+
this.cacheDir = cacheDir;
|
|
15
|
+
}
|
|
16
|
+
/**
|
|
17
|
+
* Gets the root cache directory path.
|
|
18
|
+
*/
|
|
19
|
+
getCacheDir() {
|
|
20
|
+
return this.cacheDir;
|
|
21
|
+
}
|
|
22
|
+
/**
|
|
23
|
+
* Initializes the cache directory structure.
|
|
24
|
+
*/
|
|
25
|
+
async initialize() {
|
|
26
|
+
await ensureDir(join(this.cacheDir, "github"));
|
|
27
|
+
await ensureDir(join(this.cacheDir, "scraped"));
|
|
28
|
+
}
|
|
29
|
+
/**
|
|
30
|
+
* Gets the directory path for a specific docs entry.
|
|
31
|
+
*/
|
|
32
|
+
getEntryDir(source, id) {
|
|
33
|
+
return join(this.cacheDir, source, id);
|
|
34
|
+
}
|
|
35
|
+
/**
|
|
36
|
+
* Gets the meta.json file path for a docs entry.
|
|
37
|
+
*/
|
|
38
|
+
getMetaPath(source, id) {
|
|
39
|
+
return join(this.getEntryDir(source, id), "meta.json");
|
|
40
|
+
}
|
|
41
|
+
/**
|
|
42
|
+
* Gets the content directory path for a docs entry.
|
|
43
|
+
*/
|
|
44
|
+
getContentDir(source, id) {
|
|
45
|
+
return join(this.getEntryDir(source, id), "content");
|
|
46
|
+
}
|
|
47
|
+
/**
|
|
48
|
+
* Gets the search index file path for a docs entry.
|
|
49
|
+
*/
|
|
50
|
+
getSearchIndexPath(source, id) {
|
|
51
|
+
return join(this.getEntryDir(source, id), "search-index.json");
|
|
52
|
+
}
|
|
53
|
+
/**
|
|
54
|
+
* Generates an expiration timestamp based on source type.
|
|
55
|
+
*/
|
|
56
|
+
getExpiresAt(source) {
|
|
57
|
+
const ttl = source === "github" ? GITHUB_TTL_MS : SCRAPED_TTL_MS;
|
|
58
|
+
return new Date(Date.now() + ttl).toISOString();
|
|
59
|
+
}
|
|
60
|
+
/**
|
|
61
|
+
* Stores metadata for a docs entry.
|
|
62
|
+
*/
|
|
63
|
+
async storeMeta(meta) {
|
|
64
|
+
const fullMeta = {
|
|
65
|
+
...meta,
|
|
66
|
+
expires_at: meta.expires_at ?? this.getExpiresAt(meta.source),
|
|
67
|
+
};
|
|
68
|
+
await writeJson(this.getMetaPath(meta.source, meta.id), fullMeta);
|
|
69
|
+
}
|
|
70
|
+
/**
|
|
71
|
+
* Retrieves metadata for a docs entry.
|
|
72
|
+
* Returns null if not found.
|
|
73
|
+
*/
|
|
74
|
+
async getMeta(source, id) {
|
|
75
|
+
return readJson(this.getMetaPath(source, id));
|
|
76
|
+
}
|
|
77
|
+
/**
|
|
78
|
+
* Stores content for a specific file path within a docs entry.
|
|
79
|
+
*/
|
|
80
|
+
async storeContent(source, id, filePath, content) {
|
|
81
|
+
const contentPath = join(this.getContentDir(source, id), filePath);
|
|
82
|
+
await writeText(contentPath, content);
|
|
83
|
+
}
|
|
84
|
+
/**
|
|
85
|
+
* Retrieves content for a specific file path within a docs entry.
|
|
86
|
+
* Returns null if not found.
|
|
87
|
+
*/
|
|
88
|
+
async getContent(source, id, filePath) {
|
|
89
|
+
const contentPath = join(this.getContentDir(source, id), filePath);
|
|
90
|
+
return readText(contentPath);
|
|
91
|
+
}
|
|
92
|
+
/**
|
|
93
|
+
* Stores a search index for a docs entry.
|
|
94
|
+
*/
|
|
95
|
+
async storeSearchIndex(source, id, indexJson) {
|
|
96
|
+
const indexPath = this.getSearchIndexPath(source, id);
|
|
97
|
+
await writeText(indexPath, indexJson);
|
|
98
|
+
}
|
|
99
|
+
/**
|
|
100
|
+
* Retrieves a search index for a docs entry.
|
|
101
|
+
* Returns null if not found.
|
|
102
|
+
*/
|
|
103
|
+
async getSearchIndex(source, id) {
|
|
104
|
+
const indexPath = this.getSearchIndexPath(source, id);
|
|
105
|
+
return readText(indexPath);
|
|
106
|
+
}
|
|
107
|
+
/**
|
|
108
|
+
* Checks if a search index exists for a docs entry.
|
|
109
|
+
*/
|
|
110
|
+
async hasSearchIndex(source, id) {
|
|
111
|
+
return exists(this.getSearchIndexPath(source, id));
|
|
112
|
+
}
|
|
113
|
+
/**
|
|
114
|
+
* Checks if a docs entry exists in the cache.
|
|
115
|
+
*/
|
|
116
|
+
async hasEntry(source, id) {
|
|
117
|
+
return exists(this.getMetaPath(source, id));
|
|
118
|
+
}
|
|
119
|
+
/**
|
|
120
|
+
* Lists all cached docs entries.
|
|
121
|
+
*/
|
|
122
|
+
async listEntries() {
|
|
123
|
+
const entries = [];
|
|
124
|
+
// List GitHub entries
|
|
125
|
+
const githubDirs = await listDirectories(join(this.cacheDir, "github"));
|
|
126
|
+
for (const id of githubDirs) {
|
|
127
|
+
const meta = await this.getMeta("github", id);
|
|
128
|
+
if (meta) {
|
|
129
|
+
entries.push({
|
|
130
|
+
id: meta.id,
|
|
131
|
+
source: meta.source,
|
|
132
|
+
repo: meta.repo,
|
|
133
|
+
indexed_at: meta.indexed_at,
|
|
134
|
+
page_count: meta.page_count,
|
|
135
|
+
total_size_bytes: meta.total_size_bytes,
|
|
136
|
+
});
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
// List scraped entries
|
|
140
|
+
const scrapedDirs = await listDirectories(join(this.cacheDir, "scraped"));
|
|
141
|
+
for (const id of scrapedDirs) {
|
|
142
|
+
const meta = await this.getMeta("scraped", id);
|
|
143
|
+
if (meta) {
|
|
144
|
+
entries.push({
|
|
145
|
+
id: meta.id,
|
|
146
|
+
source: meta.source,
|
|
147
|
+
base_url: meta.base_url,
|
|
148
|
+
indexed_at: meta.indexed_at,
|
|
149
|
+
page_count: meta.page_count,
|
|
150
|
+
total_size_bytes: meta.total_size_bytes,
|
|
151
|
+
});
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
return entries;
|
|
155
|
+
}
|
|
156
|
+
/**
|
|
157
|
+
* Clears a specific docs entry from the cache.
|
|
158
|
+
*/
|
|
159
|
+
async clearEntry(source, id) {
|
|
160
|
+
const entryDir = this.getEntryDir(source, id);
|
|
161
|
+
if (await exists(entryDir)) {
|
|
162
|
+
await remove(entryDir);
|
|
163
|
+
return true;
|
|
164
|
+
}
|
|
165
|
+
return false;
|
|
166
|
+
}
|
|
167
|
+
/**
|
|
168
|
+
* Clears all entries from the cache.
|
|
169
|
+
*/
|
|
170
|
+
async clearAll() {
|
|
171
|
+
const cleared = [];
|
|
172
|
+
// Clear GitHub entries
|
|
173
|
+
const githubDirs = await listDirectories(join(this.cacheDir, "github"));
|
|
174
|
+
for (const id of githubDirs) {
|
|
175
|
+
await this.clearEntry("github", id);
|
|
176
|
+
cleared.push(id);
|
|
177
|
+
}
|
|
178
|
+
// Clear scraped entries
|
|
179
|
+
const scrapedDirs = await listDirectories(join(this.cacheDir, "scraped"));
|
|
180
|
+
for (const id of scrapedDirs) {
|
|
181
|
+
await this.clearEntry("scraped", id);
|
|
182
|
+
cleared.push(id);
|
|
183
|
+
}
|
|
184
|
+
return cleared;
|
|
185
|
+
}
|
|
186
|
+
/**
|
|
187
|
+
* Checks if a cache entry has expired.
|
|
188
|
+
*/
|
|
189
|
+
isExpired(meta) {
|
|
190
|
+
return new Date(meta.expires_at) < new Date();
|
|
191
|
+
}
|
|
192
|
+
/**
|
|
193
|
+
* Finds a docs entry by ID, checking both github and scraped sources.
|
|
194
|
+
* Returns null if not found.
|
|
195
|
+
*/
|
|
196
|
+
async findById(id) {
|
|
197
|
+
// Try GitHub first
|
|
198
|
+
const githubMeta = await this.getMeta("github", id);
|
|
199
|
+
if (githubMeta)
|
|
200
|
+
return githubMeta;
|
|
201
|
+
// Try scraped
|
|
202
|
+
const scrapedMeta = await this.getMeta("scraped", id);
|
|
203
|
+
if (scrapedMeta)
|
|
204
|
+
return scrapedMeta;
|
|
205
|
+
return null;
|
|
206
|
+
}
|
|
207
|
+
}
|
|
208
|
+
/**
|
|
209
|
+
* Default cache manager instance.
|
|
210
|
+
*/
|
|
211
|
+
export const cacheManager = new CacheManager();
|
|
212
|
+
//# sourceMappingURL=cache-manager.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"cache-manager.js","sourceRoot":"","sources":["../../src/services/cache-manager.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AAEjC,OAAO,EACL,SAAS,EACT,SAAS,EACT,QAAQ,EACR,SAAS,EACT,QAAQ,EACR,SAAS,EACT,MAAM,EACN,eAAe,EACf,MAAM,GACP,MAAM,gBAAgB,CAAC;AAExB,mDAAmD;AACnD,MAAM,aAAa,GAAG,CAAC,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,IAAI,CAAC;AAE9C,8CAA8C;AAC9C,MAAM,cAAc,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,IAAI,CAAC;AAE3C;;;GAGG;AACH,MAAM,OAAO,YAAY;IACN,QAAQ,CAAS;IAElC,YAAY,WAAmB,SAAS;QACtC,IAAI,CAAC,QAAQ,GAAG,QAAQ,CAAC;IAC3B,CAAC;IAED;;OAEG;IACH,WAAW;QACT,OAAO,IAAI,CAAC,QAAQ,CAAC;IACvB,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,UAAU;QACd,MAAM,SAAS,CAAC,IAAI,CAAC,IAAI,CAAC,QAAQ,EAAE,QAAQ,CAAC,CAAC,CAAC;QAC/C,MAAM,SAAS,CAAC,IAAI,CAAC,IAAI,CAAC,QAAQ,EAAE,SAAS,CAAC,CAAC,CAAC;IAClD,CAAC;IAED;;OAEG;IACK,WAAW,CAAC,MAA4B,EAAE,EAAU;QAC1D,OAAO,IAAI,CAAC,IAAI,CAAC,QAAQ,EAAE,MAAM,EAAE,EAAE,CAAC,CAAC;IACzC,CAAC;IAED;;OAEG;IACK,WAAW,CAAC,MAA4B,EAAE,EAAU;QAC1D,OAAO,IAAI,CAAC,IAAI,CAAC,WAAW,CAAC,MAAM,EAAE,EAAE,CAAC,EAAE,WAAW,CAAC,CAAC;IACzD,CAAC;IAED;;OAEG;IACK,aAAa,CAAC,MAA4B,EAAE,EAAU;QAC5D,OAAO,IAAI,CAAC,IAAI,CAAC,WAAW,CAAC,MAAM,EAAE,EAAE,CAAC,EAAE,SAAS,CAAC,CAAC;IACvD,CAAC;IAED;;OAEG;IACK,kBAAkB,CAAC,MAA4B,EAAE,EAAU;QACjE,OAAO,IAAI,CAAC,IAAI,CAAC,WAAW,CAAC,MAAM,EAAE,EAAE,CAAC,EAAE,mBAAmB,CAAC,CAAC;IACjE,CAAC;IAED;;OAEG;IACK,YAAY,CAAC,MAA4B;QAC/C,MAAM,GAAG,GAAG,MAAM,KAAK,QAAQ,CAAC,CAAC,CAAC,aAAa,CAAC,CAAC,CAAC,cAAc,CAAC;QACjE,OAAO,IAAI,IAAI,CAAC,IAAI,CAAC,GAAG,EAAE,GAAG,GAAG,CAAC,CAAC,WAAW,EAAE,CAAC;IAClD,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,SAAS,CAAC,IAA6D;QAC3E,MAAM,QAAQ,GAAc;YAC1B,GAAG,IAAI;YACP,UAAU,EAAE,IAAI,CAAC,UAAU,IAAI,IAAI,CAAC,YAAY,CAAC,IAAI,CAAC,MAAM,CAAC;SAC9D,CAAC;QACF,MAAM,SAAS,CAAC,IAAI,CAAC,WAAW,CAAC,IAAI,CAAC,MAAM,EAAE,IAAI,CAAC,EAAE,CAAC,EAAE,QAAQ,CAAC,CAAC;IACpE,CAAC;IAED;;;OAGG;IACH,KAAK,CAAC,OAAO,CAAC,MAA4B,EAAE,EAAU;QACpD,OAAO,QAAQ,CAAY,IAAI,CAAC,WAAW,CAAC,MAAM,EAAE,EAAE,CAAC,CAAC,CAAC;IAC3D,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,YAAY,CAChB,MAA4B,EAC5B,EAAU,EACV,QAAgB,EAChB,OAAe;QAEf,MAAM,WAAW,GAAG,IAAI,CAAC,IAAI,CAAC,aAAa,CAAC,MAAM,EAAE,EAAE,CAAC,EAAE,QAAQ,CAAC,CAAC;QACnE,MAAM,SAAS,CAAC,WAAW,EAAE,OAAO,CAAC,CAAC;IACxC,CAAC;IAED;;;OAGG;IACH,KAAK,CAAC,UAAU,CACd,MAA4B,EAC5B,EAAU,EACV,QAAgB;QAEhB,MAAM,WAAW,GAAG,IAAI,CAAC,IAAI,CAAC,aAAa,CAAC,MAAM,EAAE,EAAE,CAAC,EAAE,QAAQ,CAAC,CAAC;QACnE,OAAO,QAAQ,CAAC,WAAW,CAAC,CAAC;IAC/B,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,gBAAgB,CACpB,MAA4B,EAC5B,EAAU,EACV,SAAiB;QAEjB,MAAM,SAAS,GAAG,IAAI,CAAC,kBAAkB,CAAC,MAAM,EAAE,EAAE,CAAC,CAAC;QACtD,MAAM,SAAS,CAAC,SAAS,EAAE,SAAS,CAAC,CAAC;IACxC,CAAC;IAED;;;OAGG;IACH,KAAK,CAAC,cAAc,CAClB,MAA4B,EAC5B,EAAU;QAEV,MAAM,SAAS,GAAG,IAAI,CAAC,kBAAkB,CAAC,MAAM,EAAE,EAAE,CAAC,CAAC;QACtD,OAAO,QAAQ,CAAC,SAAS,CAAC,CAAC;IAC7B,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,cAAc,CAClB,MAA4B,EAC5B,EAAU;QAEV,OAAO,MAAM,CAAC,IAAI,CAAC,kBAAkB,CAAC,MAAM,EAAE,EAAE,CAAC,CAAC,CAAC;IACrD,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,QAAQ,CAAC,MAA4B,EAAE,EAAU;QACrD,OAAO,MAAM,CAAC,IAAI,CAAC,WAAW,CAAC,MAAM,EAAE,EAAE,CAAC,CAAC,CAAC;IAC9C,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,WAAW;QACf,MAAM,OAAO,GAAwB,EAAE,CAAC;QAExC,sBAAsB;QACtB,MAAM,UAAU,GAAG,MAAM,eAAe,CAAC,IAAI,CAAC,IAAI,CAAC,QAAQ,EAAE,QAAQ,CAAC,CAAC,CAAC;QACxE,KAAK,MAAM,EAAE,IAAI,UAAU,EAAE,CAAC;YAC5B,MAAM,IAAI,GAAG,MAAM,IAAI,CAAC,OAAO,CAAC,QAAQ,EAAE,EAAE,CAAC,CAAC;YAC9C,IAAI,IAAI,EAAE,CAAC;gBACT,OAAO,CAAC,IAAI,CAAC;oBACX,EAAE,EAAE,IAAI,CAAC,EAAE;oBACX,MAAM,EAAE,IAAI,CAAC,MAAM;oBACnB,IAAI,EAAE,IAAI,CAAC,IAAI;oBACf,UAAU,EAAE,IAAI,CAAC,UAAU;oBAC3B,UAAU,EAAE,IAAI,CAAC,UAAU;oBAC3B,gBAAgB,EAAE,IAAI,CAAC,gBAAgB;iBACxC,CAAC,CAAC;YACL,CAAC;QACH,CAAC;QAED,uBAAuB;QACvB,MAAM,WAAW,GAAG,MAAM,eAAe,CAAC,IAAI,CAAC,IAAI,CAAC,QAAQ,EAAE,SAAS,CAAC,CAAC,CAAC;QAC1E,KAAK,MAAM,EAAE,IAAI,WAAW,EAAE,CAAC;YAC7B,MAAM,IAAI,GAAG,MAAM,IAAI,CAAC,OAAO,CAAC,SAAS,EAAE,EAAE,CAAC,CAAC;YAC/C,IAAI,IAAI,EAAE,CAAC;gBACT,OAAO,CAAC,IAAI,CAAC;oBACX,EAAE,EAAE,IAAI,CAAC,EAAE;oBACX,MAAM,EAAE,IAAI,CAAC,MAAM;oBACnB,QAAQ,EAAE,IAAI,CAAC,QAAQ;oBACvB,UAAU,EAAE,IAAI,CAAC,UAAU;oBAC3B,UAAU,EAAE,IAAI,CAAC,UAAU;oBAC3B,gBAAgB,EAAE,IAAI,CAAC,gBAAgB;iBACxC,CAAC,CAAC;YACL,CAAC;QACH,CAAC;QAED,OAAO,OAAO,CAAC;IACjB,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,UAAU,CAAC,MAA4B,EAAE,EAAU;QACvD,MAAM,QAAQ,GAAG,IAAI,CAAC,WAAW,CAAC,MAAM,EAAE,EAAE,CAAC,CAAC;QAC9C,IAAI,MAAM,MAAM,CAAC,QAAQ,CAAC,EAAE,CAAC;YAC3B,MAAM,MAAM,CAAC,QAAQ,CAAC,CAAC;YACvB,OAAO,IAAI,CAAC;QACd,CAAC;QACD,OAAO,KAAK,CAAC;IACf,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,QAAQ;QACZ,MAAM,OAAO,GAAa,EAAE,CAAC;QAE7B,uBAAuB;QACvB,MAAM,UAAU,GAAG,MAAM,eAAe,CAAC,IAAI,CAAC,IAAI,CAAC,QAAQ,EAAE,QAAQ,CAAC,CAAC,CAAC;QACxE,KAAK,MAAM,EAAE,IAAI,UAAU,EAAE,CAAC;YAC5B,MAAM,IAAI,CAAC,UAAU,CAAC,QAAQ,EAAE,EAAE,CAAC,CAAC;YACpC,OAAO,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QACnB,CAAC;QAED,wBAAwB;QACxB,MAAM,WAAW,GAAG,MAAM,eAAe,CAAC,IAAI,CAAC,IAAI,CAAC,QAAQ,EAAE,SAAS,CAAC,CAAC,CAAC;QAC1E,KAAK,MAAM,EAAE,IAAI,WAAW,EAAE,CAAC;YAC7B,MAAM,IAAI,CAAC,UAAU,CAAC,SAAS,EAAE,EAAE,CAAC,CAAC;YACrC,OAAO,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QACnB,CAAC;QAED,OAAO,OAAO,CAAC;IACjB,CAAC;IAED;;OAEG;IACH,SAAS,CAAC,IAAe;QACvB,OAAO,IAAI,IAAI,CAAC,IAAI,CAAC,UAAU,CAAC,GAAG,IAAI,IAAI,EAAE,CAAC;IAChD,CAAC;IAED;;;OAGG;IACH,KAAK,CAAC,QAAQ,CAAC,EAAU;QACvB,mBAAmB;QACnB,MAAM,UAAU,GAAG,MAAM,IAAI,CAAC,OAAO,CAAC,QAAQ,EAAE,EAAE,CAAC,CAAC;QACpD,IAAI,UAAU;YAAE,OAAO,UAAU,CAAC;QAElC,cAAc;QACd,MAAM,WAAW,GAAG,MAAM,IAAI,CAAC,OAAO,CAAC,SAAS,EAAE,EAAE,CAAC,CAAC;QACtD,IAAI,WAAW;YAAE,OAAO,WAAW,CAAC;QAEpC,OAAO,IAAI,CAAC;IACd,CAAC;CACF;AAED;;GAEG;AACH,MAAM,CAAC,MAAM,YAAY,GAAG,IAAI,YAAY,EAAE,CAAC"}
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Content Cleaner Service - Converts HTML to clean Markdown.
|
|
3
|
+
*
|
|
4
|
+
* Responsibilities:
|
|
5
|
+
* - Remove unwanted elements (nav, script, style, etc.)
|
|
6
|
+
* - Extract main content from the page
|
|
7
|
+
* - Convert HTML to Markdown with Turndown
|
|
8
|
+
* - Preserve code blocks with language hints
|
|
9
|
+
* - Convert relative URLs to absolute
|
|
10
|
+
*/
|
|
11
|
+
/**
|
|
12
|
+
* Options for the content cleaner.
|
|
13
|
+
*/
|
|
14
|
+
export interface ContentCleanerOptions {
|
|
15
|
+
/** Base URL for resolving relative links */
|
|
16
|
+
baseUrl?: string;
|
|
17
|
+
/** Whether to extract only the main content area */
|
|
18
|
+
extractMainContent?: boolean;
|
|
19
|
+
}
|
|
20
|
+
/**
|
|
21
|
+
* Result of cleaning HTML content.
|
|
22
|
+
*/
|
|
23
|
+
export interface CleanedContent {
|
|
24
|
+
/** The cleaned Markdown content */
|
|
25
|
+
markdown: string;
|
|
26
|
+
/** Extracted title from the page */
|
|
27
|
+
title?: string;
|
|
28
|
+
/** List of headings found in the content */
|
|
29
|
+
headings: Array<{
|
|
30
|
+
level: number;
|
|
31
|
+
text: string;
|
|
32
|
+
}>;
|
|
33
|
+
}
|
|
34
|
+
/**
|
|
35
|
+
* Cleans HTML content and converts it to Markdown.
|
|
36
|
+
*
|
|
37
|
+
* @param html The raw HTML content
|
|
38
|
+
* @param options Cleaning options
|
|
39
|
+
* @returns Cleaned markdown content with metadata
|
|
40
|
+
*/
|
|
41
|
+
export declare function cleanHtml(html: string, options?: ContentCleanerOptions): CleanedContent;
|
|
42
|
+
/**
|
|
43
|
+
* Content cleaner singleton for convenience.
|
|
44
|
+
*/
|
|
45
|
+
export declare const contentCleaner: {
|
|
46
|
+
clean: typeof cleanHtml;
|
|
47
|
+
};
|
|
48
|
+
//# sourceMappingURL=content-cleaner.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"content-cleaner.d.ts","sourceRoot":"","sources":["../../src/services/content-cleaner.ts"],"names":[],"mappings":"AAAA;;;;;;;;;GASG;AAKH;;GAEG;AACH,MAAM,WAAW,qBAAqB;IACpC,4CAA4C;IAC5C,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,oDAAoD;IACpD,kBAAkB,CAAC,EAAE,OAAO,CAAC;CAC9B;AAED;;GAEG;AACH,MAAM,WAAW,cAAc;IAC7B,mCAAmC;IACnC,QAAQ,EAAE,MAAM,CAAC;IACjB,oCAAoC;IACpC,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,4CAA4C;IAC5C,QAAQ,EAAE,KAAK,CAAC;QAAE,KAAK,EAAE,MAAM,CAAC;QAAC,IAAI,EAAE,MAAM,CAAA;KAAE,CAAC,CAAC;CAClD;AA0QD;;;;;;GAMG;AACH,wBAAgB,SAAS,CACvB,IAAI,EAAE,MAAM,EACZ,OAAO,GAAE,qBAA0B,GAClC,cAAc,CA4ChB;AAED;;GAEG;AACH,eAAO,MAAM,cAAc;;CAE1B,CAAC"}
|
|
@@ -0,0 +1,295 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Content Cleaner Service - Converts HTML to clean Markdown.
|
|
3
|
+
*
|
|
4
|
+
* Responsibilities:
|
|
5
|
+
* - Remove unwanted elements (nav, script, style, etc.)
|
|
6
|
+
* - Extract main content from the page
|
|
7
|
+
* - Convert HTML to Markdown with Turndown
|
|
8
|
+
* - Preserve code blocks with language hints
|
|
9
|
+
* - Convert relative URLs to absolute
|
|
10
|
+
*/
|
|
11
|
+
import TurndownService from "turndown";
|
|
12
|
+
import * as cheerio from "cheerio";
|
|
13
|
+
/**
|
|
14
|
+
* Elements to remove completely from the DOM.
|
|
15
|
+
*/
|
|
16
|
+
const ELEMENTS_TO_REMOVE = [
|
|
17
|
+
"script",
|
|
18
|
+
"style",
|
|
19
|
+
"noscript",
|
|
20
|
+
"iframe",
|
|
21
|
+
"nav",
|
|
22
|
+
"header",
|
|
23
|
+
"footer",
|
|
24
|
+
"aside",
|
|
25
|
+
"form",
|
|
26
|
+
"button",
|
|
27
|
+
"input",
|
|
28
|
+
"select",
|
|
29
|
+
"textarea",
|
|
30
|
+
".nav",
|
|
31
|
+
".navbar",
|
|
32
|
+
".navigation",
|
|
33
|
+
".sidebar",
|
|
34
|
+
".menu",
|
|
35
|
+
".footer",
|
|
36
|
+
".header",
|
|
37
|
+
".ads",
|
|
38
|
+
".advertisement",
|
|
39
|
+
".social-share",
|
|
40
|
+
".comments",
|
|
41
|
+
".comment",
|
|
42
|
+
".breadcrumb",
|
|
43
|
+
".breadcrumbs",
|
|
44
|
+
".pagination",
|
|
45
|
+
"[role='navigation']",
|
|
46
|
+
"[role='banner']",
|
|
47
|
+
"[role='complementary']",
|
|
48
|
+
"[role='contentinfo']",
|
|
49
|
+
];
|
|
50
|
+
/**
|
|
51
|
+
* Selectors for main content extraction (in priority order).
|
|
52
|
+
*/
|
|
53
|
+
const MAIN_CONTENT_SELECTORS = [
|
|
54
|
+
"main",
|
|
55
|
+
"article",
|
|
56
|
+
"[role='main']",
|
|
57
|
+
".content",
|
|
58
|
+
".documentation",
|
|
59
|
+
".docs",
|
|
60
|
+
".doc-content",
|
|
61
|
+
".markdown-body",
|
|
62
|
+
".post-content",
|
|
63
|
+
".article-content",
|
|
64
|
+
"#content",
|
|
65
|
+
"#main",
|
|
66
|
+
"#main-content",
|
|
67
|
+
".main-content",
|
|
68
|
+
];
|
|
69
|
+
/**
|
|
70
|
+
* Creates a configured Turndown service for HTML to Markdown conversion.
|
|
71
|
+
*/
|
|
72
|
+
function createTurndownService(baseUrl) {
|
|
73
|
+
const turndown = new TurndownService({
|
|
74
|
+
headingStyle: "atx", // # style headings
|
|
75
|
+
codeBlockStyle: "fenced", // ``` code blocks
|
|
76
|
+
bulletListMarker: "-",
|
|
77
|
+
emDelimiter: "*",
|
|
78
|
+
strongDelimiter: "**",
|
|
79
|
+
});
|
|
80
|
+
// Remove unwanted elements completely
|
|
81
|
+
turndown.remove(["script", "style", "noscript", "iframe"]);
|
|
82
|
+
// Preserve code block language hints
|
|
83
|
+
turndown.addRule("fencedCodeBlock", {
|
|
84
|
+
filter: (node) => {
|
|
85
|
+
return (node.nodeName === "PRE" &&
|
|
86
|
+
node.firstChild !== null &&
|
|
87
|
+
node.firstChild.nodeName === "CODE");
|
|
88
|
+
},
|
|
89
|
+
replacement: (_content, node) => {
|
|
90
|
+
const code = node.firstChild;
|
|
91
|
+
if (!code)
|
|
92
|
+
return "";
|
|
93
|
+
// Try to extract language from class
|
|
94
|
+
const classList = code.getAttribute?.("class") ?? "";
|
|
95
|
+
const langMatch = classList.match(/(?:language-|lang-)(\w+)/);
|
|
96
|
+
const lang = langMatch ? langMatch[1] : "";
|
|
97
|
+
// Get the text content and preserve it
|
|
98
|
+
const text = code.textContent || "";
|
|
99
|
+
return `\n\n\`\`\`${lang}\n${text}\n\`\`\`\n\n`;
|
|
100
|
+
},
|
|
101
|
+
});
|
|
102
|
+
// Handle inline code
|
|
103
|
+
turndown.addRule("inlineCode", {
|
|
104
|
+
filter: (node) => {
|
|
105
|
+
return (node.nodeName === "CODE" &&
|
|
106
|
+
node.parentNode !== null &&
|
|
107
|
+
node.parentNode.nodeName !== "PRE");
|
|
108
|
+
},
|
|
109
|
+
replacement: (content) => {
|
|
110
|
+
if (!content)
|
|
111
|
+
return "";
|
|
112
|
+
// Escape backticks in the content
|
|
113
|
+
const escaped = content.replace(/`/g, "\\`");
|
|
114
|
+
return `\`${escaped}\``;
|
|
115
|
+
},
|
|
116
|
+
});
|
|
117
|
+
// Convert relative URLs to absolute for links
|
|
118
|
+
if (baseUrl) {
|
|
119
|
+
turndown.addRule("absoluteLinks", {
|
|
120
|
+
filter: "a",
|
|
121
|
+
replacement: (content, node) => {
|
|
122
|
+
const element = node;
|
|
123
|
+
const href = element.getAttribute?.("href") ?? "";
|
|
124
|
+
if (!href || !content.trim()) {
|
|
125
|
+
return content;
|
|
126
|
+
}
|
|
127
|
+
// Skip anchor-only links
|
|
128
|
+
if (href.startsWith("#")) {
|
|
129
|
+
return `[${content}](${href})`;
|
|
130
|
+
}
|
|
131
|
+
// Convert relative URLs to absolute
|
|
132
|
+
try {
|
|
133
|
+
const absoluteUrl = new URL(href, baseUrl).href;
|
|
134
|
+
return `[${content}](${absoluteUrl})`;
|
|
135
|
+
}
|
|
136
|
+
catch {
|
|
137
|
+
// If URL is invalid, return as-is
|
|
138
|
+
return `[${content}](${href})`;
|
|
139
|
+
}
|
|
140
|
+
},
|
|
141
|
+
});
|
|
142
|
+
}
|
|
143
|
+
// Convert relative URLs to absolute for images
|
|
144
|
+
if (baseUrl) {
|
|
145
|
+
turndown.addRule("absoluteImages", {
|
|
146
|
+
filter: "img",
|
|
147
|
+
replacement: (_content, node) => {
|
|
148
|
+
const element = node;
|
|
149
|
+
const src = element.getAttribute?.("src") ?? "";
|
|
150
|
+
const alt = element.getAttribute?.("alt") ?? "";
|
|
151
|
+
if (!src) {
|
|
152
|
+
return "";
|
|
153
|
+
}
|
|
154
|
+
// Convert relative URLs to absolute
|
|
155
|
+
try {
|
|
156
|
+
const absoluteUrl = new URL(src, baseUrl).href;
|
|
157
|
+
return ``;
|
|
158
|
+
}
|
|
159
|
+
catch {
|
|
160
|
+
return ``;
|
|
161
|
+
}
|
|
162
|
+
},
|
|
163
|
+
});
|
|
164
|
+
}
|
|
165
|
+
return turndown;
|
|
166
|
+
}
|
|
167
|
+
/**
|
|
168
|
+
* Extracts the main content area from an HTML document.
|
|
169
|
+
*/
|
|
170
|
+
function extractMainContent($) {
|
|
171
|
+
// Try each selector in priority order
|
|
172
|
+
for (const selector of MAIN_CONTENT_SELECTORS) {
|
|
173
|
+
const element = $(selector).first();
|
|
174
|
+
if (element.length > 0) {
|
|
175
|
+
return element;
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
// Fallback: find the element with the most text content
|
|
179
|
+
let bestElement = null;
|
|
180
|
+
let maxTextLength = 0;
|
|
181
|
+
$("div, section").each((_, elem) => {
|
|
182
|
+
const $elem = $(elem);
|
|
183
|
+
const textLength = $elem.text().trim().length;
|
|
184
|
+
if (textLength > maxTextLength) {
|
|
185
|
+
maxTextLength = textLength;
|
|
186
|
+
bestElement = $elem;
|
|
187
|
+
}
|
|
188
|
+
});
|
|
189
|
+
return bestElement;
|
|
190
|
+
}
|
|
191
|
+
/**
|
|
192
|
+
* Extracts the page title from HTML.
|
|
193
|
+
*/
|
|
194
|
+
function extractTitle($) {
|
|
195
|
+
// Try <title> tag first
|
|
196
|
+
const titleTag = $("title").first().text().trim();
|
|
197
|
+
if (titleTag) {
|
|
198
|
+
// Clean up common suffixes
|
|
199
|
+
const cleaned = titleTag
|
|
200
|
+
.replace(/\s*[|\-–—]\s*.+$/, "") // Remove " | Site Name" or " - Site Name"
|
|
201
|
+
.trim();
|
|
202
|
+
if (cleaned)
|
|
203
|
+
return cleaned;
|
|
204
|
+
}
|
|
205
|
+
// Try <h1> tag
|
|
206
|
+
const h1 = $("h1").first().text().trim();
|
|
207
|
+
if (h1)
|
|
208
|
+
return h1;
|
|
209
|
+
// Try og:title meta tag
|
|
210
|
+
const ogTitle = $('meta[property="og:title"]').attr("content");
|
|
211
|
+
if (ogTitle)
|
|
212
|
+
return ogTitle.trim();
|
|
213
|
+
return undefined;
|
|
214
|
+
}
|
|
215
|
+
/**
|
|
216
|
+
* Extracts headings from Markdown content.
|
|
217
|
+
*/
|
|
218
|
+
function extractHeadings(markdown) {
|
|
219
|
+
const headings = [];
|
|
220
|
+
const lines = markdown.split("\n");
|
|
221
|
+
for (const line of lines) {
|
|
222
|
+
const match = line.match(/^(#{1,6})\s+(.+)$/);
|
|
223
|
+
if (match) {
|
|
224
|
+
headings.push({
|
|
225
|
+
level: match[1].length,
|
|
226
|
+
text: match[2].trim(),
|
|
227
|
+
});
|
|
228
|
+
}
|
|
229
|
+
}
|
|
230
|
+
return headings;
|
|
231
|
+
}
|
|
232
|
+
/**
|
|
233
|
+
* Normalizes markdown output for cleaner results.
|
|
234
|
+
*/
|
|
235
|
+
function normalizeMarkdown(markdown) {
|
|
236
|
+
return (markdown
|
|
237
|
+
// Remove excessive blank lines (more than 2 in a row)
|
|
238
|
+
.replace(/\n{3,}/g, "\n\n")
|
|
239
|
+
// Remove trailing whitespace from lines
|
|
240
|
+
.replace(/[ \t]+$/gm, "")
|
|
241
|
+
// Ensure single newline at end
|
|
242
|
+
.trim() + "\n");
|
|
243
|
+
}
|
|
244
|
+
/**
|
|
245
|
+
* Cleans HTML content and converts it to Markdown.
|
|
246
|
+
*
|
|
247
|
+
* @param html The raw HTML content
|
|
248
|
+
* @param options Cleaning options
|
|
249
|
+
* @returns Cleaned markdown content with metadata
|
|
250
|
+
*/
|
|
251
|
+
export function cleanHtml(html, options = {}) {
|
|
252
|
+
const { baseUrl, extractMainContent: shouldExtractMain = true } = options;
|
|
253
|
+
// Parse HTML with Cheerio
|
|
254
|
+
const $ = cheerio.load(html);
|
|
255
|
+
// Extract title before removing elements
|
|
256
|
+
const title = extractTitle($);
|
|
257
|
+
// Remove unwanted elements
|
|
258
|
+
for (const selector of ELEMENTS_TO_REMOVE) {
|
|
259
|
+
$(selector).remove();
|
|
260
|
+
}
|
|
261
|
+
// Get the content to convert
|
|
262
|
+
let contentHtml;
|
|
263
|
+
if (shouldExtractMain) {
|
|
264
|
+
const mainContent = extractMainContent($);
|
|
265
|
+
if (mainContent) {
|
|
266
|
+
contentHtml = mainContent.html() || "";
|
|
267
|
+
}
|
|
268
|
+
else {
|
|
269
|
+
// Fallback to body content
|
|
270
|
+
contentHtml = $("body").html() || html;
|
|
271
|
+
}
|
|
272
|
+
}
|
|
273
|
+
else {
|
|
274
|
+
contentHtml = $("body").html() || html;
|
|
275
|
+
}
|
|
276
|
+
// Convert to Markdown
|
|
277
|
+
const turndown = createTurndownService(baseUrl);
|
|
278
|
+
let markdown = turndown.turndown(contentHtml);
|
|
279
|
+
// Normalize the output
|
|
280
|
+
markdown = normalizeMarkdown(markdown);
|
|
281
|
+
// Extract headings from the final markdown
|
|
282
|
+
const headings = extractHeadings(markdown);
|
|
283
|
+
return {
|
|
284
|
+
markdown,
|
|
285
|
+
title,
|
|
286
|
+
headings,
|
|
287
|
+
};
|
|
288
|
+
}
|
|
289
|
+
/**
|
|
290
|
+
* Content cleaner singleton for convenience.
|
|
291
|
+
*/
|
|
292
|
+
export const contentCleaner = {
|
|
293
|
+
clean: cleanHtml,
|
|
294
|
+
};
|
|
295
|
+
//# sourceMappingURL=content-cleaner.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"content-cleaner.js","sourceRoot":"","sources":["../../src/services/content-cleaner.ts"],"names":[],"mappings":"AAAA;;;;;;;;;GASG;AAEH,OAAO,eAAyB,MAAM,UAAU,CAAC;AACjD,OAAO,KAAK,OAAO,MAAM,SAAS,CAAC;AAwBnC;;GAEG;AACH,MAAM,kBAAkB,GAAG;IACzB,QAAQ;IACR,OAAO;IACP,UAAU;IACV,QAAQ;IACR,KAAK;IACL,QAAQ;IACR,QAAQ;IACR,OAAO;IACP,MAAM;IACN,QAAQ;IACR,OAAO;IACP,QAAQ;IACR,UAAU;IACV,MAAM;IACN,SAAS;IACT,aAAa;IACb,UAAU;IACV,OAAO;IACP,SAAS;IACT,SAAS;IACT,MAAM;IACN,gBAAgB;IAChB,eAAe;IACf,WAAW;IACX,UAAU;IACV,aAAa;IACb,cAAc;IACd,aAAa;IACb,qBAAqB;IACrB,iBAAiB;IACjB,wBAAwB;IACxB,sBAAsB;CACvB,CAAC;AAEF;;GAEG;AACH,MAAM,sBAAsB,GAAG;IAC7B,MAAM;IACN,SAAS;IACT,eAAe;IACf,UAAU;IACV,gBAAgB;IAChB,OAAO;IACP,cAAc;IACd,gBAAgB;IAChB,eAAe;IACf,kBAAkB;IAClB,UAAU;IACV,OAAO;IACP,eAAe;IACf,eAAe;CAChB,CAAC;AAEF;;GAEG;AACH,SAAS,qBAAqB,CAAC,OAAgB;IAC7C,MAAM,QAAQ,GAAG,IAAI,eAAe,CAAC;QACnC,YAAY,EAAE,KAAK,EAAE,mBAAmB;QACxC,cAAc,EAAE,QAAQ,EAAE,kBAAkB;QAC5C,gBAAgB,EAAE,GAAG;QACrB,WAAW,EAAE,GAAG;QAChB,eAAe,EAAE,IAAI;KACtB,CAAC,CAAC;IAEH,sCAAsC;IACtC,QAAQ,CAAC,MAAM,CAAC,CAAC,QAAQ,EAAE,OAAO,EAAE,UAAU,EAAE,QAAQ,CAAC,CAAC,CAAC;IAE3D,qCAAqC;IACrC,QAAQ,CAAC,OAAO,CAAC,iBAAiB,EAAE;QAClC,MAAM,EAAE,CAAC,IAAI,EAAE,EAAE;YACf,OAAO,CACL,IAAI,CAAC,QAAQ,KAAK,KAAK;gBACvB,IAAI,CAAC,UAAU,KAAK,IAAI;gBACxB,IAAI,CAAC,UAAU,CAAC,QAAQ,KAAK,MAAM,CACpC,CAAC;QACJ,CAAC;QACD,WAAW,EAAE,CAAC,QAAQ,EAAE,IAAI,EAAE,EAAE;YAC9B,MAAM,IAAI,GAAG,IAAI,CAAC,UAAyB,CAAC;YAC5C,IAAI,CAAC,IAAI;gBAAE,OAAO,EAAE,CAAC;YAErB,qCAAqC;YACrC,MAAM,SAAS,GAAG,IAAI,CAAC,YAAY,EAAE,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC;YACrD,MAAM,SAAS,GAAG,SAAS,CAAC,KAAK,CAAC,0BAA0B,CAAC,CAAC;YAC9D,MAAM,IAAI,GAAG,SAAS,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;YAE3C,uCAAuC;YACvC,MAAM,IAAI,GAAG,IAAI,CAAC,WAAW,IAAI,EAAE,CAAC;YAEpC,OAAO,aAAa,IAAI,KAAK,IAAI,cAAc,CAAC;QAClD,CAAC;KACF,CAAC,CAAC;IAEH,qBAAqB;IACrB,QAAQ,CAAC,OAAO,CAAC,YAAY,EAAE;QAC7B,MAAM,EAAE,CAAC,IAAI,EAAE,EAAE;YACf,OAAO,CACL,IAAI,CAAC,QAAQ,KAAK,MAAM;gBACxB,IAAI,CAAC,UAAU,KAAK,IAAI;gBACxB,IAAI,CAAC,UAAU,CAAC,QAAQ,KAAK,KAAK,CACnC,CAAC;QACJ,CAAC;QACD,WAAW,EAAE,CAAC,OAAO,EAAE,EAAE;YACvB,IAAI,CAAC,OAAO;gBAAE,OAAO,EAAE,CAAC;YACxB,kCAAkC;YAClC,MAAM,OAAO,GAAG,OAAO,CAAC,OAAO,CAAC,IAAI,EAAE,KAAK,CAAC,CAAC;YAC7C,OAAO,KAAK,OAAO,IAAI,CAAC;QAC1B,CAAC;KACF,CAAC,CAAC;IAEH,8CAA8C;IAC9C,IAAI,OAAO,EAAE,CAAC;QACZ,QAAQ,CAAC,OAAO,CAAC,eAAe,EAAE;YAChC,MAAM,EAAE,GAAG;YACX,WAAW,EAAE,CAAC,OAAO,EAAE,IAAI,EAAE,EAAE;gBAC7B,MAAM,OAAO,GAAG,IAAY,CAAC;gBAC7B,MAAM,IAAI,GAAG,OAAO,CAAC,YAAY,EAAE,CAAC,MAAM,CAAC,IAAI,EAAE,CAAC;gBAElD,IAAI,CAAC,IAAI,IAAI,CAAC,OAAO,CAAC,IAAI,EAAE,EAAE,CAAC;oBAC7B,OAAO,OAAO,CAAC;gBACjB,CAAC;gBAED,yBAAyB;gBACzB,IAAI,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC,EAAE,CAAC;oBACzB,OAAO,IAAI,OAAO,KAAK,IAAI,GAAG,CAAC;gBACjC,CAAC;gBAED,oCAAoC;gBACpC,IAAI,CAAC;oBACH,MAAM,WAAW,GAAG,IAAI,GAAG,CAAC,IAAI,EAAE,OAAO,CAAC,CAAC,IAAI,CAAC;oBAChD,OAAO,IAAI,OAAO,KAAK,WAAW,GAAG,CAAC;gBACxC,CAAC;gBAAC,MAAM,CAAC;oBACP,kCAAkC;oBAClC,OAAO,IAAI,OAAO,KAAK,IAAI,GAAG,CAAC;gBACjC,CAAC;YACH,CAAC;SACF,CAAC,CAAC;IACL,CAAC;IAED,+CAA+C;IAC/C,IAAI,OAAO,EAAE,CAAC;QACZ,QAAQ,CAAC,OAAO,CAAC,gBAAgB,EAAE;YACjC,MAAM,EAAE,KAAK;YACb,WAAW,EAAE,CAAC,QAAQ,EAAE,IAAI,EAAE,EAAE;gBAC9B,MAAM,OAAO,GAAG,IAAY,CAAC;gBAC7B,MAAM,GAAG,GAAG,OAAO,CAAC,YAAY,EAAE,CAAC,KAAK,CAAC,IAAI,EAAE,CAAC;gBAChD,MAAM,GAAG,GAAG,OAAO,CAAC,YAAY,EAAE,CAAC,KAAK,CAAC,IAAI,EAAE,CAAC;gBAEhD,IAAI,CAAC,GAAG,EAAE,CAAC;oBACT,OAAO,EAAE,CAAC;gBACZ,CAAC;gBAED,oCAAoC;gBACpC,IAAI,CAAC;oBACH,MAAM,WAAW,GAAG,IAAI,GAAG,CAAC,GAAG,EAAE,OAAO,CAAC,CAAC,IAAI,CAAC;oBAC/C,OAAO,KAAK,GAAG,KAAK,WAAW,GAAG,CAAC;gBACrC,CAAC;gBAAC,MAAM,CAAC;oBACP,OAAO,KAAK,GAAG,KAAK,GAAG,GAAG,CAAC;gBAC7B,CAAC;YACH,CAAC;SACF,CAAC,CAAC;IACL,CAAC;IAED,OAAO,QAAQ,CAAC;AAClB,CAAC;AAED;;GAEG;AACH,SAAS,kBAAkB,CACzB,CAAqB;IAErB,sCAAsC;IACtC,KAAK,MAAM,QAAQ,IAAI,sBAAsB,EAAE,CAAC;QAC9C,MAAM,OAAO,GAAG,CAAC,CAAC,QAAQ,CAAC,CAAC,KAAK,EAAE,CAAC;QACpC,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACvB,OAAO,OAAO,CAAC;QACjB,CAAC;IACH,CAAC;IAED,wDAAwD;IACxD,IAAI,WAAW,GAA0C,IAAI,CAAC;IAC9D,IAAI,aAAa,GAAG,CAAC,CAAC;IAEtB,CAAC,CAAC,cAAc,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,EAAE;QACjC,MAAM,KAAK,GAAG,CAAC,CAAC,IAAI,CAAC,CAAC;QACtB,MAAM,UAAU,GAAG,KAAK,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC,MAAM,CAAC;QAE9C,IAAI,UAAU,GAAG,aAAa,EAAE,CAAC;YAC/B,aAAa,GAAG,UAAU,CAAC;YAC3B,WAAW,GAAG,KAAK,CAAC;QACtB,CAAC;IACH,CAAC,CAAC,CAAC;IAEH,OAAO,WAAW,CAAC;AACrB,CAAC;AAED;;GAEG;AACH,SAAS,YAAY,CAAC,CAAqB;IACzC,wBAAwB;IACxB,MAAM,QAAQ,GAAG,CAAC,CAAC,OAAO,CAAC,CAAC,KAAK,EAAE,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC;IAClD,IAAI,QAAQ,EAAE,CAAC;QACb,2BAA2B;QAC3B,MAAM,OAAO,GAAG,QAAQ;aACrB,OAAO,CAAC,kBAAkB,EAAE,EAAE,CAAC,CAAC,0CAA0C;aAC1E,IAAI,EAAE,CAAC;QACV,IAAI,OAAO;YAAE,OAAO,OAAO,CAAC;IAC9B,CAAC;IAED,eAAe;IACf,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,CAAC,CAAC,KAAK,EAAE,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC;IACzC,IAAI,EAAE;QAAE,OAAO,EAAE,CAAC;IAElB,wBAAwB;IACxB,MAAM,OAAO,GAAG,CAAC,CAAC,2BAA2B,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;IAC/D,IAAI,OAAO;QAAE,OAAO,OAAO,CAAC,IAAI,EAAE,CAAC;IAEnC,OAAO,SAAS,CAAC;AACnB,CAAC;AAED;;GAEG;AACH,SAAS,eAAe,CACtB,QAAgB;IAEhB,MAAM,QAAQ,GAA2C,EAAE,CAAC;IAC5D,MAAM,KAAK,GAAG,QAAQ,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IAEnC,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QACzB,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,mBAAmB,CAAC,CAAC;QAC9C,IAAI,KAAK,EAAE,CAAC;YACV,QAAQ,CAAC,IAAI,CAAC;gBACZ,KAAK,EAAE,KAAK,CAAC,CAAC,CAAC,CAAC,MAAM;gBACtB,IAAI,EAAE,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE;aACtB,CAAC,CAAC;QACL,CAAC;IACH,CAAC;IAED,OAAO,QAAQ,CAAC;AAClB,CAAC;AAED;;GAEG;AACH,SAAS,iBAAiB,CAAC,QAAgB;IACzC,OAAO,CACL,QAAQ;QACN,sDAAsD;SACrD,OAAO,CAAC,SAAS,EAAE,MAAM,CAAC;QAC3B,wCAAwC;SACvC,OAAO,CAAC,WAAW,EAAE,EAAE,CAAC;QACzB,+BAA+B;SAC9B,IAAI,EAAE,GAAG,IAAI,CACjB,CAAC;AACJ,CAAC;AAED;;;;;;GAMG;AACH,MAAM,UAAU,SAAS,CACvB,IAAY,EACZ,UAAiC,EAAE;IAEnC,MAAM,EAAE,OAAO,EAAE,kBAAkB,EAAE,iBAAiB,GAAG,IAAI,EAAE,GAAG,OAAO,CAAC;IAE1E,0BAA0B;IAC1B,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAE7B,yCAAyC;IACzC,MAAM,KAAK,GAAG,YAAY,CAAC,CAAC,CAAC,CAAC;IAE9B,2BAA2B;IAC3B,KAAK,MAAM,QAAQ,IAAI,kBAAkB,EAAE,CAAC;QAC1C,CAAC,CAAC,QAAQ,CAAC,CAAC,MAAM,EAAE,CAAC;IACvB,CAAC;IAED,6BAA6B;IAC7B,IAAI,WAAmB,CAAC;IAExB,IAAI,iBAAiB,EAAE,CAAC;QACtB,MAAM,WAAW,GAAG,kBAAkB,CAAC,CAAC,CAAC,CAAC;QAC1C,IAAI,WAAW,EAAE,CAAC;YAChB,WAAW,GAAG,WAAW,CAAC,IAAI,EAAE,IAAI,EAAE,CAAC;QACzC,CAAC;aAAM,CAAC;YACN,2BAA2B;YAC3B,WAAW,GAAG,CAAC,CAAC,MAAM,CAAC,CAAC,IAAI,EAAE,IAAI,IAAI,CAAC;QACzC,CAAC;IACH,CAAC;SAAM,CAAC;QACN,WAAW,GAAG,CAAC,CAAC,MAAM,CAAC,CAAC,IAAI,EAAE,IAAI,IAAI,CAAC;IACzC,CAAC;IAED,sBAAsB;IACtB,MAAM,QAAQ,GAAG,qBAAqB,CAAC,OAAO,CAAC,CAAC;IAChD,IAAI,QAAQ,GAAG,QAAQ,CAAC,QAAQ,CAAC,WAAW,CAAC,CAAC;IAE9C,uBAAuB;IACvB,QAAQ,GAAG,iBAAiB,CAAC,QAAQ,CAAC,CAAC;IAEvC,2CAA2C;IAC3C,MAAM,QAAQ,GAAG,eAAe,CAAC,QAAQ,CAAC,CAAC;IAE3C,OAAO;QACL,QAAQ;QACR,KAAK;QACL,QAAQ;KACT,CAAC;AACJ,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,MAAM,cAAc,GAAG;IAC5B,KAAK,EAAE,SAAS;CACjB,CAAC"}
|