@humbletoes/google-search 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +7 -0
- package/README.md +339 -0
- package/bin/google-search +3 -0
- package/bin/google-search-mcp +3 -0
- package/bin/google-search-mcp.cmd +2 -0
- package/bin/google-search.cmd +2 -0
- package/dist/browser-config.d.ts +41 -0
- package/dist/browser-config.js +96 -0
- package/dist/browser-config.js.map +1 -0
- package/dist/browser-pool.d.ts +13 -0
- package/dist/browser-pool.js +37 -0
- package/dist/browser-pool.js.map +1 -0
- package/dist/cache.d.ts +48 -0
- package/dist/cache.js +111 -0
- package/dist/cache.js.map +1 -0
- package/dist/errors.d.ts +26 -0
- package/dist/errors.js +48 -0
- package/dist/errors.js.map +1 -0
- package/dist/filters.d.ts +48 -0
- package/dist/filters.js +192 -0
- package/dist/filters.js.map +1 -0
- package/dist/html-cleaner.d.ts +62 -0
- package/dist/html-cleaner.js +236 -0
- package/dist/html-cleaner.js.map +1 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.js +59 -0
- package/dist/index.js.map +1 -0
- package/dist/logger.d.ts +2 -0
- package/dist/logger.js +41 -0
- package/dist/logger.js.map +1 -0
- package/dist/mcp-server.d.ts +9 -0
- package/dist/mcp-server.js +822 -0
- package/dist/mcp-server.js.map +1 -0
- package/dist/search.d.ts +18 -0
- package/dist/search.js +1080 -0
- package/dist/search.js.map +1 -0
- package/dist/types.d.ts +67 -0
- package/dist/types.js +2 -0
- package/dist/types.js.map +1 -0
- package/dist/validation.d.ts +6 -0
- package/dist/validation.js +23 -0
- package/dist/validation.js.map +1 -0
- package/dist/web-fetcher.d.ts +10 -0
- package/dist/web-fetcher.js +179 -0
- package/dist/web-fetcher.js.map +1 -0
- package/package.json +67 -0
- package/scripts/setup.js +53 -0
package/dist/cache.js
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Simple LRU cache implementation for search results
|
|
3
|
+
* Reduces redundant searches and improves performance
|
|
4
|
+
*/
|
|
5
|
+
import logger from "./logger.js";
|
|
6
|
+
export class SearchCache {
|
|
7
|
+
constructor(maxSize = 100, ttl = 300000) {
|
|
8
|
+
this.totalHits = 0;
|
|
9
|
+
this.totalMisses = 0;
|
|
10
|
+
this.cache = new Map();
|
|
11
|
+
this.maxSize = maxSize;
|
|
12
|
+
this.ttl = ttl;
|
|
13
|
+
}
|
|
14
|
+
/**
|
|
15
|
+
* Generate cache key from query and options
|
|
16
|
+
*/
|
|
17
|
+
generateKey(query, limit) {
|
|
18
|
+
return `${query.toLowerCase().trim()}:${limit || 10}`;
|
|
19
|
+
}
|
|
20
|
+
/**
|
|
21
|
+
* Get cached result if valid
|
|
22
|
+
*/
|
|
23
|
+
get(query, limit, ttl) {
|
|
24
|
+
const key = this.generateKey(query, limit);
|
|
25
|
+
const entry = this.cache.get(key);
|
|
26
|
+
if (!entry) {
|
|
27
|
+
this.totalMisses++;
|
|
28
|
+
logger.debug({ query }, "Cache miss");
|
|
29
|
+
return null;
|
|
30
|
+
}
|
|
31
|
+
// Check if entry is expired
|
|
32
|
+
const age = Date.now() - entry.timestamp;
|
|
33
|
+
const effectiveTtl = ttl || this.ttl;
|
|
34
|
+
if (age > effectiveTtl) {
|
|
35
|
+
this.totalMisses++;
|
|
36
|
+
logger.debug({ query, age, effectiveTtl }, "Cache entry expired");
|
|
37
|
+
this.cache.delete(key);
|
|
38
|
+
return null;
|
|
39
|
+
}
|
|
40
|
+
// Update hit count
|
|
41
|
+
entry.hits++;
|
|
42
|
+
this.totalHits++;
|
|
43
|
+
logger.info({ query, age, hits: entry.hits }, "Cache hit");
|
|
44
|
+
return entry.data;
|
|
45
|
+
}
|
|
46
|
+
/**
|
|
47
|
+
* Store result in cache
|
|
48
|
+
*/
|
|
49
|
+
set(query, data, limit, ttl) {
|
|
50
|
+
const key = this.generateKey(query, limit);
|
|
51
|
+
// If cache is full, remove least recently used entry
|
|
52
|
+
if (this.cache.size >= this.maxSize && !this.cache.has(key)) {
|
|
53
|
+
const firstKey = this.cache.keys().next().value;
|
|
54
|
+
if (firstKey) {
|
|
55
|
+
this.cache.delete(firstKey);
|
|
56
|
+
logger.debug({ removedKey: firstKey }, "Cache eviction");
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
this.cache.set(key, {
|
|
60
|
+
data,
|
|
61
|
+
timestamp: Date.now(),
|
|
62
|
+
hits: 0,
|
|
63
|
+
});
|
|
64
|
+
logger.debug({ query, cacheSize: this.cache.size }, "Cached search result");
|
|
65
|
+
}
|
|
66
|
+
/**
|
|
67
|
+
* Clear all cache entries
|
|
68
|
+
*/
|
|
69
|
+
clear() {
|
|
70
|
+
const size = this.cache.size;
|
|
71
|
+
this.cache.clear();
|
|
72
|
+
logger.info({ clearedEntries: size }, "Cache cleared");
|
|
73
|
+
}
|
|
74
|
+
/**
|
|
75
|
+
* Remove expired entries
|
|
76
|
+
*/
|
|
77
|
+
cleanup() {
|
|
78
|
+
const now = Date.now();
|
|
79
|
+
let removed = 0;
|
|
80
|
+
for (const [key, entry] of this.cache.entries()) {
|
|
81
|
+
if (now - entry.timestamp > this.ttl) {
|
|
82
|
+
this.cache.delete(key);
|
|
83
|
+
removed++;
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
if (removed > 0) {
|
|
87
|
+
logger.info({ removed, remaining: this.cache.size }, "Cache cleanup completed");
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
/**
|
|
91
|
+
* Get cache statistics
|
|
92
|
+
*/
|
|
93
|
+
getStats() {
|
|
94
|
+
const now = Date.now();
|
|
95
|
+
const entries = Array.from(this.cache.entries()).map(([key, entry]) => ({
|
|
96
|
+
key,
|
|
97
|
+
age: now - entry.timestamp,
|
|
98
|
+
hits: entry.hits,
|
|
99
|
+
}));
|
|
100
|
+
return {
|
|
101
|
+
size: this.cache.size,
|
|
102
|
+
maxSize: this.maxSize,
|
|
103
|
+
ttl: this.ttl,
|
|
104
|
+
hits: this.totalHits,
|
|
105
|
+
misses: this.totalMisses,
|
|
106
|
+
entries,
|
|
107
|
+
};
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
// Note: No singleton instance, create per use with desired TTL
|
|
111
|
+
//# sourceMappingURL=cache.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"cache.js","sourceRoot":"","sources":["../src/cache.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAGH,OAAO,MAAM,MAAM,aAAa,CAAC;AAQjC,MAAM,OAAO,WAAW;IAOtB,YAAY,UAAkB,GAAG,EAAE,MAAc,MAAM;QAH/C,cAAS,GAAW,CAAC,CAAC;QACtB,gBAAW,GAAW,CAAC,CAAC;QAG9B,IAAI,CAAC,KAAK,GAAG,IAAI,GAAG,EAAE,CAAC;QACvB,IAAI,CAAC,OAAO,GAAG,OAAO,CAAC;QACvB,IAAI,CAAC,GAAG,GAAG,GAAG,CAAC;IACjB,CAAC;IAED;;OAEG;IACK,WAAW,CAAC,KAAa,EAAE,KAAc;QAC/C,OAAO,GAAG,KAAK,CAAC,WAAW,EAAE,CAAC,IAAI,EAAE,IAAI,KAAK,IAAI,EAAE,EAAE,CAAC;IACxD,CAAC;IAED;;QAEI;IACH,GAAG,CAAC,KAAa,EAAE,KAAc,EAAE,GAAY;QAC9C,MAAM,GAAG,GAAG,IAAI,CAAC,WAAW,CAAC,KAAK,EAAE,KAAK,CAAC,CAAC;QAC3C,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;QAElC,IAAI,CAAC,KAAK,EAAE,CAAC;YACX,IAAI,CAAC,WAAW,EAAE,CAAC;YACnB,MAAM,CAAC,KAAK,CAAC,EAAE,KAAK,EAAE,EAAE,YAAY,CAAC,CAAC;YACtC,OAAO,IAAI,CAAC;QACd,CAAC;QAED,4BAA4B;QAC5B,MAAM,GAAG,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,KAAK,CAAC,SAAS,CAAC;QACzC,MAAM,YAAY,GAAG,GAAG,IAAI,IAAI,CAAC,GAAG,CAAC;QACrC,IAAI,GAAG,GAAG,YAAY,EAAE,CAAC;YACvB,IAAI,CAAC,WAAW,EAAE,CAAC;YACnB,MAAM,CAAC,KAAK,CAAC,EAAE,KAAK,EAAE,GAAG,EAAE,YAAY,EAAE,EAAE,qBAAqB,CAAC,CAAC;YAClE,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC;YACvB,OAAO,IAAI,CAAC;QACd,CAAC;QAED,mBAAmB;QACnB,KAAK,CAAC,IAAI,EAAE,CAAC;QACb,IAAI,CAAC,SAAS,EAAE,CAAC;QACjB,MAAM,CAAC,IAAI,CAAC,EAAE,KAAK,EAAE,GAAG,EAAE,IAAI,EAAE,KAAK,CAAC,IAAI,EAAE,EAAE,WAAW,CAAC,CAAC;QAC3D,OAAO,KAAK,CAAC,IAAI,CAAC;IACpB,CAAC;IAED;;QAEI;IACH,GAAG,CAAC,KAAa,EAAE,IAAoB,EAAE,KAAc,EAAE,GAAY;QACpE,MAAM,GAAG,GAAG,IAAI,CAAC,WAAW,CAAC,KAAK,EAAE,KAAK,CAAC,CAAC;QAE3C,qDAAqD;QACrD,IAAI,IAAI,CAAC,KAAK,CAAC,IAAI,IAAI,IAAI,CAAC,OAAO,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC;YAC5D,MAAM,QAAQ,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC,KAAK,CAAC;YAChD,IAAI,QAAQ,EAAE,CAAC;gBACb,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAC;gBAC5B,MAAM,CAAC,KAAK,CAAC,EAAE,UAAU,EAAE,QAAQ,EAAE,EAAE,gBAAgB,CAAC,CAAC;YAC3D,CAAC;QACH,CAAC;QAED,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,GAAG,EAAE;YAClB,IAAI;YACJ,SAAS,EAAE,IAAI,CAAC,GAAG,EAAE;YACrB,IAAI,EAAE,CAAC;SACR,CAAC,CAAC;QAEH,MAAM,CAAC,KAAK,CAAC,EAAE,KAAK,EAAE,SAAS,EAAE,IAAI,CAAC,KAAK,CAAC,IAAI,EAAE,EAAE,sBAAsB,CAAC,CAAC;IAC9E,CAAC;IAED;;OAEG;IACH,KAAK;QACH,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC;QAC7B,IAAI,CAAC,KAAK,CAAC,KAAK,EAAE,CAAC;QACnB,MAAM,CAAC,IAAI,CAAC,EAAE,cAAc,EAAE,IAAI,EAAE,EAAE,eAAe,CAAC,CAAC;IACzD,CAAC;IAED;;OAEG;IACH,OAAO;QACL,MAAM,GAAG,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;QACvB,IAAI,OAAO,GAAG,CAAC,CAAC;QAEhB,KAAK,MAAM,CAAC,GAAG,EAAE,KAAK,CAAC,IAAI,IAAI,CAAC,KAAK,CAAC,OAAO,EAAE,EAAE,CAAC;YAChD,IAAI,GAAG,GAAG,KAAK,CAAC,SAAS,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;gBACrC,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC;gBACvB,OAAO,EAAE,CAAC;YACZ,CAAC;QACH,CAAC;QAED,IAAI,OAAO,GAAG,CAAC,EAAE,CAAC;YAChB,MAAM,CAAC,IAAI,CAAC,EAAE,OAAO,EAAE,SAAS,EAAE,IAAI,CAAC,KAAK,CAAC,IAAI,EAAE,EAAE,yBAAyB,CAAC,CAAC;QAClF,CAAC;IACH,CAAC;IAED;;OAEG;IACH,QAAQ;QAQN,MAAM,GAAG,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;QACvB,MAAM,OAAO,GAAG,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,OAAO,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG,EAAE,KAAK,CAAC,EAAE,EAAE,CAAC,CAAC;YACtE,GAAG;YACH,GAAG,EAAE,GAAG,GAAG,KAAK,CAAC,SAAS;YAC1B,IAAI,EAAE,KAAK,CAAC,IAAI;SACjB,CAAC,CAAC,CAAC;QAEJ,OAAO;YACL,IAAI,EAAE,IAAI,CAAC,KAAK,CAAC,IAAI;YACrB,OAAO,EAAE,IAAI,CAAC,OAAO;YACrB,GAAG,EAAE,IAAI,CAAC,GAAG;YACb,IAAI,EAAE,IAAI,CAAC,SAAS;YACpB,MAAM,EAAE,IAAI,CAAC,WAAW;YACxB,OAAO;SACR,CAAC;IACJ,CAAC;CACF;AAED,+DAA+D"}
|
package/dist/errors.d.ts
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Base error class for search operations
|
|
3
|
+
*/
|
|
4
|
+
export declare class SearchError extends Error {
|
|
5
|
+
code: string;
|
|
6
|
+
retryable: boolean;
|
|
7
|
+
constructor(message: string, code: string, retryable?: boolean);
|
|
8
|
+
}
|
|
9
|
+
/**
|
|
10
|
+
* Error thrown when CAPTCHA verification is required
|
|
11
|
+
*/
|
|
12
|
+
export declare class CaptchaError extends SearchError {
|
|
13
|
+
constructor(message: string);
|
|
14
|
+
}
|
|
15
|
+
/**
|
|
16
|
+
* Error thrown when network issues occur
|
|
17
|
+
*/
|
|
18
|
+
export declare class NetworkError extends SearchError {
|
|
19
|
+
constructor(message: string);
|
|
20
|
+
}
|
|
21
|
+
/**
|
|
22
|
+
* Manages retry logic for operations that may fail
|
|
23
|
+
*/
|
|
24
|
+
export declare class RetryManager {
|
|
25
|
+
executeWithRetry<T>(operation: () => Promise<T>, maxRetries?: number, baseDelay?: number): Promise<T>;
|
|
26
|
+
}
|
package/dist/errors.js
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Base error class for search operations
|
|
3
|
+
*/
|
|
4
|
+
export class SearchError extends Error {
|
|
5
|
+
constructor(message, code, retryable = false) {
|
|
6
|
+
super(message);
|
|
7
|
+
this.code = code;
|
|
8
|
+
this.retryable = retryable;
|
|
9
|
+
this.name = 'SearchError';
|
|
10
|
+
}
|
|
11
|
+
}
|
|
12
|
+
/**
|
|
13
|
+
* Error thrown when CAPTCHA verification is required
|
|
14
|
+
*/
|
|
15
|
+
export class CaptchaError extends SearchError {
|
|
16
|
+
constructor(message) {
|
|
17
|
+
super(message, 'CAPTCHA_REQUIRED', true);
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
/**
|
|
21
|
+
* Error thrown when network issues occur
|
|
22
|
+
*/
|
|
23
|
+
export class NetworkError extends SearchError {
|
|
24
|
+
constructor(message) {
|
|
25
|
+
super(message, 'NETWORK_ERROR', true);
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
/**
|
|
29
|
+
* Manages retry logic for operations that may fail
|
|
30
|
+
*/
|
|
31
|
+
export class RetryManager {
|
|
32
|
+
async executeWithRetry(operation, maxRetries = 3, baseDelay = 1000) {
|
|
33
|
+
for (let attempt = 0; attempt <= maxRetries; attempt++) {
|
|
34
|
+
try {
|
|
35
|
+
return await operation();
|
|
36
|
+
}
|
|
37
|
+
catch (error) {
|
|
38
|
+
if (attempt === maxRetries || !(error instanceof SearchError) || !error.retryable) {
|
|
39
|
+
throw error;
|
|
40
|
+
}
|
|
41
|
+
const delay = baseDelay * Math.pow(2, attempt);
|
|
42
|
+
await new Promise(resolve => setTimeout(resolve, delay));
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
throw new Error('Retry logic error: should not reach here');
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
//# sourceMappingURL=errors.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"errors.js","sourceRoot":"","sources":["../src/errors.ts"],"names":[],"mappings":"AAAA;;GAEG;AACH,MAAM,OAAO,WAAY,SAAQ,KAAK;IACpC,YAAY,OAAe,EAAS,IAAY,EAAS,YAAqB,KAAK;QACjF,KAAK,CAAC,OAAO,CAAC,CAAC;QADmB,SAAI,GAAJ,IAAI,CAAQ;QAAS,cAAS,GAAT,SAAS,CAAiB;QAEjF,IAAI,CAAC,IAAI,GAAG,aAAa,CAAC;IAC5B,CAAC;CACF;AAED;;GAEG;AACH,MAAM,OAAO,YAAa,SAAQ,WAAW;IAC3C,YAAY,OAAe;QACzB,KAAK,CAAC,OAAO,EAAE,kBAAkB,EAAE,IAAI,CAAC,CAAC;IAC3C,CAAC;CACF;AAED;;GAEG;AACH,MAAM,OAAO,YAAa,SAAQ,WAAW;IAC3C,YAAY,OAAe;QACzB,KAAK,CAAC,OAAO,EAAE,eAAe,EAAE,IAAI,CAAC,CAAC;IACxC,CAAC;CACF;AAED;;GAEG;AACH,MAAM,OAAO,YAAY;IACvB,KAAK,CAAC,gBAAgB,CACpB,SAA2B,EAC3B,aAAqB,CAAC,EACtB,YAAoB,IAAI;QAExB,KAAK,IAAI,OAAO,GAAG,CAAC,EAAE,OAAO,IAAI,UAAU,EAAE,OAAO,EAAE,EAAE,CAAC;YACvD,IAAI,CAAC;gBACH,OAAO,MAAM,SAAS,EAAE,CAAC;YAC3B,CAAC;YAAC,OAAO,KAAK,EAAE,CAAC;gBACf,IAAI,OAAO,KAAK,UAAU,IAAI,CAAC,CAAC,KAAK,YAAY,WAAW,CAAC,IAAI,CAAC,KAAK,CAAC,SAAS,EAAE,CAAC;oBAClF,MAAM,KAAK,CAAC;gBACd,CAAC;gBACD,MAAM,KAAK,GAAG,SAAS,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,OAAO,CAAC,CAAC;gBAC/C,MAAM,IAAI,OAAO,CAAC,OAAO,CAAC,EAAE,CAAC,UAAU,CAAC,OAAO,EAAE,KAAK,CAAC,CAAC,CAAC;YAC3D,CAAC;QACH,CAAC;QACD,MAAM,IAAI,KAAK,CAAC,0CAA0C,CAAC,CAAC;IAC9D,CAAC;CACF"}
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
import { SearchResult } from './types.js';
|
|
2
|
+
export interface FilterOptions {
|
|
3
|
+
includeDomains?: string[];
|
|
4
|
+
excludeDomains?: string[];
|
|
5
|
+
minSnippetLength?: number;
|
|
6
|
+
maxSnippetLength?: number;
|
|
7
|
+
onlyRichSnippets?: boolean;
|
|
8
|
+
topN?: number;
|
|
9
|
+
skipFirst?: number;
|
|
10
|
+
mustInclude?: string[];
|
|
11
|
+
mustExclude?: string[];
|
|
12
|
+
contentType?: 'all' | 'html' | 'pdf' | 'doc' | 'video' | 'image';
|
|
13
|
+
dateRange?: {
|
|
14
|
+
from?: Date;
|
|
15
|
+
to?: Date;
|
|
16
|
+
};
|
|
17
|
+
minRelevanceScore?: number;
|
|
18
|
+
}
|
|
19
|
+
export interface FilterStats {
|
|
20
|
+
totalResults: number;
|
|
21
|
+
filteredResults: number;
|
|
22
|
+
domainDistribution: Record<string, number>;
|
|
23
|
+
contentTypeDistribution: Record<string, number>;
|
|
24
|
+
richSnippetCount: number;
|
|
25
|
+
averageSnippetLength: number;
|
|
26
|
+
dateRange?: {
|
|
27
|
+
earliest?: string;
|
|
28
|
+
latest?: string;
|
|
29
|
+
};
|
|
30
|
+
}
|
|
31
|
+
export declare class ResultFilter {
|
|
32
|
+
/**
|
|
33
|
+
* Filter search results based on various criteria
|
|
34
|
+
*/
|
|
35
|
+
static filter(results: SearchResult[], options: FilterOptions): SearchResult[];
|
|
36
|
+
/**
|
|
37
|
+
* Sort search results by various criteria
|
|
38
|
+
*/
|
|
39
|
+
static sort(results: SearchResult[], sortBy: 'position' | 'snippetLength' | 'domain', order?: 'asc' | 'desc'): SearchResult[];
|
|
40
|
+
/**
|
|
41
|
+
* Get comprehensive statistics about filtered results
|
|
42
|
+
*/
|
|
43
|
+
static getStats(results: SearchResult[]): FilterStats;
|
|
44
|
+
/**
|
|
45
|
+
* Add relevance scoring to results (basic implementation)
|
|
46
|
+
*/
|
|
47
|
+
static addRelevanceScores(results: SearchResult[], query: string): SearchResult[];
|
|
48
|
+
}
|
package/dist/filters.js
ADDED
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
export class ResultFilter {
|
|
2
|
+
/**
|
|
3
|
+
* Filter search results based on various criteria
|
|
4
|
+
*/
|
|
5
|
+
static filter(results, options) {
|
|
6
|
+
let filtered = [...results];
|
|
7
|
+
// Domain filtering
|
|
8
|
+
if (options.includeDomains && options.includeDomains.length > 0) {
|
|
9
|
+
const domains = options.includeDomains.map(d => d.toLowerCase());
|
|
10
|
+
filtered = filtered.filter(result => result.domain && domains.some(domain => result.domain.toLowerCase().includes(domain) ||
|
|
11
|
+
domain.includes(result.domain.toLowerCase())));
|
|
12
|
+
}
|
|
13
|
+
if (options.excludeDomains && options.excludeDomains.length > 0) {
|
|
14
|
+
const domains = options.excludeDomains.map(d => d.toLowerCase());
|
|
15
|
+
filtered = filtered.filter(result => !result.domain || !domains.some(domain => result.domain.toLowerCase().includes(domain) ||
|
|
16
|
+
domain.includes(result.domain.toLowerCase())));
|
|
17
|
+
}
|
|
18
|
+
// Snippet length filtering
|
|
19
|
+
if (options.minSnippetLength !== undefined) {
|
|
20
|
+
filtered = filtered.filter(result => (result.snippetLength || 0) >= options.minSnippetLength);
|
|
21
|
+
}
|
|
22
|
+
if (options.maxSnippetLength !== undefined) {
|
|
23
|
+
filtered = filtered.filter(result => (result.snippetLength || 0) <= options.maxSnippetLength);
|
|
24
|
+
}
|
|
25
|
+
// Rich snippet filtering
|
|
26
|
+
if (options.onlyRichSnippets) {
|
|
27
|
+
filtered = filtered.filter(result => result.hasRichSnippet);
|
|
28
|
+
}
|
|
29
|
+
// Content type filtering (basic URL-based detection)
|
|
30
|
+
if (options.contentType && options.contentType !== 'all') {
|
|
31
|
+
filtered = filtered.filter(result => {
|
|
32
|
+
const url = result.link.toLowerCase();
|
|
33
|
+
switch (options.contentType) {
|
|
34
|
+
case 'pdf':
|
|
35
|
+
return url.endsWith('.pdf') || url.includes('.pdf?');
|
|
36
|
+
case 'doc':
|
|
37
|
+
return url.endsWith('.doc') || url.endsWith('.docx') ||
|
|
38
|
+
url.endsWith('.txt') || url.includes('.doc');
|
|
39
|
+
case 'video':
|
|
40
|
+
return url.includes('youtube.com') || url.includes('vimeo.com') ||
|
|
41
|
+
url.endsWith('.mp4') || url.endsWith('.avi') ||
|
|
42
|
+
url.includes('/video/');
|
|
43
|
+
case 'image':
|
|
44
|
+
return url.endsWith('.jpg') || url.endsWith('.jpeg') ||
|
|
45
|
+
url.endsWith('.png') || url.endsWith('.gif') ||
|
|
46
|
+
url.endsWith('.webp') || url.includes('/image/');
|
|
47
|
+
case 'html':
|
|
48
|
+
default:
|
|
49
|
+
return !url.endsWith('.pdf') && !url.endsWith('.doc') &&
|
|
50
|
+
!url.endsWith('.docx') && !url.endsWith('.mp4') &&
|
|
51
|
+
!url.endsWith('.avi') && !url.includes('youtube.com') &&
|
|
52
|
+
!url.includes('vimeo.com');
|
|
53
|
+
}
|
|
54
|
+
});
|
|
55
|
+
}
|
|
56
|
+
// Keyword filtering
|
|
57
|
+
if (options.mustInclude && options.mustInclude.length > 0) {
|
|
58
|
+
filtered = filtered.filter(result => {
|
|
59
|
+
const text = (result.title + ' ' + result.snippet).toLowerCase();
|
|
60
|
+
return options.mustInclude.every(keyword => text.includes(keyword.toLowerCase()));
|
|
61
|
+
});
|
|
62
|
+
}
|
|
63
|
+
if (options.mustExclude && options.mustExclude.length > 0) {
|
|
64
|
+
filtered = filtered.filter(result => {
|
|
65
|
+
const text = (result.title + ' ' + result.snippet).toLowerCase();
|
|
66
|
+
return !options.mustExclude.some(keyword => text.includes(keyword.toLowerCase()));
|
|
67
|
+
});
|
|
68
|
+
}
|
|
69
|
+
// Position-based filtering
|
|
70
|
+
if (options.skipFirst && options.skipFirst > 0) {
|
|
71
|
+
filtered = filtered.slice(options.skipFirst);
|
|
72
|
+
}
|
|
73
|
+
if (options.topN && options.topN > 0) {
|
|
74
|
+
filtered = filtered.slice(0, options.topN);
|
|
75
|
+
}
|
|
76
|
+
return filtered;
|
|
77
|
+
}
|
|
78
|
+
/**
|
|
79
|
+
* Sort search results by various criteria
|
|
80
|
+
*/
|
|
81
|
+
static sort(results, sortBy, order = 'asc') {
|
|
82
|
+
return [...results].sort((a, b) => {
|
|
83
|
+
let comparison = 0;
|
|
84
|
+
switch (sortBy) {
|
|
85
|
+
case 'position':
|
|
86
|
+
comparison = (a.position || 0) - (b.position || 0);
|
|
87
|
+
break;
|
|
88
|
+
case 'snippetLength':
|
|
89
|
+
comparison = (a.snippetLength || 0) - (b.snippetLength || 0);
|
|
90
|
+
break;
|
|
91
|
+
case 'domain':
|
|
92
|
+
comparison = (a.domain || '').localeCompare(b.domain || '');
|
|
93
|
+
break;
|
|
94
|
+
}
|
|
95
|
+
return order === 'desc' ? -comparison : comparison;
|
|
96
|
+
});
|
|
97
|
+
}
|
|
98
|
+
/**
|
|
99
|
+
* Get comprehensive statistics about filtered results
|
|
100
|
+
*/
|
|
101
|
+
static getStats(results) {
|
|
102
|
+
const domainDistribution = {};
|
|
103
|
+
const contentTypeDistribution = {
|
|
104
|
+
html: 0,
|
|
105
|
+
pdf: 0,
|
|
106
|
+
doc: 0,
|
|
107
|
+
video: 0,
|
|
108
|
+
image: 0,
|
|
109
|
+
other: 0
|
|
110
|
+
};
|
|
111
|
+
let richSnippetCount = 0;
|
|
112
|
+
let totalSnippetLength = 0;
|
|
113
|
+
results.forEach(result => {
|
|
114
|
+
// Domain distribution
|
|
115
|
+
const domain = result.domain || 'unknown';
|
|
116
|
+
domainDistribution[domain] = (domainDistribution[domain] || 0) + 1;
|
|
117
|
+
// Content type distribution
|
|
118
|
+
const url = result.link.toLowerCase();
|
|
119
|
+
if (url.endsWith('.pdf') || url.includes('.pdf?')) {
|
|
120
|
+
contentTypeDistribution.pdf++;
|
|
121
|
+
}
|
|
122
|
+
else if (url.endsWith('.doc') || url.endsWith('.docx') || url.endsWith('.txt')) {
|
|
123
|
+
contentTypeDistribution.doc++;
|
|
124
|
+
}
|
|
125
|
+
else if (url.includes('youtube.com') || url.includes('vimeo.com') ||
|
|
126
|
+
url.endsWith('.mp4') || url.endsWith('.avi') || url.includes('/video/')) {
|
|
127
|
+
contentTypeDistribution.video++;
|
|
128
|
+
}
|
|
129
|
+
else if (url.endsWith('.jpg') || url.endsWith('.jpeg') || url.endsWith('.png') ||
|
|
130
|
+
url.endsWith('.gif') || url.endsWith('.webp') || url.includes('/image/')) {
|
|
131
|
+
contentTypeDistribution.image++;
|
|
132
|
+
}
|
|
133
|
+
else {
|
|
134
|
+
contentTypeDistribution.html++;
|
|
135
|
+
}
|
|
136
|
+
// Rich snippet count
|
|
137
|
+
if (result.hasRichSnippet) {
|
|
138
|
+
richSnippetCount++;
|
|
139
|
+
}
|
|
140
|
+
// Snippet length
|
|
141
|
+
totalSnippetLength += result.snippetLength || 0;
|
|
142
|
+
});
|
|
143
|
+
return {
|
|
144
|
+
totalResults: results.length,
|
|
145
|
+
filteredResults: results.length,
|
|
146
|
+
domainDistribution,
|
|
147
|
+
contentTypeDistribution,
|
|
148
|
+
richSnippetCount,
|
|
149
|
+
averageSnippetLength: results.length > 0 ? totalSnippetLength / results.length : 0,
|
|
150
|
+
};
|
|
151
|
+
}
|
|
152
|
+
/**
|
|
153
|
+
* Add relevance scoring to results (basic implementation)
|
|
154
|
+
*/
|
|
155
|
+
static addRelevanceScores(results, query) {
|
|
156
|
+
const queryWords = query.toLowerCase().split(/\s+/).filter(word => word.length > 2);
|
|
157
|
+
return results.map(result => {
|
|
158
|
+
let score = 0;
|
|
159
|
+
const title = result.title.toLowerCase();
|
|
160
|
+
const snippet = result.snippet.toLowerCase();
|
|
161
|
+
// Title matches are most important
|
|
162
|
+
queryWords.forEach(word => {
|
|
163
|
+
if (title.includes(word)) {
|
|
164
|
+
score += 10;
|
|
165
|
+
}
|
|
166
|
+
if (snippet.includes(word)) {
|
|
167
|
+
score += 5;
|
|
168
|
+
}
|
|
169
|
+
});
|
|
170
|
+
// Position bonus (earlier results are more relevant)
|
|
171
|
+
if (result.position) {
|
|
172
|
+
score += Math.max(0, 20 - result.position);
|
|
173
|
+
}
|
|
174
|
+
// Rich snippet bonus
|
|
175
|
+
if (result.hasRichSnippet) {
|
|
176
|
+
score += 5;
|
|
177
|
+
}
|
|
178
|
+
// Domain authority bonus (basic heuristic)
|
|
179
|
+
if (result.domain) {
|
|
180
|
+
const trustedDomains = ['github.com', 'stackoverflow.com', 'wikipedia.org', 'docs.microsoft.com'];
|
|
181
|
+
if (trustedDomains.some(domain => result.domain.includes(domain))) {
|
|
182
|
+
score += 3;
|
|
183
|
+
}
|
|
184
|
+
}
|
|
185
|
+
return {
|
|
186
|
+
...result,
|
|
187
|
+
relevanceScore: score
|
|
188
|
+
};
|
|
189
|
+
});
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
//# sourceMappingURL=filters.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"filters.js","sourceRoot":"","sources":["../src/filters.ts"],"names":[],"mappings":"AAiCA,MAAM,OAAO,YAAY;IACvB;;OAEG;IACH,MAAM,CAAC,MAAM,CAAC,OAAuB,EAAE,OAAsB;QAC3D,IAAI,QAAQ,GAAG,CAAC,GAAG,OAAO,CAAC,CAAC;QAE5B,mBAAmB;QACnB,IAAI,OAAO,CAAC,cAAc,IAAI,OAAO,CAAC,cAAc,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAChE,MAAM,OAAO,GAAG,OAAO,CAAC,cAAc,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,WAAW,EAAE,CAAC,CAAC;YACjE,QAAQ,GAAG,QAAQ,CAAC,MAAM,CAAC,MAAM,CAAC,EAAE,CAClC,MAAM,CAAC,MAAM,IAAI,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,EAAE,CACrC,MAAM,CAAC,MAAO,CAAC,WAAW,EAAE,CAAC,QAAQ,CAAC,MAAM,CAAC;gBAC7C,MAAM,CAAC,QAAQ,CAAC,MAAM,CAAC,MAAO,CAAC,WAAW,EAAE,CAAC,CAC9C,CACF,CAAC;QACJ,CAAC;QAED,IAAI,OAAO,CAAC,cAAc,IAAI,OAAO,CAAC,cAAc,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAChE,MAAM,OAAO,GAAG,OAAO,CAAC,cAAc,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,WAAW,EAAE,CAAC,CAAC;YACjE,QAAQ,GAAG,QAAQ,CAAC,MAAM,CAAC,MAAM,CAAC,EAAE,CAClC,CAAC,MAAM,CAAC,MAAM,IAAI,CAAC,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,EAAE,CACvC,MAAM,CAAC,MAAO,CAAC,WAAW,EAAE,CAAC,QAAQ,CAAC,MAAM,CAAC;gBAC7C,MAAM,CAAC,QAAQ,CAAC,MAAM,CAAC,MAAO,CAAC,WAAW,EAAE,CAAC,CAC9C,CACF,CAAC;QACJ,CAAC;QAED,2BAA2B;QAC3B,IAAI,OAAO,CAAC,gBAAgB,KAAK,SAAS,EAAE,CAAC;YAC3C,QAAQ,GAAG,QAAQ,CAAC,MAAM,CAAC,MAAM,CAAC,EAAE,CAClC,CAAC,MAAM,CAAC,aAAa,IAAI,CAAC,CAAC,IAAI,OAAO,CAAC,gBAAiB,CACzD,CAAC;QACJ,CAAC;QAED,IAAI,OAAO,CAAC,gBAAgB,KAAK,SAAS,EAAE,CAAC;YAC3C,QAAQ,GAAG,QAAQ,CAAC,MAAM,CAAC,MAAM,CAAC,EAAE,CAClC,CAAC,MAAM,CAAC,aAAa,IAAI,CAAC,CAAC,IAAI,OAAO,CAAC,gBAAiB,CACzD,CAAC;QACJ,CAAC;QAED,yBAAyB;QACzB,IAAI,OAAO,CAAC,gBAAgB,EAAE,CAAC;YAC7B,QAAQ,GAAG,QAAQ,CAAC,MAAM,CAAC,MAAM,CAAC,EAAE,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC;QAC9D,CAAC;QAED,qDAAqD;QACrD,IAAI,OAAO,CAAC,WAAW,IAAI,OAAO,CAAC,WAAW,KAAK,KAAK,EAAE,CAAC;YACzD,QAAQ,GAAG,QAAQ,CAAC,MAAM,CAAC,MAAM,CAAC,EAAE;gBAClC,MAAM,GAAG,GAAG,MAAM,CAAC,IAAI,CAAC,WAAW,EAAE,CAAC;gBACtC,QAAQ,OAAO,CAAC,WAAW,EAAE,CAAC;oBAC5B,KAAK,KAAK;wBACR,OAAO,GAAG,CAAC,QAAQ,CAAC,MAAM,CAAC,IAAI,GAAG,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC;oBACvD,KAAK,KAAK;wBACR,OAAO,GAAG,CAAC,QAAQ,CAAC,MAAM,CAAC,IAAI,GAAG,CAAC,QAAQ,CAAC,OAAO,CAAC;4BAC7C,GAAG,CAAC,QAAQ,CAAC,MAAM,CAAC,IAAI,GAAG,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC;oBACtD,KAAK,OAAO;wBACV,OAAO,GAAG,CAAC,QAAQ,CAAC,aAAa,CAAC,IAAI,GAAG,CAAC,QAAQ,CAAC,WAAW,CAAC;4BACxD,GAAG,CAAC,QAAQ,CAAC,MAAM,CAAC,IAAI,GAAG,CAAC,QAAQ,CAAC,MAAM,CAAC;4BAC5C,GAAG,CAAC,QAAQ,CAAC,SAAS,CAAC,CAAC;oBACjC,KAAK,OAAO;wBACV,OAAO,GAAG,CAAC,QAAQ,CAAC,MAAM,CAAC,IAAI,GAAG,CAAC,QAAQ,CAAC,OAAO,CAAC;4BAC7C,GAAG,CAAC,QAAQ,CAAC,MAAM,CAAC,IAAI,GAAG,CAAC,QAAQ,CAAC,MAAM,CAAC;4BAC5C,GAAG,CAAC,QAAQ,CAAC,OAAO,CAAC,IAAI,GAAG,CAAC,QAAQ,CAAC,SAAS,CAAC,CAAC;oBAC1D,KAAK,MAAM,CAAC;oBACZ;wBACE,OAAO,CAAC,GAAG,CAAC,QAAQ,CAAC,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,QAAQ,CAAC,MAAM,CAAC;4BAC9C,CAAC,GAAG,CAAC,QAAQ,CAAC,OAAO,CAAC,IAAI,CAAC,GAAG,CAAC,QAAQ,CAAC,MAAM,CAAC;4BAC/C,CAAC,GAAG,CAAC,QAAQ,CAAC,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,QAAQ,CAAC,aAAa,CAAC;4BACrD,CAAC,GAAG,CAAC,QAAQ,CAAC,WAAW,CAAC,CAAC;gBACtC,CAAC;YACH,CAAC,CAAC,CAAC;QACL,CAAC;QAED,oBAAoB;QACpB,IAAI,OAAO,CAAC,WAAW,IAAI,OAAO,CAAC,WAAW,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC1D,QAAQ,GAAG,QAAQ,CAAC,MAAM,CAAC,MAAM,CAAC,EAAE;gBAClC,MAAM,IAAI,GAAG,CAAC,MAAM,CAAC,KAAK,GAAG,GAAG,GAAG,MAAM,CAAC,OAAO,CAAC,CAAC,WAAW,EAAE,CAAC;gBACjE,OAAO,OAAO,CAAC,WAAY,CAAC,KAAK,CAAC,OAAO,CAAC,EAAE,CAC1C,IAAI,CAAC,QAAQ,CAAC,OAAO,CAAC,WAAW,EAAE,CAAC,CACrC,CAAC;YACJ,CAAC,CAAC,CAAC;QACL,CAAC;QAED,IAAI,OAAO,CAAC,WAAW,IAAI,OAAO,CAAC,WAAW,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC1D,QAAQ,GAAG,QAAQ,CAAC,MAAM,CAAC,MAAM,CAAC,EAAE;gBAClC,MAAM,IAAI,GAAG,CAAC,MAAM,CAAC,KAAK,GAAG,GAAG,GAAG,MAAM,CAAC,OAAO,CAAC,CAAC,WAAW,EAAE,CAAC;gBACjE,OAAO,CAAC,OAAO,CAAC,WAAY,CAAC,IAAI,CAAC,OAAO,CAAC,EAAE,CAC1C,IAAI,CAAC,QAAQ,CAAC,OAAO,CAAC,WAAW,EAAE,CAAC,CACrC,CAAC;YACJ,CAAC,CAAC,CAAC;QACL,CAAC;QAED,2BAA2B;QAC3B,IAAI,OAAO,CAAC,SAAS,IAAI,OAAO,CAAC,SAAS,GAAG,CAAC,EAAE,CAAC;YAC/C,QAAQ,GAAG,QAAQ,CAAC,KAAK,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC;QAC/C,CAAC;QAED,IAAI,OAAO,CAAC,IAAI,IAAI,OAAO,CAAC,IAAI,GAAG,CAAC,EAAE,CAAC;YACrC,QAAQ,GAAG,QAAQ,CAAC,KAAK,CAAC,CAAC,EAAE,OAAO,CAAC,IAAI,CAAC,CAAC;QAC7C,CAAC;QAED,OAAO,QAAQ,CAAC;IAClB,CAAC;IAED;;OAEG;IACH,MAAM,CAAC,IAAI,CAAC,OAAuB,EAAE,MAA+C,EAAE,QAAwB,KAAK;QACjH,OAAO,CAAC,GAAG,OAAO,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE;YAChC,IAAI,UAAU,GAAG,CAAC,CAAC;YAEnB,QAAQ,MAAM,EAAE,CAAC;gBACf,KAAK,UAAU;oBACb,UAAU,GAAG,CAAC,CAAC,CAAC,QAAQ,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,QAAQ,IAAI,CAAC,CAAC,CAAC;oBACnD,MAAM;gBACR,KAAK,eAAe;oBAClB,UAAU,GAAG,CAAC,CAAC,CAAC,aAAa,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,aAAa,IAAI,CAAC,CAAC,CAAC;oBAC7D,MAAM;gBACR,KAAK,QAAQ;oBACX,UAAU,GAAG,CAAC,CAAC,CAAC,MAAM,IAAI,EAAE,CAAC,CAAC,aAAa,CAAC,CAAC,CAAC,MAAM,IAAI,EAAE,CAAC,CAAC;oBAC5D,MAAM;YACV,CAAC;YAED,OAAO,KAAK,KAAK,MAAM,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,UAAU,CAAC;QACrD,CAAC,CAAC,CAAC;IACL,CAAC;IAED;;OAEG;IACH,MAAM,CAAC,QAAQ,CAAC,OAAuB;QACrC,MAAM,kBAAkB,GAA2B,EAAE,CAAC;QACtD,MAAM,uBAAuB,GAA2B;YACtD,IAAI,EAAE,CAAC;YACP,GAAG,EAAE,CAAC;YACN,GAAG,EAAE,CAAC;YACN,KAAK,EAAE,CAAC;YACR,KAAK,EAAE,CAAC;YACR,KAAK,EAAE,CAAC;SACT,CAAC;QAEF,IAAI,gBAAgB,GAAG,CAAC,CAAC;QACzB,IAAI,kBAAkB,GAAG,CAAC,CAAC;QAE3B,OAAO,CAAC,OAAO,CAAC,MAAM,CAAC,EAAE;YACvB,sBAAsB;YACtB,MAAM,MAAM,GAAG,MAAM,CAAC,MAAM,IAAI,SAAS,CAAC;YAC1C,kBAAkB,CAAC,MAAM,CAAC,GAAG,CAAC,kBAAkB,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC;YAEnE,4BAA4B;YAC5B,MAAM,GAAG,GAAG,MAAM,CAAC,IAAI,CAAC,WAAW,EAAE,CAAC;YACtC,IAAI,GAAG,CAAC,QAAQ,CAAC,MAAM,CAAC,IAAI,GAAG,CAAC,QAAQ,CAAC,OAAO,CAAC,EAAE,CAAC;gBAClD,uBAAuB,CAAC,GAAG,EAAE,CAAC;YAChC,CAAC;iBAAM,IAAI,GAAG,CAAC,QAAQ,CAAC,MAAM,CAAC,IAAI,GAAG,CAAC,QAAQ,CAAC,OAAO,CAAC,IAAI,GAAG,CAAC,QAAQ,CAAC,MAAM,CAAC,EAAE,CAAC;gBACjF,uBAAuB,CAAC,GAAG,EAAE,CAAC;YAChC,CAAC;iBAAM,IAAI,GAAG,CAAC,QAAQ,CAAC,aAAa,CAAC,IAAI,GAAG,CAAC,QAAQ,CAAC,WAAW,CAAC;gBACxD,GAAG,CAAC,QAAQ,CAAC,MAAM,CAAC,IAAI,GAAG,CAAC,QAAQ,CAAC,MAAM,CAAC,IAAI,GAAG,CAAC,QAAQ,CAAC,SAAS,CAAC,EAAE,CAAC;gBACnF,uBAAuB,CAAC,KAAK,EAAE,CAAC;YAClC,CAAC;iBAAM,IAAI,GAAG,CAAC,QAAQ,CAAC,MAAM,CAAC,IAAI,GAAG,CAAC,QAAQ,CAAC,OAAO,CAAC,IAAI,GAAG,CAAC,QAAQ,CAAC,MAAM,CAAC;gBACrE,GAAG,CAAC,QAAQ,CAAC,MAAM,CAAC,IAAI,GAAG,CAAC,QAAQ,CAAC,OAAO,CAAC,IAAI,GAAG,CAAC,QAAQ,CAAC,SAAS,CAAC,EAAE,CAAC;gBACpF,uBAAuB,CAAC,KAAK,EAAE,CAAC;YAClC,CAAC;iBAAM,CAAC;gBACN,uBAAuB,CAAC,IAAI,EAAE,CAAC;YACjC,CAAC;YAED,qBAAqB;YACrB,IAAI,MAAM,CAAC,cAAc,EAAE,CAAC;gBAC1B,gBAAgB,EAAE,CAAC;YACrB,CAAC;YAED,iBAAiB;YACjB,kBAAkB,IAAI,MAAM,CAAC,aAAa,IAAI,CAAC,CAAC;QAClD,CAAC,CAAC,CAAC;QAEH,OAAO;YACL,YAAY,EAAE,OAAO,CAAC,MAAM;YAC5B,eAAe,EAAE,OAAO,CAAC,MAAM;YAC/B,kBAAkB;YAClB,uBAAuB;YACvB,gBAAgB;YAChB,oBAAoB,EAAE,OAAO,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,kBAAkB,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC;SACnF,CAAC;IACJ,CAAC;IAED;;OAEG;IACH,MAAM,CAAC,kBAAkB,CAAC,OAAuB,EAAE,KAAa;QAC9D,MAAM,UAAU,GAAG,KAAK,CAAC,WAAW,EAAE,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,MAAM,CAAC,IAAI,CAAC,EAAE,CAAC,IAAI,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;QAEpF,OAAO,OAAO,CAAC,GAAG,CAAC,MAAM,CAAC,EAAE;YAC1B,IAAI,KAAK,GAAG,CAAC,CAAC;YACd,MAAM,KAAK,GAAG,MAAM,CAAC,KAAK,CAAC,WAAW,EAAE,CAAC;YACzC,MAAM,OAAO,GAAG,MAAM,CAAC,OAAO,CAAC,WAAW,EAAE,CAAC;YAE7C,mCAAmC;YACnC,UAAU,CAAC,OAAO,CAAC,IAAI,CAAC,EAAE;gBACxB,IAAI,KAAK,CAAC,QAAQ,CAAC,IAAI,CAAC,EAAE,CAAC;oBACzB,KAAK,IAAI,EAAE,CAAC;gBACd,CAAC;gBACD,IAAI,OAAO,CAAC,QAAQ,CAAC,IAAI,CAAC,EAAE,CAAC;oBAC3B,KAAK,IAAI,CAAC,CAAC;gBACb,CAAC;YACH,CAAC,CAAC,CAAC;YAEH,qDAAqD;YACrD,IAAI,MAAM,CAAC,QAAQ,EAAE,CAAC;gBACpB,KAAK,IAAI,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,EAAE,GAAG,MAAM,CAAC,QAAQ,CAAC,CAAC;YAC7C,CAAC;YAED,qBAAqB;YACrB,IAAI,MAAM,CAAC,cAAc,EAAE,CAAC;gBAC1B,KAAK,IAAI,CAAC,CAAC;YACb,CAAC;YAED,2CAA2C;YAC3C,IAAI,MAAM,CAAC,MAAM,EAAE,CAAC;gBAClB,MAAM,cAAc,GAAG,CAAC,YAAY,EAAE,mBAAmB,EAAE,eAAe,EAAE,oBAAoB,CAAC,CAAC;gBAClG,IAAI,cAAc,CAAC,IAAI,CAAC,MAAM,CAAC,EAAE,CAAC,MAAM,CAAC,MAAO,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC,EAAE,CAAC;oBACnE,KAAK,IAAI,CAAC,CAAC;gBACb,CAAC;YACH,CAAC;YAED,OAAO;gBACL,GAAG,MAAM;gBACT,cAAc,EAAE,KAAK;aACuB,CAAC;QACjD,CAAC,CAAC,CAAC;IACL,CAAC;CACF"}
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
export interface CleaningStats {
|
|
2
|
+
originalSize: number;
|
|
3
|
+
cleanedSize: number;
|
|
4
|
+
reductionPercent: number;
|
|
5
|
+
originalLines: number;
|
|
6
|
+
cleanedLines: number;
|
|
7
|
+
elementsRemoved: {
|
|
8
|
+
scripts: number;
|
|
9
|
+
styles: number;
|
|
10
|
+
images: number;
|
|
11
|
+
ads: number;
|
|
12
|
+
navigation: number;
|
|
13
|
+
comments: number;
|
|
14
|
+
};
|
|
15
|
+
contentAnalysis: {
|
|
16
|
+
hasSearchResults: boolean;
|
|
17
|
+
resultCount: number;
|
|
18
|
+
hasRichSnippets: boolean;
|
|
19
|
+
hasAds: boolean;
|
|
20
|
+
readabilityScore: number;
|
|
21
|
+
};
|
|
22
|
+
}
|
|
23
|
+
export declare class HtmlCleaner {
|
|
24
|
+
/**
|
|
25
|
+
* Extract and clean search results HTML for optimal LLM consumption
|
|
26
|
+
*/
|
|
27
|
+
static extractSearchResults(html: string): string;
|
|
28
|
+
/**
|
|
29
|
+
* Get comprehensive cleaning statistics
|
|
30
|
+
*/
|
|
31
|
+
static getCleaningStats(originalHtml: string, cleanedHtml: string): CleaningStats;
|
|
32
|
+
/**
|
|
33
|
+
* Detect if HTML contains search results
|
|
34
|
+
*/
|
|
35
|
+
private static detectSearchResults;
|
|
36
|
+
/**
|
|
37
|
+
* Count approximate number of search results
|
|
38
|
+
*/
|
|
39
|
+
private static countSearchResults;
|
|
40
|
+
/**
|
|
41
|
+
* Detect rich snippets and structured data
|
|
42
|
+
*/
|
|
43
|
+
private static detectRichSnippets;
|
|
44
|
+
/**
|
|
45
|
+
* Calculate basic readability score
|
|
46
|
+
*/
|
|
47
|
+
private static calculateReadabilityScore;
|
|
48
|
+
/**
|
|
49
|
+
* Extract structured data from HTML (JSON-LD, microdata, etc.)
|
|
50
|
+
*/
|
|
51
|
+
static extractStructuredData(html: string): any[];
|
|
52
|
+
/**
|
|
53
|
+
* Extract meta information from HTML
|
|
54
|
+
*/
|
|
55
|
+
static extractMetaInfo(html: string): {
|
|
56
|
+
title?: string;
|
|
57
|
+
description?: string;
|
|
58
|
+
keywords?: string;
|
|
59
|
+
robots?: string;
|
|
60
|
+
canonical?: string;
|
|
61
|
+
};
|
|
62
|
+
}
|