cogniscrape 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +308 -0
- package/dist/graphs/AbstractGraph.d.ts +27 -0
- package/dist/graphs/AbstractGraph.d.ts.map +1 -0
- package/dist/graphs/AbstractGraph.js +44 -0
- package/dist/graphs/AbstractGraph.js.map +1 -0
- package/dist/graphs/BaseGraph.d.ts +30 -0
- package/dist/graphs/BaseGraph.d.ts.map +1 -0
- package/dist/graphs/BaseGraph.js +62 -0
- package/dist/graphs/BaseGraph.js.map +1 -0
- package/dist/graphs/CSVScraperGraph.d.ts +16 -0
- package/dist/graphs/CSVScraperGraph.d.ts.map +1 -0
- package/dist/graphs/CSVScraperGraph.js +84 -0
- package/dist/graphs/CSVScraperGraph.js.map +1 -0
- package/dist/graphs/DepthSearchGraph.d.ts +14 -0
- package/dist/graphs/DepthSearchGraph.d.ts.map +1 -0
- package/dist/graphs/DepthSearchGraph.js +45 -0
- package/dist/graphs/DepthSearchGraph.js.map +1 -0
- package/dist/graphs/JSONScraperGraph.d.ts +18 -0
- package/dist/graphs/JSONScraperGraph.d.ts.map +1 -0
- package/dist/graphs/JSONScraperGraph.js +100 -0
- package/dist/graphs/JSONScraperGraph.js.map +1 -0
- package/dist/graphs/SearchGraph.d.ts +14 -0
- package/dist/graphs/SearchGraph.d.ts.map +1 -0
- package/dist/graphs/SearchGraph.js +42 -0
- package/dist/graphs/SearchGraph.js.map +1 -0
- package/dist/graphs/SmartScraperGraph.d.ts +16 -0
- package/dist/graphs/SmartScraperGraph.d.ts.map +1 -0
- package/dist/graphs/SmartScraperGraph.js +57 -0
- package/dist/graphs/SmartScraperGraph.js.map +1 -0
- package/dist/graphs/SmartScraperMultiGraph.d.ts +17 -0
- package/dist/graphs/SmartScraperMultiGraph.d.ts.map +1 -0
- package/dist/graphs/SmartScraperMultiGraph.js +71 -0
- package/dist/graphs/SmartScraperMultiGraph.js.map +1 -0
- package/dist/graphs/index.d.ts +12 -0
- package/dist/graphs/index.d.ts.map +1 -0
- package/dist/graphs/index.js +23 -0
- package/dist/graphs/index.js.map +1 -0
- package/dist/index.d.ts +10 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +43 -0
- package/dist/index.js.map +1 -0
- package/dist/models/GeminiModel.d.ts +16 -0
- package/dist/models/GeminiModel.d.ts.map +1 -0
- package/dist/models/GeminiModel.js +127 -0
- package/dist/models/GeminiModel.js.map +1 -0
- package/dist/models/OllamaModel.d.ts +15 -0
- package/dist/models/OllamaModel.d.ts.map +1 -0
- package/dist/models/OllamaModel.js +134 -0
- package/dist/models/OllamaModel.js.map +1 -0
- package/dist/models/index.d.ts +8 -0
- package/dist/models/index.d.ts.map +1 -0
- package/dist/models/index.js +24 -0
- package/dist/models/index.js.map +1 -0
- package/dist/nodes/BaseNode.d.ts +37 -0
- package/dist/nodes/BaseNode.d.ts.map +1 -0
- package/dist/nodes/BaseNode.js +116 -0
- package/dist/nodes/BaseNode.js.map +1 -0
- package/dist/nodes/CSVExporterNode.d.ts +16 -0
- package/dist/nodes/CSVExporterNode.d.ts.map +1 -0
- package/dist/nodes/CSVExporterNode.js +85 -0
- package/dist/nodes/CSVExporterNode.js.map +1 -0
- package/dist/nodes/ConditionalNode.d.ts +16 -0
- package/dist/nodes/ConditionalNode.d.ts.map +1 -0
- package/dist/nodes/ConditionalNode.js +68 -0
- package/dist/nodes/ConditionalNode.js.map +1 -0
- package/dist/nodes/FetchNode.d.ts +15 -0
- package/dist/nodes/FetchNode.d.ts.map +1 -0
- package/dist/nodes/FetchNode.js +182 -0
- package/dist/nodes/FetchNode.js.map +1 -0
- package/dist/nodes/GenerateAnswerNode.d.ts +14 -0
- package/dist/nodes/GenerateAnswerNode.d.ts.map +1 -0
- package/dist/nodes/GenerateAnswerNode.js +86 -0
- package/dist/nodes/GenerateAnswerNode.js.map +1 -0
- package/dist/nodes/JSONExporterNode.d.ts +16 -0
- package/dist/nodes/JSONExporterNode.d.ts.map +1 -0
- package/dist/nodes/JSONExporterNode.js +42 -0
- package/dist/nodes/JSONExporterNode.js.map +1 -0
- package/dist/nodes/MergeNode.d.ts +10 -0
- package/dist/nodes/MergeNode.d.ts.map +1 -0
- package/dist/nodes/MergeNode.js +51 -0
- package/dist/nodes/MergeNode.js.map +1 -0
- package/dist/nodes/PDFScraperNode.d.ts +10 -0
- package/dist/nodes/PDFScraperNode.d.ts.map +1 -0
- package/dist/nodes/PDFScraperNode.js +80 -0
- package/dist/nodes/PDFScraperNode.js.map +1 -0
- package/dist/nodes/ParseNode.d.ts +12 -0
- package/dist/nodes/ParseNode.d.ts.map +1 -0
- package/dist/nodes/ParseNode.js +44 -0
- package/dist/nodes/ParseNode.js.map +1 -0
- package/dist/nodes/RAGNode.d.ts +13 -0
- package/dist/nodes/RAGNode.d.ts.map +1 -0
- package/dist/nodes/RAGNode.js +64 -0
- package/dist/nodes/RAGNode.js.map +1 -0
- package/dist/nodes/ReasoningNode.d.ts +10 -0
- package/dist/nodes/ReasoningNode.d.ts.map +1 -0
- package/dist/nodes/ReasoningNode.js +51 -0
- package/dist/nodes/ReasoningNode.js.map +1 -0
- package/dist/nodes/SearchNode.d.ts +13 -0
- package/dist/nodes/SearchNode.d.ts.map +1 -0
- package/dist/nodes/SearchNode.js +81 -0
- package/dist/nodes/SearchNode.js.map +1 -0
- package/dist/nodes/XMLScraperNode.d.ts +11 -0
- package/dist/nodes/XMLScraperNode.d.ts.map +1 -0
- package/dist/nodes/XMLScraperNode.js +99 -0
- package/dist/nodes/XMLScraperNode.js.map +1 -0
- package/dist/nodes/index.d.ts +17 -0
- package/dist/nodes/index.d.ts.map +1 -0
- package/dist/nodes/index.js +33 -0
- package/dist/nodes/index.js.map +1 -0
- package/dist/prompts/index.d.ts +12 -0
- package/dist/prompts/index.d.ts.map +1 -0
- package/dist/prompts/index.js +117 -0
- package/dist/prompts/index.js.map +1 -0
- package/dist/types.d.ts +106 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +13 -0
- package/dist/types.js.map +1 -0
- package/dist/utils/cache.d.ts +28 -0
- package/dist/utils/cache.d.ts.map +1 -0
- package/dist/utils/cache.js +72 -0
- package/dist/utils/cache.js.map +1 -0
- package/dist/utils/chunking.d.ts +8 -0
- package/dist/utils/chunking.d.ts.map +1 -0
- package/dist/utils/chunking.js +51 -0
- package/dist/utils/chunking.js.map +1 -0
- package/dist/utils/cleanupHtml.d.ts +7 -0
- package/dist/utils/cleanupHtml.d.ts.map +1 -0
- package/dist/utils/cleanupHtml.js +81 -0
- package/dist/utils/cleanupHtml.js.map +1 -0
- package/dist/utils/convertToMarkdown.d.ts +6 -0
- package/dist/utils/convertToMarkdown.d.ts.map +1 -0
- package/dist/utils/convertToMarkdown.js +61 -0
- package/dist/utils/convertToMarkdown.js.map +1 -0
- package/dist/utils/index.d.ts +13 -0
- package/dist/utils/index.d.ts.map +1 -0
- package/dist/utils/index.js +40 -0
- package/dist/utils/index.js.map +1 -0
- package/dist/utils/logger.d.ts +14 -0
- package/dist/utils/logger.d.ts.map +1 -0
- package/dist/utils/logger.js +35 -0
- package/dist/utils/logger.js.map +1 -0
- package/dist/utils/proxy.d.ts +30 -0
- package/dist/utils/proxy.d.ts.map +1 -0
- package/dist/utils/proxy.js +62 -0
- package/dist/utils/proxy.js.map +1 -0
- package/dist/utils/rateLimiter.d.ts +24 -0
- package/dist/utils/rateLimiter.d.ts.map +1 -0
- package/dist/utils/rateLimiter.js +61 -0
- package/dist/utils/rateLimiter.js.map +1 -0
- package/dist/utils/retry.d.ts +17 -0
- package/dist/utils/retry.d.ts.map +1 -0
- package/dist/utils/retry.js +43 -0
- package/dist/utils/retry.js.map +1 -0
- package/dist/utils/schemaValidator.d.ts +69 -0
- package/dist/utils/schemaValidator.d.ts.map +1 -0
- package/dist/utils/schemaValidator.js +133 -0
- package/dist/utils/schemaValidator.js.map +1 -0
- package/package.json +64 -0
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Prompt templates for different scraping scenarios
|
|
4
|
+
*/
|
|
5
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
+
exports.REASONING_TEMPLATE = exports.TEMPLATE_MERGE_MD = exports.TEMPLATE_CHUNKS_MD = exports.TEMPLATE_NO_CHUNKS_MD = exports.TEMPLATE_MERGE = exports.TEMPLATE_CHUNKS = exports.TEMPLATE_NO_CHUNKS = void 0;
|
|
7
|
+
exports.formatPrompt = formatPrompt;
|
|
8
|
+
exports.TEMPLATE_NO_CHUNKS = `
|
|
9
|
+
You are a web scraping expert. Your task is to extract specific information from the provided HTML content based on the user's request.
|
|
10
|
+
|
|
11
|
+
User Request: {user_prompt}
|
|
12
|
+
|
|
13
|
+
HTML Content:
|
|
14
|
+
{content}
|
|
15
|
+
|
|
16
|
+
Instructions:
|
|
17
|
+
1. Carefully read the user's request
|
|
18
|
+
2. Extract ONLY the requested information from the HTML content
|
|
19
|
+
3. Return the data in a structured JSON format
|
|
20
|
+
4. If information is not found, return null for that field
|
|
21
|
+
5. Be precise and accurate
|
|
22
|
+
|
|
23
|
+
Respond with valid JSON only.
|
|
24
|
+
`;
|
|
25
|
+
exports.TEMPLATE_CHUNKS = `
|
|
26
|
+
You are a web scraping expert. Your task is to extract specific information from the provided content chunks based on the user's request.
|
|
27
|
+
|
|
28
|
+
User Request: {user_prompt}
|
|
29
|
+
|
|
30
|
+
Content Chunks:
|
|
31
|
+
{chunks}
|
|
32
|
+
|
|
33
|
+
Instructions:
|
|
34
|
+
1. Analyze all provided chunks
|
|
35
|
+
2. Extract ONLY the requested information
|
|
36
|
+
3. Combine information from multiple chunks if needed
|
|
37
|
+
4. Return the data in a structured JSON format
|
|
38
|
+
5. If information is not found, return null for that field
|
|
39
|
+
|
|
40
|
+
Respond with valid JSON only.
|
|
41
|
+
`;
|
|
42
|
+
exports.TEMPLATE_MERGE = `
|
|
43
|
+
You are a data merging expert. You have received multiple JSON responses from different content chunks. Your task is to merge them into a single, coherent response.
|
|
44
|
+
|
|
45
|
+
User Request: {user_prompt}
|
|
46
|
+
|
|
47
|
+
Responses to Merge:
|
|
48
|
+
{responses}
|
|
49
|
+
|
|
50
|
+
Instructions:
|
|
51
|
+
1. Merge all responses into a single JSON object
|
|
52
|
+
2. Remove duplicates
|
|
53
|
+
3. Ensure consistency across all fields
|
|
54
|
+
4. Maintain the original structure
|
|
55
|
+
5. If there are conflicts, prefer the most complete information
|
|
56
|
+
|
|
57
|
+
Respond with valid JSON only.
|
|
58
|
+
`;
|
|
59
|
+
exports.TEMPLATE_NO_CHUNKS_MD = `
|
|
60
|
+
You are a web scraping expert. Your task is to extract specific information from the provided Markdown content based on the user's request.
|
|
61
|
+
|
|
62
|
+
User Request: {user_prompt}
|
|
63
|
+
|
|
64
|
+
Markdown Content:
|
|
65
|
+
{content}
|
|
66
|
+
|
|
67
|
+
Instructions:
|
|
68
|
+
1. Carefully read the user's request
|
|
69
|
+
2. Extract ONLY the requested information from the Markdown content
|
|
70
|
+
3. Return the data in a structured JSON format
|
|
71
|
+
4. If information is not found, return null for that field
|
|
72
|
+
5. Be precise and accurate
|
|
73
|
+
|
|
74
|
+
Respond with valid JSON only.
|
|
75
|
+
`;
|
|
76
|
+
exports.TEMPLATE_CHUNKS_MD = `
|
|
77
|
+
You are a web scraping expert. Your task is to extract specific information from the provided Markdown chunks based on the user's request.
|
|
78
|
+
|
|
79
|
+
User Request: {user_prompt}
|
|
80
|
+
|
|
81
|
+
Markdown Chunks:
|
|
82
|
+
{chunks}
|
|
83
|
+
|
|
84
|
+
Instructions:
|
|
85
|
+
1. Analyze all provided chunks
|
|
86
|
+
2. Extract ONLY the requested information
|
|
87
|
+
3. Combine information from multiple chunks if needed
|
|
88
|
+
4. Return the data in a structured JSON format
|
|
89
|
+
5. If information is not found, return null for that field
|
|
90
|
+
|
|
91
|
+
Respond with valid JSON only.
|
|
92
|
+
`;
|
|
93
|
+
exports.TEMPLATE_MERGE_MD = exports.TEMPLATE_MERGE;
|
|
94
|
+
exports.REASONING_TEMPLATE = `
|
|
95
|
+
You are an AI reasoning expert. Analyze the following content and user request to determine the best approach for extracting the requested information.
|
|
96
|
+
|
|
97
|
+
User Request: {user_prompt}
|
|
98
|
+
|
|
99
|
+
Content:
|
|
100
|
+
{content}
|
|
101
|
+
|
|
102
|
+
Instructions:
|
|
103
|
+
1. Identify what information the user is looking for
|
|
104
|
+
2. Determine which parts of the content are relevant
|
|
105
|
+
3. Suggest a strategy for extracting this information
|
|
106
|
+
4. Consider edge cases and potential issues
|
|
107
|
+
|
|
108
|
+
Provide your reasoning and strategy.
|
|
109
|
+
`;
|
|
110
|
+
function formatPrompt(template, variables) {
|
|
111
|
+
let formatted = template;
|
|
112
|
+
for (const [key, value] of Object.entries(variables)) {
|
|
113
|
+
formatted = formatted.replace(new RegExp(`\\{${key}\\}`, 'g'), value);
|
|
114
|
+
}
|
|
115
|
+
return formatted;
|
|
116
|
+
}
|
|
117
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/prompts/index.ts"],"names":[],"mappings":";AAAA;;GAEG;;;AA+GH,oCAQC;AArHY,QAAA,kBAAkB,GAAG;;;;;;;;;;;;;;;;CAgBjC,CAAC;AAEW,QAAA,eAAe,GAAG;;;;;;;;;;;;;;;;CAgB9B,CAAC;AAEW,QAAA,cAAc,GAAG;;;;;;;;;;;;;;;;CAgB7B,CAAC;AAEW,QAAA,qBAAqB,GAAG;;;;;;;;;;;;;;;;CAgBpC,CAAC;AAEW,QAAA,kBAAkB,GAAG;;;;;;;;;;;;;;;;CAgBjC,CAAC;AAEW,QAAA,iBAAiB,GAAG,sBAAc,CAAC;AAEnC,QAAA,kBAAkB,GAAG;;;;;;;;;;;;;;;CAejC,CAAC;AAEF,SAAgB,YAAY,CAAC,QAAgB,EAAE,SAAiC;IAC9E,IAAI,SAAS,GAAG,QAAQ,CAAC;IAEzB,KAAK,MAAM,CAAC,GAAG,EAAE,KAAK,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,SAAS,CAAC,EAAE,CAAC;QACrD,SAAS,GAAG,SAAS,CAAC,OAAO,CAAC,IAAI,MAAM,CAAC,MAAM,GAAG,KAAK,EAAE,GAAG,CAAC,EAAE,KAAK,CAAC,CAAC;IACxE,CAAC;IAED,OAAO,SAAS,CAAC;AACnB,CAAC"}
|
package/dist/types.d.ts
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Core type definitions for CogniScrape TypeScript
|
|
3
|
+
*/
|
|
4
|
+
export interface Document {
|
|
5
|
+
pageContent: string;
|
|
6
|
+
metadata: Record<string, any>;
|
|
7
|
+
}
|
|
8
|
+
export interface LLMConfig {
|
|
9
|
+
provider: 'ollama' | 'gemini';
|
|
10
|
+
model: string;
|
|
11
|
+
apiKey?: string;
|
|
12
|
+
baseUrl?: string;
|
|
13
|
+
temperature?: number;
|
|
14
|
+
maxTokens?: number;
|
|
15
|
+
}
|
|
16
|
+
export interface ProxyConfig {
|
|
17
|
+
enabled?: boolean;
|
|
18
|
+
proxies?: string[];
|
|
19
|
+
maxFailures?: number;
|
|
20
|
+
host?: string;
|
|
21
|
+
port?: number;
|
|
22
|
+
username?: string;
|
|
23
|
+
password?: string;
|
|
24
|
+
protocol?: 'http' | 'https' | 'socks4' | 'socks5';
|
|
25
|
+
}
|
|
26
|
+
export interface RetryConfig {
|
|
27
|
+
maxRetries?: number;
|
|
28
|
+
initialDelay?: number;
|
|
29
|
+
maxDelay?: number;
|
|
30
|
+
backoffMultiplier?: number;
|
|
31
|
+
}
|
|
32
|
+
export interface RateLimitConfig {
|
|
33
|
+
maxRequests?: number;
|
|
34
|
+
windowMs?: number;
|
|
35
|
+
minDelay?: number;
|
|
36
|
+
}
|
|
37
|
+
export interface CacheConfig {
|
|
38
|
+
enabled?: boolean;
|
|
39
|
+
ttl?: number;
|
|
40
|
+
maxSize?: number;
|
|
41
|
+
}
|
|
42
|
+
export interface ScraperConfig {
|
|
43
|
+
llm: LLMConfig;
|
|
44
|
+
verbose?: boolean;
|
|
45
|
+
headless?: boolean;
|
|
46
|
+
timeout?: number;
|
|
47
|
+
cut?: boolean;
|
|
48
|
+
force?: boolean;
|
|
49
|
+
loaderKwargs?: Record<string, any>;
|
|
50
|
+
additionalInfo?: string;
|
|
51
|
+
schema?: any;
|
|
52
|
+
reattempt?: boolean;
|
|
53
|
+
reasoning?: boolean;
|
|
54
|
+
htmlMode?: boolean;
|
|
55
|
+
proxy?: ProxyConfig | ProxyConfig[];
|
|
56
|
+
retry?: RetryConfig;
|
|
57
|
+
rateLimit?: RateLimitConfig;
|
|
58
|
+
cache?: CacheConfig;
|
|
59
|
+
maxDepth?: number;
|
|
60
|
+
searchEngine?: 'duckduckgo' | 'google' | 'bing';
|
|
61
|
+
}
|
|
62
|
+
export interface NodeConfig {
|
|
63
|
+
llmModel?: BaseLLM;
|
|
64
|
+
verbose?: boolean;
|
|
65
|
+
force?: boolean;
|
|
66
|
+
cut?: boolean;
|
|
67
|
+
timeout?: number;
|
|
68
|
+
loaderKwargs?: Record<string, any>;
|
|
69
|
+
headless?: boolean;
|
|
70
|
+
chunkSize?: number;
|
|
71
|
+
additionalInfo?: string;
|
|
72
|
+
schema?: any;
|
|
73
|
+
proxyConfig?: ProxyConfig;
|
|
74
|
+
retryConfig?: RetryConfig;
|
|
75
|
+
rateLimitConfig?: RateLimitConfig;
|
|
76
|
+
cacheConfig?: CacheConfig;
|
|
77
|
+
}
|
|
78
|
+
export interface GraphState {
|
|
79
|
+
[key: string]: any;
|
|
80
|
+
userPrompt?: string;
|
|
81
|
+
url?: string;
|
|
82
|
+
localDir?: string;
|
|
83
|
+
doc?: Document[];
|
|
84
|
+
parsedDoc?: Document[];
|
|
85
|
+
relevantChunks?: Document[];
|
|
86
|
+
answer?: any;
|
|
87
|
+
}
|
|
88
|
+
export declare abstract class BaseLLM {
|
|
89
|
+
protected config: LLMConfig;
|
|
90
|
+
constructor(config: LLMConfig);
|
|
91
|
+
abstract generate(prompt: string, systemPrompt?: string): Promise<string>;
|
|
92
|
+
abstract generateJson(prompt: string, systemPrompt?: string): Promise<any>;
|
|
93
|
+
}
|
|
94
|
+
export interface ChatMessage {
|
|
95
|
+
role: 'user' | 'assistant' | 'system';
|
|
96
|
+
content: string;
|
|
97
|
+
}
|
|
98
|
+
export interface LLMResponse {
|
|
99
|
+
content: string;
|
|
100
|
+
usage?: {
|
|
101
|
+
promptTokens: number;
|
|
102
|
+
completionTokens: number;
|
|
103
|
+
totalTokens: number;
|
|
104
|
+
};
|
|
105
|
+
}
|
|
106
|
+
//# sourceMappingURL=types.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,MAAM,WAAW,QAAQ;IACvB,WAAW,EAAE,MAAM,CAAC;IACpB,QAAQ,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;CAC/B;AAED,MAAM,WAAW,SAAS;IACxB,QAAQ,EAAE,QAAQ,GAAG,QAAQ,CAAC;IAC9B,KAAK,EAAE,MAAM,CAAC;IACd,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,SAAS,CAAC,EAAE,MAAM,CAAC;CACpB;AAED,MAAM,WAAW,WAAW;IAC1B,OAAO,CAAC,EAAE,OAAO,CAAC;IAClB,OAAO,CAAC,EAAE,MAAM,EAAE,CAAC;IACnB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,QAAQ,CAAC,EAAE,MAAM,GAAG,OAAO,GAAG,QAAQ,GAAG,QAAQ,CAAC;CACnD;AAED,MAAM,WAAW,WAAW;IAC1B,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,iBAAiB,CAAC,EAAE,MAAM,CAAC;CAC5B;AAED,MAAM,WAAW,eAAe;IAC9B,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,QAAQ,CAAC,EAAE,MAAM,CAAC;CACnB;AAED,MAAM,WAAW,WAAW;IAC1B,OAAO,CAAC,EAAE,OAAO,CAAC;IAClB,GAAG,CAAC,EAAE,MAAM,CAAC;IACb,OAAO,CAAC,EAAE,MAAM,CAAC;CAClB;AAED,MAAM,WAAW,aAAa;IAC5B,GAAG,EAAE,SAAS,CAAC;IACf,OAAO,CAAC,EAAE,OAAO,CAAC;IAClB,QAAQ,CAAC,EAAE,OAAO,CAAC;IACnB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,GAAG,CAAC,EAAE,OAAO,CAAC;IACd,KAAK,CAAC,EAAE,OAAO,CAAC;IAChB,YAAY,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;IACnC,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB,MAAM,CAAC,EAAE,GAAG,CAAC;IACb,SAAS,CAAC,EAAE,OAAO,CAAC;IACpB,SAAS,CAAC,EAAE,OAAO,CAAC;IACpB,QAAQ,CAAC,EAAE,OAAO,CAAC;IAEnB,KAAK,CAAC,EAAE,WAAW,GAAG,WAAW,EAAE,CAAC;IACpC,KAAK,CAAC,EAAE,WAAW,CAAC;IACpB,SAAS,CAAC,EAAE,eAAe,CAAC;IAC5B,KAAK,CAAC,EAAE,WAAW,CAAC;IACpB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,YAAY,CAAC,EAAE,YAAY,GAAG,QAAQ,GAAG,MAAM,CAAC;CACjD;AAED,MAAM,WAAW,UAAU;IACzB,QAAQ,CAAC,EAAE,OAAO,CAAC;IACnB,OAAO,CAAC,EAAE,OAAO,CAAC;IAClB,KAAK,CAAC,EAAE,OAAO,CAAC;IAChB,GAAG,CAAC,EAAE,OAAO,CAAC;IACd,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,YAAY,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;IACnC,QAAQ,CAAC,EAAE,OAAO,CAAC;IACnB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB,MAAM,CAAC,EAAE,GAAG,CAAC;IAEb,WAAW,CAAC,EAAE,WAAW,CAAC;IAC1B,WAAW,CAAC,EAAE,WAAW,CAAC;IAC1B,eAAe,CAAC,EAAE,eAAe,CAAC;IAClC,WAAW,CAAC,EAAE,WAAW,CAAC;CAC3B;AAED,MAAM,WAAW,UAAU;IACzB,CAAC,GAAG,EAAE,MAAM,GAAG,GAAG,CAAC;IACnB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,GAAG,CAAC,EAAE,MAAM,CAAC;IACb,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,GAAG,CAAC,EAAE,QAAQ,EAAE,CAAC;IACjB,SAAS,CAAC,EAAE,QAAQ,EAAE,CAAC;IACvB,cAAc,CAAC,EAAE,QAAQ,EAAE,CAAC;IAC5B,MAAM,CAAC,EAAE,GAAG,CAAC;CACd;AAED,8BAAsB,OAAO;IAC3B,SAAS,CAAC,MAAM,EAAE,SAAS,CAAC;gBAEhB,MAAM,EAAE,SAAS;IAI7B,QAAQ,CAAC,QAAQ,CAAC,MAAM,EAAE,MAAM,EAAE,YAAY,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC;IACzE,QAAQ,CAAC,YAAY,CAAC,MAAM,EAAE,MAAM,EAAE,YAAY,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,GAAG,CAAC;CAC3E;AAED,MAAM,WAAW,WAAW;IAC1B,IAAI,EAAE,MAAM,GAAG,WAAW,GAAG,QAAQ,CAAC;IACtC,OAAO,EAAE,MAAM,CAAC;CACjB;AAED,MAAM,WAAW,WAAW;IAC1B,OAAO,EAAE,MAAM,CAAC;IAChB,KAAK,CAAC,EAAE;QACN,YAAY,EAAE,MAAM,CAAC;QACrB,gBAAgB,EAAE,MAAM,CAAC;QACzB,WAAW,EAAE,MAAM,CAAC;KACrB,CAAC;CACH"}
|
package/dist/types.js
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Core type definitions for CogniScrape TypeScript
|
|
4
|
+
*/
|
|
5
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
+
exports.BaseLLM = void 0;
|
|
7
|
+
class BaseLLM {
|
|
8
|
+
constructor(config) {
|
|
9
|
+
this.config = config;
|
|
10
|
+
}
|
|
11
|
+
}
|
|
12
|
+
exports.BaseLLM = BaseLLM;
|
|
13
|
+
//# sourceMappingURL=types.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.js","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":";AAAA;;GAEG;;;AAiGH,MAAsB,OAAO;IAG3B,YAAY,MAAiB;QAC3B,IAAI,CAAC,MAAM,GAAG,MAAM,CAAC;IACvB,CAAC;CAIF;AATD,0BASC"}
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Simple in-memory cache with TTL
|
|
3
|
+
*/
|
|
4
|
+
export interface CacheOptions {
|
|
5
|
+
ttl?: number;
|
|
6
|
+
maxSize?: number;
|
|
7
|
+
}
|
|
8
|
+
export declare class Cache<T = any> {
|
|
9
|
+
private cache;
|
|
10
|
+
private ttl;
|
|
11
|
+
private maxSize;
|
|
12
|
+
constructor(options?: CacheOptions);
|
|
13
|
+
set(key: string, value: T, customTtl?: number): void;
|
|
14
|
+
get<K = T>(key: string): K | undefined;
|
|
15
|
+
has(key: string): boolean;
|
|
16
|
+
delete(key: string): boolean;
|
|
17
|
+
clear(): void;
|
|
18
|
+
size(): number;
|
|
19
|
+
private cleanExpired;
|
|
20
|
+
getStats(): {
|
|
21
|
+
size: number;
|
|
22
|
+
maxSize: number;
|
|
23
|
+
ttl: number;
|
|
24
|
+
};
|
|
25
|
+
}
|
|
26
|
+
export declare const cache: Cache<any>;
|
|
27
|
+
export declare const globalCache: Cache<any>;
|
|
28
|
+
//# sourceMappingURL=cache.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"cache.d.ts","sourceRoot":"","sources":["../../src/utils/cache.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,MAAM,WAAW,YAAY;IAC3B,GAAG,CAAC,EAAE,MAAM,CAAC;IACb,OAAO,CAAC,EAAE,MAAM,CAAC;CAClB;AAOD,qBAAa,KAAK,CAAC,CAAC,GAAG,GAAG;IACxB,OAAO,CAAC,KAAK,CAAoC;IACjD,OAAO,CAAC,GAAG,CAAS;IACpB,OAAO,CAAC,OAAO,CAAS;gBAEZ,OAAO,GAAE,YAAiB;IAKtC,GAAG,CAAC,GAAG,EAAE,MAAM,EAAE,KAAK,EAAE,CAAC,EAAE,SAAS,CAAC,EAAE,MAAM,GAAG,IAAI;IAcpD,GAAG,CAAC,CAAC,GAAG,CAAC,EAAE,GAAG,EAAE,MAAM,GAAG,CAAC,GAAG,SAAS;IAgBtC,GAAG,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO;IAIzB,MAAM,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO;IAI5B,KAAK,IAAI,IAAI;IAIb,IAAI,IAAI,MAAM;IAMd,OAAO,CAAC,YAAY;IASpB,QAAQ,IAAI;QAAE,IAAI,EAAE,MAAM,CAAC;QAAC,OAAO,EAAE,MAAM,CAAC;QAAC,GAAG,EAAE,MAAM,CAAA;KAAE;CAQ3D;AAGD,eAAO,MAAM,KAAK,YAAc,CAAC;AACjC,eAAO,MAAM,WAAW,YAAQ,CAAC"}
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Simple in-memory cache with TTL
|
|
4
|
+
*/
|
|
5
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
+
exports.globalCache = exports.cache = exports.Cache = void 0;
|
|
7
|
+
class Cache {
|
|
8
|
+
constructor(options = {}) {
|
|
9
|
+
this.cache = new Map();
|
|
10
|
+
this.ttl = options.ttl || 5 * 60 * 1000; // Default 5 minutes
|
|
11
|
+
this.maxSize = options.maxSize || 1000;
|
|
12
|
+
}
|
|
13
|
+
set(key, value, customTtl) {
|
|
14
|
+
// Enforce max size
|
|
15
|
+
if (this.cache.size >= this.maxSize) {
|
|
16
|
+
// Remove oldest entry (first in map)
|
|
17
|
+
const firstKey = this.cache.keys().next().value;
|
|
18
|
+
if (firstKey) {
|
|
19
|
+
this.cache.delete(firstKey);
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
const expiry = Date.now() + (customTtl || this.ttl);
|
|
23
|
+
this.cache.set(key, { value, expiry });
|
|
24
|
+
}
|
|
25
|
+
get(key) {
|
|
26
|
+
const entry = this.cache.get(key);
|
|
27
|
+
if (!entry) {
|
|
28
|
+
return undefined;
|
|
29
|
+
}
|
|
30
|
+
// Check if expired
|
|
31
|
+
if (Date.now() > entry.expiry) {
|
|
32
|
+
this.cache.delete(key);
|
|
33
|
+
return undefined;
|
|
34
|
+
}
|
|
35
|
+
return entry.value;
|
|
36
|
+
}
|
|
37
|
+
has(key) {
|
|
38
|
+
return this.get(key) !== undefined;
|
|
39
|
+
}
|
|
40
|
+
delete(key) {
|
|
41
|
+
return this.cache.delete(key);
|
|
42
|
+
}
|
|
43
|
+
clear() {
|
|
44
|
+
this.cache.clear();
|
|
45
|
+
}
|
|
46
|
+
size() {
|
|
47
|
+
// Clean expired entries
|
|
48
|
+
this.cleanExpired();
|
|
49
|
+
return this.cache.size;
|
|
50
|
+
}
|
|
51
|
+
cleanExpired() {
|
|
52
|
+
const now = Date.now();
|
|
53
|
+
for (const [key, entry] of this.cache.entries()) {
|
|
54
|
+
if (now > entry.expiry) {
|
|
55
|
+
this.cache.delete(key);
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
getStats() {
|
|
60
|
+
this.cleanExpired();
|
|
61
|
+
return {
|
|
62
|
+
size: this.cache.size,
|
|
63
|
+
maxSize: this.maxSize,
|
|
64
|
+
ttl: this.ttl,
|
|
65
|
+
};
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
exports.Cache = Cache;
|
|
69
|
+
// Global cache instance
|
|
70
|
+
exports.cache = new Cache();
|
|
71
|
+
exports.globalCache = exports.cache;
|
|
72
|
+
//# sourceMappingURL=cache.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"cache.js","sourceRoot":"","sources":["../../src/utils/cache.ts"],"names":[],"mappings":";AAAA;;GAEG;;;AAYH,MAAa,KAAK;IAKhB,YAAY,UAAwB,EAAE;QAJ9B,UAAK,GAAG,IAAI,GAAG,EAAyB,CAAC;QAK/C,IAAI,CAAC,GAAG,GAAG,OAAO,CAAC,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,IAAI,CAAC,CAAC,oBAAoB;QAC7D,IAAI,CAAC,OAAO,GAAG,OAAO,CAAC,OAAO,IAAI,IAAI,CAAC;IACzC,CAAC;IAED,GAAG,CAAC,GAAW,EAAE,KAAQ,EAAE,SAAkB;QAC3C,mBAAmB;QACnB,IAAI,IAAI,CAAC,KAAK,CAAC,IAAI,IAAI,IAAI,CAAC,OAAO,EAAE,CAAC;YACpC,qCAAqC;YACrC,MAAM,QAAQ,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC,KAAK,CAAC;YAChD,IAAI,QAAQ,EAAE,CAAC;gBACb,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAC;YAC9B,CAAC;QACH,CAAC;QAED,MAAM,MAAM,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,CAAC,SAAS,IAAI,IAAI,CAAC,GAAG,CAAC,CAAC;QACpD,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,GAAG,EAAE,EAAE,KAAK,EAAE,MAAM,EAAE,CAAC,CAAC;IACzC,CAAC;IAED,GAAG,CAAQ,GAAW;QACpB,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;QAElC,IAAI,CAAC,KAAK,EAAE,CAAC;YACX,OAAO,SAAS,CAAC;QACnB,CAAC;QAED,mBAAmB;QACnB,IAAI,IAAI,CAAC,GAAG,EAAE,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC;YAC9B,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC;YACvB,OAAO,SAAS,CAAC;QACnB,CAAC;QAED,OAAO,KAAK,CAAC,KAAqB,CAAC;IACrC,CAAC;IAED,GAAG,CAAC,GAAW;QACb,OAAO,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,KAAK,SAAS,CAAC;IACrC,CAAC;IAED,MAAM,CAAC,GAAW;QAChB,OAAO,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC;IAChC,CAAC;IAED,KAAK;QACH,IAAI,CAAC,KAAK,CAAC,KAAK,EAAE,CAAC;IACrB,CAAC;IAED,IAAI;QACF,wBAAwB;QACxB,IAAI,CAAC,YAAY,EAAE,CAAC;QACpB,OAAO,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC;IACzB,CAAC;IAEO,YAAY;QAClB,MAAM,GAAG,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;QACvB,KAAK,MAAM,CAAC,GAAG,EAAE,KAAK,CAAC,IAAI,IAAI,CAAC,KAAK,CAAC,OAAO,EAAE,EAAE,CAAC;YAChD,IAAI,GAAG,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC;gBACvB,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC;YACzB,CAAC;QACH,CAAC;IACH,CAAC;IAED,QAAQ;QACN,IAAI,CAAC,YAAY,EAAE,CAAC;QACpB,OAAO;YACL,IAAI,EAAE,IAAI,CAAC,KAAK,CAAC,IAAI;YACrB,OAAO,EAAE,IAAI,CAAC,OAAO;YACrB,GAAG,EAAE,IAAI,CAAC,GAAG;SACd,CAAC;IACJ,CAAC;CACF;AA3ED,sBA2EC;AAED,wBAAwB;AACX,QAAA,KAAK,GAAG,IAAI,KAAK,EAAE,CAAC;AACpB,QAAA,WAAW,GAAG,aAAK,CAAC"}
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Text chunking utilities
|
|
3
|
+
*/
|
|
4
|
+
import { Document } from '../types';
|
|
5
|
+
export declare function chunkText(text: string, chunkSize?: number, overlap?: number): string[];
|
|
6
|
+
export declare function chunkDocuments(docs: Document[], chunkSize?: number): Document[];
|
|
7
|
+
export declare function mergeDocuments(docs: Document[]): Document;
|
|
8
|
+
//# sourceMappingURL=chunking.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"chunking.d.ts","sourceRoot":"","sources":["../../src/utils/chunking.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,QAAQ,EAAE,MAAM,UAAU,CAAC;AAEpC,wBAAgB,SAAS,CAAC,IAAI,EAAE,MAAM,EAAE,SAAS,GAAE,MAAa,EAAE,OAAO,GAAE,MAAY,GAAG,MAAM,EAAE,CAoBjG;AAED,wBAAgB,cAAc,CAAC,IAAI,EAAE,QAAQ,EAAE,EAAE,SAAS,GAAE,MAAa,GAAG,QAAQ,EAAE,CAmBrF;AAED,wBAAgB,cAAc,CAAC,IAAI,EAAE,QAAQ,EAAE,GAAG,QAAQ,CAQzD"}
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Text chunking utilities
|
|
4
|
+
*/
|
|
5
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
+
exports.chunkText = chunkText;
|
|
7
|
+
exports.chunkDocuments = chunkDocuments;
|
|
8
|
+
exports.mergeDocuments = mergeDocuments;
|
|
9
|
+
function chunkText(text, chunkSize = 2000, overlap = 200) {
|
|
10
|
+
if (!text || text.length === 0) {
|
|
11
|
+
return [];
|
|
12
|
+
}
|
|
13
|
+
const chunks = [];
|
|
14
|
+
let start = 0;
|
|
15
|
+
while (start < text.length) {
|
|
16
|
+
const end = Math.min(start + chunkSize, text.length);
|
|
17
|
+
const chunk = text.slice(start, end);
|
|
18
|
+
chunks.push(chunk);
|
|
19
|
+
// Move forward by chunkSize - overlap
|
|
20
|
+
start += chunkSize - overlap;
|
|
21
|
+
if (start >= text.length)
|
|
22
|
+
break;
|
|
23
|
+
}
|
|
24
|
+
return chunks;
|
|
25
|
+
}
|
|
26
|
+
function chunkDocuments(docs, chunkSize = 2000) {
|
|
27
|
+
const chunkedDocs = [];
|
|
28
|
+
for (const doc of docs) {
|
|
29
|
+
const chunks = chunkText(doc.pageContent, chunkSize);
|
|
30
|
+
for (let i = 0; i < chunks.length; i++) {
|
|
31
|
+
chunkedDocs.push({
|
|
32
|
+
pageContent: chunks[i],
|
|
33
|
+
metadata: {
|
|
34
|
+
...doc.metadata,
|
|
35
|
+
chunkIndex: i,
|
|
36
|
+
totalChunks: chunks.length,
|
|
37
|
+
},
|
|
38
|
+
});
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
return chunkedDocs;
|
|
42
|
+
}
|
|
43
|
+
function mergeDocuments(docs) {
|
|
44
|
+
const mergedContent = docs.map(doc => doc.pageContent).join('\n\n');
|
|
45
|
+
const mergedMetadata = docs.reduce((acc, doc) => ({ ...acc, ...doc.metadata }), {});
|
|
46
|
+
return {
|
|
47
|
+
pageContent: mergedContent,
|
|
48
|
+
metadata: mergedMetadata,
|
|
49
|
+
};
|
|
50
|
+
}
|
|
51
|
+
//# sourceMappingURL=chunking.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"chunking.js","sourceRoot":"","sources":["../../src/utils/chunking.ts"],"names":[],"mappings":";AAAA;;GAEG;;AAIH,8BAoBC;AAED,wCAmBC;AAED,wCAQC;AAnDD,SAAgB,SAAS,CAAC,IAAY,EAAE,YAAoB,IAAI,EAAE,UAAkB,GAAG;IACrF,IAAI,CAAC,IAAI,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC/B,OAAO,EAAE,CAAC;IACZ,CAAC;IAED,MAAM,MAAM,GAAa,EAAE,CAAC;IAC5B,IAAI,KAAK,GAAG,CAAC,CAAC;IAEd,OAAO,KAAK,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC;QAC3B,MAAM,GAAG,GAAG,IAAI,CAAC,GAAG,CAAC,KAAK,GAAG,SAAS,EAAE,IAAI,CAAC,MAAM,CAAC,CAAC;QACrD,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,KAAK,EAAE,GAAG,CAAC,CAAC;QACrC,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QAEnB,sCAAsC;QACtC,KAAK,IAAI,SAAS,GAAG,OAAO,CAAC;QAE7B,IAAI,KAAK,IAAI,IAAI,CAAC,MAAM;YAAE,MAAM;IAClC,CAAC;IAED,OAAO,MAAM,CAAC;AAChB,CAAC;AAED,SAAgB,cAAc,CAAC,IAAgB,EAAE,YAAoB,IAAI;IACvE,MAAM,WAAW,GAAe,EAAE,CAAC;IAEnC,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;QACvB,MAAM,MAAM,GAAG,SAAS,CAAC,GAAG,CAAC,WAAW,EAAE,SAAS,CAAC,CAAC;QAErD,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACvC,WAAW,CAAC,IAAI,CAAC;gBACf,WAAW,EAAE,MAAM,CAAC,CAAC,CAAC;gBACtB,QAAQ,EAAE;oBACR,GAAG,GAAG,CAAC,QAAQ;oBACf,UAAU,EAAE,CAAC;oBACb,WAAW,EAAE,MAAM,CAAC,MAAM;iBAC3B;aACF,CAAC,CAAC;QACL,CAAC;IACH,CAAC;IAED,OAAO,WAAW,CAAC;AACrB,CAAC;AAED,SAAgB,cAAc,CAAC,IAAgB;IAC7C,MAAM,aAAa,GAAG,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,WAAW,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;IACpE,MAAM,cAAc,GAAG,IAAI,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,GAAG,EAAE,EAAE,CAAC,CAAC,EAAE,GAAG,GAAG,EAAE,GAAG,GAAG,CAAC,QAAQ,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC;IAEpF,OAAO;QACL,WAAW,EAAE,aAAa;QAC1B,QAAQ,EAAE,cAAc;KACzB,CAAC;AACJ,CAAC"}
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* HTML cleanup and minification utilities
|
|
3
|
+
*/
|
|
4
|
+
export declare function cleanupHtml(html: string, cut?: boolean): string;
|
|
5
|
+
export declare function removeUnwantedTags(html: string): string;
|
|
6
|
+
export declare function extractMainContent(html: string): string;
|
|
7
|
+
//# sourceMappingURL=cleanupHtml.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"cleanupHtml.d.ts","sourceRoot":"","sources":["../../src/utils/cleanupHtml.ts"],"names":[],"mappings":"AAAA;;GAEG;AAIH,wBAAgB,WAAW,CAAC,IAAI,EAAE,MAAM,EAAE,GAAG,GAAE,OAAc,GAAG,MAAM,CAqCrE;AAED,wBAAgB,kBAAkB,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAuBvD;AAED,wBAAgB,kBAAkB,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAiBvD"}
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* HTML cleanup and minification utilities
|
|
4
|
+
*/
|
|
5
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
+
exports.cleanupHtml = cleanupHtml;
|
|
7
|
+
exports.removeUnwantedTags = removeUnwantedTags;
|
|
8
|
+
exports.extractMainContent = extractMainContent;
|
|
9
|
+
const html_minifier_1 = require("html-minifier");
|
|
10
|
+
function cleanupHtml(html, cut = true) {
|
|
11
|
+
if (!html)
|
|
12
|
+
return '';
|
|
13
|
+
try {
|
|
14
|
+
// Remove script and style tags
|
|
15
|
+
let cleaned = html.replace(/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/gi, '');
|
|
16
|
+
cleaned = cleaned.replace(/<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>/gi, '');
|
|
17
|
+
// Remove comments
|
|
18
|
+
cleaned = cleaned.replace(/<!--[\s\S]*?-->/g, '');
|
|
19
|
+
if (cut) {
|
|
20
|
+
// Minify HTML
|
|
21
|
+
cleaned = (0, html_minifier_1.minify)(cleaned, {
|
|
22
|
+
collapseWhitespace: true,
|
|
23
|
+
removeComments: true,
|
|
24
|
+
removeEmptyAttributes: true,
|
|
25
|
+
removeRedundantAttributes: true,
|
|
26
|
+
removeScriptTypeAttributes: true,
|
|
27
|
+
removeStyleLinkTypeAttributes: true,
|
|
28
|
+
useShortDoctype: true,
|
|
29
|
+
minifyCSS: true,
|
|
30
|
+
minifyJS: true,
|
|
31
|
+
});
|
|
32
|
+
// Remove excessive whitespace
|
|
33
|
+
cleaned = cleaned.replace(/\s+/g, ' ').trim();
|
|
34
|
+
// Remove empty tags
|
|
35
|
+
cleaned = cleaned.replace(/<(\w+)(\s[^>]*)?\s*>\s*<\/\1>/g, '');
|
|
36
|
+
}
|
|
37
|
+
return cleaned;
|
|
38
|
+
}
|
|
39
|
+
catch (error) {
|
|
40
|
+
console.error('Error cleaning HTML:', error);
|
|
41
|
+
return html;
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
function removeUnwantedTags(html) {
|
|
45
|
+
const unwantedTags = [
|
|
46
|
+
'script',
|
|
47
|
+
'style',
|
|
48
|
+
'noscript',
|
|
49
|
+
'iframe',
|
|
50
|
+
'svg',
|
|
51
|
+
'path',
|
|
52
|
+
'symbol',
|
|
53
|
+
'use',
|
|
54
|
+
'header',
|
|
55
|
+
'footer',
|
|
56
|
+
'nav',
|
|
57
|
+
];
|
|
58
|
+
let cleaned = html;
|
|
59
|
+
for (const tag of unwantedTags) {
|
|
60
|
+
const regex = new RegExp(`<${tag}\\b[^<]*(?:(?!<\\/${tag}>)<[^<]*)*<\\/${tag}>`, 'gi');
|
|
61
|
+
cleaned = cleaned.replace(regex, '');
|
|
62
|
+
}
|
|
63
|
+
return cleaned;
|
|
64
|
+
}
|
|
65
|
+
function extractMainContent(html) {
|
|
66
|
+
// Try to find main content areas
|
|
67
|
+
const mainPatterns = [
|
|
68
|
+
/<main[^>]*>([\s\S]*?)<\/main>/i,
|
|
69
|
+
/<article[^>]*>([\s\S]*?)<\/article>/i,
|
|
70
|
+
/<div[^>]*class="[^"]*content[^"]*"[^>]*>([\s\S]*?)<\/div>/i,
|
|
71
|
+
/<div[^>]*id="[^"]*content[^"]*"[^>]*>([\s\S]*?)<\/div>/i,
|
|
72
|
+
];
|
|
73
|
+
for (const pattern of mainPatterns) {
|
|
74
|
+
const match = html.match(pattern);
|
|
75
|
+
if (match) {
|
|
76
|
+
return match[1];
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
return html;
|
|
80
|
+
}
|
|
81
|
+
//# sourceMappingURL=cleanupHtml.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"cleanupHtml.js","sourceRoot":"","sources":["../../src/utils/cleanupHtml.ts"],"names":[],"mappings":";AAAA;;GAEG;;AAIH,kCAqCC;AAED,gDAuBC;AAED,gDAiBC;AAnFD,iDAAuC;AAEvC,SAAgB,WAAW,CAAC,IAAY,EAAE,MAAe,IAAI;IAC3D,IAAI,CAAC,IAAI;QAAE,OAAO,EAAE,CAAC;IAErB,IAAI,CAAC;QACH,+BAA+B;QAC/B,IAAI,OAAO,GAAG,IAAI,CAAC,OAAO,CAAC,qDAAqD,EAAE,EAAE,CAAC,CAAC;QACtF,OAAO,GAAG,OAAO,CAAC,OAAO,CAAC,kDAAkD,EAAE,EAAE,CAAC,CAAC;QAElF,kBAAkB;QAClB,OAAO,GAAG,OAAO,CAAC,OAAO,CAAC,kBAAkB,EAAE,EAAE,CAAC,CAAC;QAElD,IAAI,GAAG,EAAE,CAAC;YACR,cAAc;YACd,OAAO,GAAG,IAAA,sBAAM,EAAC,OAAO,EAAE;gBACxB,kBAAkB,EAAE,IAAI;gBACxB,cAAc,EAAE,IAAI;gBACpB,qBAAqB,EAAE,IAAI;gBAC3B,yBAAyB,EAAE,IAAI;gBAC/B,0BAA0B,EAAE,IAAI;gBAChC,6BAA6B,EAAE,IAAI;gBACnC,eAAe,EAAE,IAAI;gBACrB,SAAS,EAAE,IAAI;gBACf,QAAQ,EAAE,IAAI;aACf,CAAC,CAAC;YAEH,8BAA8B;YAC9B,OAAO,GAAG,OAAO,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC;YAE9C,oBAAoB;YACpB,OAAO,GAAG,OAAO,CAAC,OAAO,CAAC,gCAAgC,EAAE,EAAE,CAAC,CAAC;QAClE,CAAC;QAED,OAAO,OAAO,CAAC;IACjB,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,OAAO,CAAC,KAAK,CAAC,sBAAsB,EAAE,KAAK,CAAC,CAAC;QAC7C,OAAO,IAAI,CAAC;IACd,CAAC;AACH,CAAC;AAED,SAAgB,kBAAkB,CAAC,IAAY;IAC7C,MAAM,YAAY,GAAG;QACnB,QAAQ;QACR,OAAO;QACP,UAAU;QACV,QAAQ;QACR,KAAK;QACL,MAAM;QACN,QAAQ;QACR,KAAK;QACL,QAAQ;QACR,QAAQ;QACR,KAAK;KACN,CAAC;IAEF,IAAI,OAAO,GAAG,IAAI,CAAC;IAEnB,KAAK,MAAM,GAAG,IAAI,YAAY,EAAE,CAAC;QAC/B,MAAM,KAAK,GAAG,IAAI,MAAM,CAAC,IAAI,GAAG,qBAAqB,GAAG,iBAAiB,GAAG,GAAG,EAAE,IAAI,CAAC,CAAC;QACvF,OAAO,GAAG,OAAO,CAAC,OAAO,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC;IACvC,CAAC;IAED,OAAO,OAAO,CAAC;AACjB,CAAC;AAED,SAAgB,kBAAkB,CAAC,IAAY;IAC7C,iCAAiC;IACjC,MAAM,YAAY,GAAG;QACnB,gCAAgC;QAChC,sCAAsC;QACtC,4DAA4D;QAC5D,yDAAyD;KAC1D,CAAC;IAEF,KAAK,MAAM,OAAO,IAAI,YAAY,EAAE,CAAC;QACnC,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC;QAClC,IAAI,KAAK,EAAE,CAAC;YACV,OAAO,KAAK,CAAC,CAAC,CAAC,CAAC;QAClB,CAAC;IACH,CAAC;IAED,OAAO,IAAI,CAAC;AACd,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"convertToMarkdown.d.ts","sourceRoot":"","sources":["../../src/utils/convertToMarkdown.ts"],"names":[],"mappings":"AAAA;;GAEG;AAIH,wBAAgB,iBAAiB,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAqCtD;AAED,wBAAgB,mBAAmB,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAcxD"}
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Convert HTML to Markdown
|
|
4
|
+
*/
|
|
5
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
6
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
7
|
+
};
|
|
8
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
9
|
+
exports.convertToMarkdown = convertToMarkdown;
|
|
10
|
+
exports.extractTextFromHtml = extractTextFromHtml;
|
|
11
|
+
const turndown_1 = __importDefault(require("turndown"));
|
|
12
|
+
function convertToMarkdown(html) {
|
|
13
|
+
const turndownService = new turndown_1.default({
|
|
14
|
+
headingStyle: 'atx',
|
|
15
|
+
codeBlockStyle: 'fenced',
|
|
16
|
+
emDelimiter: '*',
|
|
17
|
+
strongDelimiter: '**',
|
|
18
|
+
bulletListMarker: '-',
|
|
19
|
+
});
|
|
20
|
+
// Add custom rules
|
|
21
|
+
turndownService.addRule('removeImages', {
|
|
22
|
+
filter: ['img'],
|
|
23
|
+
replacement: () => '',
|
|
24
|
+
});
|
|
25
|
+
turndownService.addRule('removeSvg', {
|
|
26
|
+
filter: ['svg'],
|
|
27
|
+
replacement: () => '',
|
|
28
|
+
});
|
|
29
|
+
turndownService.addRule('preserveLinks', {
|
|
30
|
+
filter: 'a',
|
|
31
|
+
replacement: (content, node) => {
|
|
32
|
+
const href = node.getAttribute('href');
|
|
33
|
+
return href ? `[${content}](${href})` : content;
|
|
34
|
+
},
|
|
35
|
+
});
|
|
36
|
+
try {
|
|
37
|
+
const markdown = turndownService.turndown(html);
|
|
38
|
+
// Clean up excessive newlines
|
|
39
|
+
return markdown.replace(/\n{3,}/g, '\n\n').trim();
|
|
40
|
+
}
|
|
41
|
+
catch (error) {
|
|
42
|
+
console.error('Error converting to markdown:', error);
|
|
43
|
+
return html;
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
function extractTextFromHtml(html) {
|
|
47
|
+
// Remove all HTML tags and get plain text
|
|
48
|
+
return html
|
|
49
|
+
.replace(/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/gi, '')
|
|
50
|
+
.replace(/<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>/gi, '')
|
|
51
|
+
.replace(/<[^>]+>/g, ' ')
|
|
52
|
+
.replace(/ /g, ' ')
|
|
53
|
+
.replace(/&/g, '&')
|
|
54
|
+
.replace(/</g, '<')
|
|
55
|
+
.replace(/>/g, '>')
|
|
56
|
+
.replace(/"/g, '"')
|
|
57
|
+
.replace(/'/g, "'")
|
|
58
|
+
.replace(/\s+/g, ' ')
|
|
59
|
+
.trim();
|
|
60
|
+
}
|
|
61
|
+
//# sourceMappingURL=convertToMarkdown.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"convertToMarkdown.js","sourceRoot":"","sources":["../../src/utils/convertToMarkdown.ts"],"names":[],"mappings":";AAAA;;GAEG;;;;;AAIH,8CAqCC;AAED,kDAcC;AAvDD,wDAAuC;AAEvC,SAAgB,iBAAiB,CAAC,IAAY;IAC5C,MAAM,eAAe,GAAG,IAAI,kBAAe,CAAC;QAC1C,YAAY,EAAE,KAAK;QACnB,cAAc,EAAE,QAAQ;QACxB,WAAW,EAAE,GAAG;QAChB,eAAe,EAAE,IAAI;QACrB,gBAAgB,EAAE,GAAG;KACtB,CAAC,CAAC;IAEH,mBAAmB;IACnB,eAAe,CAAC,OAAO,CAAC,cAAc,EAAE;QACtC,MAAM,EAAE,CAAC,KAAK,CAAC;QACf,WAAW,EAAE,GAAG,EAAE,CAAC,EAAE;KACtB,CAAC,CAAC;IAEH,eAAe,CAAC,OAAO,CAAC,WAAW,EAAE;QACnC,MAAM,EAAE,CAAC,KAAK,CAAC;QACf,WAAW,EAAE,GAAG,EAAE,CAAC,EAAE;KACtB,CAAC,CAAC;IAEH,eAAe,CAAC,OAAO,CAAC,eAAe,EAAE;QACvC,MAAM,EAAE,GAAG;QACX,WAAW,EAAE,CAAC,OAAO,EAAE,IAAS,EAAE,EAAE;YAClC,MAAM,IAAI,GAAG,IAAI,CAAC,YAAY,CAAC,MAAM,CAAC,CAAC;YACvC,OAAO,IAAI,CAAC,CAAC,CAAC,IAAI,OAAO,KAAK,IAAI,GAAG,CAAC,CAAC,CAAC,OAAO,CAAC;QAClD,CAAC;KACF,CAAC,CAAC;IAEH,IAAI,CAAC;QACH,MAAM,QAAQ,GAAG,eAAe,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;QAEhD,8BAA8B;QAC9B,OAAO,QAAQ,CAAC,OAAO,CAAC,SAAS,EAAE,MAAM,CAAC,CAAC,IAAI,EAAE,CAAC;IACpD,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,OAAO,CAAC,KAAK,CAAC,+BAA+B,EAAE,KAAK,CAAC,CAAC;QACtD,OAAO,IAAI,CAAC;IACd,CAAC;AACH,CAAC;AAED,SAAgB,mBAAmB,CAAC,IAAY;IAC9C,0CAA0C;IAC1C,OAAO,IAAI;SACR,OAAO,CAAC,qDAAqD,EAAE,EAAE,CAAC;SAClE,OAAO,CAAC,kDAAkD,EAAE,EAAE,CAAC;SAC/D,OAAO,CAAC,UAAU,EAAE,GAAG,CAAC;SACxB,OAAO,CAAC,SAAS,EAAE,GAAG,CAAC;SACvB,OAAO,CAAC,QAAQ,EAAE,GAAG,CAAC;SACtB,OAAO,CAAC,OAAO,EAAE,GAAG,CAAC;SACrB,OAAO,CAAC,OAAO,EAAE,GAAG,CAAC;SACrB,OAAO,CAAC,SAAS,EAAE,GAAG,CAAC;SACvB,OAAO,CAAC,QAAQ,EAAE,GAAG,CAAC;SACtB,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC;SACpB,IAAI,EAAE,CAAC;AACZ,CAAC"}
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Utility functions index
|
|
3
|
+
*/
|
|
4
|
+
export * from './cleanupHtml';
|
|
5
|
+
export * from './convertToMarkdown';
|
|
6
|
+
export * from './logger';
|
|
7
|
+
export * from './chunking';
|
|
8
|
+
export { retry } from './retry';
|
|
9
|
+
export { RateLimiter, rateLimiter } from './rateLimiter';
|
|
10
|
+
export { Cache, cache, globalCache } from './cache';
|
|
11
|
+
export { ProxyRotator, proxyRotator } from './proxy';
|
|
12
|
+
export { SchemaValidator, CommonSchemas } from './schemaValidator';
|
|
13
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/utils/index.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,cAAc,eAAe,CAAC;AAC9B,cAAc,qBAAqB,CAAC;AACpC,cAAc,UAAU,CAAC;AACzB,cAAc,YAAY,CAAC;AAC3B,OAAO,EAAE,KAAK,EAAE,MAAM,SAAS,CAAC;AAChC,OAAO,EAAE,WAAW,EAAE,WAAW,EAAE,MAAM,eAAe,CAAC;AACzD,OAAO,EAAE,KAAK,EAAE,KAAK,EAAE,WAAW,EAAE,MAAM,SAAS,CAAC;AACpD,OAAO,EAAE,YAAY,EAAE,YAAY,EAAE,MAAM,SAAS,CAAC;AACrD,OAAO,EAAE,eAAe,EAAE,aAAa,EAAE,MAAM,mBAAmB,CAAC"}
|