cogniscrape 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (159) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +308 -0
  3. package/dist/graphs/AbstractGraph.d.ts +27 -0
  4. package/dist/graphs/AbstractGraph.d.ts.map +1 -0
  5. package/dist/graphs/AbstractGraph.js +44 -0
  6. package/dist/graphs/AbstractGraph.js.map +1 -0
  7. package/dist/graphs/BaseGraph.d.ts +30 -0
  8. package/dist/graphs/BaseGraph.d.ts.map +1 -0
  9. package/dist/graphs/BaseGraph.js +62 -0
  10. package/dist/graphs/BaseGraph.js.map +1 -0
  11. package/dist/graphs/CSVScraperGraph.d.ts +16 -0
  12. package/dist/graphs/CSVScraperGraph.d.ts.map +1 -0
  13. package/dist/graphs/CSVScraperGraph.js +84 -0
  14. package/dist/graphs/CSVScraperGraph.js.map +1 -0
  15. package/dist/graphs/DepthSearchGraph.d.ts +14 -0
  16. package/dist/graphs/DepthSearchGraph.d.ts.map +1 -0
  17. package/dist/graphs/DepthSearchGraph.js +45 -0
  18. package/dist/graphs/DepthSearchGraph.js.map +1 -0
  19. package/dist/graphs/JSONScraperGraph.d.ts +18 -0
  20. package/dist/graphs/JSONScraperGraph.d.ts.map +1 -0
  21. package/dist/graphs/JSONScraperGraph.js +100 -0
  22. package/dist/graphs/JSONScraperGraph.js.map +1 -0
  23. package/dist/graphs/SearchGraph.d.ts +14 -0
  24. package/dist/graphs/SearchGraph.d.ts.map +1 -0
  25. package/dist/graphs/SearchGraph.js +42 -0
  26. package/dist/graphs/SearchGraph.js.map +1 -0
  27. package/dist/graphs/SmartScraperGraph.d.ts +16 -0
  28. package/dist/graphs/SmartScraperGraph.d.ts.map +1 -0
  29. package/dist/graphs/SmartScraperGraph.js +57 -0
  30. package/dist/graphs/SmartScraperGraph.js.map +1 -0
  31. package/dist/graphs/SmartScraperMultiGraph.d.ts +17 -0
  32. package/dist/graphs/SmartScraperMultiGraph.d.ts.map +1 -0
  33. package/dist/graphs/SmartScraperMultiGraph.js +71 -0
  34. package/dist/graphs/SmartScraperMultiGraph.js.map +1 -0
  35. package/dist/graphs/index.d.ts +12 -0
  36. package/dist/graphs/index.d.ts.map +1 -0
  37. package/dist/graphs/index.js +23 -0
  38. package/dist/graphs/index.js.map +1 -0
  39. package/dist/index.d.ts +10 -0
  40. package/dist/index.d.ts.map +1 -0
  41. package/dist/index.js +43 -0
  42. package/dist/index.js.map +1 -0
  43. package/dist/models/GeminiModel.d.ts +16 -0
  44. package/dist/models/GeminiModel.d.ts.map +1 -0
  45. package/dist/models/GeminiModel.js +127 -0
  46. package/dist/models/GeminiModel.js.map +1 -0
  47. package/dist/models/OllamaModel.d.ts +15 -0
  48. package/dist/models/OllamaModel.d.ts.map +1 -0
  49. package/dist/models/OllamaModel.js +134 -0
  50. package/dist/models/OllamaModel.js.map +1 -0
  51. package/dist/models/index.d.ts +8 -0
  52. package/dist/models/index.d.ts.map +1 -0
  53. package/dist/models/index.js +24 -0
  54. package/dist/models/index.js.map +1 -0
  55. package/dist/nodes/BaseNode.d.ts +37 -0
  56. package/dist/nodes/BaseNode.d.ts.map +1 -0
  57. package/dist/nodes/BaseNode.js +116 -0
  58. package/dist/nodes/BaseNode.js.map +1 -0
  59. package/dist/nodes/CSVExporterNode.d.ts +16 -0
  60. package/dist/nodes/CSVExporterNode.d.ts.map +1 -0
  61. package/dist/nodes/CSVExporterNode.js +85 -0
  62. package/dist/nodes/CSVExporterNode.js.map +1 -0
  63. package/dist/nodes/ConditionalNode.d.ts +16 -0
  64. package/dist/nodes/ConditionalNode.d.ts.map +1 -0
  65. package/dist/nodes/ConditionalNode.js +68 -0
  66. package/dist/nodes/ConditionalNode.js.map +1 -0
  67. package/dist/nodes/FetchNode.d.ts +15 -0
  68. package/dist/nodes/FetchNode.d.ts.map +1 -0
  69. package/dist/nodes/FetchNode.js +182 -0
  70. package/dist/nodes/FetchNode.js.map +1 -0
  71. package/dist/nodes/GenerateAnswerNode.d.ts +14 -0
  72. package/dist/nodes/GenerateAnswerNode.d.ts.map +1 -0
  73. package/dist/nodes/GenerateAnswerNode.js +86 -0
  74. package/dist/nodes/GenerateAnswerNode.js.map +1 -0
  75. package/dist/nodes/JSONExporterNode.d.ts +16 -0
  76. package/dist/nodes/JSONExporterNode.d.ts.map +1 -0
  77. package/dist/nodes/JSONExporterNode.js +42 -0
  78. package/dist/nodes/JSONExporterNode.js.map +1 -0
  79. package/dist/nodes/MergeNode.d.ts +10 -0
  80. package/dist/nodes/MergeNode.d.ts.map +1 -0
  81. package/dist/nodes/MergeNode.js +51 -0
  82. package/dist/nodes/MergeNode.js.map +1 -0
  83. package/dist/nodes/PDFScraperNode.d.ts +10 -0
  84. package/dist/nodes/PDFScraperNode.d.ts.map +1 -0
  85. package/dist/nodes/PDFScraperNode.js +80 -0
  86. package/dist/nodes/PDFScraperNode.js.map +1 -0
  87. package/dist/nodes/ParseNode.d.ts +12 -0
  88. package/dist/nodes/ParseNode.d.ts.map +1 -0
  89. package/dist/nodes/ParseNode.js +44 -0
  90. package/dist/nodes/ParseNode.js.map +1 -0
  91. package/dist/nodes/RAGNode.d.ts +13 -0
  92. package/dist/nodes/RAGNode.d.ts.map +1 -0
  93. package/dist/nodes/RAGNode.js +64 -0
  94. package/dist/nodes/RAGNode.js.map +1 -0
  95. package/dist/nodes/ReasoningNode.d.ts +10 -0
  96. package/dist/nodes/ReasoningNode.d.ts.map +1 -0
  97. package/dist/nodes/ReasoningNode.js +51 -0
  98. package/dist/nodes/ReasoningNode.js.map +1 -0
  99. package/dist/nodes/SearchNode.d.ts +13 -0
  100. package/dist/nodes/SearchNode.d.ts.map +1 -0
  101. package/dist/nodes/SearchNode.js +81 -0
  102. package/dist/nodes/SearchNode.js.map +1 -0
  103. package/dist/nodes/XMLScraperNode.d.ts +11 -0
  104. package/dist/nodes/XMLScraperNode.d.ts.map +1 -0
  105. package/dist/nodes/XMLScraperNode.js +99 -0
  106. package/dist/nodes/XMLScraperNode.js.map +1 -0
  107. package/dist/nodes/index.d.ts +17 -0
  108. package/dist/nodes/index.d.ts.map +1 -0
  109. package/dist/nodes/index.js +33 -0
  110. package/dist/nodes/index.js.map +1 -0
  111. package/dist/prompts/index.d.ts +12 -0
  112. package/dist/prompts/index.d.ts.map +1 -0
  113. package/dist/prompts/index.js +117 -0
  114. package/dist/prompts/index.js.map +1 -0
  115. package/dist/types.d.ts +106 -0
  116. package/dist/types.d.ts.map +1 -0
  117. package/dist/types.js +13 -0
  118. package/dist/types.js.map +1 -0
  119. package/dist/utils/cache.d.ts +28 -0
  120. package/dist/utils/cache.d.ts.map +1 -0
  121. package/dist/utils/cache.js +72 -0
  122. package/dist/utils/cache.js.map +1 -0
  123. package/dist/utils/chunking.d.ts +8 -0
  124. package/dist/utils/chunking.d.ts.map +1 -0
  125. package/dist/utils/chunking.js +51 -0
  126. package/dist/utils/chunking.js.map +1 -0
  127. package/dist/utils/cleanupHtml.d.ts +7 -0
  128. package/dist/utils/cleanupHtml.d.ts.map +1 -0
  129. package/dist/utils/cleanupHtml.js +81 -0
  130. package/dist/utils/cleanupHtml.js.map +1 -0
  131. package/dist/utils/convertToMarkdown.d.ts +6 -0
  132. package/dist/utils/convertToMarkdown.d.ts.map +1 -0
  133. package/dist/utils/convertToMarkdown.js +61 -0
  134. package/dist/utils/convertToMarkdown.js.map +1 -0
  135. package/dist/utils/index.d.ts +13 -0
  136. package/dist/utils/index.d.ts.map +1 -0
  137. package/dist/utils/index.js +40 -0
  138. package/dist/utils/index.js.map +1 -0
  139. package/dist/utils/logger.d.ts +14 -0
  140. package/dist/utils/logger.d.ts.map +1 -0
  141. package/dist/utils/logger.js +35 -0
  142. package/dist/utils/logger.js.map +1 -0
  143. package/dist/utils/proxy.d.ts +30 -0
  144. package/dist/utils/proxy.d.ts.map +1 -0
  145. package/dist/utils/proxy.js +62 -0
  146. package/dist/utils/proxy.js.map +1 -0
  147. package/dist/utils/rateLimiter.d.ts +24 -0
  148. package/dist/utils/rateLimiter.d.ts.map +1 -0
  149. package/dist/utils/rateLimiter.js +61 -0
  150. package/dist/utils/rateLimiter.js.map +1 -0
  151. package/dist/utils/retry.d.ts +17 -0
  152. package/dist/utils/retry.d.ts.map +1 -0
  153. package/dist/utils/retry.js +43 -0
  154. package/dist/utils/retry.js.map +1 -0
  155. package/dist/utils/schemaValidator.d.ts +69 -0
  156. package/dist/utils/schemaValidator.d.ts.map +1 -0
  157. package/dist/utils/schemaValidator.js +133 -0
  158. package/dist/utils/schemaValidator.js.map +1 -0
  159. package/package.json +64 -0
@@ -0,0 +1,117 @@
1
+ "use strict";
2
+ /**
3
+ * Prompt templates for different scraping scenarios
4
+ */
5
+ Object.defineProperty(exports, "__esModule", { value: true });
6
+ exports.REASONING_TEMPLATE = exports.TEMPLATE_MERGE_MD = exports.TEMPLATE_CHUNKS_MD = exports.TEMPLATE_NO_CHUNKS_MD = exports.TEMPLATE_MERGE = exports.TEMPLATE_CHUNKS = exports.TEMPLATE_NO_CHUNKS = void 0;
7
+ exports.formatPrompt = formatPrompt;
8
+ exports.TEMPLATE_NO_CHUNKS = `
9
+ You are a web scraping expert. Your task is to extract specific information from the provided HTML content based on the user's request.
10
+
11
+ User Request: {user_prompt}
12
+
13
+ HTML Content:
14
+ {content}
15
+
16
+ Instructions:
17
+ 1. Carefully read the user's request
18
+ 2. Extract ONLY the requested information from the HTML content
19
+ 3. Return the data in a structured JSON format
20
+ 4. If information is not found, return null for that field
21
+ 5. Be precise and accurate
22
+
23
+ Respond with valid JSON only.
24
+ `;
25
+ exports.TEMPLATE_CHUNKS = `
26
+ You are a web scraping expert. Your task is to extract specific information from the provided content chunks based on the user's request.
27
+
28
+ User Request: {user_prompt}
29
+
30
+ Content Chunks:
31
+ {chunks}
32
+
33
+ Instructions:
34
+ 1. Analyze all provided chunks
35
+ 2. Extract ONLY the requested information
36
+ 3. Combine information from multiple chunks if needed
37
+ 4. Return the data in a structured JSON format
38
+ 5. If information is not found, return null for that field
39
+
40
+ Respond with valid JSON only.
41
+ `;
42
+ exports.TEMPLATE_MERGE = `
43
+ You are a data merging expert. You have received multiple JSON responses from different content chunks. Your task is to merge them into a single, coherent response.
44
+
45
+ User Request: {user_prompt}
46
+
47
+ Responses to Merge:
48
+ {responses}
49
+
50
+ Instructions:
51
+ 1. Merge all responses into a single JSON object
52
+ 2. Remove duplicates
53
+ 3. Ensure consistency across all fields
54
+ 4. Maintain the original structure
55
+ 5. If there are conflicts, prefer the most complete information
56
+
57
+ Respond with valid JSON only.
58
+ `;
59
+ exports.TEMPLATE_NO_CHUNKS_MD = `
60
+ You are a web scraping expert. Your task is to extract specific information from the provided Markdown content based on the user's request.
61
+
62
+ User Request: {user_prompt}
63
+
64
+ Markdown Content:
65
+ {content}
66
+
67
+ Instructions:
68
+ 1. Carefully read the user's request
69
+ 2. Extract ONLY the requested information from the Markdown content
70
+ 3. Return the data in a structured JSON format
71
+ 4. If information is not found, return null for that field
72
+ 5. Be precise and accurate
73
+
74
+ Respond with valid JSON only.
75
+ `;
76
+ exports.TEMPLATE_CHUNKS_MD = `
77
+ You are a web scraping expert. Your task is to extract specific information from the provided Markdown chunks based on the user's request.
78
+
79
+ User Request: {user_prompt}
80
+
81
+ Markdown Chunks:
82
+ {chunks}
83
+
84
+ Instructions:
85
+ 1. Analyze all provided chunks
86
+ 2. Extract ONLY the requested information
87
+ 3. Combine information from multiple chunks if needed
88
+ 4. Return the data in a structured JSON format
89
+ 5. If information is not found, return null for that field
90
+
91
+ Respond with valid JSON only.
92
+ `;
93
+ exports.TEMPLATE_MERGE_MD = exports.TEMPLATE_MERGE;
94
+ exports.REASONING_TEMPLATE = `
95
+ You are an AI reasoning expert. Analyze the following content and user request to determine the best approach for extracting the requested information.
96
+
97
+ User Request: {user_prompt}
98
+
99
+ Content:
100
+ {content}
101
+
102
+ Instructions:
103
+ 1. Identify what information the user is looking for
104
+ 2. Determine which parts of the content are relevant
105
+ 3. Suggest a strategy for extracting this information
106
+ 4. Consider edge cases and potential issues
107
+
108
+ Provide your reasoning and strategy.
109
+ `;
110
+ function formatPrompt(template, variables) {
111
+ let formatted = template;
112
+ for (const [key, value] of Object.entries(variables)) {
113
+ formatted = formatted.replace(new RegExp(`\\{${key}\\}`, 'g'), value);
114
+ }
115
+ return formatted;
116
+ }
117
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/prompts/index.ts"],"names":[],"mappings":";AAAA;;GAEG;;;AA+GH,oCAQC;AArHY,QAAA,kBAAkB,GAAG;;;;;;;;;;;;;;;;CAgBjC,CAAC;AAEW,QAAA,eAAe,GAAG;;;;;;;;;;;;;;;;CAgB9B,CAAC;AAEW,QAAA,cAAc,GAAG;;;;;;;;;;;;;;;;CAgB7B,CAAC;AAEW,QAAA,qBAAqB,GAAG;;;;;;;;;;;;;;;;CAgBpC,CAAC;AAEW,QAAA,kBAAkB,GAAG;;;;;;;;;;;;;;;;CAgBjC,CAAC;AAEW,QAAA,iBAAiB,GAAG,sBAAc,CAAC;AAEnC,QAAA,kBAAkB,GAAG;;;;;;;;;;;;;;;CAejC,CAAC;AAEF,SAAgB,YAAY,CAAC,QAAgB,EAAE,SAAiC;IAC9E,IAAI,SAAS,GAAG,QAAQ,CAAC;IAEzB,KAAK,MAAM,CAAC,GAAG,EAAE,KAAK,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,SAAS,CAAC,EAAE,CAAC;QACrD,SAAS,GAAG,SAAS,CAAC,OAAO,CAAC,IAAI,MAAM,CAAC,MAAM,GAAG,KAAK,EAAE,GAAG,CAAC,EAAE,KAAK,CAAC,CAAC;IACxE,CAAC;IAED,OAAO,SAAS,CAAC;AACnB,CAAC"}
@@ -0,0 +1,106 @@
1
+ /**
2
+ * Core type definitions for CogniScrape TypeScript
3
+ */
4
+ export interface Document {
5
+ pageContent: string;
6
+ metadata: Record<string, any>;
7
+ }
8
+ export interface LLMConfig {
9
+ provider: 'ollama' | 'gemini';
10
+ model: string;
11
+ apiKey?: string;
12
+ baseUrl?: string;
13
+ temperature?: number;
14
+ maxTokens?: number;
15
+ }
16
+ export interface ProxyConfig {
17
+ enabled?: boolean;
18
+ proxies?: string[];
19
+ maxFailures?: number;
20
+ host?: string;
21
+ port?: number;
22
+ username?: string;
23
+ password?: string;
24
+ protocol?: 'http' | 'https' | 'socks4' | 'socks5';
25
+ }
26
+ export interface RetryConfig {
27
+ maxRetries?: number;
28
+ initialDelay?: number;
29
+ maxDelay?: number;
30
+ backoffMultiplier?: number;
31
+ }
32
+ export interface RateLimitConfig {
33
+ maxRequests?: number;
34
+ windowMs?: number;
35
+ minDelay?: number;
36
+ }
37
+ export interface CacheConfig {
38
+ enabled?: boolean;
39
+ ttl?: number;
40
+ maxSize?: number;
41
+ }
42
+ export interface ScraperConfig {
43
+ llm: LLMConfig;
44
+ verbose?: boolean;
45
+ headless?: boolean;
46
+ timeout?: number;
47
+ cut?: boolean;
48
+ force?: boolean;
49
+ loaderKwargs?: Record<string, any>;
50
+ additionalInfo?: string;
51
+ schema?: any;
52
+ reattempt?: boolean;
53
+ reasoning?: boolean;
54
+ htmlMode?: boolean;
55
+ proxy?: ProxyConfig | ProxyConfig[];
56
+ retry?: RetryConfig;
57
+ rateLimit?: RateLimitConfig;
58
+ cache?: CacheConfig;
59
+ maxDepth?: number;
60
+ searchEngine?: 'duckduckgo' | 'google' | 'bing';
61
+ }
62
+ export interface NodeConfig {
63
+ llmModel?: BaseLLM;
64
+ verbose?: boolean;
65
+ force?: boolean;
66
+ cut?: boolean;
67
+ timeout?: number;
68
+ loaderKwargs?: Record<string, any>;
69
+ headless?: boolean;
70
+ chunkSize?: number;
71
+ additionalInfo?: string;
72
+ schema?: any;
73
+ proxyConfig?: ProxyConfig;
74
+ retryConfig?: RetryConfig;
75
+ rateLimitConfig?: RateLimitConfig;
76
+ cacheConfig?: CacheConfig;
77
+ }
78
+ export interface GraphState {
79
+ [key: string]: any;
80
+ userPrompt?: string;
81
+ url?: string;
82
+ localDir?: string;
83
+ doc?: Document[];
84
+ parsedDoc?: Document[];
85
+ relevantChunks?: Document[];
86
+ answer?: any;
87
+ }
88
+ export declare abstract class BaseLLM {
89
+ protected config: LLMConfig;
90
+ constructor(config: LLMConfig);
91
+ abstract generate(prompt: string, systemPrompt?: string): Promise<string>;
92
+ abstract generateJson(prompt: string, systemPrompt?: string): Promise<any>;
93
+ }
94
+ export interface ChatMessage {
95
+ role: 'user' | 'assistant' | 'system';
96
+ content: string;
97
+ }
98
+ export interface LLMResponse {
99
+ content: string;
100
+ usage?: {
101
+ promptTokens: number;
102
+ completionTokens: number;
103
+ totalTokens: number;
104
+ };
105
+ }
106
+ //# sourceMappingURL=types.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,MAAM,WAAW,QAAQ;IACvB,WAAW,EAAE,MAAM,CAAC;IACpB,QAAQ,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;CAC/B;AAED,MAAM,WAAW,SAAS;IACxB,QAAQ,EAAE,QAAQ,GAAG,QAAQ,CAAC;IAC9B,KAAK,EAAE,MAAM,CAAC;IACd,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,SAAS,CAAC,EAAE,MAAM,CAAC;CACpB;AAED,MAAM,WAAW,WAAW;IAC1B,OAAO,CAAC,EAAE,OAAO,CAAC;IAClB,OAAO,CAAC,EAAE,MAAM,EAAE,CAAC;IACnB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,QAAQ,CAAC,EAAE,MAAM,GAAG,OAAO,GAAG,QAAQ,GAAG,QAAQ,CAAC;CACnD;AAED,MAAM,WAAW,WAAW;IAC1B,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,iBAAiB,CAAC,EAAE,MAAM,CAAC;CAC5B;AAED,MAAM,WAAW,eAAe;IAC9B,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,QAAQ,CAAC,EAAE,MAAM,CAAC;CACnB;AAED,MAAM,WAAW,WAAW;IAC1B,OAAO,CAAC,EAAE,OAAO,CAAC;IAClB,GAAG,CAAC,EAAE,MAAM,CAAC;IACb,OAAO,CAAC,EAAE,MAAM,CAAC;CAClB;AAED,MAAM,WAAW,aAAa;IAC5B,GAAG,EAAE,SAAS,CAAC;IACf,OAAO,CAAC,EAAE,OAAO,CAAC;IAClB,QAAQ,CAAC,EAAE,OAAO,CAAC;IACnB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,GAAG,CAAC,EAAE,OAAO,CAAC;IACd,KAAK,CAAC,EAAE,OAAO,CAAC;IAChB,YAAY,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;IACnC,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB,MAAM,CAAC,EAAE,GAAG,CAAC;IACb,SAAS,CAAC,EAAE,OAAO,CAAC;IACpB,SAAS,CAAC,EAAE,OAAO,CAAC;IACpB,QAAQ,CAAC,EAAE,OAAO,CAAC;IAEnB,KAAK,CAAC,EAAE,WAAW,GAAG,WAAW,EAAE,CAAC;IACpC,KAAK,CAAC,EAAE,WAAW,CAAC;IACpB,SAAS,CAAC,EAAE,eAAe,CAAC;IAC5B,KAAK,CAAC,EAAE,WAAW,CAAC;IACpB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,YAAY,CAAC,EAAE,YAAY,GAAG,QAAQ,GAAG,MAAM,CAAC;CACjD;AAED,MAAM,WAAW,UAAU;IACzB,QAAQ,CAAC,EAAE,OAAO,CAAC;IACnB,OAAO,CAAC,EAAE,OAAO,CAAC;IAClB,KAAK,CAAC,EAAE,OAAO,CAAC;IAChB,GAAG,CAAC,EAAE,OAAO,CAAC;IACd,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,YAAY,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;IACnC,QAAQ,CAAC,EAAE,OAAO,CAAC;IACnB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB,MAAM,CAAC,EAAE,GAAG,CAAC;IAEb,WAAW,CAAC,EAAE,WAAW,CAAC;IAC1B,WAAW,CAAC,EAAE,WAAW,CAAC;IAC1B,eAAe,CAAC,EAAE,eAAe,CAAC;IAClC,WAAW,CAAC,EAAE,WAAW,CAAC;CAC3B;AAED,MAAM,WAAW,UAAU;IACzB,CAAC,GAAG,EAAE,MAAM,GAAG,GAAG,CAAC;IACnB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,GAAG,CAAC,EAAE,MAAM,CAAC;IACb,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,GAAG,CAAC,EAAE,QAAQ,EAAE,CAAC;IACjB,SAAS,CAAC,EAAE,QAAQ,EAAE,CAAC;IACvB,cAAc,CAAC,EAAE,QAAQ,EAAE,CAAC;IAC5B,MAAM,CAAC,EAAE,GAAG,CAAC;CACd;AAED,8BAAsB,OAAO;IAC3B,SAAS,CAAC,MAAM,EAAE,SAAS,CAAC;gBAEhB,MAAM,EAAE,SAAS;IAI7B,QAAQ,CAAC,QAAQ,CAAC,MAAM,EAAE,MAAM,EAAE,YAAY,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC;IACzE,QAAQ,CAAC,YAAY,CAAC,MAAM,EAAE,MAAM,EAAE,YAAY,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,GAAG,CAAC;CAC3E;AAED,MAAM,WAAW,WAAW;IAC1B,IAAI,EAAE,MAAM,GAAG,WAAW,GAAG,QAAQ,CAAC;IACtC,OAAO,EAAE,MAAM,CAAC;CACjB;AAED,MAAM,WAAW,WAAW;IAC1B,OAAO,EAAE,MAAM,CAAC;IAChB,KAAK,CAAC,EAAE;QACN,YAAY,EAAE,MAAM,CAAC;QACrB,gBAAgB,EAAE,MAAM,CAAC;QACzB,WAAW,EAAE,MAAM,CAAC;KACrB,CAAC;CACH"}
package/dist/types.js ADDED
@@ -0,0 +1,13 @@
1
+ "use strict";
2
+ /**
3
+ * Core type definitions for CogniScrape TypeScript
4
+ */
5
+ Object.defineProperty(exports, "__esModule", { value: true });
6
+ exports.BaseLLM = void 0;
7
+ class BaseLLM {
8
+ constructor(config) {
9
+ this.config = config;
10
+ }
11
+ }
12
+ exports.BaseLLM = BaseLLM;
13
+ //# sourceMappingURL=types.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"types.js","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":";AAAA;;GAEG;;;AAiGH,MAAsB,OAAO;IAG3B,YAAY,MAAiB;QAC3B,IAAI,CAAC,MAAM,GAAG,MAAM,CAAC;IACvB,CAAC;CAIF;AATD,0BASC"}
@@ -0,0 +1,28 @@
1
+ /**
2
+ * Simple in-memory cache with TTL
3
+ */
4
+ export interface CacheOptions {
5
+ ttl?: number;
6
+ maxSize?: number;
7
+ }
8
+ export declare class Cache<T = any> {
9
+ private cache;
10
+ private ttl;
11
+ private maxSize;
12
+ constructor(options?: CacheOptions);
13
+ set(key: string, value: T, customTtl?: number): void;
14
+ get<K = T>(key: string): K | undefined;
15
+ has(key: string): boolean;
16
+ delete(key: string): boolean;
17
+ clear(): void;
18
+ size(): number;
19
+ private cleanExpired;
20
+ getStats(): {
21
+ size: number;
22
+ maxSize: number;
23
+ ttl: number;
24
+ };
25
+ }
26
+ export declare const cache: Cache<any>;
27
+ export declare const globalCache: Cache<any>;
28
+ //# sourceMappingURL=cache.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"cache.d.ts","sourceRoot":"","sources":["../../src/utils/cache.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,MAAM,WAAW,YAAY;IAC3B,GAAG,CAAC,EAAE,MAAM,CAAC;IACb,OAAO,CAAC,EAAE,MAAM,CAAC;CAClB;AAOD,qBAAa,KAAK,CAAC,CAAC,GAAG,GAAG;IACxB,OAAO,CAAC,KAAK,CAAoC;IACjD,OAAO,CAAC,GAAG,CAAS;IACpB,OAAO,CAAC,OAAO,CAAS;gBAEZ,OAAO,GAAE,YAAiB;IAKtC,GAAG,CAAC,GAAG,EAAE,MAAM,EAAE,KAAK,EAAE,CAAC,EAAE,SAAS,CAAC,EAAE,MAAM,GAAG,IAAI;IAcpD,GAAG,CAAC,CAAC,GAAG,CAAC,EAAE,GAAG,EAAE,MAAM,GAAG,CAAC,GAAG,SAAS;IAgBtC,GAAG,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO;IAIzB,MAAM,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO;IAI5B,KAAK,IAAI,IAAI;IAIb,IAAI,IAAI,MAAM;IAMd,OAAO,CAAC,YAAY;IASpB,QAAQ,IAAI;QAAE,IAAI,EAAE,MAAM,CAAC;QAAC,OAAO,EAAE,MAAM,CAAC;QAAC,GAAG,EAAE,MAAM,CAAA;KAAE;CAQ3D;AAGD,eAAO,MAAM,KAAK,YAAc,CAAC;AACjC,eAAO,MAAM,WAAW,YAAQ,CAAC"}
@@ -0,0 +1,72 @@
1
+ "use strict";
2
+ /**
3
+ * Simple in-memory cache with TTL
4
+ */
5
+ Object.defineProperty(exports, "__esModule", { value: true });
6
+ exports.globalCache = exports.cache = exports.Cache = void 0;
7
+ class Cache {
8
+ constructor(options = {}) {
9
+ this.cache = new Map();
10
+ this.ttl = options.ttl || 5 * 60 * 1000; // Default 5 minutes
11
+ this.maxSize = options.maxSize || 1000;
12
+ }
13
+ set(key, value, customTtl) {
14
+ // Enforce max size
15
+ if (this.cache.size >= this.maxSize) {
16
+ // Remove oldest entry (first in map)
17
+ const firstKey = this.cache.keys().next().value;
18
+ if (firstKey) {
19
+ this.cache.delete(firstKey);
20
+ }
21
+ }
22
+ const expiry = Date.now() + (customTtl || this.ttl);
23
+ this.cache.set(key, { value, expiry });
24
+ }
25
+ get(key) {
26
+ const entry = this.cache.get(key);
27
+ if (!entry) {
28
+ return undefined;
29
+ }
30
+ // Check if expired
31
+ if (Date.now() > entry.expiry) {
32
+ this.cache.delete(key);
33
+ return undefined;
34
+ }
35
+ return entry.value;
36
+ }
37
+ has(key) {
38
+ return this.get(key) !== undefined;
39
+ }
40
+ delete(key) {
41
+ return this.cache.delete(key);
42
+ }
43
+ clear() {
44
+ this.cache.clear();
45
+ }
46
+ size() {
47
+ // Clean expired entries
48
+ this.cleanExpired();
49
+ return this.cache.size;
50
+ }
51
+ cleanExpired() {
52
+ const now = Date.now();
53
+ for (const [key, entry] of this.cache.entries()) {
54
+ if (now > entry.expiry) {
55
+ this.cache.delete(key);
56
+ }
57
+ }
58
+ }
59
+ getStats() {
60
+ this.cleanExpired();
61
+ return {
62
+ size: this.cache.size,
63
+ maxSize: this.maxSize,
64
+ ttl: this.ttl,
65
+ };
66
+ }
67
+ }
68
+ exports.Cache = Cache;
69
+ // Global cache instance
70
+ exports.cache = new Cache();
71
+ exports.globalCache = exports.cache;
72
+ //# sourceMappingURL=cache.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"cache.js","sourceRoot":"","sources":["../../src/utils/cache.ts"],"names":[],"mappings":";AAAA;;GAEG;;;AAYH,MAAa,KAAK;IAKhB,YAAY,UAAwB,EAAE;QAJ9B,UAAK,GAAG,IAAI,GAAG,EAAyB,CAAC;QAK/C,IAAI,CAAC,GAAG,GAAG,OAAO,CAAC,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,IAAI,CAAC,CAAC,oBAAoB;QAC7D,IAAI,CAAC,OAAO,GAAG,OAAO,CAAC,OAAO,IAAI,IAAI,CAAC;IACzC,CAAC;IAED,GAAG,CAAC,GAAW,EAAE,KAAQ,EAAE,SAAkB;QAC3C,mBAAmB;QACnB,IAAI,IAAI,CAAC,KAAK,CAAC,IAAI,IAAI,IAAI,CAAC,OAAO,EAAE,CAAC;YACpC,qCAAqC;YACrC,MAAM,QAAQ,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,EAAE,CAAC,IAAI,EAAE,CAAC,KAAK,CAAC;YAChD,IAAI,QAAQ,EAAE,CAAC;gBACb,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAC;YAC9B,CAAC;QACH,CAAC;QAED,MAAM,MAAM,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,CAAC,SAAS,IAAI,IAAI,CAAC,GAAG,CAAC,CAAC;QACpD,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,GAAG,EAAE,EAAE,KAAK,EAAE,MAAM,EAAE,CAAC,CAAC;IACzC,CAAC;IAED,GAAG,CAAQ,GAAW;QACpB,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;QAElC,IAAI,CAAC,KAAK,EAAE,CAAC;YACX,OAAO,SAAS,CAAC;QACnB,CAAC;QAED,mBAAmB;QACnB,IAAI,IAAI,CAAC,GAAG,EAAE,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC;YAC9B,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC;YACvB,OAAO,SAAS,CAAC;QACnB,CAAC;QAED,OAAO,KAAK,CAAC,KAAqB,CAAC;IACrC,CAAC;IAED,GAAG,CAAC,GAAW;QACb,OAAO,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,KAAK,SAAS,CAAC;IACrC,CAAC;IAED,MAAM,CAAC,GAAW;QAChB,OAAO,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC;IAChC,CAAC;IAED,KAAK;QACH,IAAI,CAAC,KAAK,CAAC,KAAK,EAAE,CAAC;IACrB,CAAC;IAED,IAAI;QACF,wBAAwB;QACxB,IAAI,CAAC,YAAY,EAAE,CAAC;QACpB,OAAO,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC;IACzB,CAAC;IAEO,YAAY;QAClB,MAAM,GAAG,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;QACvB,KAAK,MAAM,CAAC,GAAG,EAAE,KAAK,CAAC,IAAI,IAAI,CAAC,KAAK,CAAC,OAAO,EAAE,EAAE,CAAC;YAChD,IAAI,GAAG,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC;gBACvB,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC;YACzB,CAAC;QACH,CAAC;IACH,CAAC;IAED,QAAQ;QACN,IAAI,CAAC,YAAY,EAAE,CAAC;QACpB,OAAO;YACL,IAAI,EAAE,IAAI,CAAC,KAAK,CAAC,IAAI;YACrB,OAAO,EAAE,IAAI,CAAC,OAAO;YACrB,GAAG,EAAE,IAAI,CAAC,GAAG;SACd,CAAC;IACJ,CAAC;CACF;AA3ED,sBA2EC;AAED,wBAAwB;AACX,QAAA,KAAK,GAAG,IAAI,KAAK,EAAE,CAAC;AACpB,QAAA,WAAW,GAAG,aAAK,CAAC"}
@@ -0,0 +1,8 @@
1
+ /**
2
+ * Text chunking utilities
3
+ */
4
+ import { Document } from '../types';
5
+ export declare function chunkText(text: string, chunkSize?: number, overlap?: number): string[];
6
+ export declare function chunkDocuments(docs: Document[], chunkSize?: number): Document[];
7
+ export declare function mergeDocuments(docs: Document[]): Document;
8
+ //# sourceMappingURL=chunking.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"chunking.d.ts","sourceRoot":"","sources":["../../src/utils/chunking.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EAAE,QAAQ,EAAE,MAAM,UAAU,CAAC;AAEpC,wBAAgB,SAAS,CAAC,IAAI,EAAE,MAAM,EAAE,SAAS,GAAE,MAAa,EAAE,OAAO,GAAE,MAAY,GAAG,MAAM,EAAE,CAoBjG;AAED,wBAAgB,cAAc,CAAC,IAAI,EAAE,QAAQ,EAAE,EAAE,SAAS,GAAE,MAAa,GAAG,QAAQ,EAAE,CAmBrF;AAED,wBAAgB,cAAc,CAAC,IAAI,EAAE,QAAQ,EAAE,GAAG,QAAQ,CAQzD"}
@@ -0,0 +1,51 @@
1
+ "use strict";
2
+ /**
3
+ * Text chunking utilities
4
+ */
5
+ Object.defineProperty(exports, "__esModule", { value: true });
6
+ exports.chunkText = chunkText;
7
+ exports.chunkDocuments = chunkDocuments;
8
+ exports.mergeDocuments = mergeDocuments;
9
+ function chunkText(text, chunkSize = 2000, overlap = 200) {
10
+ if (!text || text.length === 0) {
11
+ return [];
12
+ }
13
+ const chunks = [];
14
+ let start = 0;
15
+ while (start < text.length) {
16
+ const end = Math.min(start + chunkSize, text.length);
17
+ const chunk = text.slice(start, end);
18
+ chunks.push(chunk);
19
+ // Move forward by chunkSize - overlap
20
+ start += chunkSize - overlap;
21
+ if (start >= text.length)
22
+ break;
23
+ }
24
+ return chunks;
25
+ }
26
+ function chunkDocuments(docs, chunkSize = 2000) {
27
+ const chunkedDocs = [];
28
+ for (const doc of docs) {
29
+ const chunks = chunkText(doc.pageContent, chunkSize);
30
+ for (let i = 0; i < chunks.length; i++) {
31
+ chunkedDocs.push({
32
+ pageContent: chunks[i],
33
+ metadata: {
34
+ ...doc.metadata,
35
+ chunkIndex: i,
36
+ totalChunks: chunks.length,
37
+ },
38
+ });
39
+ }
40
+ }
41
+ return chunkedDocs;
42
+ }
43
+ function mergeDocuments(docs) {
44
+ const mergedContent = docs.map(doc => doc.pageContent).join('\n\n');
45
+ const mergedMetadata = docs.reduce((acc, doc) => ({ ...acc, ...doc.metadata }), {});
46
+ return {
47
+ pageContent: mergedContent,
48
+ metadata: mergedMetadata,
49
+ };
50
+ }
51
+ //# sourceMappingURL=chunking.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"chunking.js","sourceRoot":"","sources":["../../src/utils/chunking.ts"],"names":[],"mappings":";AAAA;;GAEG;;AAIH,8BAoBC;AAED,wCAmBC;AAED,wCAQC;AAnDD,SAAgB,SAAS,CAAC,IAAY,EAAE,YAAoB,IAAI,EAAE,UAAkB,GAAG;IACrF,IAAI,CAAC,IAAI,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC/B,OAAO,EAAE,CAAC;IACZ,CAAC;IAED,MAAM,MAAM,GAAa,EAAE,CAAC;IAC5B,IAAI,KAAK,GAAG,CAAC,CAAC;IAEd,OAAO,KAAK,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC;QAC3B,MAAM,GAAG,GAAG,IAAI,CAAC,GAAG,CAAC,KAAK,GAAG,SAAS,EAAE,IAAI,CAAC,MAAM,CAAC,CAAC;QACrD,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,KAAK,EAAE,GAAG,CAAC,CAAC;QACrC,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QAEnB,sCAAsC;QACtC,KAAK,IAAI,SAAS,GAAG,OAAO,CAAC;QAE7B,IAAI,KAAK,IAAI,IAAI,CAAC,MAAM;YAAE,MAAM;IAClC,CAAC;IAED,OAAO,MAAM,CAAC;AAChB,CAAC;AAED,SAAgB,cAAc,CAAC,IAAgB,EAAE,YAAoB,IAAI;IACvE,MAAM,WAAW,GAAe,EAAE,CAAC;IAEnC,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;QACvB,MAAM,MAAM,GAAG,SAAS,CAAC,GAAG,CAAC,WAAW,EAAE,SAAS,CAAC,CAAC;QAErD,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACvC,WAAW,CAAC,IAAI,CAAC;gBACf,WAAW,EAAE,MAAM,CAAC,CAAC,CAAC;gBACtB,QAAQ,EAAE;oBACR,GAAG,GAAG,CAAC,QAAQ;oBACf,UAAU,EAAE,CAAC;oBACb,WAAW,EAAE,MAAM,CAAC,MAAM;iBAC3B;aACF,CAAC,CAAC;QACL,CAAC;IACH,CAAC;IAED,OAAO,WAAW,CAAC;AACrB,CAAC;AAED,SAAgB,cAAc,CAAC,IAAgB;IAC7C,MAAM,aAAa,GAAG,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,WAAW,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;IACpE,MAAM,cAAc,GAAG,IAAI,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,GAAG,EAAE,EAAE,CAAC,CAAC,EAAE,GAAG,GAAG,EAAE,GAAG,GAAG,CAAC,QAAQ,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC;IAEpF,OAAO;QACL,WAAW,EAAE,aAAa;QAC1B,QAAQ,EAAE,cAAc;KACzB,CAAC;AACJ,CAAC"}
@@ -0,0 +1,7 @@
1
+ /**
2
+ * HTML cleanup and minification utilities
3
+ */
4
+ export declare function cleanupHtml(html: string, cut?: boolean): string;
5
+ export declare function removeUnwantedTags(html: string): string;
6
+ export declare function extractMainContent(html: string): string;
7
+ //# sourceMappingURL=cleanupHtml.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"cleanupHtml.d.ts","sourceRoot":"","sources":["../../src/utils/cleanupHtml.ts"],"names":[],"mappings":"AAAA;;GAEG;AAIH,wBAAgB,WAAW,CAAC,IAAI,EAAE,MAAM,EAAE,GAAG,GAAE,OAAc,GAAG,MAAM,CAqCrE;AAED,wBAAgB,kBAAkB,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAuBvD;AAED,wBAAgB,kBAAkB,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAiBvD"}
@@ -0,0 +1,81 @@
1
+ "use strict";
2
+ /**
3
+ * HTML cleanup and minification utilities
4
+ */
5
+ Object.defineProperty(exports, "__esModule", { value: true });
6
+ exports.cleanupHtml = cleanupHtml;
7
+ exports.removeUnwantedTags = removeUnwantedTags;
8
+ exports.extractMainContent = extractMainContent;
9
+ const html_minifier_1 = require("html-minifier");
10
+ function cleanupHtml(html, cut = true) {
11
+ if (!html)
12
+ return '';
13
+ try {
14
+ // Remove script and style tags
15
+ let cleaned = html.replace(/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/gi, '');
16
+ cleaned = cleaned.replace(/<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>/gi, '');
17
+ // Remove comments
18
+ cleaned = cleaned.replace(/<!--[\s\S]*?-->/g, '');
19
+ if (cut) {
20
+ // Minify HTML
21
+ cleaned = (0, html_minifier_1.minify)(cleaned, {
22
+ collapseWhitespace: true,
23
+ removeComments: true,
24
+ removeEmptyAttributes: true,
25
+ removeRedundantAttributes: true,
26
+ removeScriptTypeAttributes: true,
27
+ removeStyleLinkTypeAttributes: true,
28
+ useShortDoctype: true,
29
+ minifyCSS: true,
30
+ minifyJS: true,
31
+ });
32
+ // Remove excessive whitespace
33
+ cleaned = cleaned.replace(/\s+/g, ' ').trim();
34
+ // Remove empty tags
35
+ cleaned = cleaned.replace(/<(\w+)(\s[^>]*)?\s*>\s*<\/\1>/g, '');
36
+ }
37
+ return cleaned;
38
+ }
39
+ catch (error) {
40
+ console.error('Error cleaning HTML:', error);
41
+ return html;
42
+ }
43
+ }
44
+ function removeUnwantedTags(html) {
45
+ const unwantedTags = [
46
+ 'script',
47
+ 'style',
48
+ 'noscript',
49
+ 'iframe',
50
+ 'svg',
51
+ 'path',
52
+ 'symbol',
53
+ 'use',
54
+ 'header',
55
+ 'footer',
56
+ 'nav',
57
+ ];
58
+ let cleaned = html;
59
+ for (const tag of unwantedTags) {
60
+ const regex = new RegExp(`<${tag}\\b[^<]*(?:(?!<\\/${tag}>)<[^<]*)*<\\/${tag}>`, 'gi');
61
+ cleaned = cleaned.replace(regex, '');
62
+ }
63
+ return cleaned;
64
+ }
65
+ function extractMainContent(html) {
66
+ // Try to find main content areas
67
+ const mainPatterns = [
68
+ /<main[^>]*>([\s\S]*?)<\/main>/i,
69
+ /<article[^>]*>([\s\S]*?)<\/article>/i,
70
+ /<div[^>]*class="[^"]*content[^"]*"[^>]*>([\s\S]*?)<\/div>/i,
71
+ /<div[^>]*id="[^"]*content[^"]*"[^>]*>([\s\S]*?)<\/div>/i,
72
+ ];
73
+ for (const pattern of mainPatterns) {
74
+ const match = html.match(pattern);
75
+ if (match) {
76
+ return match[1];
77
+ }
78
+ }
79
+ return html;
80
+ }
81
+ //# sourceMappingURL=cleanupHtml.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"cleanupHtml.js","sourceRoot":"","sources":["../../src/utils/cleanupHtml.ts"],"names":[],"mappings":";AAAA;;GAEG;;AAIH,kCAqCC;AAED,gDAuBC;AAED,gDAiBC;AAnFD,iDAAuC;AAEvC,SAAgB,WAAW,CAAC,IAAY,EAAE,MAAe,IAAI;IAC3D,IAAI,CAAC,IAAI;QAAE,OAAO,EAAE,CAAC;IAErB,IAAI,CAAC;QACH,+BAA+B;QAC/B,IAAI,OAAO,GAAG,IAAI,CAAC,OAAO,CAAC,qDAAqD,EAAE,EAAE,CAAC,CAAC;QACtF,OAAO,GAAG,OAAO,CAAC,OAAO,CAAC,kDAAkD,EAAE,EAAE,CAAC,CAAC;QAElF,kBAAkB;QAClB,OAAO,GAAG,OAAO,CAAC,OAAO,CAAC,kBAAkB,EAAE,EAAE,CAAC,CAAC;QAElD,IAAI,GAAG,EAAE,CAAC;YACR,cAAc;YACd,OAAO,GAAG,IAAA,sBAAM,EAAC,OAAO,EAAE;gBACxB,kBAAkB,EAAE,IAAI;gBACxB,cAAc,EAAE,IAAI;gBACpB,qBAAqB,EAAE,IAAI;gBAC3B,yBAAyB,EAAE,IAAI;gBAC/B,0BAA0B,EAAE,IAAI;gBAChC,6BAA6B,EAAE,IAAI;gBACnC,eAAe,EAAE,IAAI;gBACrB,SAAS,EAAE,IAAI;gBACf,QAAQ,EAAE,IAAI;aACf,CAAC,CAAC;YAEH,8BAA8B;YAC9B,OAAO,GAAG,OAAO,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC;YAE9C,oBAAoB;YACpB,OAAO,GAAG,OAAO,CAAC,OAAO,CAAC,gCAAgC,EAAE,EAAE,CAAC,CAAC;QAClE,CAAC;QAED,OAAO,OAAO,CAAC;IACjB,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,OAAO,CAAC,KAAK,CAAC,sBAAsB,EAAE,KAAK,CAAC,CAAC;QAC7C,OAAO,IAAI,CAAC;IACd,CAAC;AACH,CAAC;AAED,SAAgB,kBAAkB,CAAC,IAAY;IAC7C,MAAM,YAAY,GAAG;QACnB,QAAQ;QACR,OAAO;QACP,UAAU;QACV,QAAQ;QACR,KAAK;QACL,MAAM;QACN,QAAQ;QACR,KAAK;QACL,QAAQ;QACR,QAAQ;QACR,KAAK;KACN,CAAC;IAEF,IAAI,OAAO,GAAG,IAAI,CAAC;IAEnB,KAAK,MAAM,GAAG,IAAI,YAAY,EAAE,CAAC;QAC/B,MAAM,KAAK,GAAG,IAAI,MAAM,CAAC,IAAI,GAAG,qBAAqB,GAAG,iBAAiB,GAAG,GAAG,EAAE,IAAI,CAAC,CAAC;QACvF,OAAO,GAAG,OAAO,CAAC,OAAO,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC;IACvC,CAAC;IAED,OAAO,OAAO,CAAC;AACjB,CAAC;AAED,SAAgB,kBAAkB,CAAC,IAAY;IAC7C,iCAAiC;IACjC,MAAM,YAAY,GAAG;QACnB,gCAAgC;QAChC,sCAAsC;QACtC,4DAA4D;QAC5D,yDAAyD;KAC1D,CAAC;IAEF,KAAK,MAAM,OAAO,IAAI,YAAY,EAAE,CAAC;QACnC,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC;QAClC,IAAI,KAAK,EAAE,CAAC;YACV,OAAO,KAAK,CAAC,CAAC,CAAC,CAAC;QAClB,CAAC;IACH,CAAC;IAED,OAAO,IAAI,CAAC;AACd,CAAC"}
@@ -0,0 +1,6 @@
1
+ /**
2
+ * Convert HTML to Markdown
3
+ */
4
+ export declare function convertToMarkdown(html: string): string;
5
+ export declare function extractTextFromHtml(html: string): string;
6
+ //# sourceMappingURL=convertToMarkdown.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"convertToMarkdown.d.ts","sourceRoot":"","sources":["../../src/utils/convertToMarkdown.ts"],"names":[],"mappings":"AAAA;;GAEG;AAIH,wBAAgB,iBAAiB,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAqCtD;AAED,wBAAgB,mBAAmB,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAcxD"}
@@ -0,0 +1,61 @@
1
+ "use strict";
2
+ /**
3
+ * Convert HTML to Markdown
4
+ */
5
+ var __importDefault = (this && this.__importDefault) || function (mod) {
6
+ return (mod && mod.__esModule) ? mod : { "default": mod };
7
+ };
8
+ Object.defineProperty(exports, "__esModule", { value: true });
9
+ exports.convertToMarkdown = convertToMarkdown;
10
+ exports.extractTextFromHtml = extractTextFromHtml;
11
+ const turndown_1 = __importDefault(require("turndown"));
12
+ function convertToMarkdown(html) {
13
+ const turndownService = new turndown_1.default({
14
+ headingStyle: 'atx',
15
+ codeBlockStyle: 'fenced',
16
+ emDelimiter: '*',
17
+ strongDelimiter: '**',
18
+ bulletListMarker: '-',
19
+ });
20
+ // Add custom rules
21
+ turndownService.addRule('removeImages', {
22
+ filter: ['img'],
23
+ replacement: () => '',
24
+ });
25
+ turndownService.addRule('removeSvg', {
26
+ filter: ['svg'],
27
+ replacement: () => '',
28
+ });
29
+ turndownService.addRule('preserveLinks', {
30
+ filter: 'a',
31
+ replacement: (content, node) => {
32
+ const href = node.getAttribute('href');
33
+ return href ? `[${content}](${href})` : content;
34
+ },
35
+ });
36
+ try {
37
+ const markdown = turndownService.turndown(html);
38
+ // Clean up excessive newlines
39
+ return markdown.replace(/\n{3,}/g, '\n\n').trim();
40
+ }
41
+ catch (error) {
42
+ console.error('Error converting to markdown:', error);
43
+ return html;
44
+ }
45
+ }
46
+ function extractTextFromHtml(html) {
47
+ // Remove all HTML tags and get plain text
48
+ return html
49
+ .replace(/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/gi, '')
50
+ .replace(/<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>/gi, '')
51
+ .replace(/<[^>]+>/g, ' ')
52
+ .replace(/&nbsp;/g, ' ')
53
+ .replace(/&amp;/g, '&')
54
+ .replace(/&lt;/g, '<')
55
+ .replace(/&gt;/g, '>')
56
+ .replace(/&quot;/g, '"')
57
+ .replace(/&#39;/g, "'")
58
+ .replace(/\s+/g, ' ')
59
+ .trim();
60
+ }
61
+ //# sourceMappingURL=convertToMarkdown.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"convertToMarkdown.js","sourceRoot":"","sources":["../../src/utils/convertToMarkdown.ts"],"names":[],"mappings":";AAAA;;GAEG;;;;;AAIH,8CAqCC;AAED,kDAcC;AAvDD,wDAAuC;AAEvC,SAAgB,iBAAiB,CAAC,IAAY;IAC5C,MAAM,eAAe,GAAG,IAAI,kBAAe,CAAC;QAC1C,YAAY,EAAE,KAAK;QACnB,cAAc,EAAE,QAAQ;QACxB,WAAW,EAAE,GAAG;QAChB,eAAe,EAAE,IAAI;QACrB,gBAAgB,EAAE,GAAG;KACtB,CAAC,CAAC;IAEH,mBAAmB;IACnB,eAAe,CAAC,OAAO,CAAC,cAAc,EAAE;QACtC,MAAM,EAAE,CAAC,KAAK,CAAC;QACf,WAAW,EAAE,GAAG,EAAE,CAAC,EAAE;KACtB,CAAC,CAAC;IAEH,eAAe,CAAC,OAAO,CAAC,WAAW,EAAE;QACnC,MAAM,EAAE,CAAC,KAAK,CAAC;QACf,WAAW,EAAE,GAAG,EAAE,CAAC,EAAE;KACtB,CAAC,CAAC;IAEH,eAAe,CAAC,OAAO,CAAC,eAAe,EAAE;QACvC,MAAM,EAAE,GAAG;QACX,WAAW,EAAE,CAAC,OAAO,EAAE,IAAS,EAAE,EAAE;YAClC,MAAM,IAAI,GAAG,IAAI,CAAC,YAAY,CAAC,MAAM,CAAC,CAAC;YACvC,OAAO,IAAI,CAAC,CAAC,CAAC,IAAI,OAAO,KAAK,IAAI,GAAG,CAAC,CAAC,CAAC,OAAO,CAAC;QAClD,CAAC;KACF,CAAC,CAAC;IAEH,IAAI,CAAC;QACH,MAAM,QAAQ,GAAG,eAAe,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;QAEhD,8BAA8B;QAC9B,OAAO,QAAQ,CAAC,OAAO,CAAC,SAAS,EAAE,MAAM,CAAC,CAAC,IAAI,EAAE,CAAC;IACpD,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,OAAO,CAAC,KAAK,CAAC,+BAA+B,EAAE,KAAK,CAAC,CAAC;QACtD,OAAO,IAAI,CAAC;IACd,CAAC;AACH,CAAC;AAED,SAAgB,mBAAmB,CAAC,IAAY;IAC9C,0CAA0C;IAC1C,OAAO,IAAI;SACR,OAAO,CAAC,qDAAqD,EAAE,EAAE,CAAC;SAClE,OAAO,CAAC,kDAAkD,EAAE,EAAE,CAAC;SAC/D,OAAO,CAAC,UAAU,EAAE,GAAG,CAAC;SACxB,OAAO,CAAC,SAAS,EAAE,GAAG,CAAC;SACvB,OAAO,CAAC,QAAQ,EAAE,GAAG,CAAC;SACtB,OAAO,CAAC,OAAO,EAAE,GAAG,CAAC;SACrB,OAAO,CAAC,OAAO,EAAE,GAAG,CAAC;SACrB,OAAO,CAAC,SAAS,EAAE,GAAG,CAAC;SACvB,OAAO,CAAC,QAAQ,EAAE,GAAG,CAAC;SACtB,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC;SACpB,IAAI,EAAE,CAAC;AACZ,CAAC"}
@@ -0,0 +1,13 @@
1
+ /**
2
+ * Utility functions index
3
+ */
4
+ export * from './cleanupHtml';
5
+ export * from './convertToMarkdown';
6
+ export * from './logger';
7
+ export * from './chunking';
8
+ export { retry } from './retry';
9
+ export { RateLimiter, rateLimiter } from './rateLimiter';
10
+ export { Cache, cache, globalCache } from './cache';
11
+ export { ProxyRotator, proxyRotator } from './proxy';
12
+ export { SchemaValidator, CommonSchemas } from './schemaValidator';
13
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/utils/index.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,cAAc,eAAe,CAAC;AAC9B,cAAc,qBAAqB,CAAC;AACpC,cAAc,UAAU,CAAC;AACzB,cAAc,YAAY,CAAC;AAC3B,OAAO,EAAE,KAAK,EAAE,MAAM,SAAS,CAAC;AAChC,OAAO,EAAE,WAAW,EAAE,WAAW,EAAE,MAAM,eAAe,CAAC;AACzD,OAAO,EAAE,KAAK,EAAE,KAAK,EAAE,WAAW,EAAE,MAAM,SAAS,CAAC;AACpD,OAAO,EAAE,YAAY,EAAE,YAAY,EAAE,MAAM,SAAS,CAAC;AACrD,OAAO,EAAE,eAAe,EAAE,aAAa,EAAE,MAAM,mBAAmB,CAAC"}