webpeel 0.17.0 → 0.17.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/integrations/index.d.ts +3 -0
- package/dist/integrations/index.d.ts.map +1 -0
- package/dist/integrations/index.js +3 -0
- package/dist/integrations/index.js.map +1 -0
- package/dist/integrations/langchain.d.ts +65 -0
- package/dist/integrations/langchain.d.ts.map +1 -0
- package/dist/integrations/langchain.js +116 -0
- package/dist/integrations/langchain.js.map +1 -0
- package/dist/integrations/llamaindex.d.ts +51 -0
- package/dist/integrations/llamaindex.d.ts.map +1 -0
- package/dist/integrations/llamaindex.js +92 -0
- package/dist/integrations/llamaindex.js.map +1 -0
- package/package.json +2 -1
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/integrations/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,aAAa,EAAE,KAAK,oBAAoB,EAAE,KAAK,QAAQ,EAAE,MAAM,gBAAgB,CAAC;AACzF,OAAO,EAAE,aAAa,EAAE,KAAK,oBAAoB,EAAE,KAAK,aAAa,EAAE,MAAM,iBAAiB,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/integrations/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,aAAa,EAA4C,MAAM,gBAAgB,CAAC;AACzF,OAAO,EAAE,aAAa,EAAiD,MAAM,iBAAiB,CAAC"}
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* WebPeel LangChain.js Document Loader
|
|
3
|
+
*
|
|
4
|
+
* Usage:
|
|
5
|
+
* import { WebPeelLoader } from 'webpeel/integrations/langchain';
|
|
6
|
+
* const loader = new WebPeelLoader({ url: 'https://example.com' });
|
|
7
|
+
* const docs = await loader.load();
|
|
8
|
+
*/
|
|
9
|
+
import type { PeelOptions } from '../types.js';
|
|
10
|
+
/** LangChain Document interface (we define our own to avoid the dependency) */
|
|
11
|
+
export interface Document {
|
|
12
|
+
pageContent: string;
|
|
13
|
+
metadata: Record<string, any>;
|
|
14
|
+
}
|
|
15
|
+
export interface WebPeelLoaderOptions {
|
|
16
|
+
/** URL to fetch */
|
|
17
|
+
url: string;
|
|
18
|
+
/** Multiple URLs to fetch */
|
|
19
|
+
urls?: string[];
|
|
20
|
+
/** Scraping mode: 'scrape' for single page, 'crawl' for following links */
|
|
21
|
+
mode?: 'scrape' | 'crawl';
|
|
22
|
+
/** Output format */
|
|
23
|
+
format?: 'markdown' | 'text' | 'html' | 'clean';
|
|
24
|
+
/** Use headless browser */
|
|
25
|
+
render?: boolean;
|
|
26
|
+
/** Stealth mode for anti-bot */
|
|
27
|
+
stealth?: boolean;
|
|
28
|
+
/** Token budget per page */
|
|
29
|
+
budget?: number;
|
|
30
|
+
/** Proxy URL */
|
|
31
|
+
proxy?: string;
|
|
32
|
+
/** Multiple proxies for rotation */
|
|
33
|
+
proxies?: string[];
|
|
34
|
+
/** CSS selector to extract */
|
|
35
|
+
selector?: string;
|
|
36
|
+
/** Enable chunking for RAG */
|
|
37
|
+
chunk?: boolean;
|
|
38
|
+
/** Max tokens per chunk (default: 512) */
|
|
39
|
+
chunkSize?: number;
|
|
40
|
+
/** Chunk overlap tokens (default: 50) */
|
|
41
|
+
chunkOverlap?: number;
|
|
42
|
+
/** Additional PeelOptions */
|
|
43
|
+
peelOptions?: Partial<PeelOptions>;
|
|
44
|
+
}
|
|
45
|
+
/**
|
|
46
|
+
* WebPeel Document Loader for LangChain.js
|
|
47
|
+
*
|
|
48
|
+
* Compatible with LangChain's BaseDocumentLoader interface.
|
|
49
|
+
* Returns Document[] with pageContent and metadata.
|
|
50
|
+
*/
|
|
51
|
+
export declare class WebPeelLoader {
|
|
52
|
+
private options;
|
|
53
|
+
constructor(options: WebPeelLoaderOptions);
|
|
54
|
+
/**
|
|
55
|
+
* Load documents from the configured URL(s).
|
|
56
|
+
* If chunking is enabled, each chunk becomes a separate Document.
|
|
57
|
+
*/
|
|
58
|
+
load(): Promise<Document[]>;
|
|
59
|
+
/**
|
|
60
|
+
* Lazy load documents one at a time (async generator).
|
|
61
|
+
* Useful for large URL lists to avoid memory pressure.
|
|
62
|
+
*/
|
|
63
|
+
lazyLoad(): AsyncGenerator<Document>;
|
|
64
|
+
}
|
|
65
|
+
//# sourceMappingURL=langchain.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"langchain.d.ts","sourceRoot":"","sources":["../../src/integrations/langchain.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAIH,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,aAAa,CAAC;AAE/C,+EAA+E;AAC/E,MAAM,WAAW,QAAQ;IACvB,WAAW,EAAE,MAAM,CAAC;IACpB,QAAQ,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;CAC/B;AAED,MAAM,WAAW,oBAAoB;IACnC,mBAAmB;IACnB,GAAG,EAAE,MAAM,CAAC;IACZ,6BAA6B;IAC7B,IAAI,CAAC,EAAE,MAAM,EAAE,CAAC;IAChB,2EAA2E;IAC3E,IAAI,CAAC,EAAE,QAAQ,GAAG,OAAO,CAAC;IAC1B,oBAAoB;IACpB,MAAM,CAAC,EAAE,UAAU,GAAG,MAAM,GAAG,MAAM,GAAG,OAAO,CAAC;IAChD,2BAA2B;IAC3B,MAAM,CAAC,EAAE,OAAO,CAAC;IACjB,gCAAgC;IAChC,OAAO,CAAC,EAAE,OAAO,CAAC;IAClB,4BAA4B;IAC5B,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,gBAAgB;IAChB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,oCAAoC;IACpC,OAAO,CAAC,EAAE,MAAM,EAAE,CAAC;IACnB,8BAA8B;IAC9B,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,8BAA8B;IAC9B,KAAK,CAAC,EAAE,OAAO,CAAC;IAChB,0CAA0C;IAC1C,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,yCAAyC;IACzC,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,6BAA6B;IAC7B,WAAW,CAAC,EAAE,OAAO,CAAC,WAAW,CAAC,CAAC;CACpC;AAED;;;;;GAKG;AACH,qBAAa,aAAa;IACxB,OAAO,CAAC,OAAO,CAAuB;gBAE1B,OAAO,EAAE,oBAAoB;IAIzC;;;OAGG;IACG,IAAI,IAAI,OAAO,CAAC,QAAQ,EAAE,CAAC;IAmFjC;;;OAGG;IACI,QAAQ,IAAI,cAAc,CAAC,QAAQ,CAAC;CAM5C"}
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* WebPeel LangChain.js Document Loader
|
|
3
|
+
*
|
|
4
|
+
* Usage:
|
|
5
|
+
* import { WebPeelLoader } from 'webpeel/integrations/langchain';
|
|
6
|
+
* const loader = new WebPeelLoader({ url: 'https://example.com' });
|
|
7
|
+
* const docs = await loader.load();
|
|
8
|
+
*/
|
|
9
|
+
import { peel } from '../index.js';
|
|
10
|
+
import { chunkContent } from '../core/chunker.js';
|
|
11
|
+
/**
|
|
12
|
+
* WebPeel Document Loader for LangChain.js
|
|
13
|
+
*
|
|
14
|
+
* Compatible with LangChain's BaseDocumentLoader interface.
|
|
15
|
+
* Returns Document[] with pageContent and metadata.
|
|
16
|
+
*/
|
|
17
|
+
export class WebPeelLoader {
|
|
18
|
+
options;
|
|
19
|
+
constructor(options) {
|
|
20
|
+
this.options = options;
|
|
21
|
+
}
|
|
22
|
+
/**
|
|
23
|
+
* Load documents from the configured URL(s).
|
|
24
|
+
* If chunking is enabled, each chunk becomes a separate Document.
|
|
25
|
+
*/
|
|
26
|
+
async load() {
|
|
27
|
+
const urls = this.options.urls || [this.options.url];
|
|
28
|
+
const documents = [];
|
|
29
|
+
for (const url of urls) {
|
|
30
|
+
try {
|
|
31
|
+
const peelOpts = {
|
|
32
|
+
format: this.options.format || 'markdown',
|
|
33
|
+
render: this.options.render,
|
|
34
|
+
stealth: this.options.stealth,
|
|
35
|
+
budget: this.options.budget,
|
|
36
|
+
proxy: this.options.proxy,
|
|
37
|
+
proxies: this.options.proxies,
|
|
38
|
+
selector: this.options.selector,
|
|
39
|
+
...this.options.peelOptions,
|
|
40
|
+
};
|
|
41
|
+
// Remove undefined values
|
|
42
|
+
Object.keys(peelOpts).forEach(key => {
|
|
43
|
+
if (peelOpts[key] === undefined)
|
|
44
|
+
delete peelOpts[key];
|
|
45
|
+
});
|
|
46
|
+
const result = await peel(url, peelOpts);
|
|
47
|
+
if (this.options.chunk) {
|
|
48
|
+
// Split into chunks, each becomes a Document
|
|
49
|
+
const chunkResult = chunkContent(result.content, {
|
|
50
|
+
maxTokens: this.options.chunkSize || 512,
|
|
51
|
+
overlap: this.options.chunkOverlap || 50,
|
|
52
|
+
strategy: 'section',
|
|
53
|
+
});
|
|
54
|
+
for (const chunk of chunkResult.chunks) {
|
|
55
|
+
documents.push({
|
|
56
|
+
pageContent: chunk.text,
|
|
57
|
+
metadata: {
|
|
58
|
+
source: url,
|
|
59
|
+
title: result.metadata?.title || '',
|
|
60
|
+
description: result.metadata?.description || '',
|
|
61
|
+
chunkIndex: chunk.index,
|
|
62
|
+
totalChunks: chunkResult.totalChunks,
|
|
63
|
+
section: chunk.section,
|
|
64
|
+
sectionDepth: chunk.sectionDepth,
|
|
65
|
+
tokenCount: chunk.tokenCount,
|
|
66
|
+
wordCount: chunk.wordCount,
|
|
67
|
+
fetchedAt: result.metadata?.fetchedAt || new Date().toISOString(),
|
|
68
|
+
method: result.metadata?.method || 'unknown',
|
|
69
|
+
},
|
|
70
|
+
});
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
else {
|
|
74
|
+
// Single document per URL
|
|
75
|
+
documents.push({
|
|
76
|
+
pageContent: result.content,
|
|
77
|
+
metadata: {
|
|
78
|
+
source: url,
|
|
79
|
+
title: result.metadata?.title || '',
|
|
80
|
+
description: result.metadata?.description || '',
|
|
81
|
+
wordCount: result.metadata?.wordCount || 0,
|
|
82
|
+
language: result.metadata?.language || '',
|
|
83
|
+
fetchedAt: result.metadata?.fetchedAt || new Date().toISOString(),
|
|
84
|
+
method: result.metadata?.method || 'unknown',
|
|
85
|
+
contentType: result.metadata?.contentType || '',
|
|
86
|
+
statusCode: result.metadata?.statusCode || 0,
|
|
87
|
+
},
|
|
88
|
+
});
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
catch (error) {
|
|
92
|
+
// Include failed URLs as empty documents with error metadata
|
|
93
|
+
documents.push({
|
|
94
|
+
pageContent: '',
|
|
95
|
+
metadata: {
|
|
96
|
+
source: url,
|
|
97
|
+
error: error instanceof Error ? error.message : String(error),
|
|
98
|
+
fetchedAt: new Date().toISOString(),
|
|
99
|
+
},
|
|
100
|
+
});
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
return documents;
|
|
104
|
+
}
|
|
105
|
+
/**
|
|
106
|
+
* Lazy load documents one at a time (async generator).
|
|
107
|
+
* Useful for large URL lists to avoid memory pressure.
|
|
108
|
+
*/
|
|
109
|
+
async *lazyLoad() {
|
|
110
|
+
const docs = await this.load();
|
|
111
|
+
for (const doc of docs) {
|
|
112
|
+
yield doc;
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
//# sourceMappingURL=langchain.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"langchain.js","sourceRoot":"","sources":["../../src/integrations/langchain.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAEH,OAAO,EAAE,IAAI,EAAE,MAAM,aAAa,CAAC;AACnC,OAAO,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AAwClD;;;;;GAKG;AACH,MAAM,OAAO,aAAa;IAChB,OAAO,CAAuB;IAEtC,YAAY,OAA6B;QACvC,IAAI,CAAC,OAAO,GAAG,OAAO,CAAC;IACzB,CAAC;IAED;;;OAGG;IACH,KAAK,CAAC,IAAI;QACR,MAAM,IAAI,GAAG,IAAI,CAAC,OAAO,CAAC,IAAI,IAAI,CAAC,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC;QACrD,MAAM,SAAS,GAAe,EAAE,CAAC;QAEjC,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;YACvB,IAAI,CAAC;gBACH,MAAM,QAAQ,GAAyB;oBACrC,MAAM,EAAE,IAAI,CAAC,OAAO,CAAC,MAAM,IAAI,UAAU;oBACzC,MAAM,EAAE,IAAI,CAAC,OAAO,CAAC,MAAM;oBAC3B,OAAO,EAAE,IAAI,CAAC,OAAO,CAAC,OAAO;oBAC7B,MAAM,EAAE,IAAI,CAAC,OAAO,CAAC,MAAM;oBAC3B,KAAK,EAAE,IAAI,CAAC,OAAO,CAAC,KAAK;oBACzB,OAAO,EAAE,IAAI,CAAC,OAAO,CAAC,OAAO;oBAC7B,QAAQ,EAAE,IAAI,CAAC,OAAO,CAAC,QAAQ;oBAC/B,GAAG,IAAI,CAAC,OAAO,CAAC,WAAW;iBAC5B,CAAC;gBAEF,0BAA0B;gBAC1B,MAAM,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC,OAAO,CAAC,GAAG,CAAC,EAAE;oBAClC,IAAK,QAAgB,CAAC,GAAG,CAAC,KAAK,SAAS;wBAAE,OAAQ,QAAgB,CAAC,GAAG,CAAC,CAAC;gBAC1E,CAAC,CAAC,CAAC;gBAEH,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,GAAG,EAAE,QAAuB,CAAC,CAAC;gBAExD,IAAI,IAAI,CAAC,OAAO,CAAC,KAAK,EAAE,CAAC;oBACvB,6CAA6C;oBAC7C,MAAM,WAAW,GAAG,YAAY,CAAC,MAAM,CAAC,OAAO,EAAE;wBAC/C,SAAS,EAAE,IAAI,CAAC,OAAO,CAAC,SAAS,IAAI,GAAG;wBACxC,OAAO,EAAE,IAAI,CAAC,OAAO,CAAC,YAAY,IAAI,EAAE;wBACxC,QAAQ,EAAE,SAAS;qBACpB,CAAC,CAAC;oBAEH,KAAK,MAAM,KAAK,IAAI,WAAW,CAAC,MAAM,EAAE,CAAC;wBACvC,SAAS,CAAC,IAAI,CAAC;4BACb,WAAW,EAAE,KAAK,CAAC,IAAI;4BACvB,QAAQ,EAAE;gCACR,MAAM,EAAE,GAAG;gCACX,KAAK,EAAE,MAAM,CAAC,QAAQ,EAAE,KAAK,IAAI,EAAE;gCACnC,WAAW,EAAE,MAAM,CAAC,QAAQ,EAAE,WAAW,IAAI,EAAE;gCAC/C,UAAU,EAAE,KAAK,CAAC,KAAK;gCACvB,WAAW,EAAE,WAAW,CAAC,WAAW;gCACpC,OAAO,EAAE,KAAK,CAAC,OAAO;gCACtB,YAAY,EAAE,KAAK,CAAC,YAAY;gCAChC,UAAU,EAAE,KAAK,CAAC,UAAU;gCAC5B,SAAS,EAAE,KAAK,CAAC,SAAS;gCAC1B,SAAS,EAAE,MAAM,CAAC,QAAQ,EAAE,SAAS,IAAI,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;gCACjE,MAAM,EAAE,MAAM,CAAC,QAAQ,EAAE,MAAM,IAAI,SAAS;6BAC7C;yBACF,CAAC,CAAC;oBACL,CAAC;gBACH,CAAC;qBAAM,CAAC;oBACN,0BAA0B;oBAC1B,SAAS,CAAC,IAAI,CAAC;wBACb,WAAW,EAAE,MAAM,CAAC,OAAO;wBAC3B,QAAQ,EAAE;4BACR,MAAM,EAAE,GAAG;4BACX,KAAK,EAAE,MAAM,CAAC,QAAQ,EAAE,KAAK,IAAI,EAAE;4BACnC,WAAW,EAAE,MAAM,CAAC,QAAQ,EAAE,WAAW,IAAI,EAAE;4BAC/C,SAAS,EAAE,MAAM,CAAC,QAAQ,EAAE,SAAS,IAAI,CAAC;4BAC1C,QAAQ,EAAE,MAAM,CAAC,QAAQ,EAAE,QAAQ,IAAI,EAAE;4BACzC,SAAS,EAAE,MAAM,CAAC,QAAQ,EAAE,SAAS,IAAI,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;4BACjE,MAAM,EAAE,MAAM,CAAC,QAAQ,EAAE,MAAM,IAAI,SAAS;4BAC5C,WAAW,EAAE,MAAM,CAAC,QAAQ,EAAE,WAAW,IAAI,EAAE;4BAC/C,UAAU,EAAE,MAAM,CAAC,QAAQ,EAAE,UAAU,IAAI,CAAC;yBAC7C;qBACF,CAAC,CAAC;gBACL,CAAC;YACH,CAAC;YAAC,OAAO,KAAK,EAAE,CAAC;gBACf,6DAA6D;gBAC7D,SAAS,CAAC,IAAI,CAAC;oBACb,WAAW,EAAE,EAAE;oBACf,QAAQ,EAAE;wBACR,MAAM,EAAE,GAAG;wBACX,KAAK,EAAE,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC;wBAC7D,SAAS,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;qBACpC;iBACF,CAAC,CAAC;YACL,CAAC;QACH,CAAC;QAED,OAAO,SAAS,CAAC;IACnB,CAAC;IAED;;;OAGG;IACH,KAAK,CAAC,CAAC,QAAQ;QACb,MAAM,IAAI,GAAG,MAAM,IAAI,CAAC,IAAI,EAAE,CAAC;QAC/B,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;YACvB,MAAM,GAAG,CAAC;QACZ,CAAC;IACH,CAAC;CACF"}
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* WebPeel LlamaIndex Reader
|
|
3
|
+
*
|
|
4
|
+
* Usage:
|
|
5
|
+
* import { WebPeelReader } from 'webpeel/integrations/llamaindex';
|
|
6
|
+
* const reader = new WebPeelReader();
|
|
7
|
+
* const docs = await reader.loadData('https://example.com');
|
|
8
|
+
*/
|
|
9
|
+
import type { PeelOptions } from '../types.js';
|
|
10
|
+
/** LlamaIndex Document interface */
|
|
11
|
+
export interface LlamaDocument {
|
|
12
|
+
text: string;
|
|
13
|
+
metadata: Record<string, any>;
|
|
14
|
+
id_?: string;
|
|
15
|
+
}
|
|
16
|
+
export interface WebPeelReaderOptions {
|
|
17
|
+
/** Output format */
|
|
18
|
+
format?: 'markdown' | 'text' | 'html' | 'clean';
|
|
19
|
+
/** Use headless browser */
|
|
20
|
+
render?: boolean;
|
|
21
|
+
/** Stealth mode */
|
|
22
|
+
stealth?: boolean;
|
|
23
|
+
/** Token budget */
|
|
24
|
+
budget?: number;
|
|
25
|
+
/** Enable chunking */
|
|
26
|
+
chunk?: boolean;
|
|
27
|
+
/** Max tokens per chunk */
|
|
28
|
+
chunkSize?: number;
|
|
29
|
+
/** Chunk overlap */
|
|
30
|
+
chunkOverlap?: number;
|
|
31
|
+
/** Proxy URL */
|
|
32
|
+
proxy?: string;
|
|
33
|
+
/** Multiple proxies */
|
|
34
|
+
proxies?: string[];
|
|
35
|
+
/** Additional PeelOptions */
|
|
36
|
+
peelOptions?: Partial<PeelOptions>;
|
|
37
|
+
}
|
|
38
|
+
/**
|
|
39
|
+
* WebPeel Reader for LlamaIndex
|
|
40
|
+
*
|
|
41
|
+
* Compatible with LlamaIndex's BaseReader interface.
|
|
42
|
+
*/
|
|
43
|
+
export declare class WebPeelReader {
|
|
44
|
+
private options;
|
|
45
|
+
constructor(options?: WebPeelReaderOptions);
|
|
46
|
+
/**
|
|
47
|
+
* Load data from one or more URLs.
|
|
48
|
+
*/
|
|
49
|
+
loadData(urlOrUrls: string | string[]): Promise<LlamaDocument[]>;
|
|
50
|
+
}
|
|
51
|
+
//# sourceMappingURL=llamaindex.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"llamaindex.d.ts","sourceRoot":"","sources":["../../src/integrations/llamaindex.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAIH,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,aAAa,CAAC;AAE/C,oCAAoC;AACpC,MAAM,WAAW,aAAa;IAC5B,IAAI,EAAE,MAAM,CAAC;IACb,QAAQ,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;IAC9B,GAAG,CAAC,EAAE,MAAM,CAAC;CACd;AAED,MAAM,WAAW,oBAAoB;IACnC,oBAAoB;IACpB,MAAM,CAAC,EAAE,UAAU,GAAG,MAAM,GAAG,MAAM,GAAG,OAAO,CAAC;IAChD,2BAA2B;IAC3B,MAAM,CAAC,EAAE,OAAO,CAAC;IACjB,mBAAmB;IACnB,OAAO,CAAC,EAAE,OAAO,CAAC;IAClB,mBAAmB;IACnB,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,sBAAsB;IACtB,KAAK,CAAC,EAAE,OAAO,CAAC;IAChB,2BAA2B;IAC3B,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,oBAAoB;IACpB,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,gBAAgB;IAChB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,uBAAuB;IACvB,OAAO,CAAC,EAAE,MAAM,EAAE,CAAC;IACnB,6BAA6B;IAC7B,WAAW,CAAC,EAAE,OAAO,CAAC,WAAW,CAAC,CAAC;CACpC;AAED;;;;GAIG;AACH,qBAAa,aAAa;IACxB,OAAO,CAAC,OAAO,CAAuB;gBAE1B,OAAO,GAAE,oBAAyB;IAI9C;;OAEG;IACG,QAAQ,CAAC,SAAS,EAAE,MAAM,GAAG,MAAM,EAAE,GAAG,OAAO,CAAC,aAAa,EAAE,CAAC;CAsEvE"}
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* WebPeel LlamaIndex Reader
|
|
3
|
+
*
|
|
4
|
+
* Usage:
|
|
5
|
+
* import { WebPeelReader } from 'webpeel/integrations/llamaindex';
|
|
6
|
+
* const reader = new WebPeelReader();
|
|
7
|
+
* const docs = await reader.loadData('https://example.com');
|
|
8
|
+
*/
|
|
9
|
+
import { peel } from '../index.js';
|
|
10
|
+
import { chunkContent } from '../core/chunker.js';
|
|
11
|
+
/**
|
|
12
|
+
* WebPeel Reader for LlamaIndex
|
|
13
|
+
*
|
|
14
|
+
* Compatible with LlamaIndex's BaseReader interface.
|
|
15
|
+
*/
|
|
16
|
+
export class WebPeelReader {
|
|
17
|
+
options;
|
|
18
|
+
constructor(options = {}) {
|
|
19
|
+
this.options = options;
|
|
20
|
+
}
|
|
21
|
+
/**
|
|
22
|
+
* Load data from one or more URLs.
|
|
23
|
+
*/
|
|
24
|
+
async loadData(urlOrUrls) {
|
|
25
|
+
const urls = Array.isArray(urlOrUrls) ? urlOrUrls : [urlOrUrls];
|
|
26
|
+
const documents = [];
|
|
27
|
+
for (const url of urls) {
|
|
28
|
+
try {
|
|
29
|
+
const peelOpts = {
|
|
30
|
+
format: this.options.format || 'markdown',
|
|
31
|
+
render: this.options.render,
|
|
32
|
+
stealth: this.options.stealth,
|
|
33
|
+
budget: this.options.budget,
|
|
34
|
+
proxy: this.options.proxy,
|
|
35
|
+
proxies: this.options.proxies,
|
|
36
|
+
...this.options.peelOptions,
|
|
37
|
+
};
|
|
38
|
+
Object.keys(peelOpts).forEach(key => {
|
|
39
|
+
if (peelOpts[key] === undefined)
|
|
40
|
+
delete peelOpts[key];
|
|
41
|
+
});
|
|
42
|
+
const result = await peel(url, peelOpts);
|
|
43
|
+
if (this.options.chunk) {
|
|
44
|
+
const chunkResult = chunkContent(result.content, {
|
|
45
|
+
maxTokens: this.options.chunkSize || 512,
|
|
46
|
+
overlap: this.options.chunkOverlap || 50,
|
|
47
|
+
strategy: 'section',
|
|
48
|
+
});
|
|
49
|
+
for (const chunk of chunkResult.chunks) {
|
|
50
|
+
documents.push({
|
|
51
|
+
text: chunk.text,
|
|
52
|
+
id_: `${url}#chunk-${chunk.index}`,
|
|
53
|
+
metadata: {
|
|
54
|
+
url,
|
|
55
|
+
title: result.metadata?.title || '',
|
|
56
|
+
chunkIndex: chunk.index,
|
|
57
|
+
totalChunks: chunkResult.totalChunks,
|
|
58
|
+
section: chunk.section,
|
|
59
|
+
tokenCount: chunk.tokenCount,
|
|
60
|
+
},
|
|
61
|
+
});
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
else {
|
|
65
|
+
documents.push({
|
|
66
|
+
text: result.content,
|
|
67
|
+
id_: url,
|
|
68
|
+
metadata: {
|
|
69
|
+
url,
|
|
70
|
+
title: result.metadata?.title || '',
|
|
71
|
+
description: result.metadata?.description || '',
|
|
72
|
+
wordCount: result.metadata?.wordCount || 0,
|
|
73
|
+
language: result.metadata?.language || '',
|
|
74
|
+
},
|
|
75
|
+
});
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
catch (error) {
|
|
79
|
+
documents.push({
|
|
80
|
+
text: '',
|
|
81
|
+
id_: url,
|
|
82
|
+
metadata: {
|
|
83
|
+
url,
|
|
84
|
+
error: error instanceof Error ? error.message : String(error),
|
|
85
|
+
},
|
|
86
|
+
});
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
return documents;
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
//# sourceMappingURL=llamaindex.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"llamaindex.js","sourceRoot":"","sources":["../../src/integrations/llamaindex.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAEH,OAAO,EAAE,IAAI,EAAE,MAAM,aAAa,CAAC;AACnC,OAAO,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AAiClD;;;;GAIG;AACH,MAAM,OAAO,aAAa;IAChB,OAAO,CAAuB;IAEtC,YAAY,UAAgC,EAAE;QAC5C,IAAI,CAAC,OAAO,GAAG,OAAO,CAAC;IACzB,CAAC;IAED;;OAEG;IACH,KAAK,CAAC,QAAQ,CAAC,SAA4B;QACzC,MAAM,IAAI,GAAG,KAAK,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC;QAChE,MAAM,SAAS,GAAoB,EAAE,CAAC;QAEtC,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;YACvB,IAAI,CAAC;gBACH,MAAM,QAAQ,GAAyB;oBACrC,MAAM,EAAE,IAAI,CAAC,OAAO,CAAC,MAAM,IAAI,UAAU;oBACzC,MAAM,EAAE,IAAI,CAAC,OAAO,CAAC,MAAM;oBAC3B,OAAO,EAAE,IAAI,CAAC,OAAO,CAAC,OAAO;oBAC7B,MAAM,EAAE,IAAI,CAAC,OAAO,CAAC,MAAM;oBAC3B,KAAK,EAAE,IAAI,CAAC,OAAO,CAAC,KAAK;oBACzB,OAAO,EAAE,IAAI,CAAC,OAAO,CAAC,OAAO;oBAC7B,GAAG,IAAI,CAAC,OAAO,CAAC,WAAW;iBAC5B,CAAC;gBAEF,MAAM,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC,OAAO,CAAC,GAAG,CAAC,EAAE;oBAClC,IAAK,QAAgB,CAAC,GAAG,CAAC,KAAK,SAAS;wBAAE,OAAQ,QAAgB,CAAC,GAAG,CAAC,CAAC;gBAC1E,CAAC,CAAC,CAAC;gBAEH,MAAM,MAAM,GAAG,MAAM,IAAI,CAAC,GAAG,EAAE,QAAuB,CAAC,CAAC;gBAExD,IAAI,IAAI,CAAC,OAAO,CAAC,KAAK,EAAE,CAAC;oBACvB,MAAM,WAAW,GAAG,YAAY,CAAC,MAAM,CAAC,OAAO,EAAE;wBAC/C,SAAS,EAAE,IAAI,CAAC,OAAO,CAAC,SAAS,IAAI,GAAG;wBACxC,OAAO,EAAE,IAAI,CAAC,OAAO,CAAC,YAAY,IAAI,EAAE;wBACxC,QAAQ,EAAE,SAAS;qBACpB,CAAC,CAAC;oBAEH,KAAK,MAAM,KAAK,IAAI,WAAW,CAAC,MAAM,EAAE,CAAC;wBACvC,SAAS,CAAC,IAAI,CAAC;4BACb,IAAI,EAAE,KAAK,CAAC,IAAI;4BAChB,GAAG,EAAE,GAAG,GAAG,UAAU,KAAK,CAAC,KAAK,EAAE;4BAClC,QAAQ,EAAE;gCACR,GAAG;gCACH,KAAK,EAAE,MAAM,CAAC,QAAQ,EAAE,KAAK,IAAI,EAAE;gCACnC,UAAU,EAAE,KAAK,CAAC,KAAK;gCACvB,WAAW,EAAE,WAAW,CAAC,WAAW;gCACpC,OAAO,EAAE,KAAK,CAAC,OAAO;gCACtB,UAAU,EAAE,KAAK,CAAC,UAAU;6BAC7B;yBACF,CAAC,CAAC;oBACL,CAAC;gBACH,CAAC;qBAAM,CAAC;oBACN,SAAS,CAAC,IAAI,CAAC;wBACb,IAAI,EAAE,MAAM,CAAC,OAAO;wBACpB,GAAG,EAAE,GAAG;wBACR,QAAQ,EAAE;4BACR,GAAG;4BACH,KAAK,EAAE,MAAM,CAAC,QAAQ,EAAE,KAAK,IAAI,EAAE;4BACnC,WAAW,EAAE,MAAM,CAAC,QAAQ,EAAE,WAAW,IAAI,EAAE;4BAC/C,SAAS,EAAE,MAAM,CAAC,QAAQ,EAAE,SAAS,IAAI,CAAC;4BAC1C,QAAQ,EAAE,MAAM,CAAC,QAAQ,EAAE,QAAQ,IAAI,EAAE;yBAC1C;qBACF,CAAC,CAAC;gBACL,CAAC;YACH,CAAC;YAAC,OAAO,KAAK,EAAE,CAAC;gBACf,SAAS,CAAC,IAAI,CAAC;oBACb,IAAI,EAAE,EAAE;oBACR,GAAG,EAAE,GAAG;oBACR,QAAQ,EAAE;wBACR,GAAG;wBACH,KAAK,EAAE,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC;qBAC9D;iBACF,CAAC,CAAC;YACL,CAAC;QACH,CAAC;QAED,OAAO,SAAS,CAAC;IACnB,CAAC;CACF"}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "webpeel",
|
|
3
|
-
"version": "0.17.
|
|
3
|
+
"version": "0.17.1",
|
|
4
4
|
"description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
|
|
5
5
|
"author": "Jake Liu",
|
|
6
6
|
"license": "AGPL-3.0-only",
|
|
@@ -43,6 +43,7 @@
|
|
|
43
43
|
"dist/types.d.ts",
|
|
44
44
|
"dist/types.d.ts.map",
|
|
45
45
|
"dist/core",
|
|
46
|
+
"dist/integrations",
|
|
46
47
|
"dist/mcp",
|
|
47
48
|
"README.md",
|
|
48
49
|
"LICENSE",
|