scrapex 1.0.0-alpha.1 → 1.0.0-beta.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +164 -5
- package/dist/embeddings/index.cjs +52 -0
- package/dist/embeddings/index.d.cts +3 -0
- package/dist/embeddings/index.d.mts +3 -0
- package/dist/embeddings/index.mjs +4 -0
- package/dist/embeddings-BjNTQSG9.cjs +1455 -0
- package/dist/embeddings-BjNTQSG9.cjs.map +1 -0
- package/dist/embeddings-Bsymy_jA.mjs +1215 -0
- package/dist/embeddings-Bsymy_jA.mjs.map +1 -0
- package/dist/{enhancer-oM4BhYYS.cjs → enhancer-Cs_WyWtJ.cjs} +2 -51
- package/dist/enhancer-Cs_WyWtJ.cjs.map +1 -0
- package/dist/{enhancer-Q6CSc1gA.mjs → enhancer-INx5NlgO.mjs} +2 -45
- package/dist/enhancer-INx5NlgO.mjs.map +1 -0
- package/dist/http-base-CHLf-Tco.cjs +684 -0
- package/dist/http-base-CHLf-Tco.cjs.map +1 -0
- package/dist/http-base-DM7YNo6X.mjs +618 -0
- package/dist/http-base-DM7YNo6X.mjs.map +1 -0
- package/dist/index-Bvseqli-.d.cts +268 -0
- package/dist/index-Bvseqli-.d.cts.map +1 -0
- package/dist/index-CIFjNySr.d.mts +268 -0
- package/dist/index-CIFjNySr.d.mts.map +1 -0
- package/dist/index-D6qfjmZQ.d.mts +401 -0
- package/dist/index-D6qfjmZQ.d.mts.map +1 -0
- package/dist/index-RFSpP5g8.d.cts +401 -0
- package/dist/index-RFSpP5g8.d.cts.map +1 -0
- package/dist/index.cjs +171 -51
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +61 -2
- package/dist/index.d.cts.map +1 -1
- package/dist/index.d.mts +61 -2
- package/dist/index.d.mts.map +1 -1
- package/dist/index.mjs +129 -6
- package/dist/index.mjs.map +1 -1
- package/dist/llm/index.cjs +252 -233
- package/dist/llm/index.cjs.map +1 -1
- package/dist/llm/index.d.cts +132 -85
- package/dist/llm/index.d.cts.map +1 -1
- package/dist/llm/index.d.mts +132 -85
- package/dist/llm/index.d.mts.map +1 -1
- package/dist/llm/index.mjs +244 -236
- package/dist/llm/index.mjs.map +1 -1
- package/dist/parsers/index.cjs +10 -199
- package/dist/parsers/index.d.cts +2 -133
- package/dist/parsers/index.d.mts +2 -133
- package/dist/parsers/index.mjs +2 -191
- package/dist/parsers-Bneuws8x.cjs +569 -0
- package/dist/parsers-Bneuws8x.cjs.map +1 -0
- package/dist/parsers-DsawHeo0.mjs +482 -0
- package/dist/parsers-DsawHeo0.mjs.map +1 -0
- package/dist/types-BOcHQU9s.d.mts +831 -0
- package/dist/types-BOcHQU9s.d.mts.map +1 -0
- package/dist/types-DutdBpqd.d.cts +831 -0
- package/dist/types-DutdBpqd.d.cts.map +1 -0
- package/package.json +15 -16
- package/dist/enhancer-Q6CSc1gA.mjs.map +0 -1
- package/dist/enhancer-oM4BhYYS.cjs.map +0 -1
- package/dist/parsers/index.cjs.map +0 -1
- package/dist/parsers/index.d.cts.map +0 -1
- package/dist/parsers/index.d.mts.map +0 -1
- package/dist/parsers/index.mjs.map +0 -1
- package/dist/types-CNQZVW36.d.mts +0 -150
- package/dist/types-CNQZVW36.d.mts.map +0 -1
- package/dist/types-D0HYR95H.d.cts +0 -150
- package/dist/types-D0HYR95H.d.cts.map +0 -1
package/dist/index.cjs
CHANGED
|
@@ -1,36 +1,12 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
6
|
-
var __getProtoOf = Object.getPrototypeOf;
|
|
7
|
-
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
8
|
-
var __copyProps = (to, from, except, desc) => {
|
|
9
|
-
if (from && typeof from === "object" || typeof from === "function") {
|
|
10
|
-
for (var keys = __getOwnPropNames(from), i = 0, n = keys.length, key; i < n; i++) {
|
|
11
|
-
key = keys[i];
|
|
12
|
-
if (!__hasOwnProp.call(to, key) && key !== except) {
|
|
13
|
-
__defProp(to, key, {
|
|
14
|
-
get: ((k) => from[k]).bind(null, key),
|
|
15
|
-
enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable
|
|
16
|
-
});
|
|
17
|
-
}
|
|
18
|
-
}
|
|
19
|
-
}
|
|
20
|
-
return to;
|
|
21
|
-
};
|
|
22
|
-
var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", {
|
|
23
|
-
value: mod,
|
|
24
|
-
enumerable: true
|
|
25
|
-
}) : target, mod));
|
|
26
|
-
|
|
27
|
-
//#endregion
|
|
28
|
-
const require_enhancer = require('./enhancer-oM4BhYYS.cjs');
|
|
1
|
+
const require_parsers = require('./parsers-Bneuws8x.cjs');
|
|
2
|
+
const require_http_base = require('./http-base-CHLf-Tco.cjs');
|
|
3
|
+
const require_embeddings = require('./embeddings-BjNTQSG9.cjs');
|
|
4
|
+
const require_enhancer = require('./enhancer-Cs_WyWtJ.cjs');
|
|
29
5
|
let cheerio = require("cheerio");
|
|
30
|
-
cheerio = __toESM(cheerio);
|
|
31
|
-
let
|
|
6
|
+
cheerio = require_parsers.__toESM(cheerio);
|
|
7
|
+
let _mozilla_readability = require("@mozilla/readability");
|
|
32
8
|
let turndown = require("turndown");
|
|
33
|
-
turndown = __toESM(turndown);
|
|
9
|
+
turndown = require_parsers.__toESM(turndown);
|
|
34
10
|
|
|
35
11
|
//#region src/core/context.ts
|
|
36
12
|
let jsdomModule = null;
|
|
@@ -109,7 +85,7 @@ var ContentExtractor = class {
|
|
|
109
85
|
async extract(context) {
|
|
110
86
|
const { options } = context;
|
|
111
87
|
if (options.extractContent === false) return {};
|
|
112
|
-
const article = new
|
|
88
|
+
const article = new _mozilla_readability.Readability(context.getDocument().cloneNode(true)).parse();
|
|
113
89
|
if (!article || !article.content) return this.extractFallback(context);
|
|
114
90
|
let content = turndown$1.turndown(article.content);
|
|
115
91
|
const maxLength = options.maxContentLength ?? 5e4;
|
|
@@ -533,9 +509,9 @@ var NativeFetcher = class {
|
|
|
533
509
|
try {
|
|
534
510
|
parsedUrl = new URL(url);
|
|
535
511
|
} catch {
|
|
536
|
-
throw new
|
|
512
|
+
throw new require_http_base.ScrapeError(`Invalid URL: ${url}`, "INVALID_URL");
|
|
537
513
|
}
|
|
538
|
-
if (!["http:", "https:"].includes(parsedUrl.protocol)) throw new
|
|
514
|
+
if (!["http:", "https:"].includes(parsedUrl.protocol)) throw new require_http_base.ScrapeError(`Invalid protocol: ${parsedUrl.protocol}`, "INVALID_URL");
|
|
539
515
|
const controller = new AbortController();
|
|
540
516
|
const timeoutId = setTimeout(() => controller.abort(), timeout);
|
|
541
517
|
try {
|
|
@@ -551,13 +527,15 @@ var NativeFetcher = class {
|
|
|
551
527
|
});
|
|
552
528
|
clearTimeout(timeoutId);
|
|
553
529
|
if (!response.ok) {
|
|
554
|
-
if (response.status === 404) throw new
|
|
555
|
-
if (response.status === 403 || response.status === 401) throw new
|
|
556
|
-
if (response.status === 429) throw new
|
|
557
|
-
throw new
|
|
530
|
+
if (response.status === 404) throw new require_http_base.ScrapeError(`Page not found: ${url}`, "NOT_FOUND", 404);
|
|
531
|
+
if (response.status === 403 || response.status === 401) throw new require_http_base.ScrapeError(`Access blocked: ${url}`, "BLOCKED", response.status);
|
|
532
|
+
if (response.status === 429) throw new require_http_base.ScrapeError(`Rate limited: ${url}`, "BLOCKED", 429);
|
|
533
|
+
throw new require_http_base.ScrapeError(`HTTP error ${response.status}: ${url}`, "FETCH_FAILED", response.status);
|
|
558
534
|
}
|
|
559
535
|
const contentType = response.headers.get("content-type") || "";
|
|
560
|
-
if (
|
|
536
|
+
if (options.allowedContentTypes) {
|
|
537
|
+
if (!options.allowedContentTypes.some((type) => contentType.toLowerCase().includes(type.toLowerCase()))) throw new require_http_base.ScrapeError(`Unexpected content type: ${contentType}`, "PARSE_ERROR");
|
|
538
|
+
} else if (!contentType.includes("text/html") && !contentType.includes("application/xhtml")) throw new require_http_base.ScrapeError(`Unexpected content type: ${contentType}`, "PARSE_ERROR");
|
|
561
539
|
const html = await response.text();
|
|
562
540
|
const responseHeaders = {};
|
|
563
541
|
response.headers.forEach((value, key) => {
|
|
@@ -572,10 +550,10 @@ var NativeFetcher = class {
|
|
|
572
550
|
};
|
|
573
551
|
} catch (error) {
|
|
574
552
|
clearTimeout(timeoutId);
|
|
575
|
-
if (error instanceof
|
|
576
|
-
if (error instanceof Error && error.name === "AbortError") throw new
|
|
577
|
-
if (error instanceof Error) throw new
|
|
578
|
-
throw new
|
|
553
|
+
if (error instanceof require_http_base.ScrapeError) throw error;
|
|
554
|
+
if (error instanceof Error && error.name === "AbortError") throw new require_http_base.ScrapeError(`Request timed out after ${timeout}ms`, "TIMEOUT");
|
|
555
|
+
if (error instanceof Error) throw new require_http_base.ScrapeError(`Fetch failed: ${error.message}`, "FETCH_FAILED", void 0, error);
|
|
556
|
+
throw new require_http_base.ScrapeError("Unknown fetch error", "FETCH_FAILED");
|
|
579
557
|
}
|
|
580
558
|
}
|
|
581
559
|
};
|
|
@@ -683,11 +661,11 @@ function matchesPattern(path, pattern) {
|
|
|
683
661
|
*/
|
|
684
662
|
async function scrape(url, options = {}) {
|
|
685
663
|
const startTime = Date.now();
|
|
686
|
-
if (!isValidUrl(url)) throw new
|
|
664
|
+
if (!isValidUrl(url)) throw new require_http_base.ScrapeError("Invalid URL provided", "INVALID_URL");
|
|
687
665
|
const normalizedUrl = normalizeUrl(url);
|
|
688
666
|
if (options.respectRobots) {
|
|
689
667
|
const robotsResult = await checkRobotsTxt(normalizedUrl, options.userAgent);
|
|
690
|
-
if (!robotsResult.allowed) throw new
|
|
668
|
+
if (!robotsResult.allowed) throw new require_http_base.ScrapeError(`URL blocked by robots.txt: ${robotsResult.reason || "disallowed"}`, "ROBOTS_BLOCKED");
|
|
691
669
|
}
|
|
692
670
|
const fetchResult = await (options.fetcher ?? defaultFetcher).fetch(normalizedUrl, {
|
|
693
671
|
timeout: options.timeout,
|
|
@@ -748,6 +726,7 @@ async function scrape(url, options = {}) {
|
|
|
748
726
|
console.error("LLM extraction failed:", error);
|
|
749
727
|
intermediateResult.error = intermediateResult.error ? `${intermediateResult.error}; LLM extraction: ${error instanceof Error ? error.message : String(error)}` : `LLM extraction: ${error instanceof Error ? error.message : String(error)}`;
|
|
750
728
|
}
|
|
729
|
+
if (options.embeddings) intermediateResult.embeddings = await require_embeddings.generateEmbeddings(intermediateResult, options.embeddings);
|
|
751
730
|
const scrapeTimeMs = Date.now() - startTime;
|
|
752
731
|
return {
|
|
753
732
|
...intermediateResult,
|
|
@@ -770,7 +749,7 @@ async function scrape(url, options = {}) {
|
|
|
770
749
|
*/
|
|
771
750
|
async function scrapeHtml(html, url, options = {}) {
|
|
772
751
|
const startTime = Date.now();
|
|
773
|
-
if (!isValidUrl(url)) throw new
|
|
752
|
+
if (!isValidUrl(url)) throw new require_http_base.ScrapeError("Invalid URL provided", "INVALID_URL");
|
|
774
753
|
const normalizedUrl = normalizeUrl(url);
|
|
775
754
|
await preloadJsdom();
|
|
776
755
|
let context = createExtractionContext(normalizedUrl, normalizedUrl, html, options);
|
|
@@ -788,9 +767,8 @@ async function scrapeHtml(html, url, options = {}) {
|
|
|
788
767
|
console.error(`Extractor "${extractor.name}" failed:`, error);
|
|
789
768
|
context = mergeResults(context, { error: context.results.error ? `${context.results.error}; ${extractor.name}: ${error instanceof Error ? error.message : String(error)}` : `${extractor.name}: ${error instanceof Error ? error.message : String(error)}` });
|
|
790
769
|
}
|
|
791
|
-
const scrapeTimeMs = Date.now() - startTime;
|
|
792
770
|
const domain = extractDomain(normalizedUrl);
|
|
793
|
-
|
|
771
|
+
const intermediateResult = {
|
|
794
772
|
url: normalizedUrl,
|
|
795
773
|
canonicalUrl: context.results.canonicalUrl || normalizedUrl,
|
|
796
774
|
domain,
|
|
@@ -817,9 +795,127 @@ async function scrapeHtml(html, url, options = {}) {
|
|
|
817
795
|
extracted: context.results.extracted,
|
|
818
796
|
custom: context.results.custom,
|
|
819
797
|
scrapedAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
820
|
-
scrapeTimeMs,
|
|
798
|
+
scrapeTimeMs: 0,
|
|
821
799
|
error: context.results.error
|
|
822
800
|
};
|
|
801
|
+
if (options.embeddings) intermediateResult.embeddings = await require_embeddings.generateEmbeddings(intermediateResult, options.embeddings);
|
|
802
|
+
const scrapeTimeMs = Date.now() - startTime;
|
|
803
|
+
return {
|
|
804
|
+
...intermediateResult,
|
|
805
|
+
scrapeTimeMs
|
|
806
|
+
};
|
|
807
|
+
}
|
|
808
|
+
|
|
809
|
+
//#endregion
|
|
810
|
+
//#region src/utils/feed.ts
|
|
811
|
+
/**
|
|
812
|
+
* Fetch and parse an RSS/Atom feed from a URL.
|
|
813
|
+
* Uses scrapex's fetcher infrastructure for consistent behavior.
|
|
814
|
+
*/
|
|
815
|
+
async function fetchFeed(url, options) {
|
|
816
|
+
const result = await (options?.fetcher || defaultFetcher).fetch(url, {
|
|
817
|
+
timeout: options?.timeout,
|
|
818
|
+
userAgent: options?.userAgent,
|
|
819
|
+
allowedContentTypes: [
|
|
820
|
+
"application/rss+xml",
|
|
821
|
+
"application/atom+xml",
|
|
822
|
+
"application/rdf+xml",
|
|
823
|
+
"application/xml",
|
|
824
|
+
"text/xml",
|
|
825
|
+
"text/html"
|
|
826
|
+
]
|
|
827
|
+
});
|
|
828
|
+
return new require_parsers.RSSParser(options?.parserOptions).parse(result.html, url);
|
|
829
|
+
}
|
|
830
|
+
/**
|
|
831
|
+
* Detect RSS/Atom feed URLs from HTML.
|
|
832
|
+
* Supports RSS, Atom, and RDF feed types.
|
|
833
|
+
*/
|
|
834
|
+
function discoverFeeds(html, baseUrl) {
|
|
835
|
+
const $ = cheerio.load(html);
|
|
836
|
+
const feeds = [];
|
|
837
|
+
const seen = /* @__PURE__ */ new Set();
|
|
838
|
+
$([
|
|
839
|
+
"link[type=\"application/rss+xml\"]",
|
|
840
|
+
"link[type=\"application/atom+xml\"]",
|
|
841
|
+
"link[type=\"application/rdf+xml\"]",
|
|
842
|
+
"link[rel=\"alternate\"][type*=\"xml\"]"
|
|
843
|
+
].join(", ")).each((_, el) => {
|
|
844
|
+
const href = $(el).attr("href");
|
|
845
|
+
if (href) try {
|
|
846
|
+
const resolved = new URL(href, baseUrl).href;
|
|
847
|
+
if (!seen.has(resolved)) {
|
|
848
|
+
seen.add(resolved);
|
|
849
|
+
feeds.push(resolved);
|
|
850
|
+
}
|
|
851
|
+
} catch {}
|
|
852
|
+
});
|
|
853
|
+
return feeds;
|
|
854
|
+
}
|
|
855
|
+
/**
|
|
856
|
+
* Filter feed items by date range.
|
|
857
|
+
* Items without publishedAt are included by default.
|
|
858
|
+
*/
|
|
859
|
+
function filterByDate(items, options) {
|
|
860
|
+
const { after, before, includeUndated = true } = options;
|
|
861
|
+
return items.filter((item) => {
|
|
862
|
+
if (!item.publishedAt) return includeUndated;
|
|
863
|
+
const date = new Date(item.publishedAt);
|
|
864
|
+
if (after && date < after) return false;
|
|
865
|
+
if (before && date > before) return false;
|
|
866
|
+
return true;
|
|
867
|
+
});
|
|
868
|
+
}
|
|
869
|
+
/**
|
|
870
|
+
* Convert feed items to markdown for LLM consumption.
|
|
871
|
+
* Uses ISO 8601 date format for consistency across environments.
|
|
872
|
+
*/
|
|
873
|
+
function feedToMarkdown(feed, options) {
|
|
874
|
+
const { includeContent = false, maxItems } = options || {};
|
|
875
|
+
const lines = [`# ${feed.title}`, ""];
|
|
876
|
+
if (feed.description) lines.push(feed.description, "");
|
|
877
|
+
const items = maxItems ? feed.items.slice(0, maxItems) : feed.items;
|
|
878
|
+
for (const item of items) {
|
|
879
|
+
lines.push(`## ${item.title}`);
|
|
880
|
+
if (item.publishedAt) {
|
|
881
|
+
const date = item.publishedAt.split("T")[0];
|
|
882
|
+
lines.push(`*${date}*`);
|
|
883
|
+
}
|
|
884
|
+
lines.push("");
|
|
885
|
+
if (includeContent && item.content) lines.push(item.content);
|
|
886
|
+
else if (item.description) lines.push(item.description);
|
|
887
|
+
if (item.link) lines.push(`[Read more](${item.link})`, "");
|
|
888
|
+
else lines.push("");
|
|
889
|
+
}
|
|
890
|
+
return lines.join("\n");
|
|
891
|
+
}
|
|
892
|
+
/**
|
|
893
|
+
* Extract plain text from feed items for LLM processing.
|
|
894
|
+
* Concatenates title, description, and content.
|
|
895
|
+
*/
|
|
896
|
+
function feedToText(feed, options) {
|
|
897
|
+
const { maxItems, separator = "\n\n---\n\n" } = options || {};
|
|
898
|
+
return (maxItems ? feed.items.slice(0, maxItems) : feed.items).map((item) => {
|
|
899
|
+
const parts = [item.title];
|
|
900
|
+
if (item.description) parts.push(item.description);
|
|
901
|
+
if (item.content) parts.push(item.content);
|
|
902
|
+
return parts.join("\n\n");
|
|
903
|
+
}).join(separator);
|
|
904
|
+
}
|
|
905
|
+
/**
|
|
906
|
+
* Paginate through a feed using rel="next" links (RFC 5005).
|
|
907
|
+
* Returns an async generator that yields each page.
|
|
908
|
+
*/
|
|
909
|
+
async function* paginateFeed(url, options) {
|
|
910
|
+
const { maxPages = 10, ...fetchOptions } = options || {};
|
|
911
|
+
let currentUrl = url;
|
|
912
|
+
let pageCount = 0;
|
|
913
|
+
while (currentUrl && pageCount < maxPages) {
|
|
914
|
+
const result = await fetchFeed(currentUrl, fetchOptions);
|
|
915
|
+
yield result.data;
|
|
916
|
+
currentUrl = result.data.next;
|
|
917
|
+
pageCount++;
|
|
918
|
+
}
|
|
823
919
|
}
|
|
824
920
|
|
|
825
921
|
//#endregion
|
|
@@ -827,17 +923,39 @@ exports.ContentExtractor = ContentExtractor;
|
|
|
827
923
|
exports.DEFAULT_TIMEOUT = DEFAULT_TIMEOUT;
|
|
828
924
|
exports.DEFAULT_USER_AGENT = DEFAULT_USER_AGENT;
|
|
829
925
|
exports.FaviconExtractor = FaviconExtractor;
|
|
926
|
+
exports.InMemoryEmbeddingCache = require_embeddings.InMemoryEmbeddingCache;
|
|
830
927
|
exports.JsonLdExtractor = JsonLdExtractor;
|
|
831
928
|
exports.LinksExtractor = LinksExtractor;
|
|
832
929
|
exports.MetaExtractor = MetaExtractor;
|
|
833
930
|
exports.NativeFetcher = NativeFetcher;
|
|
834
|
-
exports.
|
|
835
|
-
exports.
|
|
931
|
+
exports.RSSParser = require_parsers.RSSParser;
|
|
932
|
+
exports.ScrapeError = require_http_base.ScrapeError;
|
|
933
|
+
exports.TRANSFORMERS_MODELS = require_embeddings.TRANSFORMERS_MODELS;
|
|
934
|
+
exports.aggregateVectors = require_embeddings.aggregateVectors;
|
|
836
935
|
exports.checkRobotsTxt = checkRobotsTxt;
|
|
936
|
+
exports.chunkText = require_embeddings.chunkText;
|
|
937
|
+
exports.cosineSimilarity = require_embeddings.cosineSimilarity;
|
|
938
|
+
exports.createAzureEmbedding = require_embeddings.createAzureEmbedding;
|
|
837
939
|
exports.createDefaultExtractors = createDefaultExtractors;
|
|
940
|
+
exports.createEmbeddingProvider = require_embeddings.createEmbeddingProvider;
|
|
838
941
|
exports.createExtractionContext = createExtractionContext;
|
|
942
|
+
exports.createHttpEmbedding = require_embeddings.createHttpEmbedding;
|
|
943
|
+
exports.createHuggingFaceEmbedding = require_embeddings.createHuggingFaceEmbedding;
|
|
944
|
+
exports.createOllamaEmbedding = require_embeddings.createOllamaEmbedding;
|
|
945
|
+
exports.createOpenAIEmbedding = require_embeddings.createOpenAIEmbedding;
|
|
946
|
+
exports.createPiiRedactor = require_embeddings.createPiiRedactor;
|
|
947
|
+
exports.createTransformersEmbedding = require_embeddings.createTransformersEmbedding;
|
|
839
948
|
exports.defaultFetcher = defaultFetcher;
|
|
949
|
+
exports.discoverFeeds = discoverFeeds;
|
|
950
|
+
exports.embed = require_embeddings.embed;
|
|
951
|
+
exports.embedScrapedData = require_embeddings.embedScrapedData;
|
|
952
|
+
exports.estimateTokens = require_embeddings.estimateTokens;
|
|
840
953
|
exports.extractDomain = extractDomain;
|
|
954
|
+
exports.feedToMarkdown = feedToMarkdown;
|
|
955
|
+
exports.feedToText = feedToText;
|
|
956
|
+
exports.fetchFeed = fetchFeed;
|
|
957
|
+
exports.filterByDate = filterByDate;
|
|
958
|
+
exports.generateEmbeddings = require_embeddings.generateEmbeddings;
|
|
841
959
|
exports.getPath = getPath;
|
|
842
960
|
exports.getProtocol = getProtocol;
|
|
843
961
|
exports.isExternalUrl = isExternalUrl;
|
|
@@ -845,6 +963,8 @@ exports.isValidUrl = isValidUrl;
|
|
|
845
963
|
exports.matchesUrlPattern = matchesUrlPattern;
|
|
846
964
|
exports.mergeResults = mergeResults;
|
|
847
965
|
exports.normalizeUrl = normalizeUrl;
|
|
966
|
+
exports.paginateFeed = paginateFeed;
|
|
967
|
+
exports.redactPii = require_embeddings.redactPii;
|
|
848
968
|
exports.resolveUrl = resolveUrl;
|
|
849
969
|
exports.scrape = scrape;
|
|
850
970
|
exports.scrapeHtml = scrapeHtml;
|