scrapex 1.0.0-alpha.1 → 1.0.0-beta.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. package/README.md +164 -5
  2. package/dist/embeddings/index.cjs +52 -0
  3. package/dist/embeddings/index.d.cts +3 -0
  4. package/dist/embeddings/index.d.mts +3 -0
  5. package/dist/embeddings/index.mjs +4 -0
  6. package/dist/embeddings-BjNTQSG9.cjs +1455 -0
  7. package/dist/embeddings-BjNTQSG9.cjs.map +1 -0
  8. package/dist/embeddings-Bsymy_jA.mjs +1215 -0
  9. package/dist/embeddings-Bsymy_jA.mjs.map +1 -0
  10. package/dist/{enhancer-oM4BhYYS.cjs → enhancer-Cs_WyWtJ.cjs} +2 -51
  11. package/dist/enhancer-Cs_WyWtJ.cjs.map +1 -0
  12. package/dist/{enhancer-Q6CSc1gA.mjs → enhancer-INx5NlgO.mjs} +2 -45
  13. package/dist/enhancer-INx5NlgO.mjs.map +1 -0
  14. package/dist/http-base-CHLf-Tco.cjs +684 -0
  15. package/dist/http-base-CHLf-Tco.cjs.map +1 -0
  16. package/dist/http-base-DM7YNo6X.mjs +618 -0
  17. package/dist/http-base-DM7YNo6X.mjs.map +1 -0
  18. package/dist/index-Bvseqli-.d.cts +268 -0
  19. package/dist/index-Bvseqli-.d.cts.map +1 -0
  20. package/dist/index-CIFjNySr.d.mts +268 -0
  21. package/dist/index-CIFjNySr.d.mts.map +1 -0
  22. package/dist/index-D6qfjmZQ.d.mts +401 -0
  23. package/dist/index-D6qfjmZQ.d.mts.map +1 -0
  24. package/dist/index-RFSpP5g8.d.cts +401 -0
  25. package/dist/index-RFSpP5g8.d.cts.map +1 -0
  26. package/dist/index.cjs +171 -51
  27. package/dist/index.cjs.map +1 -1
  28. package/dist/index.d.cts +61 -2
  29. package/dist/index.d.cts.map +1 -1
  30. package/dist/index.d.mts +61 -2
  31. package/dist/index.d.mts.map +1 -1
  32. package/dist/index.mjs +129 -6
  33. package/dist/index.mjs.map +1 -1
  34. package/dist/llm/index.cjs +252 -233
  35. package/dist/llm/index.cjs.map +1 -1
  36. package/dist/llm/index.d.cts +132 -85
  37. package/dist/llm/index.d.cts.map +1 -1
  38. package/dist/llm/index.d.mts +132 -85
  39. package/dist/llm/index.d.mts.map +1 -1
  40. package/dist/llm/index.mjs +244 -236
  41. package/dist/llm/index.mjs.map +1 -1
  42. package/dist/parsers/index.cjs +10 -199
  43. package/dist/parsers/index.d.cts +2 -133
  44. package/dist/parsers/index.d.mts +2 -133
  45. package/dist/parsers/index.mjs +2 -191
  46. package/dist/parsers-Bneuws8x.cjs +569 -0
  47. package/dist/parsers-Bneuws8x.cjs.map +1 -0
  48. package/dist/parsers-DsawHeo0.mjs +482 -0
  49. package/dist/parsers-DsawHeo0.mjs.map +1 -0
  50. package/dist/types-BOcHQU9s.d.mts +831 -0
  51. package/dist/types-BOcHQU9s.d.mts.map +1 -0
  52. package/dist/types-DutdBpqd.d.cts +831 -0
  53. package/dist/types-DutdBpqd.d.cts.map +1 -0
  54. package/package.json +15 -16
  55. package/dist/enhancer-Q6CSc1gA.mjs.map +0 -1
  56. package/dist/enhancer-oM4BhYYS.cjs.map +0 -1
  57. package/dist/parsers/index.cjs.map +0 -1
  58. package/dist/parsers/index.d.cts.map +0 -1
  59. package/dist/parsers/index.d.mts.map +0 -1
  60. package/dist/parsers/index.mjs.map +0 -1
  61. package/dist/types-CNQZVW36.d.mts +0 -150
  62. package/dist/types-CNQZVW36.d.mts.map +0 -1
  63. package/dist/types-D0HYR95H.d.cts +0 -150
  64. package/dist/types-D0HYR95H.d.cts.map +0 -1
package/dist/index.cjs CHANGED
@@ -1,36 +1,12 @@
1
- //#region rolldown:runtime
2
- var __create = Object.create;
3
- var __defProp = Object.defineProperty;
4
- var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
5
- var __getOwnPropNames = Object.getOwnPropertyNames;
6
- var __getProtoOf = Object.getPrototypeOf;
7
- var __hasOwnProp = Object.prototype.hasOwnProperty;
8
- var __copyProps = (to, from, except, desc) => {
9
- if (from && typeof from === "object" || typeof from === "function") {
10
- for (var keys = __getOwnPropNames(from), i = 0, n = keys.length, key; i < n; i++) {
11
- key = keys[i];
12
- if (!__hasOwnProp.call(to, key) && key !== except) {
13
- __defProp(to, key, {
14
- get: ((k) => from[k]).bind(null, key),
15
- enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable
16
- });
17
- }
18
- }
19
- }
20
- return to;
21
- };
22
- var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", {
23
- value: mod,
24
- enumerable: true
25
- }) : target, mod));
26
-
27
- //#endregion
28
- const require_enhancer = require('./enhancer-oM4BhYYS.cjs');
1
+ const require_parsers = require('./parsers-Bneuws8x.cjs');
2
+ const require_http_base = require('./http-base-CHLf-Tco.cjs');
3
+ const require_embeddings = require('./embeddings-BjNTQSG9.cjs');
4
+ const require_enhancer = require('./enhancer-Cs_WyWtJ.cjs');
29
5
  let cheerio = require("cheerio");
30
- cheerio = __toESM(cheerio);
31
- let __mozilla_readability = require("@mozilla/readability");
6
+ cheerio = require_parsers.__toESM(cheerio);
7
+ let _mozilla_readability = require("@mozilla/readability");
32
8
  let turndown = require("turndown");
33
- turndown = __toESM(turndown);
9
+ turndown = require_parsers.__toESM(turndown);
34
10
 
35
11
  //#region src/core/context.ts
36
12
  let jsdomModule = null;
@@ -109,7 +85,7 @@ var ContentExtractor = class {
109
85
  async extract(context) {
110
86
  const { options } = context;
111
87
  if (options.extractContent === false) return {};
112
- const article = new __mozilla_readability.Readability(context.getDocument().cloneNode(true)).parse();
88
+ const article = new _mozilla_readability.Readability(context.getDocument().cloneNode(true)).parse();
113
89
  if (!article || !article.content) return this.extractFallback(context);
114
90
  let content = turndown$1.turndown(article.content);
115
91
  const maxLength = options.maxContentLength ?? 5e4;
@@ -533,9 +509,9 @@ var NativeFetcher = class {
533
509
  try {
534
510
  parsedUrl = new URL(url);
535
511
  } catch {
536
- throw new require_enhancer.ScrapeError(`Invalid URL: ${url}`, "INVALID_URL");
512
+ throw new require_http_base.ScrapeError(`Invalid URL: ${url}`, "INVALID_URL");
537
513
  }
538
- if (!["http:", "https:"].includes(parsedUrl.protocol)) throw new require_enhancer.ScrapeError(`Invalid protocol: ${parsedUrl.protocol}`, "INVALID_URL");
514
+ if (!["http:", "https:"].includes(parsedUrl.protocol)) throw new require_http_base.ScrapeError(`Invalid protocol: ${parsedUrl.protocol}`, "INVALID_URL");
539
515
  const controller = new AbortController();
540
516
  const timeoutId = setTimeout(() => controller.abort(), timeout);
541
517
  try {
@@ -551,13 +527,15 @@ var NativeFetcher = class {
551
527
  });
552
528
  clearTimeout(timeoutId);
553
529
  if (!response.ok) {
554
- if (response.status === 404) throw new require_enhancer.ScrapeError(`Page not found: ${url}`, "NOT_FOUND", 404);
555
- if (response.status === 403 || response.status === 401) throw new require_enhancer.ScrapeError(`Access blocked: ${url}`, "BLOCKED", response.status);
556
- if (response.status === 429) throw new require_enhancer.ScrapeError(`Rate limited: ${url}`, "BLOCKED", 429);
557
- throw new require_enhancer.ScrapeError(`HTTP error ${response.status}: ${url}`, "FETCH_FAILED", response.status);
530
+ if (response.status === 404) throw new require_http_base.ScrapeError(`Page not found: ${url}`, "NOT_FOUND", 404);
531
+ if (response.status === 403 || response.status === 401) throw new require_http_base.ScrapeError(`Access blocked: ${url}`, "BLOCKED", response.status);
532
+ if (response.status === 429) throw new require_http_base.ScrapeError(`Rate limited: ${url}`, "BLOCKED", 429);
533
+ throw new require_http_base.ScrapeError(`HTTP error ${response.status}: ${url}`, "FETCH_FAILED", response.status);
558
534
  }
559
535
  const contentType = response.headers.get("content-type") || "";
560
- if (!contentType.includes("text/html") && !contentType.includes("application/xhtml")) throw new require_enhancer.ScrapeError(`Unexpected content type: ${contentType}`, "PARSE_ERROR");
536
+ if (options.allowedContentTypes) {
537
+ if (!options.allowedContentTypes.some((type) => contentType.toLowerCase().includes(type.toLowerCase()))) throw new require_http_base.ScrapeError(`Unexpected content type: ${contentType}`, "PARSE_ERROR");
538
+ } else if (!contentType.includes("text/html") && !contentType.includes("application/xhtml")) throw new require_http_base.ScrapeError(`Unexpected content type: ${contentType}`, "PARSE_ERROR");
561
539
  const html = await response.text();
562
540
  const responseHeaders = {};
563
541
  response.headers.forEach((value, key) => {
@@ -572,10 +550,10 @@ var NativeFetcher = class {
572
550
  };
573
551
  } catch (error) {
574
552
  clearTimeout(timeoutId);
575
- if (error instanceof require_enhancer.ScrapeError) throw error;
576
- if (error instanceof Error && error.name === "AbortError") throw new require_enhancer.ScrapeError(`Request timed out after ${timeout}ms`, "TIMEOUT");
577
- if (error instanceof Error) throw new require_enhancer.ScrapeError(`Fetch failed: ${error.message}`, "FETCH_FAILED", void 0, error);
578
- throw new require_enhancer.ScrapeError("Unknown fetch error", "FETCH_FAILED");
553
+ if (error instanceof require_http_base.ScrapeError) throw error;
554
+ if (error instanceof Error && error.name === "AbortError") throw new require_http_base.ScrapeError(`Request timed out after ${timeout}ms`, "TIMEOUT");
555
+ if (error instanceof Error) throw new require_http_base.ScrapeError(`Fetch failed: ${error.message}`, "FETCH_FAILED", void 0, error);
556
+ throw new require_http_base.ScrapeError("Unknown fetch error", "FETCH_FAILED");
579
557
  }
580
558
  }
581
559
  };
@@ -683,11 +661,11 @@ function matchesPattern(path, pattern) {
683
661
  */
684
662
  async function scrape(url, options = {}) {
685
663
  const startTime = Date.now();
686
- if (!isValidUrl(url)) throw new require_enhancer.ScrapeError("Invalid URL provided", "INVALID_URL");
664
+ if (!isValidUrl(url)) throw new require_http_base.ScrapeError("Invalid URL provided", "INVALID_URL");
687
665
  const normalizedUrl = normalizeUrl(url);
688
666
  if (options.respectRobots) {
689
667
  const robotsResult = await checkRobotsTxt(normalizedUrl, options.userAgent);
690
- if (!robotsResult.allowed) throw new require_enhancer.ScrapeError(`URL blocked by robots.txt: ${robotsResult.reason || "disallowed"}`, "ROBOTS_BLOCKED");
668
+ if (!robotsResult.allowed) throw new require_http_base.ScrapeError(`URL blocked by robots.txt: ${robotsResult.reason || "disallowed"}`, "ROBOTS_BLOCKED");
691
669
  }
692
670
  const fetchResult = await (options.fetcher ?? defaultFetcher).fetch(normalizedUrl, {
693
671
  timeout: options.timeout,
@@ -748,6 +726,7 @@ async function scrape(url, options = {}) {
748
726
  console.error("LLM extraction failed:", error);
749
727
  intermediateResult.error = intermediateResult.error ? `${intermediateResult.error}; LLM extraction: ${error instanceof Error ? error.message : String(error)}` : `LLM extraction: ${error instanceof Error ? error.message : String(error)}`;
750
728
  }
729
+ if (options.embeddings) intermediateResult.embeddings = await require_embeddings.generateEmbeddings(intermediateResult, options.embeddings);
751
730
  const scrapeTimeMs = Date.now() - startTime;
752
731
  return {
753
732
  ...intermediateResult,
@@ -770,7 +749,7 @@ async function scrape(url, options = {}) {
770
749
  */
771
750
  async function scrapeHtml(html, url, options = {}) {
772
751
  const startTime = Date.now();
773
- if (!isValidUrl(url)) throw new require_enhancer.ScrapeError("Invalid URL provided", "INVALID_URL");
752
+ if (!isValidUrl(url)) throw new require_http_base.ScrapeError("Invalid URL provided", "INVALID_URL");
774
753
  const normalizedUrl = normalizeUrl(url);
775
754
  await preloadJsdom();
776
755
  let context = createExtractionContext(normalizedUrl, normalizedUrl, html, options);
@@ -788,9 +767,8 @@ async function scrapeHtml(html, url, options = {}) {
788
767
  console.error(`Extractor "${extractor.name}" failed:`, error);
789
768
  context = mergeResults(context, { error: context.results.error ? `${context.results.error}; ${extractor.name}: ${error instanceof Error ? error.message : String(error)}` : `${extractor.name}: ${error instanceof Error ? error.message : String(error)}` });
790
769
  }
791
- const scrapeTimeMs = Date.now() - startTime;
792
770
  const domain = extractDomain(normalizedUrl);
793
- return {
771
+ const intermediateResult = {
794
772
  url: normalizedUrl,
795
773
  canonicalUrl: context.results.canonicalUrl || normalizedUrl,
796
774
  domain,
@@ -817,9 +795,127 @@ async function scrapeHtml(html, url, options = {}) {
817
795
  extracted: context.results.extracted,
818
796
  custom: context.results.custom,
819
797
  scrapedAt: (/* @__PURE__ */ new Date()).toISOString(),
820
- scrapeTimeMs,
798
+ scrapeTimeMs: 0,
821
799
  error: context.results.error
822
800
  };
801
+ if (options.embeddings) intermediateResult.embeddings = await require_embeddings.generateEmbeddings(intermediateResult, options.embeddings);
802
+ const scrapeTimeMs = Date.now() - startTime;
803
+ return {
804
+ ...intermediateResult,
805
+ scrapeTimeMs
806
+ };
807
+ }
808
+
809
+ //#endregion
810
+ //#region src/utils/feed.ts
811
+ /**
812
+ * Fetch and parse an RSS/Atom feed from a URL.
813
+ * Uses scrapex's fetcher infrastructure for consistent behavior.
814
+ */
815
+ async function fetchFeed(url, options) {
816
+ const result = await (options?.fetcher || defaultFetcher).fetch(url, {
817
+ timeout: options?.timeout,
818
+ userAgent: options?.userAgent,
819
+ allowedContentTypes: [
820
+ "application/rss+xml",
821
+ "application/atom+xml",
822
+ "application/rdf+xml",
823
+ "application/xml",
824
+ "text/xml",
825
+ "text/html"
826
+ ]
827
+ });
828
+ return new require_parsers.RSSParser(options?.parserOptions).parse(result.html, url);
829
+ }
830
+ /**
831
+ * Detect RSS/Atom feed URLs from HTML.
832
+ * Supports RSS, Atom, and RDF feed types.
833
+ */
834
+ function discoverFeeds(html, baseUrl) {
835
+ const $ = cheerio.load(html);
836
+ const feeds = [];
837
+ const seen = /* @__PURE__ */ new Set();
838
+ $([
839
+ "link[type=\"application/rss+xml\"]",
840
+ "link[type=\"application/atom+xml\"]",
841
+ "link[type=\"application/rdf+xml\"]",
842
+ "link[rel=\"alternate\"][type*=\"xml\"]"
843
+ ].join(", ")).each((_, el) => {
844
+ const href = $(el).attr("href");
845
+ if (href) try {
846
+ const resolved = new URL(href, baseUrl).href;
847
+ if (!seen.has(resolved)) {
848
+ seen.add(resolved);
849
+ feeds.push(resolved);
850
+ }
851
+ } catch {}
852
+ });
853
+ return feeds;
854
+ }
855
+ /**
856
+ * Filter feed items by date range.
857
+ * Items without publishedAt are included by default.
858
+ */
859
+ function filterByDate(items, options) {
860
+ const { after, before, includeUndated = true } = options;
861
+ return items.filter((item) => {
862
+ if (!item.publishedAt) return includeUndated;
863
+ const date = new Date(item.publishedAt);
864
+ if (after && date < after) return false;
865
+ if (before && date > before) return false;
866
+ return true;
867
+ });
868
+ }
869
+ /**
870
+ * Convert feed items to markdown for LLM consumption.
871
+ * Uses ISO 8601 date format for consistency across environments.
872
+ */
873
+ function feedToMarkdown(feed, options) {
874
+ const { includeContent = false, maxItems } = options || {};
875
+ const lines = [`# ${feed.title}`, ""];
876
+ if (feed.description) lines.push(feed.description, "");
877
+ const items = maxItems ? feed.items.slice(0, maxItems) : feed.items;
878
+ for (const item of items) {
879
+ lines.push(`## ${item.title}`);
880
+ if (item.publishedAt) {
881
+ const date = item.publishedAt.split("T")[0];
882
+ lines.push(`*${date}*`);
883
+ }
884
+ lines.push("");
885
+ if (includeContent && item.content) lines.push(item.content);
886
+ else if (item.description) lines.push(item.description);
887
+ if (item.link) lines.push(`[Read more](${item.link})`, "");
888
+ else lines.push("");
889
+ }
890
+ return lines.join("\n");
891
+ }
892
+ /**
893
+ * Extract plain text from feed items for LLM processing.
894
+ * Concatenates title, description, and content.
895
+ */
896
+ function feedToText(feed, options) {
897
+ const { maxItems, separator = "\n\n---\n\n" } = options || {};
898
+ return (maxItems ? feed.items.slice(0, maxItems) : feed.items).map((item) => {
899
+ const parts = [item.title];
900
+ if (item.description) parts.push(item.description);
901
+ if (item.content) parts.push(item.content);
902
+ return parts.join("\n\n");
903
+ }).join(separator);
904
+ }
905
+ /**
906
+ * Paginate through a feed using rel="next" links (RFC 5005).
907
+ * Returns an async generator that yields each page.
908
+ */
909
+ async function* paginateFeed(url, options) {
910
+ const { maxPages = 10, ...fetchOptions } = options || {};
911
+ let currentUrl = url;
912
+ let pageCount = 0;
913
+ while (currentUrl && pageCount < maxPages) {
914
+ const result = await fetchFeed(currentUrl, fetchOptions);
915
+ yield result.data;
916
+ currentUrl = result.data.next;
917
+ pageCount++;
918
+ }
823
919
  }
824
920
 
825
921
  //#endregion
@@ -827,17 +923,39 @@ exports.ContentExtractor = ContentExtractor;
827
923
  exports.DEFAULT_TIMEOUT = DEFAULT_TIMEOUT;
828
924
  exports.DEFAULT_USER_AGENT = DEFAULT_USER_AGENT;
829
925
  exports.FaviconExtractor = FaviconExtractor;
926
+ exports.InMemoryEmbeddingCache = require_embeddings.InMemoryEmbeddingCache;
830
927
  exports.JsonLdExtractor = JsonLdExtractor;
831
928
  exports.LinksExtractor = LinksExtractor;
832
929
  exports.MetaExtractor = MetaExtractor;
833
930
  exports.NativeFetcher = NativeFetcher;
834
- exports.ScrapeError = require_enhancer.ScrapeError;
835
- exports.__toESM = __toESM;
931
+ exports.RSSParser = require_parsers.RSSParser;
932
+ exports.ScrapeError = require_http_base.ScrapeError;
933
+ exports.TRANSFORMERS_MODELS = require_embeddings.TRANSFORMERS_MODELS;
934
+ exports.aggregateVectors = require_embeddings.aggregateVectors;
836
935
  exports.checkRobotsTxt = checkRobotsTxt;
936
+ exports.chunkText = require_embeddings.chunkText;
937
+ exports.cosineSimilarity = require_embeddings.cosineSimilarity;
938
+ exports.createAzureEmbedding = require_embeddings.createAzureEmbedding;
837
939
  exports.createDefaultExtractors = createDefaultExtractors;
940
+ exports.createEmbeddingProvider = require_embeddings.createEmbeddingProvider;
838
941
  exports.createExtractionContext = createExtractionContext;
942
+ exports.createHttpEmbedding = require_embeddings.createHttpEmbedding;
943
+ exports.createHuggingFaceEmbedding = require_embeddings.createHuggingFaceEmbedding;
944
+ exports.createOllamaEmbedding = require_embeddings.createOllamaEmbedding;
945
+ exports.createOpenAIEmbedding = require_embeddings.createOpenAIEmbedding;
946
+ exports.createPiiRedactor = require_embeddings.createPiiRedactor;
947
+ exports.createTransformersEmbedding = require_embeddings.createTransformersEmbedding;
839
948
  exports.defaultFetcher = defaultFetcher;
949
+ exports.discoverFeeds = discoverFeeds;
950
+ exports.embed = require_embeddings.embed;
951
+ exports.embedScrapedData = require_embeddings.embedScrapedData;
952
+ exports.estimateTokens = require_embeddings.estimateTokens;
840
953
  exports.extractDomain = extractDomain;
954
+ exports.feedToMarkdown = feedToMarkdown;
955
+ exports.feedToText = feedToText;
956
+ exports.fetchFeed = fetchFeed;
957
+ exports.filterByDate = filterByDate;
958
+ exports.generateEmbeddings = require_embeddings.generateEmbeddings;
841
959
  exports.getPath = getPath;
842
960
  exports.getProtocol = getProtocol;
843
961
  exports.isExternalUrl = isExternalUrl;
@@ -845,6 +963,8 @@ exports.isValidUrl = isValidUrl;
845
963
  exports.matchesUrlPattern = matchesUrlPattern;
846
964
  exports.mergeResults = mergeResults;
847
965
  exports.normalizeUrl = normalizeUrl;
966
+ exports.paginateFeed = paginateFeed;
967
+ exports.redactPii = require_embeddings.redactPii;
848
968
  exports.resolveUrl = resolveUrl;
849
969
  exports.scrape = scrape;
850
970
  exports.scrapeHtml = scrapeHtml;