scrapex 0.5.3 → 1.0.0-beta.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +1 -1
- package/README.md +551 -145
- package/dist/enhancer-ByjRD-t5.mjs +769 -0
- package/dist/enhancer-ByjRD-t5.mjs.map +1 -0
- package/dist/enhancer-j0xqKDJm.cjs +847 -0
- package/dist/enhancer-j0xqKDJm.cjs.map +1 -0
- package/dist/index-CDgcRnig.d.cts +268 -0
- package/dist/index-CDgcRnig.d.cts.map +1 -0
- package/dist/index-piS5wtki.d.mts +268 -0
- package/dist/index-piS5wtki.d.mts.map +1 -0
- package/dist/index.cjs +2007 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +580 -0
- package/dist/index.d.cts.map +1 -0
- package/dist/index.d.mts +580 -0
- package/dist/index.d.mts.map +1 -0
- package/dist/index.mjs +1956 -0
- package/dist/index.mjs.map +1 -0
- package/dist/llm/index.cjs +334 -0
- package/dist/llm/index.cjs.map +1 -0
- package/dist/llm/index.d.cts +258 -0
- package/dist/llm/index.d.cts.map +1 -0
- package/dist/llm/index.d.mts +258 -0
- package/dist/llm/index.d.mts.map +1 -0
- package/dist/llm/index.mjs +317 -0
- package/dist/llm/index.mjs.map +1 -0
- package/dist/parsers/index.cjs +11 -0
- package/dist/parsers/index.d.cts +2 -0
- package/dist/parsers/index.d.mts +2 -0
- package/dist/parsers/index.mjs +3 -0
- package/dist/parsers-Bneuws8x.cjs +569 -0
- package/dist/parsers-Bneuws8x.cjs.map +1 -0
- package/dist/parsers-CwkYnyWY.mjs +482 -0
- package/dist/parsers-CwkYnyWY.mjs.map +1 -0
- package/dist/types-CadAXrme.d.mts +674 -0
- package/dist/types-CadAXrme.d.mts.map +1 -0
- package/dist/types-DPEtPihB.d.cts +674 -0
- package/dist/types-DPEtPihB.d.cts.map +1 -0
- package/package.json +79 -100
- package/dist/index.d.ts +0 -45
- package/dist/index.js +0 -8
- package/dist/scrapex.cjs.development.js +0 -1130
- package/dist/scrapex.cjs.development.js.map +0 -1
- package/dist/scrapex.cjs.production.min.js +0 -2
- package/dist/scrapex.cjs.production.min.js.map +0 -1
- package/dist/scrapex.esm.js +0 -1122
- package/dist/scrapex.esm.js.map +0 -1
package/package.json
CHANGED
|
@@ -1,119 +1,98 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "scrapex",
|
|
3
|
-
"version": "0.
|
|
4
|
-
"
|
|
5
|
-
"
|
|
6
|
-
"
|
|
3
|
+
"version": "1.0.0-beta.1",
|
|
4
|
+
"description": "Modern web scraper with LLM-enhanced extraction, extensible pipeline, and pluggable parsers",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"exports": {
|
|
7
|
+
".": {
|
|
8
|
+
"types": "./dist/index.d.mts",
|
|
9
|
+
"import": "./dist/index.mjs",
|
|
10
|
+
"require": "./dist/index.cjs"
|
|
11
|
+
},
|
|
12
|
+
"./parsers": {
|
|
13
|
+
"types": "./dist/parsers/index.d.mts",
|
|
14
|
+
"import": "./dist/parsers/index.mjs",
|
|
15
|
+
"require": "./dist/parsers/index.cjs"
|
|
16
|
+
},
|
|
17
|
+
"./llm": {
|
|
18
|
+
"types": "./dist/llm/index.d.mts",
|
|
19
|
+
"import": "./dist/llm/index.mjs",
|
|
20
|
+
"require": "./dist/llm/index.cjs"
|
|
21
|
+
},
|
|
22
|
+
"./embeddings": {
|
|
23
|
+
"types": "./dist/embeddings/index.d.mts",
|
|
24
|
+
"import": "./dist/embeddings/index.mjs",
|
|
25
|
+
"require": "./dist/embeddings/index.cjs"
|
|
26
|
+
}
|
|
27
|
+
},
|
|
28
|
+
"main": "./dist/index.cjs",
|
|
29
|
+
"module": "./dist/index.mjs",
|
|
30
|
+
"types": "./dist/index.d.mts",
|
|
7
31
|
"files": [
|
|
8
32
|
"dist"
|
|
9
33
|
],
|
|
10
34
|
"scripts": {
|
|
11
|
-
"
|
|
12
|
-
"build": "
|
|
13
|
-
"test": "
|
|
14
|
-
"test:watch": "
|
|
15
|
-
"
|
|
16
|
-
"
|
|
35
|
+
"dev": "tsdown --watch",
|
|
36
|
+
"build": "tsdown",
|
|
37
|
+
"test": "vitest run",
|
|
38
|
+
"test:watch": "vitest",
|
|
39
|
+
"test:coverage": "vitest run --coverage",
|
|
40
|
+
"type-check": "tsc --noEmit",
|
|
41
|
+
"lint": "biome lint ./src",
|
|
42
|
+
"lint:fix": "biome lint --write ./src",
|
|
43
|
+
"format": "biome format ./src",
|
|
44
|
+
"format:fix": "biome format --write ./src",
|
|
45
|
+
"check": "biome check ./src",
|
|
46
|
+
"check:fix": "biome check --write ./src",
|
|
47
|
+
"prepublishOnly": "npm run build"
|
|
17
48
|
},
|
|
18
49
|
"repository": {
|
|
19
50
|
"type": "git",
|
|
20
51
|
"url": "https://github.com/developer-rakeshpaul/scrapex"
|
|
21
52
|
},
|
|
22
|
-
"
|
|
23
|
-
"
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
"
|
|
29
|
-
"
|
|
30
|
-
|
|
53
|
+
"keywords": [
|
|
54
|
+
"scraper",
|
|
55
|
+
"web-scraping",
|
|
56
|
+
"metadata",
|
|
57
|
+
"llm",
|
|
58
|
+
"extraction",
|
|
59
|
+
"readability",
|
|
60
|
+
"markdown",
|
|
61
|
+
"parser",
|
|
62
|
+
"embeddings",
|
|
63
|
+
"vector-search"
|
|
64
|
+
],
|
|
65
|
+
"author": "Rakesh Paul <https://binaryroute.com/authors/rk-paul/>",
|
|
31
66
|
"license": "MIT",
|
|
32
|
-
"
|
|
33
|
-
"
|
|
34
|
-
"semi": true,
|
|
35
|
-
"singleQuote": true,
|
|
36
|
-
"trailingComma": "es5"
|
|
67
|
+
"engines": {
|
|
68
|
+
"node": ">=20"
|
|
37
69
|
},
|
|
38
70
|
"dependencies": {
|
|
39
|
-
"@
|
|
40
|
-
"
|
|
41
|
-
"
|
|
42
|
-
"
|
|
43
|
-
"
|
|
44
|
-
"
|
|
45
|
-
"
|
|
46
|
-
"
|
|
47
|
-
"lodash.get": "^4.4.2",
|
|
48
|
-
"lodash.uniq": "^4.5.0",
|
|
49
|
-
"metascraper": "^5.24.6",
|
|
50
|
-
"metascraper-amazon": "^5.24.6",
|
|
51
|
-
"metascraper-audio": "^5.24.6",
|
|
52
|
-
"metascraper-author": "^5.24.6",
|
|
53
|
-
"metascraper-clearbit": "^5.24.6",
|
|
54
|
-
"metascraper-date": "^5.24.6",
|
|
55
|
-
"metascraper-description": "^5.24.6",
|
|
56
|
-
"metascraper-iframe": "^5.24.6",
|
|
57
|
-
"metascraper-image": "^5.24.6",
|
|
58
|
-
"metascraper-lang": "^5.24.6",
|
|
59
|
-
"metascraper-logo": "^5.24.6",
|
|
60
|
-
"metascraper-logo-favicon": "^5.24.6",
|
|
61
|
-
"metascraper-media-provider": "^5.24.6",
|
|
62
|
-
"metascraper-publisher": "^5.24.6",
|
|
63
|
-
"metascraper-readability": "^5.24.6",
|
|
64
|
-
"metascraper-soundcloud": "^5.24.6",
|
|
65
|
-
"metascraper-spotify": "^5.24.6",
|
|
66
|
-
"metascraper-title": "^5.24.6",
|
|
67
|
-
"metascraper-url": "^5.24.6",
|
|
68
|
-
"metascraper-video": "^5.24.6",
|
|
69
|
-
"metascraper-youtube": "^5.24.6",
|
|
70
|
-
"node-fetch": "^3.0.0",
|
|
71
|
-
"page-metadata-parser": "^1.1.4",
|
|
72
|
-
"robots-parser": "^2.3.0",
|
|
73
|
-
"sanitize-html": "^2.5.0",
|
|
74
|
-
"valid-url": "^1.0.9"
|
|
71
|
+
"@mozilla/readability": "^0.6.0",
|
|
72
|
+
"cheerio": "^1.1.2",
|
|
73
|
+
"jsdom": "^27.4.0",
|
|
74
|
+
"mdast-util-from-markdown": "^2.0.2",
|
|
75
|
+
"mdast-util-to-string": "^4.0.0",
|
|
76
|
+
"turndown": "^7.2.2",
|
|
77
|
+
"unist-util-visit": "^5.0.0",
|
|
78
|
+
"zod": "^4.3.4"
|
|
75
79
|
},
|
|
76
80
|
"devDependencies": {
|
|
77
|
-
"@
|
|
78
|
-
"@types/
|
|
79
|
-
"@types/
|
|
80
|
-
"@types/
|
|
81
|
-
"@types/
|
|
82
|
-
"
|
|
83
|
-
"
|
|
84
|
-
"
|
|
85
|
-
"@types/metascraper-author": "^5.14.0",
|
|
86
|
-
"@types/metascraper-clearbit": "^5.14.1",
|
|
87
|
-
"@types/metascraper-date": "^5.14.0",
|
|
88
|
-
"@types/metascraper-description": "^5.14.1",
|
|
89
|
-
"@types/metascraper-image": "^5.14.0",
|
|
90
|
-
"@types/metascraper-lang": "^5.14.0",
|
|
91
|
-
"@types/metascraper-logo": "^5.14.0",
|
|
92
|
-
"@types/metascraper-logo-favicon": "^5.14.1",
|
|
93
|
-
"@types/metascraper-media-provider": "^5.14.1",
|
|
94
|
-
"@types/metascraper-publisher": "^5.14.0",
|
|
95
|
-
"@types/metascraper-readability": "^5.14.0",
|
|
96
|
-
"@types/metascraper-soundcloud": "^5.14.0",
|
|
97
|
-
"@types/metascraper-spotify": "^5.14.0",
|
|
98
|
-
"@types/metascraper-title": "^5.14.0",
|
|
99
|
-
"@types/metascraper-url": "^5.14.0",
|
|
100
|
-
"@types/metascraper-video": "^5.14.0",
|
|
101
|
-
"@types/metascraper-youtube": "^5.14.0",
|
|
102
|
-
"@types/mozilla-readability": "^0.2.1",
|
|
103
|
-
"@types/node": "^16.9.1",
|
|
104
|
-
"@types/sanitize-html": "^2.3.2",
|
|
105
|
-
"@types/valid-url": "^1.0.3",
|
|
106
|
-
"husky": "^7.0.2",
|
|
107
|
-
"jest-extended": "^0.11.5",
|
|
108
|
-
"prettier": "^2.4.0",
|
|
109
|
-
"pretty-quick": "^3.1.1",
|
|
110
|
-
"tsdx": "^0.14.1",
|
|
111
|
-
"tslib": "^2.3.1",
|
|
112
|
-
"typescript": "^4.4.3"
|
|
81
|
+
"@biomejs/biome": "^2.3.10",
|
|
82
|
+
"@types/jsdom": "^27.0.0",
|
|
83
|
+
"@types/mdast": "^4.0.4",
|
|
84
|
+
"@types/node": "^22.10.0",
|
|
85
|
+
"@types/turndown": "^5.0.6",
|
|
86
|
+
"tsdown": "^0.18.4",
|
|
87
|
+
"typescript": "^5.9.3",
|
|
88
|
+
"vitest": "^4.0.16"
|
|
113
89
|
},
|
|
114
|
-
"
|
|
115
|
-
"
|
|
116
|
-
|
|
117
|
-
|
|
90
|
+
"peerDependencies": {
|
|
91
|
+
"puppeteer": "^24.34.0"
|
|
92
|
+
},
|
|
93
|
+
"peerDependenciesMeta": {
|
|
94
|
+
"puppeteer": {
|
|
95
|
+
"optional": true
|
|
96
|
+
}
|
|
118
97
|
}
|
|
119
98
|
}
|
package/dist/index.d.ts
DELETED
|
@@ -1,45 +0,0 @@
|
|
|
1
|
-
/// <reference types="cheerio" />
|
|
2
|
-
import { IOptions as SanitizeHtmlOptions } from 'sanitize-html';
|
|
3
|
-
export interface ILink {
|
|
4
|
-
text?: string;
|
|
5
|
-
href?: string;
|
|
6
|
-
}
|
|
7
|
-
export interface IMetadata {
|
|
8
|
-
url: string;
|
|
9
|
-
date?: string;
|
|
10
|
-
image?: string;
|
|
11
|
-
publisher?: string;
|
|
12
|
-
title?: string;
|
|
13
|
-
author?: string;
|
|
14
|
-
description?: string;
|
|
15
|
-
audio?: string;
|
|
16
|
-
logo?: string;
|
|
17
|
-
lang?: string;
|
|
18
|
-
text?: string;
|
|
19
|
-
favicon?: string;
|
|
20
|
-
tags: Array<string>;
|
|
21
|
-
keywords: Array<string>;
|
|
22
|
-
links?: ILink[];
|
|
23
|
-
content?: string;
|
|
24
|
-
html?: string;
|
|
25
|
-
source: string;
|
|
26
|
-
video?: string;
|
|
27
|
-
code?: string[];
|
|
28
|
-
embeds?: Array<Record<string, string | undefined>>;
|
|
29
|
-
twitter: Record<string, string | undefined>;
|
|
30
|
-
}
|
|
31
|
-
export declare function getEmbedAttrs(el: cheerio.TagElement): {
|
|
32
|
-
src: string;
|
|
33
|
-
height: string;
|
|
34
|
-
width: string;
|
|
35
|
-
title: string;
|
|
36
|
-
};
|
|
37
|
-
declare type MetaScraperRules = 'audio' | 'amazon' | 'iframe' | 'media-provider' | 'soundcloud' | 'uol' | 'spotify' | 'video' | 'youtube';
|
|
38
|
-
export declare type ScrapeOptions = {
|
|
39
|
-
timeout?: number;
|
|
40
|
-
metascraperRules?: Array<MetaScraperRules>;
|
|
41
|
-
sanitizeOptions?: SanitizeHtmlOptions;
|
|
42
|
-
};
|
|
43
|
-
export declare const scrape: (url: string, options?: ScrapeOptions | undefined) => Promise<IMetadata | null>;
|
|
44
|
-
export declare const scrapeHtml: (url: string, html: string, options?: ScrapeOptions | undefined) => Promise<IMetadata | null>;
|
|
45
|
-
export {};
|