scrapex 0.5.3 → 1.0.0-beta.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. package/LICENSE +1 -1
  2. package/README.md +551 -145
  3. package/dist/enhancer-ByjRD-t5.mjs +769 -0
  4. package/dist/enhancer-ByjRD-t5.mjs.map +1 -0
  5. package/dist/enhancer-j0xqKDJm.cjs +847 -0
  6. package/dist/enhancer-j0xqKDJm.cjs.map +1 -0
  7. package/dist/index-CDgcRnig.d.cts +268 -0
  8. package/dist/index-CDgcRnig.d.cts.map +1 -0
  9. package/dist/index-piS5wtki.d.mts +268 -0
  10. package/dist/index-piS5wtki.d.mts.map +1 -0
  11. package/dist/index.cjs +2007 -0
  12. package/dist/index.cjs.map +1 -0
  13. package/dist/index.d.cts +580 -0
  14. package/dist/index.d.cts.map +1 -0
  15. package/dist/index.d.mts +580 -0
  16. package/dist/index.d.mts.map +1 -0
  17. package/dist/index.mjs +1956 -0
  18. package/dist/index.mjs.map +1 -0
  19. package/dist/llm/index.cjs +334 -0
  20. package/dist/llm/index.cjs.map +1 -0
  21. package/dist/llm/index.d.cts +258 -0
  22. package/dist/llm/index.d.cts.map +1 -0
  23. package/dist/llm/index.d.mts +258 -0
  24. package/dist/llm/index.d.mts.map +1 -0
  25. package/dist/llm/index.mjs +317 -0
  26. package/dist/llm/index.mjs.map +1 -0
  27. package/dist/parsers/index.cjs +11 -0
  28. package/dist/parsers/index.d.cts +2 -0
  29. package/dist/parsers/index.d.mts +2 -0
  30. package/dist/parsers/index.mjs +3 -0
  31. package/dist/parsers-Bneuws8x.cjs +569 -0
  32. package/dist/parsers-Bneuws8x.cjs.map +1 -0
  33. package/dist/parsers-CwkYnyWY.mjs +482 -0
  34. package/dist/parsers-CwkYnyWY.mjs.map +1 -0
  35. package/dist/types-CadAXrme.d.mts +674 -0
  36. package/dist/types-CadAXrme.d.mts.map +1 -0
  37. package/dist/types-DPEtPihB.d.cts +674 -0
  38. package/dist/types-DPEtPihB.d.cts.map +1 -0
  39. package/package.json +79 -100
  40. package/dist/index.d.ts +0 -45
  41. package/dist/index.js +0 -8
  42. package/dist/scrapex.cjs.development.js +0 -1130
  43. package/dist/scrapex.cjs.development.js.map +0 -1
  44. package/dist/scrapex.cjs.production.min.js +0 -2
  45. package/dist/scrapex.cjs.production.min.js.map +0 -1
  46. package/dist/scrapex.esm.js +0 -1122
  47. package/dist/scrapex.esm.js.map +0 -1
package/package.json CHANGED
@@ -1,119 +1,98 @@
1
1
  {
2
2
  "name": "scrapex",
3
- "version": "0.5.3",
4
- "main": "dist/index.js",
5
- "module": "dist/scrapex.esm.js",
6
- "typings": "dist/index.d.ts",
3
+ "version": "1.0.0-beta.1",
4
+ "description": "Modern web scraper with LLM-enhanced extraction, extensible pipeline, and pluggable parsers",
5
+ "type": "module",
6
+ "exports": {
7
+ ".": {
8
+ "types": "./dist/index.d.mts",
9
+ "import": "./dist/index.mjs",
10
+ "require": "./dist/index.cjs"
11
+ },
12
+ "./parsers": {
13
+ "types": "./dist/parsers/index.d.mts",
14
+ "import": "./dist/parsers/index.mjs",
15
+ "require": "./dist/parsers/index.cjs"
16
+ },
17
+ "./llm": {
18
+ "types": "./dist/llm/index.d.mts",
19
+ "import": "./dist/llm/index.mjs",
20
+ "require": "./dist/llm/index.cjs"
21
+ },
22
+ "./embeddings": {
23
+ "types": "./dist/embeddings/index.d.mts",
24
+ "import": "./dist/embeddings/index.mjs",
25
+ "require": "./dist/embeddings/index.cjs"
26
+ }
27
+ },
28
+ "main": "./dist/index.cjs",
29
+ "module": "./dist/index.mjs",
30
+ "types": "./dist/index.d.mts",
7
31
  "files": [
8
32
  "dist"
9
33
  ],
10
34
  "scripts": {
11
- "start": "tsdx watch",
12
- "build": "tsdx build",
13
- "test": "tsdx test",
14
- "test:watch": "npm run test -- --watch",
15
- "prepare": "npm run build",
16
- "release": "npx np"
35
+ "dev": "tsdown --watch",
36
+ "build": "tsdown",
37
+ "test": "vitest run",
38
+ "test:watch": "vitest",
39
+ "test:coverage": "vitest run --coverage",
40
+ "type-check": "tsc --noEmit",
41
+ "lint": "biome lint ./src",
42
+ "lint:fix": "biome lint --write ./src",
43
+ "format": "biome format ./src",
44
+ "format:fix": "biome format --write ./src",
45
+ "check": "biome check ./src",
46
+ "check:fix": "biome check --write ./src",
47
+ "prepublishOnly": "npm run build"
17
48
  },
18
49
  "repository": {
19
50
  "type": "git",
20
51
  "url": "https://github.com/developer-rakeshpaul/scrapex"
21
52
  },
22
- "husky": {
23
- "hooks": {
24
- "pre-commit": "pretty-quick --staged"
25
- }
26
- },
27
- "np": {
28
- "yarn": false,
29
- "contents": "dist"
30
- },
53
+ "keywords": [
54
+ "scraper",
55
+ "web-scraping",
56
+ "metadata",
57
+ "llm",
58
+ "extraction",
59
+ "readability",
60
+ "markdown",
61
+ "parser",
62
+ "embeddings",
63
+ "vector-search"
64
+ ],
65
+ "author": "Rakesh Paul <https://binaryroute.com/authors/rk-paul/>",
31
66
  "license": "MIT",
32
- "prettier": {
33
- "printWidth": 80,
34
- "semi": true,
35
- "singleQuote": true,
36
- "trailingComma": "es5"
67
+ "engines": {
68
+ "node": ">=20"
37
69
  },
38
70
  "dependencies": {
39
- "@metascraper/helpers": "^5.24.6",
40
- "@mozilla/readability": "^0.4.1",
41
- "@types/got": "^9.6.12",
42
- "agentkeepalive": "^4.1.4",
43
- "cheerio": "^1.0.0-rc.10",
44
- "domino": "^2.1.6",
45
- "got": "^11.8.2",
46
- "jsdom": "^16.7.0",
47
- "lodash.get": "^4.4.2",
48
- "lodash.uniq": "^4.5.0",
49
- "metascraper": "^5.24.6",
50
- "metascraper-amazon": "^5.24.6",
51
- "metascraper-audio": "^5.24.6",
52
- "metascraper-author": "^5.24.6",
53
- "metascraper-clearbit": "^5.24.6",
54
- "metascraper-date": "^5.24.6",
55
- "metascraper-description": "^5.24.6",
56
- "metascraper-iframe": "^5.24.6",
57
- "metascraper-image": "^5.24.6",
58
- "metascraper-lang": "^5.24.6",
59
- "metascraper-logo": "^5.24.6",
60
- "metascraper-logo-favicon": "^5.24.6",
61
- "metascraper-media-provider": "^5.24.6",
62
- "metascraper-publisher": "^5.24.6",
63
- "metascraper-readability": "^5.24.6",
64
- "metascraper-soundcloud": "^5.24.6",
65
- "metascraper-spotify": "^5.24.6",
66
- "metascraper-title": "^5.24.6",
67
- "metascraper-url": "^5.24.6",
68
- "metascraper-video": "^5.24.6",
69
- "metascraper-youtube": "^5.24.6",
70
- "node-fetch": "^3.0.0",
71
- "page-metadata-parser": "^1.1.4",
72
- "robots-parser": "^2.3.0",
73
- "sanitize-html": "^2.5.0",
74
- "valid-url": "^1.0.9"
71
+ "@mozilla/readability": "^0.6.0",
72
+ "cheerio": "^1.1.2",
73
+ "jsdom": "^27.4.0",
74
+ "mdast-util-from-markdown": "^2.0.2",
75
+ "mdast-util-to-string": "^4.0.0",
76
+ "turndown": "^7.2.2",
77
+ "unist-util-visit": "^5.0.0",
78
+ "zod": "^4.3.4"
75
79
  },
76
80
  "devDependencies": {
77
- "@types/cheerio": "^0.22.30",
78
- "@types/jest": "^27.0.1",
79
- "@types/jsdom": "^16.2.13",
80
- "@types/lodash.get": "^4.4.6",
81
- "@types/lodash.uniq": "^4.5.6",
82
- "@types/metascraper": "^5.14.1",
83
- "@types/metascraper-amazon": "^5.14.0",
84
- "@types/metascraper-audio": "^5.14.0",
85
- "@types/metascraper-author": "^5.14.0",
86
- "@types/metascraper-clearbit": "^5.14.1",
87
- "@types/metascraper-date": "^5.14.0",
88
- "@types/metascraper-description": "^5.14.1",
89
- "@types/metascraper-image": "^5.14.0",
90
- "@types/metascraper-lang": "^5.14.0",
91
- "@types/metascraper-logo": "^5.14.0",
92
- "@types/metascraper-logo-favicon": "^5.14.1",
93
- "@types/metascraper-media-provider": "^5.14.1",
94
- "@types/metascraper-publisher": "^5.14.0",
95
- "@types/metascraper-readability": "^5.14.0",
96
- "@types/metascraper-soundcloud": "^5.14.0",
97
- "@types/metascraper-spotify": "^5.14.0",
98
- "@types/metascraper-title": "^5.14.0",
99
- "@types/metascraper-url": "^5.14.0",
100
- "@types/metascraper-video": "^5.14.0",
101
- "@types/metascraper-youtube": "^5.14.0",
102
- "@types/mozilla-readability": "^0.2.1",
103
- "@types/node": "^16.9.1",
104
- "@types/sanitize-html": "^2.3.2",
105
- "@types/valid-url": "^1.0.3",
106
- "husky": "^7.0.2",
107
- "jest-extended": "^0.11.5",
108
- "prettier": "^2.4.0",
109
- "pretty-quick": "^3.1.1",
110
- "tsdx": "^0.14.1",
111
- "tslib": "^2.3.1",
112
- "typescript": "^4.4.3"
81
+ "@biomejs/biome": "^2.3.10",
82
+ "@types/jsdom": "^27.0.0",
83
+ "@types/mdast": "^4.0.4",
84
+ "@types/node": "^22.10.0",
85
+ "@types/turndown": "^5.0.6",
86
+ "tsdown": "^0.18.4",
87
+ "typescript": "^5.9.3",
88
+ "vitest": "^4.0.16"
113
89
  },
114
- "jest": {
115
- "setupFilesAfterEnv": [
116
- "jest-extended"
117
- ]
90
+ "peerDependencies": {
91
+ "puppeteer": "^24.34.0"
92
+ },
93
+ "peerDependenciesMeta": {
94
+ "puppeteer": {
95
+ "optional": true
96
+ }
118
97
  }
119
98
  }
package/dist/index.d.ts DELETED
@@ -1,45 +0,0 @@
1
- /// <reference types="cheerio" />
2
- import { IOptions as SanitizeHtmlOptions } from 'sanitize-html';
3
- export interface ILink {
4
- text?: string;
5
- href?: string;
6
- }
7
- export interface IMetadata {
8
- url: string;
9
- date?: string;
10
- image?: string;
11
- publisher?: string;
12
- title?: string;
13
- author?: string;
14
- description?: string;
15
- audio?: string;
16
- logo?: string;
17
- lang?: string;
18
- text?: string;
19
- favicon?: string;
20
- tags: Array<string>;
21
- keywords: Array<string>;
22
- links?: ILink[];
23
- content?: string;
24
- html?: string;
25
- source: string;
26
- video?: string;
27
- code?: string[];
28
- embeds?: Array<Record<string, string | undefined>>;
29
- twitter: Record<string, string | undefined>;
30
- }
31
- export declare function getEmbedAttrs(el: cheerio.TagElement): {
32
- src: string;
33
- height: string;
34
- width: string;
35
- title: string;
36
- };
37
- declare type MetaScraperRules = 'audio' | 'amazon' | 'iframe' | 'media-provider' | 'soundcloud' | 'uol' | 'spotify' | 'video' | 'youtube';
38
- export declare type ScrapeOptions = {
39
- timeout?: number;
40
- metascraperRules?: Array<MetaScraperRules>;
41
- sanitizeOptions?: SanitizeHtmlOptions;
42
- };
43
- export declare const scrape: (url: string, options?: ScrapeOptions | undefined) => Promise<IMetadata | null>;
44
- export declare const scrapeHtml: (url: string, html: string, options?: ScrapeOptions | undefined) => Promise<IMetadata | null>;
45
- export {};
package/dist/index.js DELETED
@@ -1,8 +0,0 @@
1
-
2
- 'use strict'
3
-
4
- if (process.env.NODE_ENV === 'production') {
5
- module.exports = require('./scrapex.cjs.production.min.js')
6
- } else {
7
- module.exports = require('./scrapex.cjs.development.js')
8
- }