scrapex 0.5.2 → 1.0.0-alpha.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/LICENSE +1 -1
  2. package/README.md +392 -145
  3. package/dist/enhancer-Q6CSc1gA.mjs +220 -0
  4. package/dist/enhancer-Q6CSc1gA.mjs.map +1 -0
  5. package/dist/enhancer-oM4BhYYS.cjs +268 -0
  6. package/dist/enhancer-oM4BhYYS.cjs.map +1 -0
  7. package/dist/index.cjs +852 -0
  8. package/dist/index.cjs.map +1 -0
  9. package/dist/index.d.cts +264 -0
  10. package/dist/index.d.cts.map +1 -0
  11. package/dist/index.d.mts +264 -0
  12. package/dist/index.d.mts.map +1 -0
  13. package/dist/index.mjs +798 -0
  14. package/dist/index.mjs.map +1 -0
  15. package/dist/llm/index.cjs +316 -0
  16. package/dist/llm/index.cjs.map +1 -0
  17. package/dist/llm/index.d.cts +211 -0
  18. package/dist/llm/index.d.cts.map +1 -0
  19. package/dist/llm/index.d.mts +211 -0
  20. package/dist/llm/index.d.mts.map +1 -0
  21. package/dist/llm/index.mjs +310 -0
  22. package/dist/llm/index.mjs.map +1 -0
  23. package/dist/parsers/index.cjs +200 -0
  24. package/dist/parsers/index.cjs.map +1 -0
  25. package/dist/parsers/index.d.cts +133 -0
  26. package/dist/parsers/index.d.cts.map +1 -0
  27. package/dist/parsers/index.d.mts +133 -0
  28. package/dist/parsers/index.d.mts.map +1 -0
  29. package/dist/parsers/index.mjs +192 -0
  30. package/dist/parsers/index.mjs.map +1 -0
  31. package/dist/types-CNQZVW36.d.mts +150 -0
  32. package/dist/types-CNQZVW36.d.mts.map +1 -0
  33. package/dist/types-D0HYR95H.d.cts +150 -0
  34. package/dist/types-D0HYR95H.d.cts.map +1 -0
  35. package/package.json +80 -100
  36. package/dist/index.d.ts +0 -45
  37. package/dist/index.js +0 -8
  38. package/dist/scrapex.cjs.development.js +0 -1128
  39. package/dist/scrapex.cjs.development.js.map +0 -1
  40. package/dist/scrapex.cjs.production.min.js +0 -2
  41. package/dist/scrapex.cjs.production.min.js.map +0 -1
  42. package/dist/scrapex.esm.js +0 -1120
  43. package/dist/scrapex.esm.js.map +0 -1
@@ -0,0 +1,150 @@
1
+ import { CheerioAPI } from "cheerio";
2
+
3
+ //#region src/core/types.d.ts
4
+
5
+ /**
6
+ * Content type classification for scraped URLs
7
+ */
8
+ type ContentType = 'article' | 'repo' | 'docs' | 'package' | 'video' | 'tool' | 'product' | 'unknown';
9
+ /**
10
+ * Extracted link from content
11
+ */
12
+ interface ExtractedLink {
13
+ url: string;
14
+ text: string;
15
+ isExternal: boolean;
16
+ }
17
+ /**
18
+ * Extracted entities from LLM enhancement
19
+ */
20
+ interface ExtractedEntities {
21
+ people: string[];
22
+ organizations: string[];
23
+ technologies: string[];
24
+ locations: string[];
25
+ concepts: string[];
26
+ }
27
+ /**
28
+ * Main result of metadata scraping - optimized for LLM consumption
29
+ */
30
+ interface ScrapedData {
31
+ url: string;
32
+ canonicalUrl: string;
33
+ domain: string;
34
+ title: string;
35
+ description: string;
36
+ image?: string;
37
+ favicon?: string;
38
+ content: string;
39
+ textContent: string;
40
+ excerpt: string;
41
+ wordCount: number;
42
+ author?: string;
43
+ publishedAt?: string;
44
+ modifiedAt?: string;
45
+ siteName?: string;
46
+ language?: string;
47
+ contentType: ContentType;
48
+ keywords: string[];
49
+ jsonLd?: Record<string, unknown>[];
50
+ links?: ExtractedLink[];
51
+ summary?: string;
52
+ suggestedTags?: string[];
53
+ entities?: ExtractedEntities;
54
+ extracted?: Record<string, unknown>;
55
+ custom?: Record<string, unknown>;
56
+ scrapedAt: string;
57
+ scrapeTimeMs: number;
58
+ error?: string;
59
+ }
60
+ /**
61
+ * LLM enhancement types
62
+ */
63
+ type EnhancementType = 'summarize' | 'tags' | 'entities' | 'classify';
64
+ /**
65
+ * Schema for structured LLM extraction
66
+ */
67
+ type ExtractionSchemaType = 'string' | 'number' | 'boolean' | 'string[]' | 'number[]' | `${string}?`;
68
+ type ExtractionSchema = Record<string, ExtractionSchemaType>;
69
+ /**
70
+ * Forward declaration for LLM provider (defined in llm/types.ts)
71
+ */
72
+ interface LLMProvider {
73
+ readonly name: string;
74
+ complete(prompt: string, options?: CompletionOptions): Promise<string>;
75
+ completeJSON<T>(prompt: string, schema: unknown): Promise<T>;
76
+ }
77
+ interface CompletionOptions {
78
+ maxTokens?: number;
79
+ temperature?: number;
80
+ systemPrompt?: string;
81
+ }
82
+ /**
83
+ * Forward declaration for Fetcher (defined in fetchers/types.ts)
84
+ */
85
+ interface Fetcher {
86
+ readonly name: string;
87
+ fetch(url: string, options: FetchOptions): Promise<FetchResult>;
88
+ }
89
+ interface FetchOptions {
90
+ timeout?: number;
91
+ userAgent?: string;
92
+ headers?: Record<string, string>;
93
+ }
94
+ interface FetchResult {
95
+ html: string;
96
+ finalUrl: string;
97
+ statusCode: number;
98
+ contentType: string;
99
+ headers?: Record<string, string>;
100
+ }
101
+ /**
102
+ * Forward declaration for Extractor (defined in extractors/types.ts)
103
+ */
104
+ interface Extractor {
105
+ readonly name: string;
106
+ readonly priority?: number;
107
+ extract(context: ExtractionContext): Promise<Partial<ScrapedData>>;
108
+ }
109
+ /**
110
+ * Shared context passed to all extractors
111
+ */
112
+ interface ExtractionContext {
113
+ url: string;
114
+ finalUrl: string;
115
+ html: string;
116
+ $: CheerioAPI;
117
+ getDocument(): Document;
118
+ results: Partial<ScrapedData>;
119
+ options: ScrapeOptions;
120
+ }
121
+ /**
122
+ * Options for scraping
123
+ */
124
+ interface ScrapeOptions {
125
+ /** Timeout in milliseconds (default: 10000) */
126
+ timeout?: number;
127
+ /** User agent string */
128
+ userAgent?: string;
129
+ /** Whether to extract full content (default: true) */
130
+ extractContent?: boolean;
131
+ /** Maximum content length in characters (default: 50000) */
132
+ maxContentLength?: number;
133
+ /** Custom fetcher (for Puppeteer/Playwright) */
134
+ fetcher?: Fetcher;
135
+ /** Custom extractors to run */
136
+ extractors?: Extractor[];
137
+ /** If true, only run custom extractors (replace defaults) */
138
+ replaceDefaultExtractors?: boolean;
139
+ /** Check robots.txt before scraping (default: false) */
140
+ respectRobots?: boolean;
141
+ /** LLM provider for enhancements */
142
+ llm?: LLMProvider;
143
+ /** LLM enhancement types to run */
144
+ enhance?: EnhancementType[];
145
+ /** Schema for structured LLM extraction */
146
+ extract?: ExtractionSchema;
147
+ }
148
+ //#endregion
149
+ export { ExtractedLink as a, ExtractionSchemaType as c, FetchResult as d, Fetcher as f, ScrapedData as h, ExtractedEntities as i, Extractor as l, ScrapeOptions as m, ContentType as n, ExtractionContext as o, LLMProvider as p, EnhancementType as r, ExtractionSchema as s, CompletionOptions as t, FetchOptions as u };
150
+ //# sourceMappingURL=types-D0HYR95H.d.cts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"types-D0HYR95H.d.cts","names":[],"sources":["../src/core/types.ts"],"sourcesContent":[],"mappings":";;;;;;AAKA;AAaiB,KAbL,WAAA,GAakB,SAAA,GAAA,MAAA,GAAA,MAAA,GAAA,SAAA,GAAA,OAAA,GAAA,MAAA,GAAA,SAAA,GAAA,SAAA;AAS9B;AAWA;;AA8BW,UAlDM,aAAA,CAkDN;EAGD,GAAA,EAAA,MAAA;EAKG,IAAA,EAAA,MAAA;EACC,UAAA,EAAA,OAAA;;;AAcd;AAKA;AAQY,UA7EK,iBAAA,CA6E6B;EAK7B,MAAA,EAAA,MAAW,EAAA;EAES,aAAA,EAAA,MAAA,EAAA;EAAoB,YAAA,EAAA,MAAA,EAAA;EACG,SAAA,EAAA,MAAA,EAAA;EAAR,QAAA,EAAA,MAAA,EAAA;;AAGpD;AASA;;AAEqD,UAxFpC,WAAA,CAwFoC;EAAR,GAAA,EAAA,MAAA;EAAO,YAAA,EAAA,MAAA;EAGnC,MAAA,EAAA,MAAA;EAMA,KAAA,EAAA,MAAA;EAWA,WAAA,EAAS,MAAA;EAGP,KAAA,CAAA,EAAA,MAAA;EAAoC,OAAA,CAAA,EAAA,MAAA;EAAR,OAAA,EAAA,MAAA;EAAR,WAAA,EAAA,MAAA;EAAO,OAAA,EAAA,MAAA;EAM7B,SAAA,EAAA,MAAA;EAOZ,MAAA,CAAA,EAAA,MAAA;EAGY,WAAA,CAAA,EAAA,MAAA;EAGE,UAAA,CAAA,EAAA,MAAA;EAAR,QAAA,CAAA,EAAA,MAAA;EAGA,QAAA,CAAA,EAAA,MAAA;EAAa,WAAA,EA3GT,WA2GS;EAMP,QAAA,EAAA,MAAa,EAAA;EAclB,MAAA,CAAA,EA3HD,MA2HC,CAAA,MAAA,EAAA,OAAA,CAAA,EAAA;EAGG,KAAA,CAAA,EA3HL,aA2HK,EAAA;EASP,OAAA,CAAA,EAAA,MAAA;EAGI,aAAA,CAAA,EAAA,MAAA,EAAA;EAGA,QAAA,CAAA,EArIC,iBAqID;EAAgB,SAAA,CAAA,EApId,MAoIc,CAAA,MAAA,EAAA,OAAA,CAAA;WAjIjB;;;;;;;;KAWC,eAAA;;;;KAKA,oBAAA;KAQA,gBAAA,GAAmB,eAAe;;;;UAK7B,WAAA;;qCAEoB,oBAAoB;oDACL,QAAQ;;UAG3C,iBAAA;;;;;;;;UASA,OAAA;;8BAEa,eAAe,QAAQ;;UAGpC,YAAA;;;YAGL;;UAGK,WAAA;;;;;YAKL;;;;;UAMK,SAAA;;;mBAGE,oBAAoB,QAAQ,QAAQ;;;;;UAMtC,iBAAA;;;;KAOZ;iBAGY;WAGN,QAAQ;WAGR;;;;;UAMM,aAAA;;;;;;;;;;YAcL;;eAGG;;;;;;QASP;;YAGI;;YAGA"}
package/package.json CHANGED
@@ -1,119 +1,99 @@
1
1
  {
2
2
  "name": "scrapex",
3
- "version": "0.5.2",
4
- "main": "dist/index.js",
5
- "module": "dist/scrapex.esm.js",
6
- "typings": "dist/index.d.ts",
3
+ "version": "1.0.0-alpha.1",
4
+ "description": "Modern web scraper with LLM-enhanced extraction, extensible pipeline, and pluggable parsers",
5
+ "type": "module",
6
+ "exports": {
7
+ ".": {
8
+ "types": "./dist/index.d.mts",
9
+ "import": "./dist/index.mjs",
10
+ "require": "./dist/index.cjs"
11
+ },
12
+ "./parsers": {
13
+ "types": "./dist/parsers/index.d.mts",
14
+ "import": "./dist/parsers/index.mjs",
15
+ "require": "./dist/parsers/index.cjs"
16
+ },
17
+ "./llm": {
18
+ "types": "./dist/llm/index.d.mts",
19
+ "import": "./dist/llm/index.mjs",
20
+ "require": "./dist/llm/index.cjs"
21
+ }
22
+ },
23
+ "main": "./dist/index.cjs",
24
+ "module": "./dist/index.mjs",
25
+ "types": "./dist/index.d.mts",
7
26
  "files": [
8
27
  "dist"
9
28
  ],
10
29
  "scripts": {
11
- "start": "tsdx watch",
12
- "build": "tsdx build",
13
- "test": "tsdx test",
14
- "test:watch": "npm run test -- --watch",
15
- "prepare": "npm run build",
16
- "release": "npx np"
30
+ "dev": "tsdown --watch",
31
+ "build": "tsdown",
32
+ "test": "vitest run",
33
+ "test:watch": "vitest",
34
+ "test:coverage": "vitest run --coverage",
35
+ "type-check": "tsc --noEmit",
36
+ "lint": "biome lint ./src",
37
+ "lint:fix": "biome lint --write ./src",
38
+ "format": "biome format ./src",
39
+ "format:fix": "biome format --write ./src",
40
+ "check": "biome check ./src",
41
+ "check:fix": "biome check --write ./src",
42
+ "prepublishOnly": "npm run build"
17
43
  },
18
44
  "repository": {
19
45
  "type": "git",
20
46
  "url": "https://github.com/developer-rakeshpaul/scrapex"
21
47
  },
22
- "husky": {
23
- "hooks": {
24
- "pre-commit": "pretty-quick --staged"
25
- }
26
- },
27
- "np": {
28
- "yarn": false,
29
- "contents": "dist"
30
- },
48
+ "keywords": [
49
+ "scraper",
50
+ "web-scraping",
51
+ "metadata",
52
+ "llm",
53
+ "extraction",
54
+ "readability",
55
+ "markdown",
56
+ "parser"
57
+ ],
58
+ "author": "Rakesh Paul <https://binaryroute.com/authors/rk-paul/>",
31
59
  "license": "MIT",
32
- "prettier": {
33
- "printWidth": 80,
34
- "semi": true,
35
- "singleQuote": true,
36
- "trailingComma": "es5"
60
+ "engines": {
61
+ "node": ">=20"
37
62
  },
38
63
  "dependencies": {
39
- "@metascraper/helpers": "^5.24.6",
40
- "@mozilla/readability": "^0.4.1",
41
- "@types/got": "^9.6.12",
42
- "agentkeepalive": "^4.1.4",
43
- "cheerio": "^1.0.0-rc.10",
44
- "domino": "^2.1.6",
45
- "got": "^11.8.2",
46
- "jsdom": "^16.7.0",
47
- "lodash.get": "^4.4.2",
48
- "lodash.uniq": "^4.5.0",
49
- "metascraper": "^5.24.6",
50
- "metascraper-amazon": "^5.24.6",
51
- "metascraper-audio": "^5.24.6",
52
- "metascraper-author": "^5.24.6",
53
- "metascraper-clearbit": "^5.24.6",
54
- "metascraper-date": "^5.24.6",
55
- "metascraper-description": "^5.24.6",
56
- "metascraper-iframe": "^5.24.6",
57
- "metascraper-image": "^5.24.6",
58
- "metascraper-lang": "^5.24.6",
59
- "metascraper-logo": "^5.24.6",
60
- "metascraper-logo-favicon": "^5.24.6",
61
- "metascraper-media-provider": "^5.24.6",
62
- "metascraper-publisher": "^5.24.6",
63
- "metascraper-readability": "^5.24.6",
64
- "metascraper-soundcloud": "^5.24.6",
65
- "metascraper-spotify": "^5.24.6",
66
- "metascraper-title": "^5.24.6",
67
- "metascraper-url": "^5.24.6",
68
- "metascraper-video": "^5.24.6",
69
- "metascraper-youtube": "^5.24.6",
70
- "node-fetch": "^3.0.0",
71
- "page-metadata-parser": "^1.1.4",
72
- "robots-parser": "^2.3.0",
73
- "sanitize-html": "^2.5.0",
74
- "valid-url": "^1.0.9"
64
+ "@mozilla/readability": "^0.6.0",
65
+ "cheerio": "^1.1.2",
66
+ "jsdom": "^27.2.0",
67
+ "mdast-util-from-markdown": "^2.0.2",
68
+ "mdast-util-to-string": "^4.0.0",
69
+ "turndown": "^7.2.2",
70
+ "unist-util-visit": "^5.0.0",
71
+ "zod": "^4.1.13"
75
72
  },
76
73
  "devDependencies": {
77
- "@types/cheerio": "^0.22.30",
78
- "@types/jest": "^27.0.1",
79
- "@types/jsdom": "^16.2.13",
80
- "@types/lodash.get": "^4.4.6",
81
- "@types/lodash.uniq": "^4.5.6",
82
- "@types/metascraper": "^5.14.1",
83
- "@types/metascraper-amazon": "^5.14.0",
84
- "@types/metascraper-audio": "^5.14.0",
85
- "@types/metascraper-author": "^5.14.0",
86
- "@types/metascraper-clearbit": "^5.14.1",
87
- "@types/metascraper-date": "^5.14.0",
88
- "@types/metascraper-description": "^5.14.1",
89
- "@types/metascraper-image": "^5.14.0",
90
- "@types/metascraper-lang": "^5.14.0",
91
- "@types/metascraper-logo": "^5.14.0",
92
- "@types/metascraper-logo-favicon": "^5.14.1",
93
- "@types/metascraper-media-provider": "^5.14.1",
94
- "@types/metascraper-publisher": "^5.14.0",
95
- "@types/metascraper-readability": "^5.14.0",
96
- "@types/metascraper-soundcloud": "^5.14.0",
97
- "@types/metascraper-spotify": "^5.14.0",
98
- "@types/metascraper-title": "^5.14.0",
99
- "@types/metascraper-url": "^5.14.0",
100
- "@types/metascraper-video": "^5.14.0",
101
- "@types/metascraper-youtube": "^5.14.0",
102
- "@types/mozilla-readability": "^0.2.1",
103
- "@types/node": "^16.9.1",
104
- "@types/sanitize-html": "^2.3.2",
105
- "@types/valid-url": "^1.0.3",
106
- "husky": "^7.0.2",
107
- "jest-extended": "^0.11.5",
108
- "prettier": "^2.4.0",
109
- "pretty-quick": "^3.1.1",
110
- "tsdx": "^0.14.1",
111
- "tslib": "^2.3.1",
112
- "typescript": "^4.4.3"
74
+ "@biomejs/biome": "^2.3.8",
75
+ "@types/jsdom": "^27.0.0",
76
+ "@types/mdast": "^4.0.4",
77
+ "@types/node": "^22.10.0",
78
+ "@types/turndown": "^5.0.6",
79
+ "tsdown": "^0.17.0",
80
+ "typescript": "^5.9.3",
81
+ "vitest": "^4.0.15"
113
82
  },
114
- "jest": {
115
- "setupFilesAfterEnv": [
116
- "jest-extended"
117
- ]
83
+ "peerDependencies": {
84
+ "@anthropic-ai/sdk": ">=0.30.0",
85
+ "openai": ">=4.0.0",
86
+ "puppeteer": ">=23.0.0"
87
+ },
88
+ "peerDependenciesMeta": {
89
+ "@anthropic-ai/sdk": {
90
+ "optional": true
91
+ },
92
+ "openai": {
93
+ "optional": true
94
+ },
95
+ "puppeteer": {
96
+ "optional": true
97
+ }
118
98
  }
119
99
  }
package/dist/index.d.ts DELETED
@@ -1,45 +0,0 @@
1
- /// <reference types="cheerio" />
2
- import { IOptions as SanitizeHtmlOptions } from 'sanitize-html';
3
- export interface ILink {
4
- text?: string;
5
- href?: string;
6
- }
7
- export interface IMetadata {
8
- url: string;
9
- date?: string;
10
- image?: string;
11
- publisher?: string;
12
- title?: string;
13
- author?: string;
14
- description?: string;
15
- audio?: string;
16
- logo?: string;
17
- lang?: string;
18
- text?: string;
19
- favicon?: string;
20
- tags: Array<string>;
21
- keywords: Array<string>;
22
- links?: ILink[];
23
- content?: string;
24
- html?: string;
25
- source: string;
26
- video?: string;
27
- code?: string[];
28
- embeds?: Array<Record<string, string | undefined>>;
29
- twitter: Record<string, string | undefined>;
30
- }
31
- export declare function getEmbedAttrs(el: cheerio.TagElement): {
32
- src: string;
33
- height: string;
34
- width: string;
35
- title: string;
36
- };
37
- declare type MetaScraperRules = 'audio' | 'amazon' | 'iframe' | 'media-provider' | 'soundcloud' | 'uol' | 'spotify' | 'video' | 'youtube';
38
- export declare type ScrapeOptions = {
39
- timeout?: number;
40
- metascraperRules?: Array<MetaScraperRules>;
41
- sanitizeOptions?: SanitizeHtmlOptions;
42
- };
43
- export declare const scrape: (url: string, options?: ScrapeOptions | undefined) => Promise<IMetadata | null>;
44
- export declare const scrapeHtml: (url: string, html: string, options?: ScrapeOptions | undefined) => Promise<IMetadata | null>;
45
- export {};
package/dist/index.js DELETED
@@ -1,8 +0,0 @@
1
-
2
- 'use strict'
3
-
4
- if (process.env.NODE_ENV === 'production') {
5
- module.exports = require('./scrapex.cjs.production.min.js')
6
- } else {
7
- module.exports = require('./scrapex.cjs.development.js')
8
- }