@crawlee/linkedom 3.13.1-beta.35 → 3.13.1-beta.37
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -2,7 +2,7 @@ import type { IncomingMessage } from 'node:http';
|
|
|
2
2
|
import type { EnqueueLinksOptions, ErrorHandler, GetUserDataFromRequest, HttpCrawlerOptions, InternalHttpCrawlingContext, InternalHttpHook, RequestHandler, RequestProvider, RouterRoutes } from '@crawlee/http';
|
|
3
3
|
import { HttpCrawler } from '@crawlee/http';
|
|
4
4
|
import type { Dictionary } from '@crawlee/types';
|
|
5
|
-
import { type CheerioRoot } from '@crawlee/utils';
|
|
5
|
+
import { type CheerioRoot, type RobotsTxtFile } from '@crawlee/utils';
|
|
6
6
|
export type LinkeDOMErrorHandler<UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler
|
|
7
7
|
JSONData extends Dictionary = any> = ErrorHandler<LinkeDOMCrawlingContext<UserData, JSONData>>;
|
|
8
8
|
export interface LinkeDOMCrawlerOptions<UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler
|
|
@@ -131,12 +131,13 @@ interface EnqueueLinksInternalOptions {
|
|
|
131
131
|
options?: LinkeDOMCrawlerEnqueueLinksOptions;
|
|
132
132
|
window: Window | null;
|
|
133
133
|
requestQueue: RequestProvider;
|
|
134
|
+
robotsTxtFile?: RobotsTxtFile;
|
|
134
135
|
originalRequestUrl: string;
|
|
135
136
|
finalRequestUrl?: string;
|
|
136
137
|
}
|
|
137
138
|
/** @internal */
|
|
138
139
|
// @ts-ignore optional peer dependency or compatibility with es2022
|
|
139
|
-
export declare function linkedomCrawlerEnqueueLinks({ options, window, requestQueue, originalRequestUrl, finalRequestUrl, }: EnqueueLinksInternalOptions): Promise<import("@crawlee/types").BatchAddRequestsResult>;
|
|
140
|
+
export declare function linkedomCrawlerEnqueueLinks({ options, window, requestQueue, robotsTxtFile, originalRequestUrl, finalRequestUrl, }: EnqueueLinksInternalOptions): Promise<import("@crawlee/types").BatchAddRequestsResult>;
|
|
140
141
|
/**
|
|
141
142
|
* Creates new {@apilink Router} instance that works based on request labels.
|
|
142
143
|
* This instance can then serve as a `requestHandler` of your {@apilink LinkeDOMCrawler}.
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"linkedom-crawler.d.ts","sourceRoot":"","sources":["../../src/internals/linkedom-crawler.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,WAAW,CAAC;AAEjD,OAAO,KAAK,EACR,mBAAmB,EACnB,YAAY,EACZ,sBAAsB,EACtB,kBAAkB,EAClB,2BAA2B,EAC3B,gBAAgB,EAChB,cAAc,EACd,eAAe,EACf,YAAY,EACf,MAAM,eAAe,CAAC;AACvB,OAAO,EAEH,WAAW,EAId,MAAM,eAAe,CAAC;AACvB,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,gBAAgB,CAAC;AACjD,OAAO,EAAE,KAAK,WAAW,EAAS,MAAM,gBAAgB,CAAC;
|
|
1
|
+
{"version":3,"file":"linkedom-crawler.d.ts","sourceRoot":"","sources":["../../src/internals/linkedom-crawler.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,WAAW,CAAC;AAEjD,OAAO,KAAK,EACR,mBAAmB,EACnB,YAAY,EACZ,sBAAsB,EACtB,kBAAkB,EAClB,2BAA2B,EAC3B,gBAAgB,EAChB,cAAc,EACd,eAAe,EACf,YAAY,EACf,MAAM,eAAe,CAAC;AACvB,OAAO,EAEH,WAAW,EAId,MAAM,eAAe,CAAC;AACvB,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,gBAAgB,CAAC;AACjD,OAAO,EAAE,KAAK,WAAW,EAAE,KAAK,aAAa,EAAS,MAAM,gBAAgB,CAAC;AAO7E,MAAM,MAAM,oBAAoB,CAC5B,QAAQ,SAAS,UAAU,GAAG,GAAG,EAAE,2EAA2E;AAC9G,QAAQ,SAAS,UAAU,GAAG,GAAG,IACjC,YAAY,CAAC,uBAAuB,CAAC,QAAQ,EAAE,QAAQ,CAAC,CAAC,CAAC;AAE9D,MAAM,WAAW,sBAAsB,CACnC,QAAQ,SAAS,UAAU,GAAG,GAAG,EAAE,2EAA2E;AAC9G,QAAQ,SAAS,UAAU,GAAG,GAAG,CACnC,SAAQ,kBAAkB,CAAC,uBAAuB,CAAC,QAAQ,EAAE,QAAQ,CAAC,CAAC;CAAG;AAE5E,MAAM,WAAW,kCAAmC,SAAQ,IAAI,CAAC,mBAAmB,EAAE,MAAM,GAAG,cAAc,CAAC;CAAG;AAEjH,MAAM,MAAM,YAAY,CACpB,QAAQ,SAAS,UAAU,GAAG,GAAG,EAAE,2EAA2E;AAC9G,QAAQ,SAAS,UAAU,GAAG,GAAG,IACjC,gBAAgB,CAAC,uBAAuB,CAAC,QAAQ,EAAE,QAAQ,CAAC,CAAC,CAAC;AAElE,MAAM,WAAW,uBAAuB,CACpC,QAAQ,SAAS,UAAU,GAAG,GAAG,EAAE,2EAA2E;AAC9G,QAAQ,SAAS,UAAU,GAAG,GAAG,CACnC,SAAQ,2BAA2B,CAAC,QAAQ,EAAE,QAAQ,EAAE,eAAe,CAAC;IACtE,MAAM,EAAE,MAAM,CAAC;IAMf,QAAQ,EAAE,QAAQ,CAAC;IAEnB;;;;;;;;;;;;OAYG;IACH,eAAe,CAAC,QAAQ,EAAE,MAAM,EAAE,SAAS,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC;IAErE;;;;;;;;;;;OAWG;IACH,gBAAgB,CAAC,QAAQ,CAAC,EAAE,MAAM,EAAE,SAAS,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,WAAW,CAAC,CAAC;CACjF;AAED,MAAM,MAAM,sBAAsB,CAC9B,QAAQ,SAAS,UAAU,GAAG,GAAG,EAAE,2EAA2E;AAC9G,QAAQ,SAAS,UAAU,GAAG,GAAG,IACjC,cAAc,CAAC,uBAAuB,CAAC,QAAQ,EAAE,QAAQ,CAAC,CAAC,CAAC;AAEhE;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAqEG;AAEH,qBAAa,eAAgB,SAAQ,WAAW,CAAC,uBAAuB,CAAC;IACrE,OAAO,CAAC,MAAM,CAAC,MAAM,CAAmB;cAEf,UAAU,CAC/B,QAAQ,EAAE,eAAe,EACzB,KAAK,EAAE,OAAO,EACd,eAAe,EAAE,uBAAuB;;;2BAaF,QAAQ;wCAEJ,kCAAkC;;IAajE,kBAAkB,CAAC,OAAO,EAAE,uBAAuB;CA0BrE;AAED,UAAU,2BAA2B;IACjC,OAAO,CAAC,EAAE,kCAAkC,CAAC;IAC7C,MAAM,EAAE,MAAM,GAAG,IAAI,CAAC;IACtB,YAAY,EAAE,eAAe,CAAC;IAC9B,aAAa,CAAC,EAAE,aAAa,CAAC;IAC9B,kBAAkB,EAAE,MAAM,CAAC;IAC3B,eAAe,CAAC,EAAE,MAAM,CAAC;CAC5B;AAED,gBAAgB;AAChB,wBAAsB,2BAA2B,CAAC,EAC9C,OAAO,EACP,MAAM,EACN,YAAY,EACZ,aAAa,EACb,kBAAkB,EAClB,eAAe,GAClB,EAAE,2BAA2B,4DAyB7B;AAmBD;;;;;;;;;;;;;;;;;;;;;;;GAuBG;AACH,wBAAgB,oBAAoB,CAChC,OAAO,SAAS,uBAAuB,GAAG,uBAAuB,EACjE,QAAQ,SAAS,UAAU,GAAG,sBAAsB,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC,EAC1E,MAAM,CAAC,EAAE,YAAY,CAAC,OAAO,EAAE,QAAQ,CAAC,kDAEzC"}
|
|
@@ -98,6 +98,7 @@ class LinkeDOMCrawler extends http_1.HttpCrawler {
|
|
|
98
98
|
options: enqueueOptions,
|
|
99
99
|
window: document.defaultView,
|
|
100
100
|
requestQueue: await this.getRequestQueue(),
|
|
101
|
+
robotsTxtFile: await this.getRobotsTxtFileForUrl(crawlingContext.request.url),
|
|
101
102
|
originalRequestUrl: crawlingContext.request.url,
|
|
102
103
|
finalRequestUrl: crawlingContext.request.loadedUrl,
|
|
103
104
|
});
|
|
@@ -134,7 +135,7 @@ Object.defineProperty(LinkeDOMCrawler, "parser", {
|
|
|
134
135
|
value: new cached_1.DOMParser()
|
|
135
136
|
});
|
|
136
137
|
/** @internal */
|
|
137
|
-
async function linkedomCrawlerEnqueueLinks({ options, window, requestQueue, originalRequestUrl, finalRequestUrl, }) {
|
|
138
|
+
async function linkedomCrawlerEnqueueLinks({ options, window, requestQueue, robotsTxtFile, originalRequestUrl, finalRequestUrl, }) {
|
|
138
139
|
if (!window) {
|
|
139
140
|
throw new Error('Cannot enqueue links because the DOM is not available.');
|
|
140
141
|
}
|
|
@@ -147,6 +148,7 @@ async function linkedomCrawlerEnqueueLinks({ options, window, requestQueue, orig
|
|
|
147
148
|
const urls = extractUrlsFromWindow(window, options?.selector ?? 'a', options?.baseUrl ?? finalRequestUrl ?? originalRequestUrl);
|
|
148
149
|
return (0, http_1.enqueueLinks)({
|
|
149
150
|
requestQueue,
|
|
151
|
+
robotsTxtFile,
|
|
150
152
|
urls,
|
|
151
153
|
baseUrl,
|
|
152
154
|
...options,
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"linkedom-crawler.js","sourceRoot":"","sources":["../../src/internals/linkedom-crawler.ts"],"names":[],"mappings":";;;
|
|
1
|
+
{"version":3,"file":"linkedom-crawler.js","sourceRoot":"","sources":["../../src/internals/linkedom-crawler.ts"],"names":[],"mappings":";;;AA2OA,kEAgCC;AA2CD,oDAKC;;AA9SD,wCAMuB;AAEvB,0CAA6E;AAC7E,yDAAmC;AACnC,mHAAmH;AACnH,4CAA4C;AAE5C,gDAAwD;AAkExD;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAqEG;AAEH,MAAa,eAAgB,SAAQ,kBAAoC;IAGlD,KAAK,CAAC,UAAU,CAC/B,QAAyB,EACzB,KAAc,EACd,eAAwC;QAExC,MAAM,IAAI,GAAG,MAAM,IAAA,gCAAoB,EAAC,QAAQ,CAAC,CAAC;QAElD,MAAM,QAAQ,GAAG,eAAe,CAAC,MAAM,CAAC,eAAe,CAAC,IAAI,CAAC,QAAQ,EAAE,EAAE,KAAK,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC;QAE3G,OAAO;YACH,MAAM,EAAE,QAAQ,CAAC,WAAW;YAC5B,IAAI,IAAI;gBACJ,OAAO,QAAQ,CAAC,eAAe,CAAC,SAAS,CAAC;YAC9C,CAAC;YACD,IAAI,QAAQ;gBACR,iEAAiE;gBACjE,OAAO,QAA+B,CAAC;YAC3C,CAAC;YACD,YAAY,EAAE,KAAK,EAAE,cAAmD,EAAE,EAAE;gBACxE,OAAO,2BAA2B,CAAC;oBAC/B,OAAO,EAAE,cAAc;oBACvB,MAAM,EAAE,QAAQ,CAAC,WAAW;oBAC5B,YAAY,EAAE,MAAM,IAAI,CAAC,eAAe,EAAE;oBAC1C,aAAa,EAAE,MAAM,IAAI,CAAC,sBAAsB,CAAC,eAAe,CAAC,OAAO,CAAC,GAAG,CAAC;oBAC7E,kBAAkB,EAAE,eAAe,CAAC,OAAO,CAAC,GAAG;oBAC/C,eAAe,EAAE,eAAe,CAAC,OAAO,CAAC,SAAS;iBACrD,CAAC,CAAC;YACP,CAAC;SACJ,CAAC;IACN,CAAC;IAEQ,KAAK,CAAC,kBAAkB,CAAC,OAAgC;QAC9D,OAAO,CAAC,eAAe,GAAG,KAAK,EAAE,QAAgB,EAAE,SAAS,GAAG,IAAK,EAAE,EAAE;YACpE,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;YAErC,IAAI,CAAC,CAAC,QAAQ,CAAC,CAAC,GAAG,EAAE,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;gBACjC,IAAI,SAAS,EAAE,CAAC;oBACZ,MAAM,IAAA,aAAK,EAAC,EAAE,CAAC,CAAC;oBAChB,MAAM,OAAO,CAAC,eAAe,CAAC,QAAQ,EAAE,IAAI,CAAC,GAAG,CAAC,SAAS,GAAG,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC;oBACrE,OAAO;gBACX,CAAC;gBAED,MAAM,IAAI,KAAK,CAAC,aAAa,QAAQ,cAAc,CAAC,CAAC;YACzD,CAAC;QACL,CAAC,CAAC;QACF,OAAO,CAAC,gBAAgB,GAAG,KAAK,EAAE,QAAiB,EAAE,UAAU,GAAG,IAAK,EAAE,EAAE;YACvE,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;YAErC,IAAI,QAAQ,IAAI,CAAC,CAAC,QAAQ,CAAC,CAAC,GAAG,EAAE,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;gBAC7C,MAAM,IAAI,KAAK,CAAC,aAAa,QAAQ,cAAc,CAAC,CAAC;YACzD,CAAC;YAED,OAAO,CAAC,CAAC;QACb,CAAC,CAAC;QAEF,MAAM,KAAK,CAAC,kBAAkB,CAAC,OAAO,CAAC,CAAC;IAC5C,CAAC;;AA3DL,0CA4DC;AA3DkB;;;;WAAS,IAAI,kBAAS,EAAE;GAAC;AAsE5C,gBAAgB;AACT,KAAK,UAAU,2BAA2B,CAAC,EAC9C,OAAO,EACP,MAAM,EACN,YAAY,EACZ,aAAa,EACb,kBAAkB,EAClB,eAAe,GACW;IAC1B,IAAI,CAAC,MAAM,EAAE,CAAC;QACV,MAAM,IAAI,KAAK,CAAC,wDAAwD,CAAC,CAAC;IAC9E,CAAC;IAED,MAAM,OAAO,GAAG,IAAA,6CAAsC,EAAC;QACnD,eAAe,EAAE,OAAO,EAAE,QAAQ;QAClC,eAAe;QACf,kBAAkB;QAClB,mBAAmB,EAAE,OAAO,EAAE,OAAO;KACxC,CAAC,CAAC;IAEH,MAAM,IAAI,GAAG,qBAAqB,CAC9B,MAAM,EACN,OAAO,EAAE,QAAQ,IAAI,GAAG,EACxB,OAAO,EAAE,OAAO,IAAI,eAAe,IAAI,kBAAkB,CAC5D,CAAC;IAEF,OAAO,IAAA,mBAAY,EAAC;QAChB,YAAY;QACZ,aAAa;QACb,IAAI;QACJ,OAAO;QACP,GAAG,OAAO;KACb,CAAC,CAAC;AACP,CAAC;AAED;;;GAGG;AACH,SAAS,qBAAqB,CAAC,MAAc,EAAE,QAAgB,EAAE,OAAe;IAC5E,OAAO,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,gBAAgB,CAAC,QAAQ,CAAC,CAAC;SACxD,GAAG,CAAC,CAAC,CAAM,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC;SACvB,MAAM,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,KAAK,SAAS,IAAI,IAAI,KAAK,EAAE,CAAC;SACnD,GAAG,CAAC,CAAC,IAAwB,EAAE,EAAE;QAC9B,IAAI,IAAI,KAAK,SAAS,EAAE,CAAC;YACrB,OAAO,SAAS,CAAC;QACrB,CAAC;QACD,OAAO,IAAA,qBAAc,EAAC,IAAI,EAAE,OAAO,CAAC,CAAC;IACzC,CAAC,CAAC;SACD,MAAM,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,KAAK,SAAS,IAAI,IAAI,KAAK,EAAE,CAAa,CAAC;AACzE,CAAC;AAED;;;;;;;;;;;;;;;;;;;;;;;GAuBG;AACH,SAAgB,oBAAoB,CAGlC,MAAwC;IACtC,OAAO,aAAM,CAAC,MAAM,CAAU,MAAM,CAAC,CAAC;AAC1C,CAAC"}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@crawlee/linkedom",
|
|
3
|
-
"version": "3.13.1-beta.
|
|
3
|
+
"version": "3.13.1-beta.37",
|
|
4
4
|
"description": "The scalable web crawling and scraping library for JavaScript/Node.js. Enables development of data extraction and web automation jobs (not only) with headless Chrome and Puppeteer.",
|
|
5
5
|
"engines": {
|
|
6
6
|
"node": ">=16.0.0"
|
|
@@ -55,8 +55,8 @@
|
|
|
55
55
|
"dependencies": {
|
|
56
56
|
"@apify/timeout": "^0.3.0",
|
|
57
57
|
"@apify/utilities": "^2.7.10",
|
|
58
|
-
"@crawlee/http": "3.13.1-beta.
|
|
59
|
-
"@crawlee/types": "3.13.1-beta.
|
|
58
|
+
"@crawlee/http": "3.13.1-beta.37",
|
|
59
|
+
"@crawlee/types": "3.13.1-beta.37",
|
|
60
60
|
"linkedom": "^0.18.0",
|
|
61
61
|
"ow": "^0.28.2",
|
|
62
62
|
"tslib": "^2.4.0"
|
|
@@ -68,5 +68,5 @@
|
|
|
68
68
|
}
|
|
69
69
|
}
|
|
70
70
|
},
|
|
71
|
-
"gitHead": "
|
|
71
|
+
"gitHead": "f69267d6ef883e536245e482b7af10e39ea4008d"
|
|
72
72
|
}
|