crawlee-one 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. package/README.md +81 -0
  2. package/dist/cjs/cli/cli.d.ts +1 -0
  3. package/dist/cjs/cli/cli.js +61 -0
  4. package/dist/cjs/cli/cli.js.map +1 -0
  5. package/dist/cjs/cli/index.d.ts +2 -0
  6. package/dist/cjs/cli/index.js +6 -0
  7. package/dist/cjs/cli/index.js.map +1 -0
  8. package/dist/cjs/index.d.ts +24 -0
  9. package/dist/cjs/index.js +43 -0
  10. package/dist/cjs/index.js.map +1 -0
  11. package/dist/cjs/lib/actions/dom.d.ts +102 -0
  12. package/dist/cjs/lib/actions/dom.js +743 -0
  13. package/dist/cjs/lib/actions/dom.js.map +1 -0
  14. package/dist/cjs/lib/actions/domUtils.d.ts +42 -0
  15. package/dist/cjs/lib/actions/domUtils.js +126 -0
  16. package/dist/cjs/lib/actions/domUtils.js.map +1 -0
  17. package/dist/cjs/lib/actions/page.d.ts +69 -0
  18. package/dist/cjs/lib/actions/page.js +205 -0
  19. package/dist/cjs/lib/actions/page.js.map +1 -0
  20. package/dist/cjs/lib/actions/scrapeListing.d.ts +78 -0
  21. package/dist/cjs/lib/actions/scrapeListing.js +242 -0
  22. package/dist/cjs/lib/actions/scrapeListing.js.map +1 -0
  23. package/dist/cjs/lib/actor/actor.d.ts +90 -0
  24. package/dist/cjs/lib/actor/actor.js +306 -0
  25. package/dist/cjs/lib/actor/actor.js.map +1 -0
  26. package/dist/cjs/lib/actor/types.d.ts +162 -0
  27. package/dist/cjs/lib/actor/types.js +3 -0
  28. package/dist/cjs/lib/actor/types.js.map +1 -0
  29. package/dist/cjs/lib/actor.d.ts +189 -0
  30. package/dist/cjs/lib/actor.js +225 -0
  31. package/dist/cjs/lib/actor.js.map +1 -0
  32. package/dist/cjs/lib/actorSpec.d.ts +20 -0
  33. package/dist/cjs/lib/actorSpec.js +3 -0
  34. package/dist/cjs/lib/actorSpec.js.map +1 -0
  35. package/dist/cjs/lib/config.d.ts +561 -0
  36. package/dist/cjs/lib/config.js +707 -0
  37. package/dist/cjs/lib/config.js.map +1 -0
  38. package/dist/cjs/lib/dataset/maxCount.d.ts +30 -0
  39. package/dist/cjs/lib/dataset/maxCount.js +55 -0
  40. package/dist/cjs/lib/dataset/maxCount.js.map +1 -0
  41. package/dist/cjs/lib/dataset/pushData.d.ts +123 -0
  42. package/dist/cjs/lib/dataset/pushData.js +182 -0
  43. package/dist/cjs/lib/dataset/pushData.js.map +1 -0
  44. package/dist/cjs/lib/dataset.d.ts +98 -0
  45. package/dist/cjs/lib/dataset.js +122 -0
  46. package/dist/cjs/lib/dataset.js.map +1 -0
  47. package/dist/cjs/lib/dom.d.ts +78 -0
  48. package/dist/cjs/lib/dom.js +243 -0
  49. package/dist/cjs/lib/dom.js.map +1 -0
  50. package/dist/cjs/lib/error/errorHandler.d.ts +112 -0
  51. package/dist/cjs/lib/error/errorHandler.js +164 -0
  52. package/dist/cjs/lib/error/errorHandler.js.map +1 -0
  53. package/dist/cjs/lib/error/sentry.d.ts +11 -0
  54. package/dist/cjs/lib/error/sentry.js +60 -0
  55. package/dist/cjs/lib/error/sentry.js.map +1 -0
  56. package/dist/cjs/lib/integrations/apify.d.ts +67 -0
  57. package/dist/cjs/lib/integrations/apify.js +106 -0
  58. package/dist/cjs/lib/integrations/apify.js.map +1 -0
  59. package/dist/cjs/lib/integrations/types.d.ts +274 -0
  60. package/dist/cjs/lib/integrations/types.js +3 -0
  61. package/dist/cjs/lib/integrations/types.js.map +1 -0
  62. package/dist/cjs/lib/io/dataset.d.ts +67 -0
  63. package/dist/cjs/lib/io/dataset.js +86 -0
  64. package/dist/cjs/lib/io/dataset.js.map +1 -0
  65. package/dist/cjs/lib/io/maxCount.d.ts +30 -0
  66. package/dist/cjs/lib/io/maxCount.js +55 -0
  67. package/dist/cjs/lib/io/maxCount.js.map +1 -0
  68. package/dist/cjs/lib/io/pushData.d.ts +124 -0
  69. package/dist/cjs/lib/io/pushData.js +193 -0
  70. package/dist/cjs/lib/io/pushData.js.map +1 -0
  71. package/dist/cjs/lib/io/pushRequests.d.ts +38 -0
  72. package/dist/cjs/lib/io/pushRequests.js +63 -0
  73. package/dist/cjs/lib/io/pushRequests.js.map +1 -0
  74. package/dist/cjs/lib/io/requestQueue.d.ts +28 -0
  75. package/dist/cjs/lib/io/requestQueue.js +40 -0
  76. package/dist/cjs/lib/io/requestQueue.js.map +1 -0
  77. package/dist/cjs/lib/log.d.ts +38 -0
  78. package/dist/cjs/lib/log.js +54 -0
  79. package/dist/cjs/lib/log.js.map +1 -0
  80. package/dist/cjs/lib/migrate/localMigrator.d.ts +10 -0
  81. package/dist/cjs/lib/migrate/localMigrator.js +57 -0
  82. package/dist/cjs/lib/migrate/localMigrator.js.map +1 -0
  83. package/dist/cjs/lib/migrate/localState.d.ts +7 -0
  84. package/dist/cjs/lib/migrate/localState.js +43 -0
  85. package/dist/cjs/lib/migrate/localState.js.map +1 -0
  86. package/dist/cjs/lib/migrate/types.d.ts +6 -0
  87. package/dist/cjs/lib/migrate/types.js +3 -0
  88. package/dist/cjs/lib/migrate/types.js.map +1 -0
  89. package/dist/cjs/lib/readme/readme.d.ts +65 -0
  90. package/dist/cjs/lib/readme/readme.js +534 -0
  91. package/dist/cjs/lib/readme/readme.js.map +1 -0
  92. package/dist/cjs/lib/readme/types.d.ts +260 -0
  93. package/dist/cjs/lib/readme/types.js +54 -0
  94. package/dist/cjs/lib/readme/types.js.map +1 -0
  95. package/dist/cjs/lib/router.d.ts +132 -0
  96. package/dist/cjs/lib/router.js +165 -0
  97. package/dist/cjs/lib/router.js.map +1 -0
  98. package/dist/cjs/lib/scraper/scrapeListing.d.ts +78 -0
  99. package/dist/cjs/lib/scraper/scrapeListing.js +242 -0
  100. package/dist/cjs/lib/scraper/scrapeListing.js.map +1 -0
  101. package/dist/cjs/lib/test/actor.d.ts +21 -0
  102. package/dist/cjs/lib/test/actor.js +56 -0
  103. package/dist/cjs/lib/test/actor.js.map +1 -0
  104. package/dist/cjs/lib/test/mockApifyClient.d.ts +32 -0
  105. package/dist/cjs/lib/test/mockApifyClient.js +176 -0
  106. package/dist/cjs/lib/test/mockApifyClient.js.map +1 -0
  107. package/dist/cjs/types.d.ts +31 -0
  108. package/dist/cjs/types.js +3 -0
  109. package/dist/cjs/types.js.map +1 -0
  110. package/dist/cjs/utils/async.d.ts +19 -0
  111. package/dist/cjs/utils/async.js +74 -0
  112. package/dist/cjs/utils/async.js.map +1 -0
  113. package/dist/cjs/utils/error.d.ts +1 -0
  114. package/dist/cjs/utils/error.js +10 -0
  115. package/dist/cjs/utils/error.js.map +1 -0
  116. package/dist/cjs/utils/format.d.ts +9 -0
  117. package/dist/cjs/utils/format.js +19 -0
  118. package/dist/cjs/utils/format.js.map +1 -0
  119. package/dist/cjs/utils/package.d.ts +15 -0
  120. package/dist/cjs/utils/package.js +25 -0
  121. package/dist/cjs/utils/package.js.map +1 -0
  122. package/dist/cjs/utils/types.d.ts +6 -0
  123. package/dist/cjs/utils/types.js +9 -0
  124. package/dist/cjs/utils/types.js.map +1 -0
  125. package/dist/cjs/utils/url.d.ts +9 -0
  126. package/dist/cjs/utils/url.js +32 -0
  127. package/dist/cjs/utils/url.js.map +1 -0
  128. package/dist/cjs/utils/valueMonitor.d.ts +31 -0
  129. package/dist/cjs/utils/valueMonitor.js +91 -0
  130. package/dist/cjs/utils/valueMonitor.js.map +1 -0
  131. package/package.json +85 -0
@@ -0,0 +1,98 @@
1
+ import type { CrawlingContext } from 'crawlee';
2
+ export interface ActorEntryMetadata {
3
+ actorId: string | null;
4
+ actorRunId: string | null;
5
+ actorRunUrl: string | null;
6
+ contextId: string;
7
+ requestId: string | null;
8
+ /** The URL given to the crawler */
9
+ originalUrl: string | null;
10
+ /** The URL given to the crawler after possible redirects */
11
+ loadedUrl: string | null;
12
+ /** ISO datetime string that indicates the time when the request has been processed. */
13
+ dateHandled: string;
14
+ numberOfRetries: number;
15
+ }
16
+ /** Add metadata to the object */
17
+ export type WithActorEntryMetadata<T> = T & {
18
+ metadata: ActorEntryMetadata;
19
+ };
20
+ /** Functions that generates a "redacted" version of a value */
21
+ export type PrivateValueGen<V, K, O> = (val: V, key: K, obj: O) => any;
22
+ /**
23
+ * Given a property value (and its position) this function
24
+ * determines if the property is considered private (and
25
+ * hence should be hidden for privacy reasons).
26
+ *
27
+ * Property is private if the function returns truthy value.
28
+ */
29
+ export type PrivacyFilter<V, K, O> = (val: V, key: K, obj: O, options?: {
30
+ setCustomPrivateValue: (val: V) => any;
31
+ privateValueGen: PrivateValueGen<V, K, O>;
32
+ }) => any;
33
+ /**
34
+ * PrivacyMask determines which (potentally nested) properties
35
+ * of an object are considered private.
36
+ *
37
+ * PrivacyMask copies the structure of another object, but each
38
+ * non-object property on PrivacyMask is a PrivacyFilter - function
39
+ * that determines if the property is considered private.
40
+ *
41
+ * Property is private if the function returns truthy value.
42
+ */
43
+ export type PrivacyMask<T extends object> = {
44
+ [Key in keyof T]?: T[Key] extends Date | any[] ? PrivacyFilter<T[Key], Key, T> : T[Key] extends object ? PrivacyMask<T[Key]> : PrivacyFilter<T[Key], Key, T>;
45
+ };
46
+ export interface PushDataOptions<T extends object> {
47
+ /**
48
+ * Whether items should be enriched with request and run metadata.
49
+ *
50
+ * If truthy, the metadata is set under the `metadata` property.
51
+ */
52
+ includeMetadata?: boolean;
53
+ /**
54
+ * Whether properties that are considered personal data should be shown as is.
55
+ *
56
+ * If falsy or not set, these properties are redacted to hide the actual information.
57
+ *
58
+ * Which properties are personal data is determined by `privacyMask`.
59
+ */
60
+ showPrivate?: boolean;
61
+ /**
62
+ * Determine which properties are considered personal data.
63
+ *
64
+ * See {@link PrivacyMask}.
65
+ **/
66
+ privacyMask: PrivacyMask<T>;
67
+ /**
68
+ * Option to select which keys (fields) of an entry to keep (discarding the rest)
69
+ * before pushing the entries to the dataset.
70
+ *
71
+ * This serves mainly to allow users to select the keys from actor input UI.
72
+ *
73
+ * This is done before `remapKeys`.
74
+ *
75
+ * Keys can be nested, e.g. `"someProp.value[0]"`. Nested path is
76
+ * resolved using Lodash.get().
77
+ */
78
+ pickKeys?: string[];
79
+ /**
80
+ * Option to remap the keys before pushing the entries to the dataset.
81
+ *
82
+ * This serves mainly to allow users to remap the keys from actor input UI.
83
+ *
84
+ * Keys can be nested, e.g. `"someProp.value[0]"`. Nested path is
85
+ * resolved using Lodash.get().
86
+ */
87
+ remapKeys?: Record<string, string>;
88
+ /** ID or name of the dataset to which the data should be pushed */
89
+ datasetIdOrName?: string;
90
+ }
91
+ /**
92
+ * `Actor.pushData` with extra features:
93
+ *
94
+ * - (Optionally) Add metadata to entries before they are pushed to dataset.
95
+ * - (Optionally) Set which (nested) properties are personal data and allow to
96
+ * redact them for privacy compliance.
97
+ */
98
+ export declare const pushData: <Ctx extends CrawlingContext<unknown, import("crawlee").Dictionary>, T extends Record<any, any> = Record<any, any>>(oneOrManyItems: T | T[], ctx: Ctx, options: PushDataOptions<T>) => Promise<Pick<T | WithActorEntryMetadata<Record<any, any>>, string>[]>;
@@ -0,0 +1,122 @@
1
+ "use strict";
2
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
3
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
4
+ return new (P || (P = Promise))(function (resolve, reject) {
5
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
6
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
7
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
8
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
9
+ });
10
+ };
11
+ Object.defineProperty(exports, "__esModule", { value: true });
12
+ exports.pushData = void 0;
13
+ const apify_1 = require("apify");
14
+ const lodash_1 = require("lodash");
15
+ const createMetadataMapper = (ctx) => {
16
+ const { actorId, actorRunId } = apify_1.Actor.getEnv();
17
+ const actorRunUrl = actorId != null && actorRunId != null
18
+ ? `https://console.apify.com/actors/${actorId}/runs/${actorRunId}`
19
+ : null;
20
+ const handledAt = new Date().toISOString();
21
+ const addMetadataToData = (item) => {
22
+ var _a, _b, _c;
23
+ return (Object.assign(Object.assign({}, item), { metadata: {
24
+ actorId,
25
+ actorRunId,
26
+ actorRunUrl,
27
+ contextId: ctx.id,
28
+ requestId: (_a = ctx.request.id) !== null && _a !== void 0 ? _a : null,
29
+ originalUrl: (_b = ctx.request.url) !== null && _b !== void 0 ? _b : null,
30
+ loadedUrl: (_c = ctx.request.loadedUrl) !== null && _c !== void 0 ? _c : null,
31
+ dateHandled: ctx.request.handledAt || handledAt,
32
+ numberOfRetries: ctx.request.retryCount,
33
+ } }));
34
+ };
35
+ return addMetadataToData;
36
+ };
37
+ const applyPrivacyMask = (item, options) => {
38
+ const { showPrivate, privacyMask, privateValueGen = (_, key) => `<Redacted property "${key}">`, } = options;
39
+ const resolvePrivateValue = (key, val) => {
40
+ // Allow to set custom "redacted" value by calling
41
+ // `setCustomPrivateValue` from inside the filter function.
42
+ let customPrivateValue;
43
+ let setCustomPrivateValueCalled = false;
44
+ const setCustomPrivateValue = (val) => {
45
+ customPrivateValue = val;
46
+ setCustomPrivateValueCalled = true;
47
+ };
48
+ const privacyFilter = privacyMask[key];
49
+ const isPrivate = privacyFilter
50
+ ? privacyFilter(val, key, item, { setCustomPrivateValue, privateValueGen })
51
+ : false;
52
+ // prettier-ignore
53
+ const privateValue = (
54
+ // Don't redact anything if we're asked to show private data
55
+ showPrivate ? val
56
+ // Otherwise, if custom value was given, use that
57
+ : setCustomPrivateValueCalled ? customPrivateValue
58
+ // Otherwise, decide based on filter truthiness
59
+ : isPrivate ? privateValueGen(key, val, item) : val);
60
+ return privateValue;
61
+ };
62
+ const redactedObj = Object.entries(item).reduce((agg, [key, val]) => {
63
+ var _a;
64
+ const isNestedObj = typeof val === 'object' && val != null && !(val instanceof Date) && !Array.isArray(val);
65
+ if (isNestedObj) {
66
+ // Recursively process nested objects
67
+ const subObj = applyPrivacyMask(val, {
68
+ showPrivate,
69
+ privacyMask: ((_a = privacyMask[key]) !== null && _a !== void 0 ? _a : {}),
70
+ privateValueGen,
71
+ });
72
+ agg[key] = subObj;
73
+ }
74
+ else {
75
+ agg[key] = resolvePrivateValue(key, val);
76
+ }
77
+ return agg;
78
+ }, {});
79
+ return redactedObj;
80
+ };
81
+ /** Rename object properties in place */
82
+ const renameKeys = (item, keyNameMap) => {
83
+ Object.entries(keyNameMap || {}).forEach(([oldPath, newPath]) => {
84
+ if (oldPath === newPath)
85
+ return;
86
+ const val = (0, lodash_1.get)(item, oldPath);
87
+ (0, lodash_1.set)(item, newPath, val);
88
+ (0, lodash_1.unset)(item, oldPath);
89
+ });
90
+ return item;
91
+ };
92
+ /**
93
+ * `Actor.pushData` with extra features:
94
+ *
95
+ * - (Optionally) Add metadata to entries before they are pushed to dataset.
96
+ * - (Optionally) Set which (nested) properties are personal data and allow to
97
+ * redact them for privacy compliance.
98
+ */
99
+ const pushData = (oneOrManyItems, ctx, options) => __awaiter(void 0, void 0, void 0, function* () {
100
+ const { includeMetadata, showPrivate, privacyMask, remapKeys, pickKeys, datasetIdOrName } = options;
101
+ const items = Array.isArray(oneOrManyItems) ? oneOrManyItems : [oneOrManyItems];
102
+ ctx.log.debug(`Preparing entries before pushing ${items.length} items to dataset`); // prettier-ignore
103
+ const addMetadataToData = createMetadataMapper(ctx);
104
+ const adjustedItems = items.map((item) => {
105
+ const itemWithMetadata = includeMetadata ? addMetadataToData(item) : item;
106
+ const maskedItem = applyPrivacyMask(itemWithMetadata, {
107
+ showPrivate,
108
+ privacyMask,
109
+ privateValueGen: (val, key) => `<Redacted property "${key}". To include the actual value, toggle ON the Actor input option "Include personal data">`,
110
+ });
111
+ const pickedItem = pickKeys ? (0, lodash_1.pick)(maskedItem, pickKeys) : maskedItem;
112
+ const renamedItem = remapKeys ? renameKeys(pickedItem, remapKeys) : pickedItem;
113
+ return renamedItem;
114
+ });
115
+ ctx.log.info(`Pushing ${adjustedItems.length} entries to dataset`);
116
+ const dataset = datasetIdOrName ? yield apify_1.Actor.openDataset(datasetIdOrName) : apify_1.Actor;
117
+ yield dataset.pushData(adjustedItems);
118
+ ctx.log.info(`Done pushing ${adjustedItems.length} entries to dataset`);
119
+ return adjustedItems;
120
+ });
121
+ exports.pushData = pushData;
122
+ //# sourceMappingURL=dataset.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"dataset.js","sourceRoot":"","sources":["../../../src/lib/dataset.ts"],"names":[],"mappings":";;;;;;;;;;;;AAAA,iCAA8B;AAE9B,mCAA+C;AA0G/C,MAAM,oBAAoB,GAAG,CAA8B,GAAQ,EAAE,EAAE;IACrE,MAAM,EAAE,OAAO,EAAE,UAAU,EAAE,GAAG,aAAK,CAAC,MAAM,EAAE,CAAC;IAC/C,MAAM,WAAW,GACf,OAAO,IAAI,IAAI,IAAI,UAAU,IAAI,IAAI;QACnC,CAAC,CAAC,oCAAoC,OAAO,SAAS,UAAU,EAAE;QAClE,CAAC,CAAC,IAAI,CAAC;IACX,MAAM,SAAS,GAAG,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;IAE3C,MAAM,iBAAiB,GAAG,CACxB,IAAI,EACuB,EAAE;;QAAC,OAAA,iCAC3B,IAAI,KACP,QAAQ,EAAE;gBACR,OAAO;gBACP,UAAU;gBACV,WAAW;gBACX,SAAS,EAAE,GAAG,CAAC,EAAE;gBACjB,SAAS,EAAE,MAAA,GAAG,CAAC,OAAO,CAAC,EAAE,mCAAI,IAAI;gBAEjC,WAAW,EAAE,MAAA,GAAG,CAAC,OAAO,CAAC,GAAG,mCAAI,IAAI;gBACpC,SAAS,EAAE,MAAA,GAAG,CAAC,OAAO,CAAC,SAAS,mCAAI,IAAI;gBAExC,WAAW,EAAE,GAAG,CAAC,OAAO,CAAC,SAAS,IAAI,SAAS;gBAC/C,eAAe,EAAE,GAAG,CAAC,OAAO,CAAC,UAAU;aACxC,IACD,CAAA;KAAA,CAAC;IACH,OAAO,iBAAiB,CAAC;AAC3B,CAAC,CAAC;AAEF,MAAM,gBAAgB,GAAG,CACvB,IAAO,EACP,OAIC,EACD,EAAE;IACF,MAAM,EACJ,WAAW,EACX,WAAW,EACX,eAAe,GAAG,CAAC,CAAC,EAAE,GAAG,EAAE,EAAE,CAAC,uBAAuB,GAAG,IAAI,GAC7D,GAAG,OAAO,CAAC;IAEZ,MAAM,mBAAmB,GAAG,CAAC,GAAW,EAAE,GAAQ,EAAE,EAAE;QACpD,kDAAkD;QAClD,2DAA2D;QAC3D,IAAI,kBAAkB,CAAC;QACvB,IAAI,2BAA2B,GAAG,KAAK,CAAC;QACxC,MAAM,qBAAqB,GAAG,CAAC,GAAQ,EAAE,EAAE;YACzC,kBAAkB,GAAG,GAAG,CAAC;YACzB,2BAA2B,GAAG,IAAI,CAAC;QACrC,CAAC,CAAC;QAEF,MAAM,aAAa,GAAG,WAAW,CAAC,GAAG,CAA6C,CAAC;QACnF,MAAM,SAAS,GAAG,aAAa;YAC7B,CAAC,CAAC,aAAa,CAAC,GAAG,EAAE,GAAG,EAAE,IAAI,EAAE,EAAE,qBAAqB,EAAE,eAAe,EAAE,CAAC;YAC3E,CAAC,CAAC,KAAK,CAAC;QAEV,kBAAkB;QAClB,MAAM,YAAY,GAAG;QACnB,4DAA4D;QAC5D,WAAW,CAAC,CAAC,CAAC,GAAG;YACjB,iDAAiD;YACjD,CAAC,CAAC,2BAA2B,CAAC,CAAC,CAAC,kBAAkB;gBAClD,+CAA+C;gBAC/C,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,eAAe,CAAC,GAAG,EAAE,GAAG,EAAE,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG,CACpD,CAAC;QACF,OAAO,YAAY,CAAC;IACtB,CAAC,CAAC;IAEF,MAAM,WAAW,GAAG,MAAM,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,GAAG,EAAE,GAAG,CAAC,EAAE,EAAE;;QAClE,MAAM,WAAW,GACf,OAAO,GAAG,KAAK,QAAQ,IAAI,GAAG,IAAI,IAAI,IAAI,CAAC,CAAC,GAAG,YAAY,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC;QAE1F,IAAI,WAAW,EAAE;YACf,qCAAqC;YACrC,MAAM,MAAM,GAAG,gBAAgB,CAAC,GAAG,EAAE;gBACnC,WAAW;gBACX,WAAW,EAAE,CAAC,MAAA,WAAW,CAAC,GAAG,CAAC,mCAAI,EAAE,CAAQ;gBAC5C,eAAe;aAChB,CAAC,CAAC;YACH,GAAG,CAAC,GAAc,CAAC,GAAG,MAAa,CAAC;SACrC;aAAM;YACL,GAAG,CAAC,GAAc,CAAC,GAAG,mBAAmB,CAAC,GAAG,EAAE,GAAG,CAAC,CAAC;SACrD;QACD,OAAO,GAAG,CAAC;IACb,CAAC,EAAE,EAAO,CAAC,CAAC;IAEZ,OAAO,WAAW,CAAC;AACrB,CAAC,CAAC;AAEF,wCAAwC;AACxC,MAAM,UAAU,GAAG,CAAmB,IAAO,EAAE,UAA4C,EAAE,EAAE;IAC7F,MAAM,CAAC,OAAO,CAAC,UAAU,IAAI,EAAE,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,OAAO,EAAE,OAAO,CAAC,EAAE,EAAE;QAC9D,IAAI,OAAO,KAAK,OAAO;YAAE,OAAO;QAChC,MAAM,GAAG,GAAG,IAAA,YAAG,EAAC,IAAI,EAAE,OAAO,CAAC,CAAC;QAC/B,IAAA,YAAG,EAAC,IAAI,EAAE,OAAiB,EAAE,GAAG,CAAC,CAAC;QAClC,IAAA,cAAK,EAAC,IAAI,EAAE,OAAO,CAAC,CAAC;IACvB,CAAC,CAAC,CAAC;IACH,OAAO,IAAI,CAAC;AACd,CAAC,CAAC;AAEF;;;;;;GAMG;AACI,MAAM,QAAQ,GAAG,CAItB,cAAuB,EACvB,GAAQ,EACR,OAA2B,EAC3B,EAAE;IACF,MAAM,EAAE,eAAe,EAAE,WAAW,EAAE,WAAW,EAAE,SAAS,EAAE,QAAQ,EAAE,eAAe,EAAE,GACvF,OAAO,CAAC;IAEV,MAAM,KAAK,GAAG,KAAK,CAAC,OAAO,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,cAAc,CAAC,CAAC;IAEhF,GAAG,CAAC,GAAG,CAAC,KAAK,CAAC,oCAAoC,KAAK,CAAC,MAAM,mBAAmB,CAAC,CAAC,CAAC,kBAAkB;IACtG,MAAM,iBAAiB,GAAG,oBAAoB,CAAC,GAAG,CAAC,CAAC;IACpD,MAAM,aAAa,GAAG,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE;QACvC,MAAM,gBAAgB,GAAG,eAAe,CAAC,CAAC,CAAC,iBAAiB,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC;QAC1E,MAAM,UAAU,GAAG,gBAAgB,CAAC,gBAAgB,EAAE;YACpD,WAAW;YACX,WAAW;YACX,eAAe,EAAE,CAAC,GAAG,EAAE,GAAG,EAAE,EAAE,CAC5B,uBAAuB,GAAG,2FAA2F;SACxH,CAAC,CAAC;QAEH,MAAM,UAAU,GAAG,QAAQ,CAAC,CAAC,CAAC,IAAA,aAAI,EAAC,UAAU,EAAE,QAAQ,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC;QACtE,MAAM,WAAW,GAAG,SAAS,CAAC,CAAC,CAAC,UAAU,CAAC,UAAU,EAAE,SAAS,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC;QAE/E,OAAO,WAAW,CAAC;IACrB,CAAC,CAAC,CAAC;IAEH,GAAG,CAAC,GAAG,CAAC,IAAI,CAAC,WAAW,aAAa,CAAC,MAAM,qBAAqB,CAAC,CAAC;IACnE,MAAM,OAAO,GAAG,eAAe,CAAC,CAAC,CAAC,MAAM,aAAK,CAAC,WAAW,CAAC,eAAe,CAAC,CAAC,CAAC,CAAC,aAAK,CAAC;IACnF,MAAM,OAAO,CAAC,QAAQ,CAAC,aAAa,CAAC,CAAC;IACtC,GAAG,CAAC,GAAG,CAAC,IAAI,CAAC,gBAAgB,aAAa,CAAC,MAAM,qBAAqB,CAAC,CAAC;IAExE,OAAO,aAAa,CAAC;AACvB,CAAC,CAAA,CAAC;AApCW,QAAA,QAAQ,YAoCnB","sourcesContent":["import { Actor } from 'apify';\nimport type { CrawlingContext } from 'crawlee';\nimport { get, pick, set, unset } from 'lodash';\n\nexport interface ActorEntryMetadata {\n actorId: string | null;\n actorRunId: string | null;\n actorRunUrl: string | null;\n contextId: string;\n requestId: string | null;\n\n /** The URL given to the crawler */\n originalUrl: string | null;\n /** The URL given to the crawler after possible redirects */\n loadedUrl: string | null;\n\n /** ISO datetime string that indicates the time when the request has been processed. */\n dateHandled: string;\n numberOfRetries: number;\n}\n\n/** Add metadata to the object */\nexport type WithActorEntryMetadata<T> = T & { metadata: ActorEntryMetadata };\n\n/** Functions that generates a \"redacted\" version of a value */\nexport type PrivateValueGen<V, K, O> = (val: V, key: K, obj: O) => any;\n\n/**\n * Given a property value (and its position) this function\n * determines if the property is considered private (and\n * hence should be hidden for privacy reasons).\n *\n * Property is private if the function returns truthy value.\n */\nexport type PrivacyFilter<V, K, O> = (\n val: V,\n key: K,\n obj: O,\n options?: {\n setCustomPrivateValue: (val: V) => any;\n privateValueGen: PrivateValueGen<V, K, O>;\n }\n) => any;\n\n/**\n * PrivacyMask determines which (potentally nested) properties\n * of an object are considered private.\n *\n * PrivacyMask copies the structure of another object, but each\n * non-object property on PrivacyMask is a PrivacyFilter - function\n * that determines if the property is considered private.\n *\n * Property is private if the function returns truthy value.\n */\nexport type PrivacyMask<T extends object> = {\n [Key in keyof T]?: T[Key] extends Date | any[] // Consider Data and Array as non-objects\n ? PrivacyFilter<T[Key], Key, T>\n : T[Key] extends object\n ? PrivacyMask<T[Key]>\n : PrivacyFilter<T[Key], Key, T>;\n};\n\nexport interface PushDataOptions<T extends object> {\n /**\n * Whether items should be enriched with request and run metadata.\n *\n * If truthy, the metadata is set under the `metadata` property.\n */\n includeMetadata?: boolean;\n /**\n * Whether properties that are considered personal data should be shown as is.\n *\n * If falsy or not set, these properties are redacted to hide the actual information.\n *\n * Which properties are personal data is determined by `privacyMask`.\n */\n showPrivate?: boolean;\n /**\n * Determine which properties are considered personal data.\n *\n * See {@link PrivacyMask}.\n **/\n privacyMask: PrivacyMask<T>;\n /**\n * Option to select which keys (fields) of an entry to keep (discarding the rest)\n * before pushing the entries to the dataset.\n *\n * This serves mainly to allow users to select the keys from actor input UI.\n *\n * This is done before `remapKeys`.\n *\n * Keys can be nested, e.g. `\"someProp.value[0]\"`. Nested path is\n * resolved using Lodash.get().\n */\n pickKeys?: string[];\n /**\n * Option to remap the keys before pushing the entries to the dataset.\n *\n * This serves mainly to allow users to remap the keys from actor input UI.\n *\n * Keys can be nested, e.g. `\"someProp.value[0]\"`. Nested path is\n * resolved using Lodash.get().\n */\n remapKeys?: Record<string, string>;\n /** ID or name of the dataset to which the data should be pushed */\n datasetIdOrName?: string;\n}\n\nconst createMetadataMapper = <Ctx extends CrawlingContext>(ctx: Ctx) => {\n const { actorId, actorRunId } = Actor.getEnv();\n const actorRunUrl =\n actorId != null && actorRunId != null\n ? `https://console.apify.com/actors/${actorId}/runs/${actorRunId}`\n : null;\n const handledAt = new Date().toISOString();\n\n const addMetadataToData = <T extends Record<any, any> = Record<any, any>>(\n item\n ): WithActorEntryMetadata<T> => ({\n ...item,\n metadata: {\n actorId,\n actorRunId,\n actorRunUrl,\n contextId: ctx.id,\n requestId: ctx.request.id ?? null,\n\n originalUrl: ctx.request.url ?? null,\n loadedUrl: ctx.request.loadedUrl ?? null,\n\n dateHandled: ctx.request.handledAt || handledAt,\n numberOfRetries: ctx.request.retryCount,\n },\n });\n return addMetadataToData;\n};\n\nconst applyPrivacyMask = <T extends Record<any, any> = Record<any, any>>(\n item: T,\n options: {\n showPrivate?: boolean;\n privacyMask: PrivacyMask<T>;\n privateValueGen?: (val: any, key: string, item: T) => any;\n }\n) => {\n const {\n showPrivate,\n privacyMask,\n privateValueGen = (_, key) => `<Redacted property \"${key}\">`,\n } = options;\n\n const resolvePrivateValue = (key: string, val: any) => {\n // Allow to set custom \"redacted\" value by calling\n // `setCustomPrivateValue` from inside the filter function.\n let customPrivateValue;\n let setCustomPrivateValueCalled = false;\n const setCustomPrivateValue = (val: any) => {\n customPrivateValue = val;\n setCustomPrivateValueCalled = true;\n };\n\n const privacyFilter = privacyMask[key] as PrivacyFilter<any, any, any> | undefined;\n const isPrivate = privacyFilter\n ? privacyFilter(val, key, item, { setCustomPrivateValue, privateValueGen })\n : false;\n\n // prettier-ignore\n const privateValue = (\n // Don't redact anything if we're asked to show private data\n showPrivate ? val\n // Otherwise, if custom value was given, use that\n : setCustomPrivateValueCalled ? customPrivateValue\n // Otherwise, decide based on filter truthiness\n : isPrivate ? privateValueGen(key, val, item) : val\n );\n return privateValue;\n };\n\n const redactedObj = Object.entries(item).reduce((agg, [key, val]) => {\n const isNestedObj =\n typeof val === 'object' && val != null && !(val instanceof Date) && !Array.isArray(val);\n\n if (isNestedObj) {\n // Recursively process nested objects\n const subObj = applyPrivacyMask(val, {\n showPrivate,\n privacyMask: (privacyMask[key] ?? {}) as any,\n privateValueGen,\n });\n agg[key as keyof T] = subObj as any;\n } else {\n agg[key as keyof T] = resolvePrivateValue(key, val);\n }\n return agg;\n }, {} as T);\n\n return redactedObj;\n};\n\n/** Rename object properties in place */\nconst renameKeys = <T extends object>(item: T, keyNameMap: Partial<Record<keyof T, string>>) => {\n Object.entries(keyNameMap || {}).forEach(([oldPath, newPath]) => {\n if (oldPath === newPath) return;\n const val = get(item, oldPath);\n set(item, newPath as string, val);\n unset(item, oldPath);\n });\n return item;\n};\n\n/**\n * `Actor.pushData` with extra features:\n *\n * - (Optionally) Add metadata to entries before they are pushed to dataset.\n * - (Optionally) Set which (nested) properties are personal data and allow to\n * redact them for privacy compliance.\n */\nexport const pushData = async <\n Ctx extends CrawlingContext,\n T extends Record<any, any> = Record<any, any>\n>(\n oneOrManyItems: T | T[],\n ctx: Ctx,\n options: PushDataOptions<T>\n) => {\n const { includeMetadata, showPrivate, privacyMask, remapKeys, pickKeys, datasetIdOrName } =\n options;\n\n const items = Array.isArray(oneOrManyItems) ? oneOrManyItems : [oneOrManyItems];\n\n ctx.log.debug(`Preparing entries before pushing ${items.length} items to dataset`); // prettier-ignore\n const addMetadataToData = createMetadataMapper(ctx);\n const adjustedItems = items.map((item) => {\n const itemWithMetadata = includeMetadata ? addMetadataToData(item) : item;\n const maskedItem = applyPrivacyMask(itemWithMetadata, {\n showPrivate,\n privacyMask,\n privateValueGen: (val, key) =>\n `<Redacted property \"${key}\". To include the actual value, toggle ON the Actor input option \"Include personal data\">`,\n });\n\n const pickedItem = pickKeys ? pick(maskedItem, pickKeys) : maskedItem;\n const renamedItem = remapKeys ? renameKeys(pickedItem, remapKeys) : pickedItem;\n\n return renamedItem;\n });\n\n ctx.log.info(`Pushing ${adjustedItems.length} entries to dataset`);\n const dataset = datasetIdOrName ? await Actor.openDataset(datasetIdOrName) : Actor;\n await dataset.pushData(adjustedItems);\n ctx.log.info(`Done pushing ${adjustedItems.length} entries to dataset`);\n\n return adjustedItems;\n};\n"]}
@@ -0,0 +1,78 @@
1
+ import { AnyNode, Cheerio } from 'cheerio';
2
+ import { StrAsNumOptions } from '../utils/format';
3
+ import { FormatUrlOptions } from '../utils/url';
4
+ /**
5
+ * Common interface for work working with DOM despite different environments.
6
+ *
7
+ * Consider these environments:
8
+ * 1) Browser (via Playwright & Chromium) - uses Browser API to work with DOM
9
+ * 2) Cheerio - uses own API to work with DOM
10
+ *
11
+ * This common interfaces makes the scraping code more portable between the two.
12
+ */
13
+ export interface DOMLib<El extends BaseEl, BaseEl> {
14
+ node: El | null;
15
+ /** Get element's text (trimmed) */
16
+ text: (options?: {
17
+ allowEmpty?: boolean;
18
+ }) => string | null;
19
+ /** Get element's text as uppercase (trimmed) */
20
+ textAsUpper: (options?: {
21
+ allowEmpty?: boolean;
22
+ }) => string | null;
23
+ /** Get element's text as lowercase (trimmed) */
24
+ textAsLower: (options?: {
25
+ allowEmpty?: boolean;
26
+ }) => string | null;
27
+ /** Get element's text and convert it to number */
28
+ textAsNumber: (options?: StrAsNumOptions) => number | null;
29
+ /** Get element's attribute */
30
+ attr: (attrName: string, options?: {
31
+ allowEmpty?: boolean;
32
+ }) => string | null;
33
+ /** Get element's property */
34
+ prop: (propName: string, options?: {
35
+ allowEmpty?: boolean;
36
+ }) => string | null;
37
+ /** Get element's href */
38
+ href: (options?: {
39
+ allowEmpty?: boolean;
40
+ } & FormatUrlOptions) => string | null;
41
+ /** Get element's src */
42
+ src: (options?: {
43
+ allowEmpty?: boolean;
44
+ } & FormatUrlOptions) => string | null;
45
+ /** Get element's nodeName */
46
+ nodeName: () => string | null;
47
+ /** Get URL of website associated with the DOM */
48
+ url: () => string | null;
49
+ /** Freely modify the underlying DOM node */
50
+ map: <TVal>(map: (node: El | null) => TVal) => TVal;
51
+ /** Get a single descendant matching the selector */
52
+ findOne: <TNewEl extends BaseEl = El>(selector: string) => DOMLib<TNewEl, BaseEl> | null;
53
+ /** Get all descendants matching the selector */
54
+ findMany: <TNewEl extends BaseEl = El>(selector: string) => DOMLib<TNewEl, BaseEl>[];
55
+ /** Get element's parent */
56
+ parent: <TNewEl extends BaseEl = El>() => DOMLib<TNewEl, BaseEl> | null;
57
+ /** Get element's children */
58
+ children: <TNewEl extends BaseEl = El>() => DOMLib<TNewEl, BaseEl>[];
59
+ /** Get remove the element */
60
+ remove: () => void;
61
+ /** Get root element */
62
+ root: <TNewEl extends BaseEl = El>() => DOMLib<TNewEl, BaseEl> | null;
63
+ }
64
+ export type BrowserDOMLib<T extends Element = Element> = DOMLib<T, Element>;
65
+ /** Implementation of DOMLib in browser (using Browser API) */
66
+ export declare const browserDOMLib: <T extends Element>(node: T) => BrowserDOMLib<T>;
67
+ export type CheerioDOMLib<T extends Cheerio<AnyNode> = Cheerio<AnyNode>> = DOMLib<T, Cheerio<AnyNode>>;
68
+ /**
69
+ * Given a Cheerio selection, split it into an array of Cheerio selections,
70
+ * where each has only one element.
71
+ *
72
+ * From `Cheerio[el, el, el, el]`
73
+ *
74
+ * To `[Cheerio[el], Cheerio[el], Cheerio[el], Cheerio[el]]`
75
+ */
76
+ export declare const splitCheerioSelection: (cheerioSel: Cheerio<AnyNode>) => Cheerio<import("cheerio").Document | import("cheerio").Element | import("domhandler").CDATA | import("domhandler").Text | import("domhandler").Comment | import("domhandler").ProcessingInstruction>[];
77
+ /** Implementation of DOMLib in Cheerio */
78
+ export declare const cheerioDOMLib: <T extends Cheerio<AnyNode>>(cheerioNode: T, srcUrl: string | null) => CheerioDOMLib<T>;
@@ -0,0 +1,243 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.cheerioDOMLib = exports.splitCheerioSelection = exports.browserDOMLib = void 0;
4
+ const cheerio_1 = require("cheerio");
5
+ const format_1 = require("../utils/format");
6
+ const url_1 = require("../utils/url");
7
+ /** Implementation of DOMLib in browser (using Browser API) */
8
+ const browserDOMLib = (node) => {
9
+ ///////////////////////
10
+ // SCALAR OPERATIONS
11
+ ///////////////////////
12
+ const text = ({ allowEmpty } = {}) => {
13
+ var _a, _b;
14
+ const txt = (_b = (_a = node.textContent) === null || _a === void 0 ? void 0 : _a.trim()) !== null && _b !== void 0 ? _b : null;
15
+ return (0, format_1.strOrNull)(txt, allowEmpty);
16
+ };
17
+ const textAsUpper = (options) => {
18
+ const txt = text(options);
19
+ return txt ? txt.toLocaleUpperCase() : txt;
20
+ };
21
+ const textAsLower = (options) => {
22
+ const txt = text(options);
23
+ return txt ? txt.toLocaleLowerCase() : txt;
24
+ };
25
+ const textAsNumber = (options) => {
26
+ const txt = text(options);
27
+ return (0, format_1.strAsNumber)(txt, options);
28
+ };
29
+ const prop = (propName, { allowEmpty } = {}) => {
30
+ var _a;
31
+ let propVal = (_a = node[propName]) !== null && _a !== void 0 ? _a : null;
32
+ propVal = typeof propVal === 'string' ? propVal.trim() : propVal;
33
+ return (0, format_1.strOrNull)(propVal, allowEmpty);
34
+ };
35
+ const attr = (propName, { allowEmpty } = {}) => {
36
+ var _a;
37
+ let attrVal = (_a = node.getAttribute(propName)) !== null && _a !== void 0 ? _a : null;
38
+ attrVal = typeof attrVal === 'string' ? attrVal.trim() : attrVal;
39
+ return (0, format_1.strOrNull)(attrVal, allowEmpty);
40
+ };
41
+ const href = ({ allowEmpty, allowRelative, baseUrl } = {}) => {
42
+ const val = prop('href', { allowEmpty });
43
+ return (0, url_1.formatUrl)(val, { allowRelative, baseUrl });
44
+ };
45
+ const src = ({ allowEmpty, allowRelative, baseUrl } = {}) => {
46
+ const val = prop('src', { allowEmpty });
47
+ return (0, url_1.formatUrl)(val, { allowRelative, baseUrl });
48
+ };
49
+ const nodeName = () => {
50
+ // On UPPER- vs lower-case https://stackoverflow.com/questions/27223756/
51
+ const val = prop('nodeName');
52
+ return typeof val === 'string' ? val.toLocaleUpperCase() : val;
53
+ };
54
+ const url = () => {
55
+ var _a, _b;
56
+ const doc = node.ownerDocument;
57
+ // See https://stackoverflow.com/a/16010322/9788634
58
+ const urlVal = ((_b = (_a = doc.defaultView) === null || _a === void 0 ? void 0 : _a.location) === null || _b === void 0 ? void 0 : _b.href) || null;
59
+ return urlVal;
60
+ };
61
+ const map = (mapFn) => {
62
+ return mapFn(node);
63
+ };
64
+ ///////////////////////
65
+ // NODE OPERATIONS
66
+ ///////////////////////
67
+ const findOne = (selector) => {
68
+ var _a;
69
+ if (![Node.ELEMENT_NODE, Node.DOCUMENT_NODE].includes(node.nodeType))
70
+ return null;
71
+ const resultEl = ((_a = node.querySelector(selector)) !== null && _a !== void 0 ? _a : null);
72
+ return resultEl ? (0, exports.browserDOMLib)(resultEl) : null;
73
+ };
74
+ const findMany = (selector) => {
75
+ if (![Node.ELEMENT_NODE, Node.DOCUMENT_NODE].includes(node.nodeType))
76
+ return [];
77
+ const resultEls = [...node.querySelectorAll(selector)]; // prettier-ignore
78
+ return resultEls.map((el) => (0, exports.browserDOMLib)(el));
79
+ };
80
+ const parent = () => {
81
+ const parentEl = (node.parentNode || null);
82
+ return parentEl ? (0, exports.browserDOMLib)(parentEl) : null;
83
+ };
84
+ const children = () => {
85
+ const childEls = [...node.childNodes];
86
+ return childEls.map((el) => (0, exports.browserDOMLib)(el));
87
+ };
88
+ const root = () => {
89
+ var _a;
90
+ const rootEl = (((_a = node.ownerDocument) === null || _a === void 0 ? void 0 : _a.documentElement) || null);
91
+ return rootEl ? (0, exports.browserDOMLib)(rootEl) : null;
92
+ };
93
+ const remove = () => {
94
+ var _a;
95
+ const parentEl = parent();
96
+ (_a = parentEl === null || parentEl === void 0 ? void 0 : parentEl.node) === null || _a === void 0 ? void 0 : _a.removeChild(node);
97
+ };
98
+ return {
99
+ node,
100
+ text,
101
+ textAsLower,
102
+ textAsUpper,
103
+ textAsNumber,
104
+ attr,
105
+ prop,
106
+ href,
107
+ src,
108
+ nodeName,
109
+ url,
110
+ map,
111
+ findOne,
112
+ findMany,
113
+ parent,
114
+ children,
115
+ root,
116
+ remove,
117
+ };
118
+ };
119
+ exports.browserDOMLib = browserDOMLib;
120
+ /**
121
+ * Given a Cheerio selection, split it into an array of Cheerio selections,
122
+ * where each has only one element.
123
+ *
124
+ * From `Cheerio[el, el, el, el]`
125
+ *
126
+ * To `[Cheerio[el], Cheerio[el], Cheerio[el], Cheerio[el]]`
127
+ */
128
+ const splitCheerioSelection = (cheerioSel) => {
129
+ return cheerioSel.toArray().map((el) => {
130
+ const cheerioInst = (0, cheerio_1.load)(el);
131
+ return cheerioInst(el);
132
+ });
133
+ };
134
+ exports.splitCheerioSelection = splitCheerioSelection;
135
+ /** Implementation of DOMLib in Cheerio */
136
+ const cheerioDOMLib = (cheerioNode, srcUrl) => {
137
+ ///////////////////////
138
+ // SCALAR OPERATIONS
139
+ ///////////////////////
140
+ const text = ({ allowEmpty } = {}) => {
141
+ var _a, _b;
142
+ const txt = (_b = (_a = cheerioNode.text()) === null || _a === void 0 ? void 0 : _a.trim()) !== null && _b !== void 0 ? _b : null;
143
+ return (0, format_1.strOrNull)(txt, allowEmpty);
144
+ };
145
+ const textAsUpper = (options) => {
146
+ const txt = text(options);
147
+ return txt ? txt.toLocaleUpperCase() : txt;
148
+ };
149
+ const textAsLower = (options) => {
150
+ const txt = text(options);
151
+ return txt ? txt.toLocaleLowerCase() : txt;
152
+ };
153
+ const textAsNumber = (options) => {
154
+ const txt = text(options);
155
+ return (0, format_1.strAsNumber)(txt, options);
156
+ };
157
+ const attr = (attrName, { allowEmpty } = {}) => {
158
+ var _a;
159
+ let attrVal = (_a = cheerioNode.attr(attrName)) !== null && _a !== void 0 ? _a : null;
160
+ attrVal = typeof attrVal === 'string' ? attrVal.trim() : attrVal;
161
+ return (0, format_1.strOrNull)(attrVal, allowEmpty);
162
+ };
163
+ const prop = (propName, { allowEmpty } = {}) => {
164
+ var _a;
165
+ let propVal = (_a = cheerioNode.prop(propName)) !== null && _a !== void 0 ? _a : null;
166
+ propVal = typeof propVal === 'string' ? propVal.trim() : propVal;
167
+ return (0, format_1.strOrNull)(propVal, allowEmpty);
168
+ };
169
+ const href = ({ allowEmpty, allowRelative, baseUrl } = {}) => {
170
+ const val = prop('href', { allowEmpty });
171
+ return (0, url_1.formatUrl)(val, { allowRelative, baseUrl });
172
+ };
173
+ const src = ({ allowEmpty, allowRelative, baseUrl } = {}) => {
174
+ const val = prop('src', { allowEmpty });
175
+ return (0, url_1.formatUrl)(val, { allowRelative, baseUrl });
176
+ };
177
+ const nodeName = () => {
178
+ // On UPPER- vs lower-case https://stackoverflow.com/questions/27223756/
179
+ const val = prop('nodeName');
180
+ return typeof val === 'string' ? val.toLocaleUpperCase() : val;
181
+ };
182
+ const url = () => {
183
+ return srcUrl !== null && srcUrl !== void 0 ? srcUrl : null;
184
+ };
185
+ const map = (mapFn) => {
186
+ return mapFn(cheerioNode);
187
+ };
188
+ ///////////////////////
189
+ // NODE OPERATIONS
190
+ ///////////////////////
191
+ const findOne = (selector) => {
192
+ const resultEl = cheerioNode.find(selector).first();
193
+ if (!resultEl.get(0))
194
+ return null;
195
+ return (0, exports.cheerioDOMLib)(resultEl, srcUrl);
196
+ };
197
+ const findMany = (selector) => {
198
+ const resultEls = (0, exports.splitCheerioSelection)(cheerioNode.find(selector));
199
+ return resultEls.map((ch) => (0, exports.cheerioDOMLib)(ch, srcUrl));
200
+ };
201
+ const parent = () => {
202
+ const parentEl = cheerioNode.parent().first();
203
+ if (!parentEl.get(0))
204
+ return null;
205
+ return (0, exports.cheerioDOMLib)(parentEl, srcUrl);
206
+ };
207
+ const children = () => {
208
+ const childEls = (0, exports.splitCheerioSelection)(cheerioNode.children());
209
+ return childEls.map((ch) => (0, exports.cheerioDOMLib)(ch, srcUrl));
210
+ };
211
+ const root = () => {
212
+ var _a;
213
+ const rootEl = (_a = cheerioNode._root) === null || _a === void 0 ? void 0 : _a.first();
214
+ if (!(rootEl === null || rootEl === void 0 ? void 0 : rootEl.get(0)))
215
+ return null;
216
+ return (0, exports.cheerioDOMLib)(rootEl, srcUrl);
217
+ };
218
+ const remove = () => {
219
+ cheerioNode.remove();
220
+ };
221
+ return {
222
+ node: cheerioNode,
223
+ text,
224
+ textAsLower,
225
+ textAsUpper,
226
+ textAsNumber,
227
+ attr,
228
+ prop,
229
+ href,
230
+ src,
231
+ nodeName,
232
+ url,
233
+ map,
234
+ findOne,
235
+ findMany,
236
+ parent,
237
+ children,
238
+ root,
239
+ remove,
240
+ };
241
+ };
242
+ exports.cheerioDOMLib = cheerioDOMLib;
243
+ //# sourceMappingURL=dom.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"dom.js","sourceRoot":"","sources":["../../../src/lib/dom.ts"],"names":[],"mappings":";;;AAAA,qCAAgE;AAEhE,4CAA0E;AAC1E,sCAA2D;AA6D3D,8DAA8D;AACvD,MAAM,aAAa,GAAG,CAAoB,IAAO,EAAoB,EAAE;IAC5E,uBAAuB;IACvB,oBAAoB;IACpB,uBAAuB;IAEvB,MAAM,IAAI,GAA6B,CAAC,EAAE,UAAU,EAAE,GAAG,EAAE,EAAE,EAAE;;QAC7D,MAAM,GAAG,GAAG,MAAA,MAAA,IAAI,CAAC,WAAW,0CAAE,IAAI,EAAE,mCAAI,IAAI,CAAC;QAC7C,OAAO,IAAA,kBAAS,EAAC,GAAG,EAAE,UAAU,CAAC,CAAC;IACpC,CAAC,CAAC;IAEF,MAAM,WAAW,GAAoC,CAAC,OAAO,EAAE,EAAE;QAC/D,MAAM,GAAG,GAAG,IAAI,CAAC,OAAO,CAAC,CAAC;QAC1B,OAAO,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC,iBAAiB,EAAE,CAAC,CAAC,CAAC,GAAG,CAAC;IAC7C,CAAC,CAAC;IAEF,MAAM,WAAW,GAAoC,CAAC,OAAO,EAAE,EAAE;QAC/D,MAAM,GAAG,GAAG,IAAI,CAAC,OAAO,CAAC,CAAC;QAC1B,OAAO,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC,iBAAiB,EAAE,CAAC,CAAC,CAAC,GAAG,CAAC;IAC7C,CAAC,CAAC;IAEF,MAAM,YAAY,GAAqC,CAAC,OAAO,EAAE,EAAE;QACjE,MAAM,GAAG,GAAG,IAAI,CAAC,OAAO,CAAC,CAAC;QAC1B,OAAO,IAAA,oBAAW,EAAC,GAAG,EAAE,OAAO,CAAC,CAAC;IACnC,CAAC,CAAC;IAEF,MAAM,IAAI,GAA6B,CAAC,QAAQ,EAAE,EAAE,UAAU,EAAE,GAAG,EAAE,EAAE,EAAE;;QACvE,IAAI,OAAO,GAAG,MAAA,IAAI,CAAC,QAAQ,CAAC,mCAAI,IAAI,CAAC;QACrC,OAAO,GAAG,OAAO,OAAO,KAAK,QAAQ,CAAC,CAAC,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC;QACjE,OAAO,IAAA,kBAAS,EAAC,OAAO,EAAE,UAAU,CAAC,CAAC;IACxC,CAAC,CAAC;IAEF,MAAM,IAAI,GAA6B,CAAC,QAAQ,EAAE,EAAE,UAAU,EAAE,GAAG,EAAE,EAAE,EAAE;;QACvE,IAAI,OAAO,GAAG,MAAA,IAAI,CAAC,YAAY,CAAC,QAAQ,CAAC,mCAAI,IAAI,CAAC;QAClD,OAAO,GAAG,OAAO,OAAO,KAAK,QAAQ,CAAC,CAAC,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC;QACjE,OAAO,IAAA,kBAAS,EAAC,OAAO,EAAE,UAAU,CAAC,CAAC;IACxC,CAAC,CAAC;IAEF,MAAM,IAAI,GAA6B,CAAC,EAAE,UAAU,EAAE,aAAa,EAAE,OAAO,EAAE,GAAG,EAAE,EAAE,EAAE;QACrF,MAAM,GAAG,GAAG,IAAI,CAAC,MAAM,EAAE,EAAE,UAAU,EAAE,CAAC,CAAC;QACzC,OAAO,IAAA,eAAS,EAAC,GAAG,EAAE,EAAE,aAAa,EAAE,OAAO,EAAE,CAAC,CAAC;IACpD,CAAC,CAAC;IAEF,MAAM,GAAG,GAA4B,CAAC,EAAE,UAAU,EAAE,aAAa,EAAE,OAAO,EAAE,GAAG,EAAE,EAAE,EAAE;QACnF,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,EAAE,EAAE,UAAU,EAAE,CAAC,CAAC;QACxC,OAAO,IAAA,eAAS,EAAC,GAAG,EAAE,EAAE,aAAa,EAAE,OAAO,EAAE,CAAC,CAAC;IACpD,CAAC,CAAC;IAEF,MAAM,QAAQ,GAAiC,GAAG,EAAE;QAClD,wEAAwE;QACxE,MAAM,GAAG,GAAG,IAAI,CAAC,UAAU,CAAC,CAAC;QAC7B,OAAO,OAAO,GAAG,KAAK,QAAQ,CAAC,CAAC,CAAC,GAAG,CAAC,iBAAiB,EAAE,CAAC,CAAC,CAAC,GAAG,CAAC;IACjE,CAAC,CAAC;IAEF,MAAM,GAAG,GAA4B,GAAG,EAAE;;QACxC,MAAM,GAAG,GAAG,IAAI,CAAC,aAAa,CAAC;QAC/B,mDAAmD;QACnD,MAAM,MAAM,GAAG,CAAA,MAAA,MAAA,GAAG,CAAC,WAAW,0CAAE,QAAQ,0CAAE,IAAI,KAAI,IAAI,CAAC;QACvD,OAAO,MAAM,CAAC;IAChB,CAAC,CAAC;IAEF,MAAM,GAAG,GAA4B,CAAO,KAAwB,EAAE,EAAE;QACtE,OAAO,KAAK,CAAC,IAAI,CAAC,CAAC;IACrB,CAAC,CAAC;IAEF,uBAAuB;IACvB,kBAAkB;IAClB,uBAAuB;IAEvB,MAAM,OAAO,GAAgC,CAA6B,QAAQ,EAAE,EAAE;;QACpF,IAAI,CAAC,CAAC,IAAI,CAAC,YAAY,EAAE,IAAI,CAAC,aAAa,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,QAAe,CAAC;YAAE,OAAO,IAAI,CAAC;QACzF,MAAM,QAAQ,GAAG,CAAC,MAAA,IAAI,CAAC,aAAa,CAAC,QAAQ,CAAC,mCAAI,IAAI,CAAkB,CAAC;QACzE,OAAO,QAAQ,CAAC,CAAC,CAAC,IAAA,qBAAa,EAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC;IACnD,CAAC,CAAC;IAEF,MAAM,QAAQ,GAAiC,CAA6B,QAAQ,EAAE,EAAE;QACtF,IAAI,CAAC,CAAC,IAAI,CAAC,YAAY,EAAE,IAAI,CAAC,aAAa,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,QAAe,CAAC;YAAE,OAAO,EAAE,CAAC;QACvF,MAAM,SAAS,GAAG,CAAC,GAAG,IAAI,CAAC,gBAAgB,CAAC,QAAQ,CAAC,CAAa,CAAC,CAAC,kBAAkB;QACtF,OAAO,SAAS,CAAC,GAAG,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,IAAA,qBAAa,EAAC,EAAE,CAAC,CAAC,CAAC;IAClD,CAAC,CAAC;IAEF,MAAM,MAAM,GAA+B,GAA+B,EAAE;QAC1E,MAAM,QAAQ,GAAG,CAAC,IAAI,CAAC,UAAU,IAAI,IAAI,CAAkB,CAAC;QAC5D,OAAO,QAAQ,CAAC,CAAC,CAAC,IAAA,qBAAa,EAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC;IACnD,CAAC,CAAC;IAEF,MAAM,QAAQ,GAAiC,GAA+B,EAAE;QAC9E,MAAM,QAAQ,GAAG,CAAC,GAAG,IAAI,CAAC,UAAU,CAAa,CAAC;QAClD,OAAO,QAAQ,CAAC,GAAG,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,IAAA,qBAAa,EAAC,EAAE,CAAC,CAAC,CAAC;IACjD,CAAC,CAAC;IAEF,MAAM,IAAI,GAA6B,GAA+B,EAAE;;QACtE,MAAM,MAAM,GAAG,CAAC,CAAC,MAAA,IAAI,CAAC,aAAa,0CAAE,eAAuB,KAAI,IAAI,CAAkB,CAAC;QACvF,OAAO,MAAM,CAAC,CAAC,CAAC,IAAA,qBAAa,EAAC,MAAM,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC;IAC/C,CAAC,CAAC;IAEF,MAAM,MAAM,GAA+B,GAAG,EAAE;;QAC9C,MAAM,QAAQ,GAAG,MAAM,EAAE,CAAC;QAC1B,MAAA,QAAQ,aAAR,QAAQ,uBAAR,QAAQ,CAAE,IAAI,0CAAE,WAAW,CAAC,IAAI,CAAC,CAAC;IACpC,CAAC,CAAC;IAEF,OAAO;QACL,IAAI;QAEJ,IAAI;QACJ,WAAW;QACX,WAAW;QACX,YAAY;QACZ,IAAI;QACJ,IAAI;QACJ,IAAI;QACJ,GAAG;QACH,QAAQ;QACR,GAAG;QACH,GAAG;QAEH,OAAO;QACP,QAAQ;QACR,MAAM;QACN,QAAQ;QACR,IAAI;QACJ,MAAM;KACsB,CAAC;AACjC,CAAC,CAAC;AA1HW,QAAA,aAAa,iBA0HxB;AAOF;;;;;;;GAOG;AACI,MAAM,qBAAqB,GAAG,CAAC,UAA4B,EAAE,EAAE;IACpE,OAAO,UAAU,CAAC,OAAO,EAAE,CAAC,GAAG,CAAC,CAAC,EAAE,EAAE,EAAE;QACrC,MAAM,WAAW,GAAG,IAAA,cAAW,EAAC,EAAE,CAAC,CAAC;QACpC,OAAO,WAAW,CAAC,EAAE,CAAC,CAAC;IACzB,CAAC,CAAC,CAAC;AACL,CAAC,CAAC;AALW,QAAA,qBAAqB,yBAKhC;AAEF,0CAA0C;AACnC,MAAM,aAAa,GAAG,CAC3B,WAAc,EACd,MAAqB,EACH,EAAE;IACpB,uBAAuB;IACvB,oBAAoB;IACpB,uBAAuB;IAEvB,MAAM,IAAI,GAA6B,CAAC,EAAE,UAAU,EAAE,GAAG,EAAE,EAAE,EAAE;;QAC7D,MAAM,GAAG,GAAG,MAAA,MAAA,WAAW,CAAC,IAAI,EAAE,0CAAE,IAAI,EAAE,mCAAI,IAAI,CAAC;QAC/C,OAAO,IAAA,kBAAS,EAAC,GAAG,EAAE,UAAU,CAAC,CAAC;IACpC,CAAC,CAAC;IAEF,MAAM,WAAW,GAAoC,CAAC,OAAO,EAAE,EAAE;QAC/D,MAAM,GAAG,GAAG,IAAI,CAAC,OAAO,CAAC,CAAC;QAC1B,OAAO,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC,iBAAiB,EAAE,CAAC,CAAC,CAAC,GAAG,CAAC;IAC7C,CAAC,CAAC;IAEF,MAAM,WAAW,GAAoC,CAAC,OAAO,EAAE,EAAE;QAC/D,MAAM,GAAG,GAAG,IAAI,CAAC,OAAO,CAAC,CAAC;QAC1B,OAAO,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC,iBAAiB,EAAE,CAAC,CAAC,CAAC,GAAG,CAAC;IAC7C,CAAC,CAAC;IAEF,MAAM,YAAY,GAAqC,CAAC,OAAO,EAAE,EAAE;QACjE,MAAM,GAAG,GAAG,IAAI,CAAC,OAAO,CAAC,CAAC;QAC1B,OAAO,IAAA,oBAAW,EAAC,GAAG,EAAE,OAAO,CAAC,CAAC;IACnC,CAAC,CAAC;IAEF,MAAM,IAAI,GAA6B,CAAC,QAAQ,EAAE,EAAE,UAAU,EAAE,GAAG,EAAE,EAAE,EAAE;;QACvE,IAAI,OAAO,GAAG,MAAA,WAAW,CAAC,IAAI,CAAC,QAAQ,CAAC,mCAAI,IAAI,CAAC;QACjD,OAAO,GAAG,OAAO,OAAO,KAAK,QAAQ,CAAC,CAAC,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC;QACjE,OAAO,IAAA,kBAAS,EAAC,OAAO,EAAE,UAAU,CAAC,CAAC;IACxC,CAAC,CAAC;IAEF,MAAM,IAAI,GAA6B,CAAC,QAAQ,EAAE,EAAE,UAAU,EAAE,GAAG,EAAE,EAAE,EAAE;;QACvE,IAAI,OAAO,GAAG,MAAA,WAAW,CAAC,IAAI,CAAC,QAAQ,CAAC,mCAAI,IAAI,CAAC;QACjD,OAAO,GAAG,OAAO,OAAO,KAAK,QAAQ,CAAC,CAAC,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC;QACjE,OAAO,IAAA,kBAAS,EAAC,OAAO,EAAE,UAAU,CAAC,CAAC;IACxC,CAAC,CAAC;IAEF,MAAM,IAAI,GAA6B,CAAC,EAAE,UAAU,EAAE,aAAa,EAAE,OAAO,EAAE,GAAG,EAAE,EAAE,EAAE;QACrF,MAAM,GAAG,GAAG,IAAI,CAAC,MAAM,EAAE,EAAE,UAAU,EAAE,CAAC,CAAC;QACzC,OAAO,IAAA,eAAS,EAAC,GAAG,EAAE,EAAE,aAAa,EAAE,OAAO,EAAE,CAAC,CAAC;IACpD,CAAC,CAAC;IAEF,MAAM,GAAG,GAA4B,CAAC,EAAE,UAAU,EAAE,aAAa,EAAE,OAAO,EAAE,GAAG,EAAE,EAAE,EAAE;QACnF,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,EAAE,EAAE,UAAU,EAAE,CAAC,CAAC;QACxC,OAAO,IAAA,eAAS,EAAC,GAAG,EAAE,EAAE,aAAa,EAAE,OAAO,EAAE,CAAC,CAAC;IACpD,CAAC,CAAC;IAEF,MAAM,QAAQ,GAAiC,GAAG,EAAE;QAClD,wEAAwE;QACxE,MAAM,GAAG,GAAG,IAAI,CAAC,UAAU,CAAC,CAAC;QAC7B,OAAO,OAAO,GAAG,KAAK,QAAQ,CAAC,CAAC,CAAC,GAAG,CAAC,iBAAiB,EAAE,CAAC,CAAC,CAAC,GAAG,CAAC;IACjE,CAAC,CAAC;IAEF,MAAM,GAAG,GAA4B,GAAG,EAAE;QACxC,OAAO,MAAM,aAAN,MAAM,cAAN,MAAM,GAAI,IAAI,CAAC;IACxB,CAAC,CAAC;IAEF,MAAM,GAAG,GAA4B,CAAO,KAAwB,EAAE,EAAE;QACtE,OAAO,KAAK,CAAC,WAAW,CAAC,CAAC;IAC5B,CAAC,CAAC;IAEF,uBAAuB;IACvB,kBAAkB;IAClB,uBAAuB;IAEvB,MAAM,OAAO,GAAgC,CAAsC,QAAQ,EAAE,EAAE;QAC7F,MAAM,QAAQ,GAAG,WAAW,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC,KAAK,EAAY,CAAC;QAC9D,IAAI,CAAC,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC;YAAE,OAAO,IAAI,CAAC;QAClC,OAAO,IAAA,qBAAa,EAAC,QAAQ,EAAE,MAAM,CAAC,CAAC;IACzC,CAAC,CAAC;IAEF,MAAM,QAAQ,GAAiC,CAC7C,QAAQ,EACR,EAAE;QACF,MAAM,SAAS,GAAG,IAAA,6BAAqB,EAAC,WAAW,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAa,CAAC;QAChF,OAAO,SAAS,CAAC,GAAG,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,IAAA,qBAAa,EAAC,EAAE,EAAE,MAAM,CAAC,CAAC,CAAC;IAC1D,CAAC,CAAC;IAEF,MAAM,MAAM,GAA+B,GAAwC,EAAE;QACnF,MAAM,QAAQ,GAAG,WAAW,CAAC,MAAM,EAAE,CAAC,KAAK,EAAY,CAAC;QACxD,IAAI,CAAC,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC;YAAE,OAAO,IAAI,CAAC;QAClC,OAAO,IAAA,qBAAa,EAAC,QAAQ,EAAE,MAAM,CAAC,CAAC;IACzC,CAAC,CAAC;IAEF,MAAM,QAAQ,GAAiC,GAAwC,EAAE;QACvF,MAAM,QAAQ,GAAG,IAAA,6BAAqB,EAAC,WAAW,CAAC,QAAQ,EAAE,CAAa,CAAC;QAC3E,OAAO,QAAQ,CAAC,GAAG,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,IAAA,qBAAa,EAAC,EAAE,EAAE,MAAM,CAAC,CAAC,CAAC;IACzD,CAAC,CAAC;IAEF,MAAM,IAAI,GAA6B,GAAwC,EAAE;;QAC/E,MAAM,MAAM,GAAG,MAAA,WAAW,CAAC,KAAK,0CAAE,KAAK,EAAmB,CAAC;QAC3D,IAAI,CAAC,CAAA,MAAM,aAAN,MAAM,uBAAN,MAAM,CAAE,GAAG,CAAC,CAAC,CAAC,CAAA;YAAE,OAAO,IAAI,CAAC;QACjC,OAAO,IAAA,qBAAa,EAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IACvC,CAAC,CAAC;IAEF,MAAM,MAAM,GAA+B,GAAG,EAAE;QAC9C,WAAW,CAAC,MAAM,EAAE,CAAC;IACvB,CAAC,CAAC;IAEF,OAAO;QACL,IAAI,EAAE,WAAW;QAEjB,IAAI;QACJ,WAAW;QACX,WAAW;QACX,YAAY;QACZ,IAAI;QACJ,IAAI;QACJ,IAAI;QACJ,GAAG;QACH,QAAQ;QACR,GAAG;QACH,GAAG;QAEH,OAAO;QACP,QAAQ;QACR,MAAM;QACN,QAAQ;QACR,IAAI;QACJ,MAAM;KACoB,CAAC;AAC/B,CAAC,CAAC;AA5HW,QAAA,aAAa,iBA4HxB","sourcesContent":["import { load as loadCheerio, AnyNode, Cheerio } from 'cheerio';\n\nimport { StrAsNumOptions, strAsNumber, strOrNull } from '../utils/format';\nimport { FormatUrlOptions, formatUrl } from '../utils/url';\n\n/**\n * Common interface for work working with DOM despite different environments.\n *\n * Consider these environments:\n * 1) Browser (via Playwright & Chromium) - uses Browser API to work with DOM\n * 2) Cheerio - uses own API to work with DOM\n *\n * This common interfaces makes the scraping code more portable between the two.\n */\nexport interface DOMLib<El extends BaseEl, BaseEl> {\n node: El | null;\n\n ///////////////////////\n // SCALAR OPERATIONS\n ///////////////////////\n\n /** Get element's text (trimmed) */\n text: (options?: { allowEmpty?: boolean }) => string | null;\n /** Get element's text as uppercase (trimmed) */\n textAsUpper: (options?: { allowEmpty?: boolean }) => string | null;\n /** Get element's text as lowercase (trimmed) */\n textAsLower: (options?: { allowEmpty?: boolean }) => string | null;\n /** Get element's text and convert it to number */\n textAsNumber: (options?: StrAsNumOptions) => number | null;\n /** Get element's attribute */\n attr: (attrName: string, options?: { allowEmpty?: boolean }) => string | null;\n /** Get element's property */\n prop: (propName: string, options?: { allowEmpty?: boolean }) => string | null;\n /** Get element's href */\n href: (options?: { allowEmpty?: boolean } & FormatUrlOptions) => string | null;\n /** Get element's src */\n src: (options?: { allowEmpty?: boolean } & FormatUrlOptions) => string | null;\n /** Get element's nodeName */\n nodeName: () => string | null;\n /** Get URL of website associated with the DOM */\n url: () => string | null;\n /** Freely modify the underlying DOM node */\n map: <TVal>(map: (node: El | null) => TVal) => TVal;\n\n ///////////////////////\n // NODE OPERATIONS\n ///////////////////////\n\n /** Get a single descendant matching the selector */\n findOne: <TNewEl extends BaseEl = El>(selector: string) => DOMLib<TNewEl, BaseEl> | null;\n /** Get all descendants matching the selector */\n findMany: <TNewEl extends BaseEl = El>(selector: string) => DOMLib<TNewEl, BaseEl>[];\n /** Get element's parent */\n parent: <TNewEl extends BaseEl = El>() => DOMLib<TNewEl, BaseEl> | null;\n /** Get element's children */\n children: <TNewEl extends BaseEl = El>() => DOMLib<TNewEl, BaseEl>[];\n /** Get remove the element */\n remove: () => void;\n /** Get root element */\n root: <TNewEl extends BaseEl = El>() => DOMLib<TNewEl, BaseEl> | null;\n}\n\nexport type BrowserDOMLib<T extends Element = Element> = DOMLib<T, Element>;\n\n/** Implementation of DOMLib in browser (using Browser API) */\nexport const browserDOMLib = <T extends Element>(node: T): BrowserDOMLib<T> => {\n ///////////////////////\n // SCALAR OPERATIONS\n ///////////////////////\n\n const text: BrowserDOMLib<T>['text'] = ({ allowEmpty } = {}) => {\n const txt = node.textContent?.trim() ?? null;\n return strOrNull(txt, allowEmpty);\n };\n\n const textAsUpper: BrowserDOMLib<T>['textAsUpper'] = (options) => {\n const txt = text(options);\n return txt ? txt.toLocaleUpperCase() : txt;\n };\n\n const textAsLower: BrowserDOMLib<T>['textAsLower'] = (options) => {\n const txt = text(options);\n return txt ? txt.toLocaleLowerCase() : txt;\n };\n\n const textAsNumber: BrowserDOMLib<T>['textAsNumber'] = (options) => {\n const txt = text(options);\n return strAsNumber(txt, options);\n };\n\n const prop: BrowserDOMLib<T>['prop'] = (propName, { allowEmpty } = {}) => {\n let propVal = node[propName] ?? null;\n propVal = typeof propVal === 'string' ? propVal.trim() : propVal;\n return strOrNull(propVal, allowEmpty);\n };\n\n const attr: BrowserDOMLib<T>['attr'] = (propName, { allowEmpty } = {}) => {\n let attrVal = node.getAttribute(propName) ?? null;\n attrVal = typeof attrVal === 'string' ? attrVal.trim() : attrVal;\n return strOrNull(attrVal, allowEmpty);\n };\n\n const href: BrowserDOMLib<T>['href'] = ({ allowEmpty, allowRelative, baseUrl } = {}) => {\n const val = prop('href', { allowEmpty });\n return formatUrl(val, { allowRelative, baseUrl });\n };\n\n const src: BrowserDOMLib<T>['src'] = ({ allowEmpty, allowRelative, baseUrl } = {}) => {\n const val = prop('src', { allowEmpty });\n return formatUrl(val, { allowRelative, baseUrl });\n };\n\n const nodeName: BrowserDOMLib<T>['nodeName'] = () => {\n // On UPPER- vs lower-case https://stackoverflow.com/questions/27223756/\n const val = prop('nodeName');\n return typeof val === 'string' ? val.toLocaleUpperCase() : val;\n };\n\n const url: BrowserDOMLib<T>['url'] = () => {\n const doc = node.ownerDocument;\n // See https://stackoverflow.com/a/16010322/9788634\n const urlVal = doc.defaultView?.location?.href || null;\n return urlVal;\n };\n\n const map: BrowserDOMLib<T>['map'] = <TVal>(mapFn: (node: T) => TVal) => {\n return mapFn(node);\n };\n\n ///////////////////////\n // NODE OPERATIONS\n ///////////////////////\n\n const findOne: BrowserDOMLib<T>['findOne'] = <TNewEl extends Element = T>(selector) => {\n if (![Node.ELEMENT_NODE, Node.DOCUMENT_NODE].includes(node.nodeType as any)) return null;\n const resultEl = (node.querySelector(selector) ?? null) as TNewEl | null;\n return resultEl ? browserDOMLib(resultEl) : null;\n };\n\n const findMany: BrowserDOMLib<T>['findMany'] = <TNewEl extends Element = T>(selector) => {\n if (![Node.ELEMENT_NODE, Node.DOCUMENT_NODE].includes(node.nodeType as any)) return [];\n const resultEls = [...node.querySelectorAll(selector)] as TNewEl[]; // prettier-ignore\n return resultEls.map((el) => browserDOMLib(el));\n };\n\n const parent: BrowserDOMLib<T>['parent'] = <TNewEl extends Element = T>() => {\n const parentEl = (node.parentNode || null) as TNewEl | null;\n return parentEl ? browserDOMLib(parentEl) : null;\n };\n\n const children: BrowserDOMLib<T>['children'] = <TNewEl extends Element = T>() => {\n const childEls = [...node.childNodes] as TNewEl[];\n return childEls.map((el) => browserDOMLib(el));\n };\n\n const root: BrowserDOMLib<T>['root'] = <TNewEl extends Element = T>() => {\n const rootEl = ((node.ownerDocument?.documentElement as any) || null) as TNewEl | null;\n return rootEl ? browserDOMLib(rootEl) : null;\n };\n\n const remove: BrowserDOMLib<T>['remove'] = () => {\n const parentEl = parent();\n parentEl?.node?.removeChild(node);\n };\n\n return {\n node,\n\n text,\n textAsLower,\n textAsUpper,\n textAsNumber,\n attr,\n prop,\n href,\n src,\n nodeName,\n url,\n map,\n\n findOne,\n findMany,\n parent,\n children,\n root,\n remove,\n } satisfies DOMLib<T, Element>;\n};\n\nexport type CheerioDOMLib<T extends Cheerio<AnyNode> = Cheerio<AnyNode>> = DOMLib<\n T,\n Cheerio<AnyNode>\n>;\n\n/**\n * Given a Cheerio selection, split it into an array of Cheerio selections,\n * where each has only one element.\n *\n * From `Cheerio[el, el, el, el]`\n *\n * To `[Cheerio[el], Cheerio[el], Cheerio[el], Cheerio[el]]`\n */\nexport const splitCheerioSelection = (cheerioSel: Cheerio<AnyNode>) => {\n return cheerioSel.toArray().map((el) => {\n const cheerioInst = loadCheerio(el);\n return cheerioInst(el);\n });\n};\n\n/** Implementation of DOMLib in Cheerio */\nexport const cheerioDOMLib = <T extends Cheerio<AnyNode>>(\n cheerioNode: T,\n srcUrl: string | null\n): CheerioDOMLib<T> => {\n ///////////////////////\n // SCALAR OPERATIONS\n ///////////////////////\n\n const text: CheerioDOMLib<T>['text'] = ({ allowEmpty } = {}) => {\n const txt = cheerioNode.text()?.trim() ?? null;\n return strOrNull(txt, allowEmpty);\n };\n\n const textAsUpper: CheerioDOMLib<T>['textAsUpper'] = (options) => {\n const txt = text(options);\n return txt ? txt.toLocaleUpperCase() : txt;\n };\n\n const textAsLower: CheerioDOMLib<T>['textAsLower'] = (options) => {\n const txt = text(options);\n return txt ? txt.toLocaleLowerCase() : txt;\n };\n\n const textAsNumber: CheerioDOMLib<T>['textAsNumber'] = (options) => {\n const txt = text(options);\n return strAsNumber(txt, options);\n };\n\n const attr: CheerioDOMLib<T>['attr'] = (attrName, { allowEmpty } = {}) => {\n let attrVal = cheerioNode.attr(attrName) ?? null;\n attrVal = typeof attrVal === 'string' ? attrVal.trim() : attrVal;\n return strOrNull(attrVal, allowEmpty);\n };\n\n const prop: CheerioDOMLib<T>['prop'] = (propName, { allowEmpty } = {}) => {\n let propVal = cheerioNode.prop(propName) ?? null;\n propVal = typeof propVal === 'string' ? propVal.trim() : propVal;\n return strOrNull(propVal, allowEmpty);\n };\n\n const href: CheerioDOMLib<T>['href'] = ({ allowEmpty, allowRelative, baseUrl } = {}) => {\n const val = prop('href', { allowEmpty });\n return formatUrl(val, { allowRelative, baseUrl });\n };\n\n const src: CheerioDOMLib<T>['src'] = ({ allowEmpty, allowRelative, baseUrl } = {}) => {\n const val = prop('src', { allowEmpty });\n return formatUrl(val, { allowRelative, baseUrl });\n };\n\n const nodeName: CheerioDOMLib<T>['nodeName'] = () => {\n // On UPPER- vs lower-case https://stackoverflow.com/questions/27223756/\n const val = prop('nodeName');\n return typeof val === 'string' ? val.toLocaleUpperCase() : val;\n };\n\n const url: CheerioDOMLib<T>['url'] = () => {\n return srcUrl ?? null;\n };\n\n const map: CheerioDOMLib<T>['map'] = <TVal>(mapFn: (node: T) => TVal) => {\n return mapFn(cheerioNode);\n };\n\n ///////////////////////\n // NODE OPERATIONS\n ///////////////////////\n\n const findOne: CheerioDOMLib<T>['findOne'] = <TNewEl extends Cheerio<AnyNode> = T>(selector) => {\n const resultEl = cheerioNode.find(selector).first() as TNewEl;\n if (!resultEl.get(0)) return null;\n return cheerioDOMLib(resultEl, srcUrl);\n };\n\n const findMany: CheerioDOMLib<T>['findMany'] = <TNewEl extends Cheerio<AnyNode> = T>(\n selector\n ) => {\n const resultEls = splitCheerioSelection(cheerioNode.find(selector)) as TNewEl[];\n return resultEls.map((ch) => cheerioDOMLib(ch, srcUrl));\n };\n\n const parent: CheerioDOMLib<T>['parent'] = <TNewEl extends Cheerio<AnyNode> = T>() => {\n const parentEl = cheerioNode.parent().first() as TNewEl;\n if (!parentEl.get(0)) return null;\n return cheerioDOMLib(parentEl, srcUrl);\n };\n\n const children: CheerioDOMLib<T>['children'] = <TNewEl extends Cheerio<AnyNode> = T>() => {\n const childEls = splitCheerioSelection(cheerioNode.children()) as TNewEl[];\n return childEls.map((ch) => cheerioDOMLib(ch, srcUrl));\n };\n\n const root: CheerioDOMLib<T>['root'] = <TNewEl extends Cheerio<AnyNode> = T>() => {\n const rootEl = cheerioNode._root?.first() as TNewEl | null;\n if (!rootEl?.get(0)) return null;\n return cheerioDOMLib(rootEl, srcUrl);\n };\n\n const remove: CheerioDOMLib<T>['remove'] = () => {\n cheerioNode.remove();\n };\n\n return {\n node: cheerioNode,\n\n text,\n textAsLower,\n textAsUpper,\n textAsNumber,\n attr,\n prop,\n href,\n src,\n nodeName,\n url,\n map,\n\n findOne,\n findMany,\n parent,\n children,\n root,\n remove,\n } satisfies CheerioDOMLib<T>;\n};\n"]}