crawlee-one 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. package/README.md +81 -0
  2. package/dist/cjs/cli/cli.d.ts +1 -0
  3. package/dist/cjs/cli/cli.js +61 -0
  4. package/dist/cjs/cli/cli.js.map +1 -0
  5. package/dist/cjs/cli/index.d.ts +2 -0
  6. package/dist/cjs/cli/index.js +6 -0
  7. package/dist/cjs/cli/index.js.map +1 -0
  8. package/dist/cjs/index.d.ts +24 -0
  9. package/dist/cjs/index.js +43 -0
  10. package/dist/cjs/index.js.map +1 -0
  11. package/dist/cjs/lib/actions/dom.d.ts +102 -0
  12. package/dist/cjs/lib/actions/dom.js +743 -0
  13. package/dist/cjs/lib/actions/dom.js.map +1 -0
  14. package/dist/cjs/lib/actions/domUtils.d.ts +42 -0
  15. package/dist/cjs/lib/actions/domUtils.js +126 -0
  16. package/dist/cjs/lib/actions/domUtils.js.map +1 -0
  17. package/dist/cjs/lib/actions/page.d.ts +69 -0
  18. package/dist/cjs/lib/actions/page.js +205 -0
  19. package/dist/cjs/lib/actions/page.js.map +1 -0
  20. package/dist/cjs/lib/actions/scrapeListing.d.ts +78 -0
  21. package/dist/cjs/lib/actions/scrapeListing.js +242 -0
  22. package/dist/cjs/lib/actions/scrapeListing.js.map +1 -0
  23. package/dist/cjs/lib/actor/actor.d.ts +90 -0
  24. package/dist/cjs/lib/actor/actor.js +306 -0
  25. package/dist/cjs/lib/actor/actor.js.map +1 -0
  26. package/dist/cjs/lib/actor/types.d.ts +162 -0
  27. package/dist/cjs/lib/actor/types.js +3 -0
  28. package/dist/cjs/lib/actor/types.js.map +1 -0
  29. package/dist/cjs/lib/actor.d.ts +189 -0
  30. package/dist/cjs/lib/actor.js +225 -0
  31. package/dist/cjs/lib/actor.js.map +1 -0
  32. package/dist/cjs/lib/actorSpec.d.ts +20 -0
  33. package/dist/cjs/lib/actorSpec.js +3 -0
  34. package/dist/cjs/lib/actorSpec.js.map +1 -0
  35. package/dist/cjs/lib/config.d.ts +561 -0
  36. package/dist/cjs/lib/config.js +707 -0
  37. package/dist/cjs/lib/config.js.map +1 -0
  38. package/dist/cjs/lib/dataset/maxCount.d.ts +30 -0
  39. package/dist/cjs/lib/dataset/maxCount.js +55 -0
  40. package/dist/cjs/lib/dataset/maxCount.js.map +1 -0
  41. package/dist/cjs/lib/dataset/pushData.d.ts +123 -0
  42. package/dist/cjs/lib/dataset/pushData.js +182 -0
  43. package/dist/cjs/lib/dataset/pushData.js.map +1 -0
  44. package/dist/cjs/lib/dataset.d.ts +98 -0
  45. package/dist/cjs/lib/dataset.js +122 -0
  46. package/dist/cjs/lib/dataset.js.map +1 -0
  47. package/dist/cjs/lib/dom.d.ts +78 -0
  48. package/dist/cjs/lib/dom.js +243 -0
  49. package/dist/cjs/lib/dom.js.map +1 -0
  50. package/dist/cjs/lib/error/errorHandler.d.ts +112 -0
  51. package/dist/cjs/lib/error/errorHandler.js +164 -0
  52. package/dist/cjs/lib/error/errorHandler.js.map +1 -0
  53. package/dist/cjs/lib/error/sentry.d.ts +11 -0
  54. package/dist/cjs/lib/error/sentry.js +60 -0
  55. package/dist/cjs/lib/error/sentry.js.map +1 -0
  56. package/dist/cjs/lib/integrations/apify.d.ts +67 -0
  57. package/dist/cjs/lib/integrations/apify.js +106 -0
  58. package/dist/cjs/lib/integrations/apify.js.map +1 -0
  59. package/dist/cjs/lib/integrations/types.d.ts +274 -0
  60. package/dist/cjs/lib/integrations/types.js +3 -0
  61. package/dist/cjs/lib/integrations/types.js.map +1 -0
  62. package/dist/cjs/lib/io/dataset.d.ts +67 -0
  63. package/dist/cjs/lib/io/dataset.js +86 -0
  64. package/dist/cjs/lib/io/dataset.js.map +1 -0
  65. package/dist/cjs/lib/io/maxCount.d.ts +30 -0
  66. package/dist/cjs/lib/io/maxCount.js +55 -0
  67. package/dist/cjs/lib/io/maxCount.js.map +1 -0
  68. package/dist/cjs/lib/io/pushData.d.ts +124 -0
  69. package/dist/cjs/lib/io/pushData.js +193 -0
  70. package/dist/cjs/lib/io/pushData.js.map +1 -0
  71. package/dist/cjs/lib/io/pushRequests.d.ts +38 -0
  72. package/dist/cjs/lib/io/pushRequests.js +63 -0
  73. package/dist/cjs/lib/io/pushRequests.js.map +1 -0
  74. package/dist/cjs/lib/io/requestQueue.d.ts +28 -0
  75. package/dist/cjs/lib/io/requestQueue.js +40 -0
  76. package/dist/cjs/lib/io/requestQueue.js.map +1 -0
  77. package/dist/cjs/lib/log.d.ts +38 -0
  78. package/dist/cjs/lib/log.js +54 -0
  79. package/dist/cjs/lib/log.js.map +1 -0
  80. package/dist/cjs/lib/migrate/localMigrator.d.ts +10 -0
  81. package/dist/cjs/lib/migrate/localMigrator.js +57 -0
  82. package/dist/cjs/lib/migrate/localMigrator.js.map +1 -0
  83. package/dist/cjs/lib/migrate/localState.d.ts +7 -0
  84. package/dist/cjs/lib/migrate/localState.js +43 -0
  85. package/dist/cjs/lib/migrate/localState.js.map +1 -0
  86. package/dist/cjs/lib/migrate/types.d.ts +6 -0
  87. package/dist/cjs/lib/migrate/types.js +3 -0
  88. package/dist/cjs/lib/migrate/types.js.map +1 -0
  89. package/dist/cjs/lib/readme/readme.d.ts +65 -0
  90. package/dist/cjs/lib/readme/readme.js +534 -0
  91. package/dist/cjs/lib/readme/readme.js.map +1 -0
  92. package/dist/cjs/lib/readme/types.d.ts +260 -0
  93. package/dist/cjs/lib/readme/types.js +54 -0
  94. package/dist/cjs/lib/readme/types.js.map +1 -0
  95. package/dist/cjs/lib/router.d.ts +132 -0
  96. package/dist/cjs/lib/router.js +165 -0
  97. package/dist/cjs/lib/router.js.map +1 -0
  98. package/dist/cjs/lib/scraper/scrapeListing.d.ts +78 -0
  99. package/dist/cjs/lib/scraper/scrapeListing.js +242 -0
  100. package/dist/cjs/lib/scraper/scrapeListing.js.map +1 -0
  101. package/dist/cjs/lib/test/actor.d.ts +21 -0
  102. package/dist/cjs/lib/test/actor.js +56 -0
  103. package/dist/cjs/lib/test/actor.js.map +1 -0
  104. package/dist/cjs/lib/test/mockApifyClient.d.ts +32 -0
  105. package/dist/cjs/lib/test/mockApifyClient.js +176 -0
  106. package/dist/cjs/lib/test/mockApifyClient.js.map +1 -0
  107. package/dist/cjs/types.d.ts +31 -0
  108. package/dist/cjs/types.js +3 -0
  109. package/dist/cjs/types.js.map +1 -0
  110. package/dist/cjs/utils/async.d.ts +19 -0
  111. package/dist/cjs/utils/async.js +74 -0
  112. package/dist/cjs/utils/async.js.map +1 -0
  113. package/dist/cjs/utils/error.d.ts +1 -0
  114. package/dist/cjs/utils/error.js +10 -0
  115. package/dist/cjs/utils/error.js.map +1 -0
  116. package/dist/cjs/utils/format.d.ts +9 -0
  117. package/dist/cjs/utils/format.js +19 -0
  118. package/dist/cjs/utils/format.js.map +1 -0
  119. package/dist/cjs/utils/package.d.ts +15 -0
  120. package/dist/cjs/utils/package.js +25 -0
  121. package/dist/cjs/utils/package.js.map +1 -0
  122. package/dist/cjs/utils/types.d.ts +6 -0
  123. package/dist/cjs/utils/types.js +9 -0
  124. package/dist/cjs/utils/types.js.map +1 -0
  125. package/dist/cjs/utils/url.d.ts +9 -0
  126. package/dist/cjs/utils/url.js +32 -0
  127. package/dist/cjs/utils/url.js.map +1 -0
  128. package/dist/cjs/utils/valueMonitor.d.ts +31 -0
  129. package/dist/cjs/utils/valueMonitor.js +91 -0
  130. package/dist/cjs/utils/valueMonitor.js.map +1 -0
  131. package/package.json +85 -0
@@ -0,0 +1,124 @@
1
+ import type { CrawlingContext } from 'crawlee';
2
+ import type { CrawleeOneIO } from '../integrations/types';
3
+ /** Functions that generates a "redacted" version of a value */
4
+ export type PrivateValueGen<V, K, O> = (val: V, key: K, obj: O) => any;
5
+ /**
6
+ * Given a property value (and its position) this function
7
+ * determines if the property is considered private (and
8
+ * hence should be hidden for privacy reasons).
9
+ *
10
+ * Property is private if the function returns truthy value.
11
+ */
12
+ export type PrivacyFilter<V, K, O> = (val: V, key: K, obj: O, options?: {
13
+ setCustomPrivateValue: (val: V) => any;
14
+ privateValueGen: PrivateValueGen<V, K, O>;
15
+ }) => any;
16
+ /**
17
+ * PrivacyMask determines which (potentally nested) properties
18
+ * of an object are considered private.
19
+ *
20
+ * PrivacyMask copies the structure of another object, but each
21
+ * non-object property on PrivacyMask is a PrivacyFilter - function
22
+ * that determines if the property is considered private.
23
+ *
24
+ * Property is private if the function returns truthy value.
25
+ */
26
+ export type PrivacyMask<T extends object> = {
27
+ [Key in keyof T]?: T[Key] extends Date | any[] ? PrivacyFilter<T[Key], Key, T> : T[Key] extends object ? PrivacyMask<T[Key]> : PrivacyFilter<T[Key], Key, T>;
28
+ };
29
+ export interface PushDataOptions<T extends object> {
30
+ io?: CrawleeOneIO<any, any>;
31
+ /**
32
+ * If set, only at most this many entries will be scraped.
33
+ *
34
+ * The count is determined from the Dataset that's used for the crawler run.
35
+ *
36
+ * This means that if `maxCount` is set to 50, but the
37
+ * associated Dataset already has 40 items in it, then only 10 new entries
38
+ * will be saved.
39
+ */
40
+ maxCount?: number;
41
+ /**
42
+ * Whether items should be enriched with request and run metadata.
43
+ *
44
+ * If truthy, the metadata is set under the `metadata` property.
45
+ */
46
+ includeMetadata?: boolean;
47
+ /**
48
+ * Whether properties that are considered personal data should be shown as is.
49
+ *
50
+ * If falsy or not set, these properties are redacted to hide the actual information.
51
+ *
52
+ * Which properties are personal data is determined by `privacyMask`.
53
+ */
54
+ showPrivate?: boolean;
55
+ /**
56
+ * Determine which properties are considered personal data.
57
+ *
58
+ * See {@link PrivacyMask}.
59
+ **/
60
+ privacyMask: PrivacyMask<T>;
61
+ /**
62
+ * Option to select which keys (fields) of an entry to keep (discarding the rest)
63
+ * before pushing the entries to the dataset.
64
+ *
65
+ * This serves mainly to allow users to select the keys from actor input UI.
66
+ *
67
+ * This is done before `remapKeys`.
68
+ *
69
+ * Keys can be nested, e.g. `"someProp.value[0]"`. Nested path is
70
+ * resolved using Lodash.get().
71
+ */
72
+ pickKeys?: string[];
73
+ /**
74
+ * Option to remap the keys before pushing the entries to the dataset.
75
+ *
76
+ * This serves mainly to allow users to remap the keys from actor input UI.
77
+ *
78
+ * Keys can be nested, e.g. `"someProp.value[0]"`. Nested path is
79
+ * resolved using Lodash.get().
80
+ */
81
+ remapKeys?: Record<string, string>;
82
+ /**
83
+ * Option to freely transform an entry before pushing it to the dataset.
84
+ *
85
+ * This serves mainly to allow users to transform the entries from actor input UI.
86
+ */
87
+ transform?: (item: any) => any;
88
+ /**
89
+ * Option to filter an entry before pushing it to the dataset.
90
+ *
91
+ * This serves mainly to allow users to filter the entries from actor input UI.
92
+ */
93
+ filter?: (item: any) => any;
94
+ /** ID or name of the dataset to which the data should be pushed */
95
+ datasetId?: string;
96
+ /** ID of the RequestQueue that stores remaining requests */
97
+ requestQueueId?: string;
98
+ /** ID or name of the key-value store used as cache */
99
+ cacheStoreId?: string;
100
+ /** Define fields that uniquely identify entries for caching */
101
+ cachePrimaryKeys?: string[];
102
+ /** Define whether we want to add, remove, or overwrite cached entries with results from the actor run */
103
+ cacheActionOnResult?: 'add' | 'remove' | 'overwrite' | null;
104
+ }
105
+ /**
106
+ * Serialize dataset item to fixed-length hash.
107
+ *
108
+ * NOTE: Apify (around which this lib is designed) allows the key-value store key
109
+ * to be max 256 char long.
110
+ * https://docs.apify.com/sdk/js/reference/class/KeyValueStore#setValue
111
+ */
112
+ export declare const itemCacheKey: (item: any, primaryKeys?: string[]) => string;
113
+ /**
114
+ * Apify's `Actor.pushData` with extra features:
115
+ *
116
+ * - Data can be sent elsewhere, not just to Apify. This is set by the `io` options. By default data is sent using Apify (cloud/local).
117
+ * - Limit the max size of the Dataset. No entries are added when Dataset is at or above the limit.
118
+ * - Redact "private" fields
119
+ * - Add metadata to entries before they are pushed to dataset.
120
+ * - Select and rename (nested) properties
121
+ * - Transform and filter entries. Entries that did not pass the filter are not added to the dataset.
122
+ * - Add/remove entries to/from KeyValueStore. Entries are saved to the store by hash generated from entry fields set by `cachePrimaryKeys`.
123
+ */
124
+ export declare const pushData: <Ctx extends CrawlingContext<unknown, import("crawlee").Dictionary>, T extends Record<any, any> = Record<any, any>>(oneOrManyItems: T | T[], ctx: Ctx, options: PushDataOptions<T>) => Promise<unknown[]>;
@@ -0,0 +1,193 @@
1
+ "use strict";
2
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
3
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
4
+ return new (P || (P = Promise))(function (resolve, reject) {
5
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
6
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
7
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
8
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
9
+ });
10
+ };
11
+ Object.defineProperty(exports, "__esModule", { value: true });
12
+ exports.pushData = exports.itemCacheKey = void 0;
13
+ const lodash_1 = require("lodash");
14
+ const async_1 = require("../../utils/async");
15
+ const apify_1 = require("../integrations/apify");
16
+ const dataset_1 = require("./dataset");
17
+ const createMetadataMapper = (ctx, options) => __awaiter(void 0, void 0, void 0, function* () {
18
+ const { io = apify_1.apifyIO } = options !== null && options !== void 0 ? options : {};
19
+ const metadata = yield io.generateEntryMetadata(ctx);
20
+ const addMetadataToData = (item) => (Object.assign(Object.assign({}, item), { metadata }));
21
+ return addMetadataToData;
22
+ });
23
+ const applyPrivacyMask = (item, options) => {
24
+ const { showPrivate, privacyMask, privateValueGen = (_, key) => `<Redacted property "${key}">`, } = options;
25
+ const resolvePrivateValue = (key, val) => {
26
+ // Allow to set custom "redacted" value by calling
27
+ // `setCustomPrivateValue` from inside the filter function.
28
+ let customPrivateValue;
29
+ let setCustomPrivateValueCalled = false;
30
+ const setCustomPrivateValue = (val) => {
31
+ customPrivateValue = val;
32
+ setCustomPrivateValueCalled = true;
33
+ };
34
+ const privacyFilter = privacyMask[key];
35
+ const isPrivate = privacyFilter
36
+ ? privacyFilter(val, key, item, { setCustomPrivateValue, privateValueGen })
37
+ : false;
38
+ // prettier-ignore
39
+ const privateValue = (
40
+ // Don't redact anything if we're asked to show private data
41
+ showPrivate ? val
42
+ // Otherwise, if custom value was given, use that
43
+ : setCustomPrivateValueCalled ? customPrivateValue
44
+ // Otherwise, decide based on filter truthiness
45
+ : isPrivate ? privateValueGen(val, key, item) : val);
46
+ return privateValue;
47
+ };
48
+ const redactedObj = Object.entries(item).reduce((agg, [key, val]) => {
49
+ var _a;
50
+ const isNestedObj = typeof val === 'object' && val != null && !(val instanceof Date) && !Array.isArray(val);
51
+ if (isNestedObj) {
52
+ // Recursively process nested objects
53
+ const subObj = applyPrivacyMask(val, {
54
+ showPrivate,
55
+ privacyMask: ((_a = privacyMask[key]) !== null && _a !== void 0 ? _a : {}),
56
+ privateValueGen,
57
+ });
58
+ agg[key] = subObj;
59
+ }
60
+ else {
61
+ agg[key] = resolvePrivateValue(key, val);
62
+ }
63
+ return agg;
64
+ }, {});
65
+ return redactedObj;
66
+ };
67
+ /** Rename object properties in place */
68
+ const renameKeys = (item, keyNameMap) => {
69
+ Object.entries(keyNameMap || {}).forEach(([oldPath, newPath]) => {
70
+ if (oldPath === newPath)
71
+ return;
72
+ const val = (0, lodash_1.get)(item, oldPath);
73
+ (0, lodash_1.set)(item, newPath, val);
74
+ (0, lodash_1.unset)(item, oldPath);
75
+ });
76
+ return item;
77
+ };
78
+ const sortObjectKeys = (obj) => (0, lodash_1.fromPairs)((0, lodash_1.sortBy)(Object.keys(obj)).map((key) => [key, obj[key]]));
79
+ /**
80
+ * Serialize dataset item to fixed-length hash.
81
+ *
82
+ * NOTE: Apify (around which this lib is designed) allows the key-value store key
83
+ * to be max 256 char long.
84
+ * https://docs.apify.com/sdk/js/reference/class/KeyValueStore#setValue
85
+ */
86
+ const itemCacheKey = (item, primaryKeys) => {
87
+ const thePrimaryKeys = primaryKeys
88
+ ? (0, lodash_1.sortBy)((0, lodash_1.uniq)(primaryKeys.map((s) => s === null || s === void 0 ? void 0 : s.trim()).filter(Boolean)))
89
+ : null;
90
+ const serializedItem = thePrimaryKeys
91
+ ? thePrimaryKeys.map((k) => item === null || item === void 0 ? void 0 : item[k]).join(':')
92
+ : item && (0, lodash_1.isPlainObject)(item)
93
+ ? JSON.stringify(sortObjectKeys(item)) // If possible sort the object's keys
94
+ : JSON.stringify(item);
95
+ const cacheId = cyrb53(serializedItem);
96
+ return cacheId.toString();
97
+ };
98
+ exports.itemCacheKey = itemCacheKey;
99
+ /**
100
+ * Hashing function used when calculating cache ID hash from entries.
101
+ *
102
+ * See https://stackoverflow.com/a/52171480/9788634.
103
+ */
104
+ const cyrb53 = (str, seed = 0) => {
105
+ let h1 = 0xdeadbeef ^ seed, h2 = 0x41c6ce57 ^ seed;
106
+ for (let i = 0, ch; i < str.length; i++) {
107
+ ch = str.charCodeAt(i);
108
+ h1 = Math.imul(h1 ^ ch, 2654435761);
109
+ h2 = Math.imul(h2 ^ ch, 1597334677);
110
+ }
111
+ h1 = Math.imul(h1 ^ (h1 >>> 16), 2246822507);
112
+ h1 ^= Math.imul(h2 ^ (h2 >>> 13), 3266489909);
113
+ h2 = Math.imul(h2 ^ (h2 >>> 16), 2246822507);
114
+ h2 ^= Math.imul(h1 ^ (h1 >>> 13), 3266489909);
115
+ return 4294967296 * (2097151 & h2) + (h1 >>> 0);
116
+ };
117
+ const shortenToSize = (entries, maxCount, options) => __awaiter(void 0, void 0, void 0, function* () {
118
+ const { io, datasetId, requestQueueId, log } = options !== null && options !== void 0 ? options : {};
119
+ const datasetName = datasetId ? `"${datasetId}"` : 'DEFAULT';
120
+ const sizeMonitor = (0, dataset_1.datasetSizeMonitor)(maxCount, { datasetId, requestQueueId, io });
121
+ // Ignore incoming entries if the dataset is already full
122
+ const isDatasetFull = yield sizeMonitor.isFull();
123
+ if (isDatasetFull) {
124
+ log === null || log === void 0 ? void 0 : log.warning(`Dataset (${datasetName}) is already full (${maxCount} entries), ${entries.length} entries will be discarded.`);
125
+ return [];
126
+ } // prettier-ignore
127
+ // Show warning when only part of the incoming data made it into the dataset
128
+ const slicedEntries = yield sizeMonitor.shortenToSize(entries);
129
+ if (slicedEntries.length !== entries.length) {
130
+ log === null || log === void 0 ? void 0 : log.warning(`Dataset (${datasetName}) has become full (${maxCount} entries), ${entries.length} entries will be discarded.`);
131
+ return [];
132
+ } // prettier-ignore
133
+ return slicedEntries;
134
+ });
135
+ /**
136
+ * Apify's `Actor.pushData` with extra features:
137
+ *
138
+ * - Data can be sent elsewhere, not just to Apify. This is set by the `io` options. By default data is sent using Apify (cloud/local).
139
+ * - Limit the max size of the Dataset. No entries are added when Dataset is at or above the limit.
140
+ * - Redact "private" fields
141
+ * - Add metadata to entries before they are pushed to dataset.
142
+ * - Select and rename (nested) properties
143
+ * - Transform and filter entries. Entries that did not pass the filter are not added to the dataset.
144
+ * - Add/remove entries to/from KeyValueStore. Entries are saved to the store by hash generated from entry fields set by `cachePrimaryKeys`.
145
+ */
146
+ const pushData = (oneOrManyItems, ctx, options) => __awaiter(void 0, void 0, void 0, function* () {
147
+ const { io = apify_1.apifyIO, maxCount, includeMetadata, showPrivate, privacyMask, remapKeys, pickKeys, transform, filter, datasetId, requestQueueId, cacheStoreId, cachePrimaryKeys, cacheActionOnResult, } = options;
148
+ const manyItems = Array.isArray(oneOrManyItems) ? oneOrManyItems : [oneOrManyItems];
149
+ const items = maxCount != null
150
+ ? yield shortenToSize(manyItems, maxCount, { io, datasetId, requestQueueId, log: ctx.log })
151
+ : manyItems;
152
+ ctx.log.debug(`Preparing to push ${items.length} entries to dataset`); // prettier-ignore
153
+ const addMetadataToData = yield createMetadataMapper(ctx, { io });
154
+ const adjustedItems = yield items.reduce((aggPromise, item) => __awaiter(void 0, void 0, void 0, function* () {
155
+ const agg = yield aggPromise;
156
+ const itemWithMetadata = includeMetadata ? addMetadataToData(item) : item;
157
+ const maskedItem = applyPrivacyMask(itemWithMetadata, {
158
+ showPrivate,
159
+ privacyMask,
160
+ privateValueGen: (val, key) => `<Redacted property "${key}". To include the actual value, toggle ON the input option "Include personal data">`,
161
+ });
162
+ const renamedItem = remapKeys ? renameKeys(maskedItem, remapKeys) : maskedItem;
163
+ const pickedItem = pickKeys ? (0, lodash_1.pick)(renamedItem, pickKeys) : renamedItem;
164
+ const transformedItem = transform ? yield transform(pickedItem) : pickedItem;
165
+ const passedFilter = filter ? yield filter(transformedItem) : true;
166
+ if (passedFilter)
167
+ agg.push(transformedItem);
168
+ return agg;
169
+ }), Promise.resolve([]));
170
+ // Push entries to primary dataset
171
+ ctx.log.info(`Pushing ${adjustedItems.length} entries to dataset`);
172
+ const dataset = yield io.openDataset(datasetId);
173
+ yield dataset.pushData(adjustedItems);
174
+ ctx.log.info(`Done pushing ${adjustedItems.length} entries to dataset`);
175
+ // Update entries in cache
176
+ if (cacheStoreId && cacheActionOnResult) {
177
+ ctx.log.info(`Update ${adjustedItems.length} entries in cache`);
178
+ const store = yield io.openKeyValueStore(cacheStoreId);
179
+ yield (0, async_1.serialAsyncMap)(adjustedItems, (item) => __awaiter(void 0, void 0, void 0, function* () {
180
+ const cacheId = (0, exports.itemCacheKey)(item, cachePrimaryKeys);
181
+ if (['add', 'overwrite'].includes(cacheActionOnResult)) {
182
+ yield store.setValue(cacheId, item);
183
+ }
184
+ else if (cacheActionOnResult === 'remove') {
185
+ yield store.setValue(cacheId, null);
186
+ }
187
+ }));
188
+ ctx.log.info(`Done updating ${adjustedItems.length} entries in cache`);
189
+ }
190
+ return adjustedItems;
191
+ });
192
+ exports.pushData = pushData;
193
+ //# sourceMappingURL=pushData.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"pushData.js","sourceRoot":"","sources":["../../../../src/lib/io/pushData.ts"],"names":[],"mappings":";;;;;;;;;;;;AACA,mCAAuF;AAEvF,6CAAmD;AAEnD,iDAAmE;AACnE,uCAA+C;AAqH/C,MAAM,oBAAoB,GAAG,CAI3B,GAAQ,EACR,OAAoB,EACpB,EAAE;IACF,MAAM,EAAE,EAAE,GAAG,eAAO,EAAE,GAAG,OAAO,aAAP,OAAO,cAAP,OAAO,GAAI,EAAE,CAAC;IAEvC,MAAM,QAAQ,GAAG,MAAM,EAAE,CAAC,qBAAqB,CAAC,GAAG,CAAC,CAAC;IACrD,MAAM,iBAAiB,GAAG,CAAmB,IAAO,EAAE,EAAE,CAAC,iCAAM,IAAI,KAAE,QAAQ,IAAG,CAAC;IACjF,OAAO,iBAAiB,CAAC;AAC3B,CAAC,CAAA,CAAC;AAEF,MAAM,gBAAgB,GAAG,CACvB,IAAO,EACP,OAIC,EACD,EAAE;IACF,MAAM,EACJ,WAAW,EACX,WAAW,EACX,eAAe,GAAG,CAAC,CAAC,EAAE,GAAG,EAAE,EAAE,CAAC,uBAAuB,GAAG,IAAI,GAC7D,GAAG,OAAO,CAAC;IAEZ,MAAM,mBAAmB,GAAG,CAAC,GAAW,EAAE,GAAQ,EAAE,EAAE;QACpD,kDAAkD;QAClD,2DAA2D;QAC3D,IAAI,kBAAkB,CAAC;QACvB,IAAI,2BAA2B,GAAG,KAAK,CAAC;QACxC,MAAM,qBAAqB,GAAG,CAAC,GAAQ,EAAE,EAAE;YACzC,kBAAkB,GAAG,GAAG,CAAC;YACzB,2BAA2B,GAAG,IAAI,CAAC;QACrC,CAAC,CAAC;QAEF,MAAM,aAAa,GAAG,WAAW,CAAC,GAAG,CAA6C,CAAC;QACnF,MAAM,SAAS,GAAG,aAAa;YAC7B,CAAC,CAAC,aAAa,CAAC,GAAG,EAAE,GAAG,EAAE,IAAI,EAAE,EAAE,qBAAqB,EAAE,eAAe,EAAE,CAAC;YAC3E,CAAC,CAAC,KAAK,CAAC;QAEV,kBAAkB;QAClB,MAAM,YAAY,GAAG;QACnB,4DAA4D;QAC5D,WAAW,CAAC,CAAC,CAAC,GAAG;YACjB,iDAAiD;YACjD,CAAC,CAAC,2BAA2B,CAAC,CAAC,CAAC,kBAAkB;gBAClD,+CAA+C;gBAC/C,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,eAAe,CAAC,GAAG,EAAE,GAAG,EAAE,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG,CACpD,CAAC;QACF,OAAO,YAAY,CAAC;IACtB,CAAC,CAAC;IAEF,MAAM,WAAW,GAAG,MAAM,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,GAAG,EAAE,GAAG,CAAC,EAAE,EAAE;;QAClE,MAAM,WAAW,GACf,OAAO,GAAG,KAAK,QAAQ,IAAI,GAAG,IAAI,IAAI,IAAI,CAAC,CAAC,GAAG,YAAY,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC;QAE1F,IAAI,WAAW,EAAE;YACf,qCAAqC;YACrC,MAAM,MAAM,GAAG,gBAAgB,CAAC,GAAG,EAAE;gBACnC,WAAW;gBACX,WAAW,EAAE,CAAC,MAAA,WAAW,CAAC,GAAG,CAAC,mCAAI,EAAE,CAAQ;gBAC5C,eAAe;aAChB,CAAC,CAAC;YACH,GAAG,CAAC,GAAc,CAAC,GAAG,MAAa,CAAC;SACrC;aAAM;YACL,GAAG,CAAC,GAAc,CAAC,GAAG,mBAAmB,CAAC,GAAG,EAAE,GAAG,CAAC,CAAC;SACrD;QACD,OAAO,GAAG,CAAC;IACb,CAAC,EAAE,EAAO,CAAC,CAAC;IAEZ,OAAO,WAAW,CAAC;AACrB,CAAC,CAAC;AAEF,wCAAwC;AACxC,MAAM,UAAU,GAAG,CAAmB,IAAO,EAAE,UAAkC,EAAE,EAAE;IACnF,MAAM,CAAC,OAAO,CAAC,UAAU,IAAI,EAAE,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,OAAO,EAAE,OAAO,CAAC,EAAE,EAAE;QAC9D,IAAI,OAAO,KAAK,OAAO;YAAE,OAAO;QAChC,MAAM,GAAG,GAAG,IAAA,YAAG,EAAC,IAAI,EAAE,OAAO,CAAC,CAAC;QAC/B,IAAA,YAAG,EAAC,IAAI,EAAE,OAAiB,EAAE,GAAG,CAAC,CAAC;QAClC,IAAA,cAAK,EAAC,IAAI,EAAE,OAAO,CAAC,CAAC;IACvB,CAAC,CAAC,CAAC;IACH,OAAO,IAAI,CAAC;AACd,CAAC,CAAC;AAEF,MAAM,cAAc,GAAG,CAAmB,GAAM,EAAE,EAAE,CAClD,IAAA,kBAAS,EAAC,IAAA,eAAM,EAAC,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,EAAE,CAAC,CAAC,GAAG,EAAE,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;AAEpE;;;;;;GAMG;AACI,MAAM,YAAY,GAAG,CAAC,IAAS,EAAE,WAAsB,EAAE,EAAE;IAChE,MAAM,cAAc,GAAG,WAAW;QAChC,CAAC,CAAC,IAAA,eAAM,EAAC,IAAA,aAAI,EAAC,WAAW,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,aAAD,CAAC,uBAAD,CAAC,CAAE,IAAI,EAAE,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,CAAC;QACjE,CAAC,CAAC,IAAI,CAAC;IAET,MAAM,cAAc,GAAG,cAAc;QACnC,CAAC,CAAC,cAAc,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,IAAI,aAAJ,IAAI,uBAAJ,IAAI,CAAG,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC;QAChD,CAAC,CAAC,IAAI,IAAI,IAAA,sBAAa,EAAC,IAAI,CAAC;YAC7B,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,cAAc,CAAC,IAAI,CAAC,CAAC,CAAC,qCAAqC;YAC5E,CAAC,CAAC,IAAI,CAAC,SAAS,CAAC,IAAI,CAAC,CAAC;IAEzB,MAAM,OAAO,GAAG,MAAM,CAAC,cAAc,CAAC,CAAC;IACvC,OAAO,OAAO,CAAC,QAAQ,EAAE,CAAC;AAC5B,CAAC,CAAC;AAbW,QAAA,YAAY,gBAavB;AAEF;;;;GAIG;AACH,MAAM,MAAM,GAAG,CAAC,GAAG,EAAE,IAAI,GAAG,CAAC,EAAE,EAAE;IAC/B,IAAI,EAAE,GAAG,UAAU,GAAG,IAAI,EACxB,EAAE,GAAG,UAAU,GAAG,IAAI,CAAC;IACzB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE;QACvC,EAAE,GAAG,GAAG,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC;QACvB,EAAE,GAAG,IAAI,CAAC,IAAI,CAAC,EAAE,GAAG,EAAE,EAAE,UAAU,CAAC,CAAC;QACpC,EAAE,GAAG,IAAI,CAAC,IAAI,CAAC,EAAE,GAAG,EAAE,EAAE,UAAU,CAAC,CAAC;KACrC;IACD,EAAE,GAAG,IAAI,CAAC,IAAI,CAAC,EAAE,GAAG,CAAC,EAAE,KAAK,EAAE,CAAC,EAAE,UAAU,CAAC,CAAC;IAC7C,EAAE,IAAI,IAAI,CAAC,IAAI,CAAC,EAAE,GAAG,CAAC,EAAE,KAAK,EAAE,CAAC,EAAE,UAAU,CAAC,CAAC;IAC9C,EAAE,GAAG,IAAI,CAAC,IAAI,CAAC,EAAE,GAAG,CAAC,EAAE,KAAK,EAAE,CAAC,EAAE,UAAU,CAAC,CAAC;IAC7C,EAAE,IAAI,IAAI,CAAC,IAAI,CAAC,EAAE,GAAG,CAAC,EAAE,KAAK,EAAE,CAAC,EAAE,UAAU,CAAC,CAAC;IAE9C,OAAO,UAAU,GAAG,CAAC,OAAO,GAAG,EAAE,CAAC,GAAG,CAAC,EAAE,KAAK,CAAC,CAAC,CAAC;AAClD,CAAC,CAAC;AAEF,MAAM,aAAa,GAAG,CACpB,OAAY,EACZ,QAAgB,EAChB,OAAsF,EACtF,EAAE;IACF,MAAM,EAAE,EAAE,EAAE,SAAS,EAAE,cAAc,EAAE,GAAG,EAAE,GAAG,OAAO,aAAP,OAAO,cAAP,OAAO,GAAI,EAAE,CAAC;IAC7D,MAAM,WAAW,GAAG,SAAS,CAAC,CAAC,CAAC,IAAI,SAAS,GAAG,CAAC,CAAC,CAAC,SAAS,CAAC;IAE7D,MAAM,WAAW,GAAG,IAAA,4BAAkB,EAAC,QAAQ,EAAE,EAAE,SAAS,EAAE,cAAc,EAAE,EAAE,EAAE,CAAC,CAAC;IAEpF,yDAAyD;IACzD,MAAM,aAAa,GAAG,MAAM,WAAW,CAAC,MAAM,EAAE,CAAC;IACjD,IAAI,aAAa,EAAE;QACjB,GAAG,aAAH,GAAG,uBAAH,GAAG,CAAE,OAAO,CAAC,YAAY,WAAW,sBAAsB,QAAQ,cAAc,OAAO,CAAC,MAAM,6BAA6B,CAAC,CAAC;QAC7H,OAAO,EAAE,CAAC;KACX,CAAC,kBAAkB;IAEpB,4EAA4E;IAC5E,MAAM,aAAa,GAAG,MAAM,WAAW,CAAC,aAAa,CAAC,OAAO,CAAC,CAAC;IAC/D,IAAI,aAAa,CAAC,MAAM,KAAK,OAAO,CAAC,MAAM,EAAE;QAC3C,GAAG,aAAH,GAAG,uBAAH,GAAG,CAAE,OAAO,CAAC,YAAY,WAAW,sBAAsB,QAAQ,cAAc,OAAO,CAAC,MAAM,6BAA6B,CAAC,CAAC;QAC7H,OAAO,EAAE,CAAC;KACX,CAAC,kBAAkB;IAEpB,OAAO,aAAa,CAAC;AACvB,CAAC,CAAA,CAAC;AAEF;;;;;;;;;;GAUG;AACI,MAAM,QAAQ,GAAG,CAItB,cAAuB,EACvB,GAAQ,EACR,OAA2B,EAC3B,EAAE;IACF,MAAM,EACJ,EAAE,GAAG,eAAuB,EAC5B,QAAQ,EACR,eAAe,EACf,WAAW,EACX,WAAW,EACX,SAAS,EACT,QAAQ,EACR,SAAS,EACT,MAAM,EACN,SAAS,EACT,cAAc,EACd,YAAY,EACZ,gBAAgB,EAChB,mBAAmB,GACpB,GAAG,OAAO,CAAC;IAEZ,MAAM,SAAS,GAAG,KAAK,CAAC,OAAO,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,cAAc,CAAC,CAAC;IACpF,MAAM,KAAK,GACT,QAAQ,IAAI,IAAI;QACd,CAAC,CAAC,MAAM,aAAa,CAAC,SAAS,EAAE,QAAQ,EAAE,EAAE,EAAE,EAAE,SAAS,EAAE,cAAc,EAAE,GAAG,EAAE,GAAG,CAAC,GAAG,EAAE,CAAC;QAC3F,CAAC,CAAC,SAAS,CAAC;IAEhB,GAAG,CAAC,GAAG,CAAC,KAAK,CAAC,qBAAqB,KAAK,CAAC,MAAM,qBAAqB,CAAC,CAAC,CAAC,kBAAkB;IACzF,MAAM,iBAAiB,GAAG,MAAM,oBAAoB,CAAC,GAAG,EAAE,EAAE,EAAE,EAAE,CAAC,CAAC;IAElE,MAAM,aAAa,GAAG,MAAM,KAAK,CAAC,MAAM,CAAC,CAAO,UAAU,EAAE,IAAI,EAAE,EAAE;QAClE,MAAM,GAAG,GAAG,MAAM,UAAU,CAAC;QAE7B,MAAM,gBAAgB,GAAG,eAAe,CAAC,CAAC,CAAC,iBAAiB,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC;QAC1E,MAAM,UAAU,GAAG,gBAAgB,CAAC,gBAAgB,EAAE;YACpD,WAAW;YACX,WAAW;YACX,eAAe,EAAE,CAAC,GAAG,EAAE,GAAG,EAAE,EAAE,CAC5B,uBAAuB,GAAG,qFAAqF;SAClH,CAAC,CAAC;QAEH,MAAM,WAAW,GAAG,SAAS,CAAC,CAAC,CAAC,UAAU,CAAC,UAAU,EAAE,SAAS,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC;QAC/E,MAAM,UAAU,GAAG,QAAQ,CAAC,CAAC,CAAC,IAAA,aAAI,EAAC,WAAW,EAAE,QAAQ,CAAC,CAAC,CAAC,CAAC,WAAW,CAAC;QACxE,MAAM,eAAe,GAAG,SAAS,CAAC,CAAC,CAAC,MAAM,SAAS,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC;QAC7E,MAAM,YAAY,GAAG,MAAM,CAAC,CAAC,CAAC,MAAM,MAAM,CAAC,eAAe,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC;QAEnE,IAAI,YAAY;YAAE,GAAG,CAAC,IAAI,CAAC,eAAe,CAAC,CAAC;QAE5C,OAAO,GAAG,CAAC;IACb,CAAC,CAAA,EAAE,OAAO,CAAC,OAAO,CAAC,EAAe,CAAC,CAAC,CAAC;IAErC,kCAAkC;IAClC,GAAG,CAAC,GAAG,CAAC,IAAI,CAAC,WAAW,aAAa,CAAC,MAAM,qBAAqB,CAAC,CAAC;IACnE,MAAM,OAAO,GAAG,MAAM,EAAE,CAAC,WAAW,CAAC,SAAS,CAAC,CAAC;IAChD,MAAM,OAAO,CAAC,QAAQ,CAAC,aAAa,CAAC,CAAC;IACtC,GAAG,CAAC,GAAG,CAAC,IAAI,CAAC,gBAAgB,aAAa,CAAC,MAAM,qBAAqB,CAAC,CAAC;IAExE,0BAA0B;IAC1B,IAAI,YAAY,IAAI,mBAAmB,EAAE;QACvC,GAAG,CAAC,GAAG,CAAC,IAAI,CAAC,UAAU,aAAa,CAAC,MAAM,mBAAmB,CAAC,CAAC;QAChE,MAAM,KAAK,GAAG,MAAM,EAAE,CAAC,iBAAiB,CAAC,YAAY,CAAC,CAAC;QACvD,MAAM,IAAA,sBAAc,EAAC,aAAa,EAAE,CAAO,IAAS,EAAE,EAAE;YACtD,MAAM,OAAO,GAAG,IAAA,oBAAY,EAAC,IAAI,EAAE,gBAAgB,CAAC,CAAC;YAErD,IAAI,CAAC,KAAK,EAAE,WAAW,CAAC,CAAC,QAAQ,CAAC,mBAAmB,CAAC,EAAE;gBACtD,MAAM,KAAK,CAAC,QAAQ,CAAC,OAAO,EAAE,IAAI,CAAC,CAAC;aACrC;iBAAM,IAAI,mBAAmB,KAAK,QAAQ,EAAE;gBAC3C,MAAM,KAAK,CAAC,QAAQ,CAAC,OAAO,EAAE,IAAI,CAAC,CAAC;aACrC;QACH,CAAC,CAAA,CAAC,CAAC;QACH,GAAG,CAAC,GAAG,CAAC,IAAI,CAAC,iBAAiB,aAAa,CAAC,MAAM,mBAAmB,CAAC,CAAC;KACxE;IAED,OAAO,aAAa,CAAC;AACvB,CAAC,CAAA,CAAC;AA9EW,QAAA,QAAQ,YA8EnB","sourcesContent":["import type { CrawlingContext, Log } from 'crawlee';\nimport { get, pick, set, unset, uniq, sortBy, isPlainObject, fromPairs } from 'lodash';\n\nimport { serialAsyncMap } from '../../utils/async';\nimport type { CrawleeOneIO } from '../integrations/types';\nimport { ApifyCrawleeOneIO, apifyIO } from '../integrations/apify';\nimport { datasetSizeMonitor } from './dataset';\n\n/** Functions that generates a \"redacted\" version of a value */\nexport type PrivateValueGen<V, K, O> = (val: V, key: K, obj: O) => any;\n\n/**\n * Given a property value (and its position) this function\n * determines if the property is considered private (and\n * hence should be hidden for privacy reasons).\n *\n * Property is private if the function returns truthy value.\n */\nexport type PrivacyFilter<V, K, O> = (\n val: V,\n key: K,\n obj: O,\n options?: {\n setCustomPrivateValue: (val: V) => any;\n privateValueGen: PrivateValueGen<V, K, O>;\n }\n) => any;\n\n/**\n * PrivacyMask determines which (potentally nested) properties\n * of an object are considered private.\n *\n * PrivacyMask copies the structure of another object, but each\n * non-object property on PrivacyMask is a PrivacyFilter - function\n * that determines if the property is considered private.\n *\n * Property is private if the function returns truthy value.\n */\nexport type PrivacyMask<T extends object> = {\n [Key in keyof T]?: T[Key] extends Date | any[] // Consider Data and Array as non-objects\n ? PrivacyFilter<T[Key], Key, T>\n : T[Key] extends object\n ? PrivacyMask<T[Key]>\n : PrivacyFilter<T[Key], Key, T>;\n};\n\nexport interface PushDataOptions<T extends object> {\n io?: CrawleeOneIO<any, any>;\n /**\n * If set, only at most this many entries will be scraped.\n *\n * The count is determined from the Dataset that's used for the crawler run.\n *\n * This means that if `maxCount` is set to 50, but the\n * associated Dataset already has 40 items in it, then only 10 new entries\n * will be saved.\n */\n maxCount?: number;\n /**\n * Whether items should be enriched with request and run metadata.\n *\n * If truthy, the metadata is set under the `metadata` property.\n */\n includeMetadata?: boolean;\n /**\n * Whether properties that are considered personal data should be shown as is.\n *\n * If falsy or not set, these properties are redacted to hide the actual information.\n *\n * Which properties are personal data is determined by `privacyMask`.\n */\n showPrivate?: boolean;\n /**\n * Determine which properties are considered personal data.\n *\n * See {@link PrivacyMask}.\n **/\n privacyMask: PrivacyMask<T>;\n /**\n * Option to select which keys (fields) of an entry to keep (discarding the rest)\n * before pushing the entries to the dataset.\n *\n * This serves mainly to allow users to select the keys from actor input UI.\n *\n * This is done before `remapKeys`.\n *\n * Keys can be nested, e.g. `\"someProp.value[0]\"`. Nested path is\n * resolved using Lodash.get().\n */\n pickKeys?: string[];\n /**\n * Option to remap the keys before pushing the entries to the dataset.\n *\n * This serves mainly to allow users to remap the keys from actor input UI.\n *\n * Keys can be nested, e.g. `\"someProp.value[0]\"`. Nested path is\n * resolved using Lodash.get().\n */\n remapKeys?: Record<string, string>;\n /**\n * Option to freely transform an entry before pushing it to the dataset.\n *\n * This serves mainly to allow users to transform the entries from actor input UI.\n */\n transform?: (item: any) => any;\n /**\n * Option to filter an entry before pushing it to the dataset.\n *\n * This serves mainly to allow users to filter the entries from actor input UI.\n */\n filter?: (item: any) => any;\n /** ID or name of the dataset to which the data should be pushed */\n datasetId?: string;\n /** ID of the RequestQueue that stores remaining requests */\n requestQueueId?: string;\n /** ID or name of the key-value store used as cache */\n cacheStoreId?: string;\n /** Define fields that uniquely identify entries for caching */\n cachePrimaryKeys?: string[];\n /** Define whether we want to add, remove, or overwrite cached entries with results from the actor run */\n cacheActionOnResult?: 'add' | 'remove' | 'overwrite' | null;\n}\n\nconst createMetadataMapper = async <\n Ctx extends CrawlingContext,\n TIO extends CrawleeOneIO<any, any> = ApifyCrawleeOneIO\n>(\n ctx: Ctx,\n options: { io: TIO }\n) => {\n const { io = apifyIO } = options ?? {};\n\n const metadata = await io.generateEntryMetadata(ctx);\n const addMetadataToData = <T extends object>(item: T) => ({ ...item, metadata });\n return addMetadataToData;\n};\n\nconst applyPrivacyMask = <T extends Record<any, any> = Record<any, any>>(\n item: T,\n options: {\n showPrivate?: boolean;\n privacyMask: PrivacyMask<T>;\n privateValueGen?: (val: any, key: string, item: T) => any;\n }\n) => {\n const {\n showPrivate,\n privacyMask,\n privateValueGen = (_, key) => `<Redacted property \"${key}\">`,\n } = options;\n\n const resolvePrivateValue = (key: string, val: any) => {\n // Allow to set custom \"redacted\" value by calling\n // `setCustomPrivateValue` from inside the filter function.\n let customPrivateValue;\n let setCustomPrivateValueCalled = false;\n const setCustomPrivateValue = (val: any) => {\n customPrivateValue = val;\n setCustomPrivateValueCalled = true;\n };\n\n const privacyFilter = privacyMask[key] as PrivacyFilter<any, any, any> | undefined;\n const isPrivate = privacyFilter\n ? privacyFilter(val, key, item, { setCustomPrivateValue, privateValueGen })\n : false;\n\n // prettier-ignore\n const privateValue = (\n // Don't redact anything if we're asked to show private data\n showPrivate ? val\n // Otherwise, if custom value was given, use that\n : setCustomPrivateValueCalled ? customPrivateValue\n // Otherwise, decide based on filter truthiness\n : isPrivate ? privateValueGen(val, key, item) : val\n );\n return privateValue;\n };\n\n const redactedObj = Object.entries(item).reduce((agg, [key, val]) => {\n const isNestedObj =\n typeof val === 'object' && val != null && !(val instanceof Date) && !Array.isArray(val);\n\n if (isNestedObj) {\n // Recursively process nested objects\n const subObj = applyPrivacyMask(val, {\n showPrivate,\n privacyMask: (privacyMask[key] ?? {}) as any,\n privateValueGen,\n });\n agg[key as keyof T] = subObj as any;\n } else {\n agg[key as keyof T] = resolvePrivateValue(key, val);\n }\n return agg;\n }, {} as T);\n\n return redactedObj;\n};\n\n/** Rename object properties in place */\nconst renameKeys = <T extends object>(item: T, keyNameMap: Record<string, string>) => {\n Object.entries(keyNameMap || {}).forEach(([oldPath, newPath]) => {\n if (oldPath === newPath) return;\n const val = get(item, oldPath);\n set(item, newPath as string, val);\n unset(item, oldPath);\n });\n return item;\n};\n\nconst sortObjectKeys = <T extends object>(obj: T) =>\n fromPairs(sortBy(Object.keys(obj)).map((key) => [key, obj[key]]));\n\n/**\n * Serialize dataset item to fixed-length hash.\n *\n * NOTE: Apify (around which this lib is designed) allows the key-value store key\n * to be max 256 char long.\n * https://docs.apify.com/sdk/js/reference/class/KeyValueStore#setValue\n */\nexport const itemCacheKey = (item: any, primaryKeys?: string[]) => {\n const thePrimaryKeys = primaryKeys\n ? sortBy(uniq(primaryKeys.map((s) => s?.trim()).filter(Boolean)))\n : null;\n\n const serializedItem = thePrimaryKeys\n ? thePrimaryKeys.map((k) => item?.[k]).join(':')\n : item && isPlainObject(item)\n ? JSON.stringify(sortObjectKeys(item)) // If possible sort the object's keys\n : JSON.stringify(item);\n\n const cacheId = cyrb53(serializedItem);\n return cacheId.toString();\n};\n\n/**\n * Hashing function used when calculating cache ID hash from entries.\n *\n * See https://stackoverflow.com/a/52171480/9788634.\n */\nconst cyrb53 = (str, seed = 0) => {\n let h1 = 0xdeadbeef ^ seed,\n h2 = 0x41c6ce57 ^ seed;\n for (let i = 0, ch; i < str.length; i++) {\n ch = str.charCodeAt(i);\n h1 = Math.imul(h1 ^ ch, 2654435761);\n h2 = Math.imul(h2 ^ ch, 1597334677);\n }\n h1 = Math.imul(h1 ^ (h1 >>> 16), 2246822507);\n h1 ^= Math.imul(h2 ^ (h2 >>> 13), 3266489909);\n h2 = Math.imul(h2 ^ (h2 >>> 16), 2246822507);\n h2 ^= Math.imul(h1 ^ (h1 >>> 13), 3266489909);\n\n return 4294967296 * (2097151 & h2) + (h1 >>> 0);\n};\n\nconst shortenToSize = async <T>(\n entries: T[],\n maxCount: number,\n options?: { io?: CrawleeOneIO; datasetId?: string; requestQueueId?: string; log: Log }\n) => {\n const { io, datasetId, requestQueueId, log } = options ?? {};\n const datasetName = datasetId ? `\"${datasetId}\"` : 'DEFAULT';\n\n const sizeMonitor = datasetSizeMonitor(maxCount, { datasetId, requestQueueId, io });\n\n // Ignore incoming entries if the dataset is already full\n const isDatasetFull = await sizeMonitor.isFull();\n if (isDatasetFull) {\n log?.warning(`Dataset (${datasetName}) is already full (${maxCount} entries), ${entries.length} entries will be discarded.`);\n return [];\n } // prettier-ignore\n\n // Show warning when only part of the incoming data made it into the dataset\n const slicedEntries = await sizeMonitor.shortenToSize(entries);\n if (slicedEntries.length !== entries.length) {\n log?.warning(`Dataset (${datasetName}) has become full (${maxCount} entries), ${entries.length} entries will be discarded.`);\n return [];\n } // prettier-ignore\n\n return slicedEntries;\n};\n\n/**\n * Apify's `Actor.pushData` with extra features:\n *\n * - Data can be sent elsewhere, not just to Apify. This is set by the `io` options. By default data is sent using Apify (cloud/local).\n * - Limit the max size of the Dataset. No entries are added when Dataset is at or above the limit.\n * - Redact \"private\" fields\n * - Add metadata to entries before they are pushed to dataset.\n * - Select and rename (nested) properties\n * - Transform and filter entries. Entries that did not pass the filter are not added to the dataset.\n * - Add/remove entries to/from KeyValueStore. Entries are saved to the store by hash generated from entry fields set by `cachePrimaryKeys`.\n */\nexport const pushData = async <\n Ctx extends CrawlingContext,\n T extends Record<any, any> = Record<any, any>\n>(\n oneOrManyItems: T | T[],\n ctx: Ctx,\n options: PushDataOptions<T>\n) => {\n const {\n io = apifyIO as CrawleeOneIO,\n maxCount,\n includeMetadata,\n showPrivate,\n privacyMask,\n remapKeys,\n pickKeys,\n transform,\n filter,\n datasetId,\n requestQueueId,\n cacheStoreId,\n cachePrimaryKeys,\n cacheActionOnResult,\n } = options;\n\n const manyItems = Array.isArray(oneOrManyItems) ? oneOrManyItems : [oneOrManyItems];\n const items =\n maxCount != null\n ? await shortenToSize(manyItems, maxCount, { io, datasetId, requestQueueId, log: ctx.log })\n : manyItems;\n\n ctx.log.debug(`Preparing to push ${items.length} entries to dataset`); // prettier-ignore\n const addMetadataToData = await createMetadataMapper(ctx, { io });\n\n const adjustedItems = await items.reduce(async (aggPromise, item) => {\n const agg = await aggPromise;\n\n const itemWithMetadata = includeMetadata ? addMetadataToData(item) : item;\n const maskedItem = applyPrivacyMask(itemWithMetadata, {\n showPrivate,\n privacyMask,\n privateValueGen: (val, key) =>\n `<Redacted property \"${key}\". To include the actual value, toggle ON the input option \"Include personal data\">`,\n });\n\n const renamedItem = remapKeys ? renameKeys(maskedItem, remapKeys) : maskedItem;\n const pickedItem = pickKeys ? pick(renamedItem, pickKeys) : renamedItem;\n const transformedItem = transform ? await transform(pickedItem) : pickedItem;\n const passedFilter = filter ? await filter(transformedItem) : true;\n\n if (passedFilter) agg.push(transformedItem);\n\n return agg;\n }, Promise.resolve([] as unknown[]));\n\n // Push entries to primary dataset\n ctx.log.info(`Pushing ${adjustedItems.length} entries to dataset`);\n const dataset = await io.openDataset(datasetId);\n await dataset.pushData(adjustedItems);\n ctx.log.info(`Done pushing ${adjustedItems.length} entries to dataset`);\n\n // Update entries in cache\n if (cacheStoreId && cacheActionOnResult) {\n ctx.log.info(`Update ${adjustedItems.length} entries in cache`);\n const store = await io.openKeyValueStore(cacheStoreId);\n await serialAsyncMap(adjustedItems, async (item: any) => {\n const cacheId = itemCacheKey(item, cachePrimaryKeys);\n\n if (['add', 'overwrite'].includes(cacheActionOnResult)) {\n await store.setValue(cacheId, item);\n } else if (cacheActionOnResult === 'remove') {\n await store.setValue(cacheId, null);\n }\n });\n ctx.log.info(`Done updating ${adjustedItems.length} entries in cache`);\n }\n\n return adjustedItems;\n};\n"]}
@@ -0,0 +1,38 @@
1
+ import type { CrawlingContext, Request as CrawleeRequest, RequestQueueOperationOptions } from 'crawlee';
2
+ import type { CrawleeOneIO } from '../integrations/types';
3
+ export interface PushRequestsOptions<T extends CrawleeRequest = CrawleeRequest> {
4
+ io?: CrawleeOneIO<any, any>;
5
+ /**
6
+ * If set, only at most this many requests will be added to the RequestQueue.
7
+ *
8
+ * The count is determined from the RequestQueue that's used for the crawler run.
9
+ *
10
+ * This means that if `maxCount` is set to 50, but the
11
+ * associated RequestQueue already handled 40 requests, then only 10 new requests
12
+ * will be processed.
13
+ */
14
+ maxCount?: number;
15
+ /**
16
+ * Option to freely transform a request before pushing it to the RequestQueue.
17
+ *
18
+ * This serves mainly to allow users to transform the requests from actor input UI.
19
+ */
20
+ transform?: (req: T) => any;
21
+ /**
22
+ * Option to filter a request before pushing it to the RequestQueue.
23
+ *
24
+ * This serves mainly to allow users to filter the requests from actor input UI.
25
+ */
26
+ filter?: (req: T) => any;
27
+ /** ID of the RequestQueue to which the data should be pushed */
28
+ requestQueueId?: string;
29
+ queueOptions?: RequestQueueOperationOptions;
30
+ }
31
+ /**
32
+ * Similar to `Actor.openRequestQueue().addRequests`, but with extra features:
33
+ *
34
+ * - Data can be sent elsewhere, not just to Apify. This is set by the `io` options. By default data is sent using Apify (cloud/local).
35
+ * - Limit the max size of the RequestQueue. No requests are added when RequestQueue is at or above the limit.
36
+ * - Transform and filter requests. Requests that did not pass the filter are not added to the RequestQueue.
37
+ */
38
+ export declare const pushRequests: <Ctx extends CrawlingContext<unknown, import("crawlee").Dictionary>, T extends CrawleeRequest<import("crawlee").Dictionary> = CrawleeRequest<import("crawlee").Dictionary>>(oneOrManyItems: T | T[], ctx: Ctx, options: PushRequestsOptions<T>) => Promise<unknown[]>;
@@ -0,0 +1,63 @@
1
+ "use strict";
2
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
3
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
4
+ return new (P || (P = Promise))(function (resolve, reject) {
5
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
6
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
7
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
8
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
9
+ });
10
+ };
11
+ Object.defineProperty(exports, "__esModule", { value: true });
12
+ exports.pushRequests = void 0;
13
+ const requestQueue_1 = require("./requestQueue");
14
+ const apify_1 = require("../integrations/apify");
15
+ const shortenToSize = (entries, maxCount, options) => __awaiter(void 0, void 0, void 0, function* () {
16
+ const { requestQueueId, log } = options !== null && options !== void 0 ? options : {};
17
+ const queueName = requestQueueId ? `"${requestQueueId}"` : 'DEFAULT';
18
+ const sizeMonitor = (0, requestQueue_1.requestQueueSizeMonitor)(maxCount, options);
19
+ // Ignore incoming entries if the queue is already full
20
+ const isFull = yield sizeMonitor.isFull();
21
+ if (isFull) {
22
+ log === null || log === void 0 ? void 0 : log.warning(`RequestQueue (${queueName}) is already full (${maxCount} entries), ${entries.length} entries will be discarded.`);
23
+ return [];
24
+ } // prettier-ignore
25
+ // Show warning when only part of the incoming requests made it into the queue
26
+ const slicedEntries = yield sizeMonitor.shortenToSize(entries);
27
+ if (slicedEntries.length !== entries.length) {
28
+ log === null || log === void 0 ? void 0 : log.warning(`RequestQueue (${queueName}) has become full (${maxCount} entries), ${entries.length} entries will be discarded.`);
29
+ return [];
30
+ } // prettier-ignore
31
+ return slicedEntries;
32
+ });
33
+ /**
34
+ * Similar to `Actor.openRequestQueue().addRequests`, but with extra features:
35
+ *
36
+ * - Data can be sent elsewhere, not just to Apify. This is set by the `io` options. By default data is sent using Apify (cloud/local).
37
+ * - Limit the max size of the RequestQueue. No requests are added when RequestQueue is at or above the limit.
38
+ * - Transform and filter requests. Requests that did not pass the filter are not added to the RequestQueue.
39
+ */
40
+ const pushRequests = (oneOrManyItems, ctx, options) => __awaiter(void 0, void 0, void 0, function* () {
41
+ const { io = apify_1.apifyIO, maxCount, transform, filter, requestQueueId, queueOptions, } = options;
42
+ const manyItems = Array.isArray(oneOrManyItems) ? oneOrManyItems : [oneOrManyItems];
43
+ const items = maxCount != null
44
+ ? yield shortenToSize(manyItems, maxCount, { io, requestQueueId, log: ctx.log })
45
+ : manyItems;
46
+ ctx.log.debug(`Preparing to push ${items.length} requests to queue`); // prettier-ignore
47
+ const adjustedItems = yield items.reduce((aggPromise, item) => __awaiter(void 0, void 0, void 0, function* () {
48
+ const agg = yield aggPromise;
49
+ const transformedItem = transform ? yield transform(item) : item;
50
+ const passedFilter = filter ? yield filter(transformedItem) : true;
51
+ if (passedFilter)
52
+ agg.push(transformedItem);
53
+ return agg;
54
+ }), Promise.resolve([]));
55
+ // Push requests to primary RequestQueue
56
+ ctx.log.info(`Pushing ${adjustedItems.length} requests to queue`);
57
+ const reqQueue = yield io.openRequestQueue(requestQueueId);
58
+ yield reqQueue.addRequests(adjustedItems, queueOptions);
59
+ ctx.log.info(`Done pushing ${adjustedItems.length} requests to queue`);
60
+ return adjustedItems;
61
+ });
62
+ exports.pushRequests = pushRequests;
63
+ //# sourceMappingURL=pushRequests.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"pushRequests.js","sourceRoot":"","sources":["../../../../src/lib/io/pushRequests.ts"],"names":[],"mappings":";;;;;;;;;;;;AAOA,iDAAyD;AAEzD,iDAAgD;AAiChD,MAAM,aAAa,GAAG,CACpB,OAAY,EACZ,QAAgB,EAChB,OAAkE,EAClE,EAAE;IACF,MAAM,EAAE,cAAc,EAAE,GAAG,EAAE,GAAG,OAAO,aAAP,OAAO,cAAP,OAAO,GAAI,EAAE,CAAC;IAE9C,MAAM,SAAS,GAAG,cAAc,CAAC,CAAC,CAAC,IAAI,cAAc,GAAG,CAAC,CAAC,CAAC,SAAS,CAAC;IAErE,MAAM,WAAW,GAAG,IAAA,sCAAuB,EAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;IAE/D,uDAAuD;IACvD,MAAM,MAAM,GAAG,MAAM,WAAW,CAAC,MAAM,EAAE,CAAC;IAC1C,IAAI,MAAM,EAAE;QACV,GAAG,aAAH,GAAG,uBAAH,GAAG,CAAE,OAAO,CAAC,iBAAiB,SAAS,sBAAsB,QAAQ,cAAc,OAAO,CAAC,MAAM,6BAA6B,CAAC,CAAC;QAChI,OAAO,EAAE,CAAC;KACX,CAAC,kBAAkB;IAEpB,8EAA8E;IAC9E,MAAM,aAAa,GAAG,MAAM,WAAW,CAAC,aAAa,CAAC,OAAO,CAAC,CAAC;IAC/D,IAAI,aAAa,CAAC,MAAM,KAAK,OAAO,CAAC,MAAM,EAAE;QAC3C,GAAG,aAAH,GAAG,uBAAH,GAAG,CAAE,OAAO,CAAC,iBAAiB,SAAS,sBAAsB,QAAQ,cAAc,OAAO,CAAC,MAAM,6BAA6B,CAAC,CAAC;QAChI,OAAO,EAAE,CAAC;KACX,CAAC,kBAAkB;IAEpB,OAAO,aAAa,CAAC;AACvB,CAAC,CAAA,CAAC;AAEF;;;;;;GAMG;AACI,MAAM,YAAY,GAAG,CAI1B,cAAuB,EACvB,GAAQ,EACR,OAA+B,EAC/B,EAAE;IACF,MAAM,EACJ,EAAE,GAAG,eAAuB,EAC5B,QAAQ,EACR,SAAS,EACT,MAAM,EACN,cAAc,EACd,YAAY,GACb,GAAG,OAAO,CAAC;IAEZ,MAAM,SAAS,GAAG,KAAK,CAAC,OAAO,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,cAAc,CAAC,CAAC;IACpF,MAAM,KAAK,GACT,QAAQ,IAAI,IAAI;QACd,CAAC,CAAC,MAAM,aAAa,CAAC,SAAS,EAAE,QAAQ,EAAE,EAAE,EAAE,EAAE,cAAc,EAAE,GAAG,EAAE,GAAG,CAAC,GAAG,EAAE,CAAC;QAChF,CAAC,CAAC,SAAS,CAAC;IAEhB,GAAG,CAAC,GAAG,CAAC,KAAK,CAAC,qBAAqB,KAAK,CAAC,MAAM,oBAAoB,CAAC,CAAC,CAAC,kBAAkB;IAExF,MAAM,aAAa,GAAG,MAAM,KAAK,CAAC,MAAM,CAAC,CAAO,UAAU,EAAE,IAAI,EAAE,EAAE;QAClE,MAAM,GAAG,GAAG,MAAM,UAAU,CAAC;QAE7B,MAAM,eAAe,GAAG,SAAS,CAAC,CAAC,CAAC,MAAM,SAAS,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC;QACjE,MAAM,YAAY,GAAG,MAAM,CAAC,CAAC,CAAC,MAAM,MAAM,CAAC,eAAe,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC;QAEnE,IAAI,YAAY;YAAE,GAAG,CAAC,IAAI,CAAC,eAAe,CAAC,CAAC;QAE5C,OAAO,GAAG,CAAC;IACb,CAAC,CAAA,EAAE,OAAO,CAAC,OAAO,CAAC,EAAe,CAAC,CAAC,CAAC;IAErC,wCAAwC;IACxC,GAAG,CAAC,GAAG,CAAC,IAAI,CAAC,WAAW,aAAa,CAAC,MAAM,oBAAoB,CAAC,CAAC;IAClE,MAAM,QAAQ,GAAG,MAAM,EAAE,CAAC,gBAAgB,CAAC,cAAc,CAAC,CAAC;IAC3D,MAAM,QAAQ,CAAC,WAAW,CAAC,aAAsB,EAAE,YAAY,CAAC,CAAC;IACjE,GAAG,CAAC,GAAG,CAAC,IAAI,CAAC,gBAAgB,aAAa,CAAC,MAAM,oBAAoB,CAAC,CAAC;IAEvE,OAAO,aAAa,CAAC;AACvB,CAAC,CAAA,CAAC;AA3CW,QAAA,YAAY,gBA2CvB","sourcesContent":["import type {\n CrawlingContext,\n Log,\n Request as CrawleeRequest,\n RequestQueueOperationOptions,\n} from 'crawlee';\n\nimport { requestQueueSizeMonitor } from './requestQueue';\nimport type { CrawleeOneIO } from '../integrations/types';\nimport { apifyIO } from '../integrations/apify';\n\nexport interface PushRequestsOptions<T extends CrawleeRequest = CrawleeRequest> {\n io?: CrawleeOneIO<any, any>;\n /**\n * If set, only at most this many requests will be added to the RequestQueue.\n *\n * The count is determined from the RequestQueue that's used for the crawler run.\n *\n * This means that if `maxCount` is set to 50, but the\n * associated RequestQueue already handled 40 requests, then only 10 new requests\n * will be processed.\n */\n maxCount?: number;\n /**\n * Option to freely transform a request before pushing it to the RequestQueue.\n *\n * This serves mainly to allow users to transform the requests from actor input UI.\n */\n transform?: (req: T) => any;\n /**\n * Option to filter a request before pushing it to the RequestQueue.\n *\n * This serves mainly to allow users to filter the requests from actor input UI.\n */\n filter?: (req: T) => any;\n /** ID of the RequestQueue to which the data should be pushed */\n requestQueueId?: string;\n\n // Pass-through options\n queueOptions?: RequestQueueOperationOptions;\n}\n\nconst shortenToSize = async <T>(\n entries: T[],\n maxCount: number,\n options?: { io?: CrawleeOneIO; requestQueueId?: string; log: Log }\n) => {\n const { requestQueueId, log } = options ?? {};\n\n const queueName = requestQueueId ? `\"${requestQueueId}\"` : 'DEFAULT';\n\n const sizeMonitor = requestQueueSizeMonitor(maxCount, options);\n\n // Ignore incoming entries if the queue is already full\n const isFull = await sizeMonitor.isFull();\n if (isFull) {\n log?.warning(`RequestQueue (${queueName}) is already full (${maxCount} entries), ${entries.length} entries will be discarded.`);\n return [];\n } // prettier-ignore\n\n // Show warning when only part of the incoming requests made it into the queue\n const slicedEntries = await sizeMonitor.shortenToSize(entries);\n if (slicedEntries.length !== entries.length) {\n log?.warning(`RequestQueue (${queueName}) has become full (${maxCount} entries), ${entries.length} entries will be discarded.`);\n return [];\n } // prettier-ignore\n\n return slicedEntries;\n};\n\n/**\n * Similar to `Actor.openRequestQueue().addRequests`, but with extra features:\n *\n * - Data can be sent elsewhere, not just to Apify. This is set by the `io` options. By default data is sent using Apify (cloud/local).\n * - Limit the max size of the RequestQueue. No requests are added when RequestQueue is at or above the limit.\n * - Transform and filter requests. Requests that did not pass the filter are not added to the RequestQueue.\n */\nexport const pushRequests = async <\n Ctx extends CrawlingContext,\n T extends CrawleeRequest = CrawleeRequest\n>(\n oneOrManyItems: T | T[],\n ctx: Ctx,\n options: PushRequestsOptions<T>\n) => {\n const {\n io = apifyIO as CrawleeOneIO,\n maxCount,\n transform,\n filter,\n requestQueueId,\n queueOptions,\n } = options;\n\n const manyItems = Array.isArray(oneOrManyItems) ? oneOrManyItems : [oneOrManyItems];\n const items =\n maxCount != null\n ? await shortenToSize(manyItems, maxCount, { io, requestQueueId, log: ctx.log })\n : manyItems;\n\n ctx.log.debug(`Preparing to push ${items.length} requests to queue`); // prettier-ignore\n\n const adjustedItems = await items.reduce(async (aggPromise, item) => {\n const agg = await aggPromise;\n\n const transformedItem = transform ? await transform(item) : item;\n const passedFilter = filter ? await filter(transformedItem) : true;\n\n if (passedFilter) agg.push(transformedItem);\n\n return agg;\n }, Promise.resolve([] as unknown[]));\n\n // Push requests to primary RequestQueue\n ctx.log.info(`Pushing ${adjustedItems.length} requests to queue`);\n const reqQueue = await io.openRequestQueue(requestQueueId);\n await reqQueue.addRequests(adjustedItems as any[], queueOptions);\n ctx.log.info(`Done pushing ${adjustedItems.length} requests to queue`);\n\n return adjustedItems;\n};\n"]}
@@ -0,0 +1,28 @@
1
+ import { type ValueMonitorOptions } from '../../utils/valueMonitor';
2
+ import type { CrawleeOneIO } from '../integrations/types';
3
+ export interface RequestQueueSizeMonitorOptions extends ValueMonitorOptions {
4
+ io?: CrawleeOneIO<any, any>;
5
+ /**
6
+ * ID of the RequestQueue that's monitored for size.
7
+ *
8
+ * If omitted, the default RequestQueue is used.
9
+ */
10
+ requestQueueId?: string;
11
+ }
12
+ /**
13
+ * Semi-automatic monitoring of RequestQueue size. This is used for limiting the total of
14
+ * entries scraped per run / RequestQueue:
15
+ * - When RequestQueue reaches `maxSize`, then all remaining Requests are removed.
16
+ * - Pass an array of items to `shortenToSize` to shorten the array to the size
17
+ * that still fits the RequestQueue.
18
+ *
19
+ * By default uses Apify RequestQueue.
20
+ */
21
+ export declare const requestQueueSizeMonitor: (maxSize: number, options?: RequestQueueSizeMonitorOptions) => {
22
+ shortenToSize: <T>(arr: T[]) => Promise<T[]>;
23
+ isFull: () => Promise<boolean>;
24
+ value: () => number | Promise<number> | null;
25
+ isStale: () => boolean;
26
+ refresh: () => Promise<number>;
27
+ onValue: (callback: import("../../utils/valueMonitor").ValueCallback<number>) => () => void;
28
+ };
@@ -0,0 +1,40 @@
1
+ "use strict";
2
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
3
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
4
+ return new (P || (P = Promise))(function (resolve, reject) {
5
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
6
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
7
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
8
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
9
+ });
10
+ };
11
+ Object.defineProperty(exports, "__esModule", { value: true });
12
+ exports.requestQueueSizeMonitor = void 0;
13
+ const valueMonitor_1 = require("../../utils/valueMonitor");
14
+ const apify_1 = require("../integrations/apify");
15
+ /**
16
+ * Semi-automatic monitoring of RequestQueue size. This is used for limiting the total of
17
+ * entries scraped per run / RequestQueue:
18
+ * - When RequestQueue reaches `maxSize`, then all remaining Requests are removed.
19
+ * - Pass an array of items to `shortenToSize` to shorten the array to the size
20
+ * that still fits the RequestQueue.
21
+ *
22
+ * By default uses Apify RequestQueue.
23
+ */
24
+ const requestQueueSizeMonitor = (maxSize, options) => {
25
+ const { io = apify_1.apifyIO } = options !== null && options !== void 0 ? options : {};
26
+ const getSize = () => __awaiter(void 0, void 0, void 0, function* () {
27
+ var _a;
28
+ const reqQueue = yield io.openRequestQueue(options === null || options === void 0 ? void 0 : options.requestQueueId);
29
+ const count = (_a = (yield reqQueue.handledCount())) !== null && _a !== void 0 ? _a : 0;
30
+ return count;
31
+ });
32
+ // When we've reached the RequestQueue's max size, then remove all remaining Requests
33
+ const onMaxSizeReached = () => __awaiter(void 0, void 0, void 0, function* () {
34
+ const reqQueue = yield io.openRequestQueue(options === null || options === void 0 ? void 0 : options.requestQueueId);
35
+ yield reqQueue.drop();
36
+ });
37
+ return (0, valueMonitor_1.createSizeMonitor)(maxSize, getSize, onMaxSizeReached, options);
38
+ };
39
+ exports.requestQueueSizeMonitor = requestQueueSizeMonitor;
40
+ //# sourceMappingURL=requestQueue.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"requestQueue.js","sourceRoot":"","sources":["../../../../src/lib/io/requestQueue.ts"],"names":[],"mappings":";;;;;;;;;;;;AAAA,2DAAuF;AAEvF,iDAAgD;AAYhD;;;;;;;;GAQG;AACI,MAAM,uBAAuB,GAAG,CACrC,OAAe,EACf,OAAwC,EACxC,EAAE;IACF,MAAM,EAAE,EAAE,GAAG,eAAO,EAAE,GAAG,OAAO,aAAP,OAAO,cAAP,OAAO,GAAI,EAAE,CAAC;IAEvC,MAAM,OAAO,GAAG,GAAS,EAAE;;QACzB,MAAM,QAAQ,GAAG,MAAM,EAAE,CAAC,gBAAgB,CAAC,OAAO,aAAP,OAAO,uBAAP,OAAO,CAAE,cAAc,CAAC,CAAC;QACpE,MAAM,KAAK,GAAG,MAAA,CAAC,MAAM,QAAQ,CAAC,YAAY,EAAE,CAAC,mCAAI,CAAC,CAAC;QACnD,OAAO,KAAK,CAAC;IACf,CAAC,CAAA,CAAC;IAEF,qFAAqF;IACrF,MAAM,gBAAgB,GAAG,GAAS,EAAE;QAClC,MAAM,QAAQ,GAAG,MAAM,EAAE,CAAC,gBAAgB,CAAC,OAAO,aAAP,OAAO,uBAAP,OAAO,CAAE,cAAc,CAAC,CAAC;QACpE,MAAM,QAAQ,CAAC,IAAI,EAAE,CAAC;IACxB,CAAC,CAAA,CAAC;IAEF,OAAO,IAAA,gCAAiB,EAAC,OAAO,EAAE,OAAO,EAAE,gBAAgB,EAAE,OAAO,CAAC,CAAC;AACxE,CAAC,CAAC;AAnBW,QAAA,uBAAuB,2BAmBlC","sourcesContent":["import { createSizeMonitor, type ValueMonitorOptions } from '../../utils/valueMonitor';\nimport type { CrawleeOneIO } from '../integrations/types';\nimport { apifyIO } from '../integrations/apify';\n\nexport interface RequestQueueSizeMonitorOptions extends ValueMonitorOptions {\n io?: CrawleeOneIO<any, any>;\n /**\n * ID of the RequestQueue that's monitored for size.\n *\n * If omitted, the default RequestQueue is used.\n */\n requestQueueId?: string;\n}\n\n/**\n * Semi-automatic monitoring of RequestQueue size. This is used for limiting the total of\n * entries scraped per run / RequestQueue:\n * - When RequestQueue reaches `maxSize`, then all remaining Requests are removed.\n * - Pass an array of items to `shortenToSize` to shorten the array to the size\n * that still fits the RequestQueue.\n *\n * By default uses Apify RequestQueue.\n */\nexport const requestQueueSizeMonitor = (\n maxSize: number,\n options?: RequestQueueSizeMonitorOptions\n) => {\n const { io = apifyIO } = options ?? {};\n\n const getSize = async () => {\n const reqQueue = await io.openRequestQueue(options?.requestQueueId);\n const count = (await reqQueue.handledCount()) ?? 0;\n return count;\n };\n\n // When we've reached the RequestQueue's max size, then remove all remaining Requests\n const onMaxSizeReached = async () => {\n const reqQueue = await io.openRequestQueue(options?.requestQueueId);\n await reqQueue.drop();\n };\n\n return createSizeMonitor(maxSize, getSize, onMaxSizeReached, options);\n};\n"]}
@@ -0,0 +1,38 @@
1
+ import { type CrawlingContext, LogLevel as CrawleeLogLevel } from 'crawlee';
2
+ import type { ArrVal } from '../utils/types';
3
+ import type { CrawlerRouterWrapper } from './router';
4
+ export declare const LOG_LEVEL: readonly ["debug", "info", "warn", "error", "off"];
5
+ export type LogLevel = ArrVal<typeof LOG_LEVEL>;
6
+ /** Map log levels of `crawlee-one` to log levels of `crawlee` */
7
+ export declare const logLevelToCrawlee: Record<LogLevel, CrawleeLogLevel>;
8
+ /**
9
+ * Wrapper for Crawlee route handler that configures log level.
10
+ *
11
+ *
12
+ * Usage with Crawlee's `RouterHandler.addDefaultHandler`
13
+ * ```ts
14
+ * const wrappedHandler = logLevelHandlerWrapper('debug')(handler)
15
+ * await router.addDefaultHandler<Ctx>(wrappedHandler);
16
+ * ```
17
+ *
18
+ * Usage with Crawlee's `RouterHandler.addHandler`
19
+ * ```ts
20
+ * const wrappedHandler = logLevelHandlerWrapper('error')(handler)
21
+ * await router.addHandler<Ctx>(wrappedHandler);
22
+ * ```
23
+ *
24
+ * Usage with `createCrawleeOne`
25
+ * ```ts
26
+ * const actor = await createCrawleeOne<CheerioCrawlingContext>({
27
+ * validateInput,
28
+ * router: createCheerioRouter(),
29
+ * routes,
30
+ * routeHandlers: ({ input }) => createHandlers(input!),
31
+ * routerWrappers: ({ input }) => [
32
+ * logLevelHandlerWrapper<CheerioCrawlingContext<any, any>>(input?.logLevel ?? 'info'),
33
+ * ],
34
+ * createCrawler: ({ router, input }) => createCrawler({ router, input, crawlerConfig }),
35
+ * });
36
+ * ```
37
+ */
38
+ export declare const logLevelHandlerWrapper: <T extends CrawlingContext<unknown, import("crawlee").Dictionary>, RouterCtx extends Record<string, any> = Record<string, any>>(logLevel: LogLevel) => CrawlerRouterWrapper<T, RouterCtx>;