crawlee-one 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. package/README.md +81 -0
  2. package/dist/cjs/cli/cli.d.ts +1 -0
  3. package/dist/cjs/cli/cli.js +61 -0
  4. package/dist/cjs/cli/cli.js.map +1 -0
  5. package/dist/cjs/cli/index.d.ts +2 -0
  6. package/dist/cjs/cli/index.js +6 -0
  7. package/dist/cjs/cli/index.js.map +1 -0
  8. package/dist/cjs/index.d.ts +24 -0
  9. package/dist/cjs/index.js +43 -0
  10. package/dist/cjs/index.js.map +1 -0
  11. package/dist/cjs/lib/actions/dom.d.ts +102 -0
  12. package/dist/cjs/lib/actions/dom.js +743 -0
  13. package/dist/cjs/lib/actions/dom.js.map +1 -0
  14. package/dist/cjs/lib/actions/domUtils.d.ts +42 -0
  15. package/dist/cjs/lib/actions/domUtils.js +126 -0
  16. package/dist/cjs/lib/actions/domUtils.js.map +1 -0
  17. package/dist/cjs/lib/actions/page.d.ts +69 -0
  18. package/dist/cjs/lib/actions/page.js +205 -0
  19. package/dist/cjs/lib/actions/page.js.map +1 -0
  20. package/dist/cjs/lib/actions/scrapeListing.d.ts +78 -0
  21. package/dist/cjs/lib/actions/scrapeListing.js +242 -0
  22. package/dist/cjs/lib/actions/scrapeListing.js.map +1 -0
  23. package/dist/cjs/lib/actor/actor.d.ts +90 -0
  24. package/dist/cjs/lib/actor/actor.js +306 -0
  25. package/dist/cjs/lib/actor/actor.js.map +1 -0
  26. package/dist/cjs/lib/actor/types.d.ts +162 -0
  27. package/dist/cjs/lib/actor/types.js +3 -0
  28. package/dist/cjs/lib/actor/types.js.map +1 -0
  29. package/dist/cjs/lib/actor.d.ts +189 -0
  30. package/dist/cjs/lib/actor.js +225 -0
  31. package/dist/cjs/lib/actor.js.map +1 -0
  32. package/dist/cjs/lib/actorSpec.d.ts +20 -0
  33. package/dist/cjs/lib/actorSpec.js +3 -0
  34. package/dist/cjs/lib/actorSpec.js.map +1 -0
  35. package/dist/cjs/lib/config.d.ts +561 -0
  36. package/dist/cjs/lib/config.js +707 -0
  37. package/dist/cjs/lib/config.js.map +1 -0
  38. package/dist/cjs/lib/dataset/maxCount.d.ts +30 -0
  39. package/dist/cjs/lib/dataset/maxCount.js +55 -0
  40. package/dist/cjs/lib/dataset/maxCount.js.map +1 -0
  41. package/dist/cjs/lib/dataset/pushData.d.ts +123 -0
  42. package/dist/cjs/lib/dataset/pushData.js +182 -0
  43. package/dist/cjs/lib/dataset/pushData.js.map +1 -0
  44. package/dist/cjs/lib/dataset.d.ts +98 -0
  45. package/dist/cjs/lib/dataset.js +122 -0
  46. package/dist/cjs/lib/dataset.js.map +1 -0
  47. package/dist/cjs/lib/dom.d.ts +78 -0
  48. package/dist/cjs/lib/dom.js +243 -0
  49. package/dist/cjs/lib/dom.js.map +1 -0
  50. package/dist/cjs/lib/error/errorHandler.d.ts +112 -0
  51. package/dist/cjs/lib/error/errorHandler.js +164 -0
  52. package/dist/cjs/lib/error/errorHandler.js.map +1 -0
  53. package/dist/cjs/lib/error/sentry.d.ts +11 -0
  54. package/dist/cjs/lib/error/sentry.js +60 -0
  55. package/dist/cjs/lib/error/sentry.js.map +1 -0
  56. package/dist/cjs/lib/integrations/apify.d.ts +67 -0
  57. package/dist/cjs/lib/integrations/apify.js +106 -0
  58. package/dist/cjs/lib/integrations/apify.js.map +1 -0
  59. package/dist/cjs/lib/integrations/types.d.ts +274 -0
  60. package/dist/cjs/lib/integrations/types.js +3 -0
  61. package/dist/cjs/lib/integrations/types.js.map +1 -0
  62. package/dist/cjs/lib/io/dataset.d.ts +67 -0
  63. package/dist/cjs/lib/io/dataset.js +86 -0
  64. package/dist/cjs/lib/io/dataset.js.map +1 -0
  65. package/dist/cjs/lib/io/maxCount.d.ts +30 -0
  66. package/dist/cjs/lib/io/maxCount.js +55 -0
  67. package/dist/cjs/lib/io/maxCount.js.map +1 -0
  68. package/dist/cjs/lib/io/pushData.d.ts +124 -0
  69. package/dist/cjs/lib/io/pushData.js +193 -0
  70. package/dist/cjs/lib/io/pushData.js.map +1 -0
  71. package/dist/cjs/lib/io/pushRequests.d.ts +38 -0
  72. package/dist/cjs/lib/io/pushRequests.js +63 -0
  73. package/dist/cjs/lib/io/pushRequests.js.map +1 -0
  74. package/dist/cjs/lib/io/requestQueue.d.ts +28 -0
  75. package/dist/cjs/lib/io/requestQueue.js +40 -0
  76. package/dist/cjs/lib/io/requestQueue.js.map +1 -0
  77. package/dist/cjs/lib/log.d.ts +38 -0
  78. package/dist/cjs/lib/log.js +54 -0
  79. package/dist/cjs/lib/log.js.map +1 -0
  80. package/dist/cjs/lib/migrate/localMigrator.d.ts +10 -0
  81. package/dist/cjs/lib/migrate/localMigrator.js +57 -0
  82. package/dist/cjs/lib/migrate/localMigrator.js.map +1 -0
  83. package/dist/cjs/lib/migrate/localState.d.ts +7 -0
  84. package/dist/cjs/lib/migrate/localState.js +43 -0
  85. package/dist/cjs/lib/migrate/localState.js.map +1 -0
  86. package/dist/cjs/lib/migrate/types.d.ts +6 -0
  87. package/dist/cjs/lib/migrate/types.js +3 -0
  88. package/dist/cjs/lib/migrate/types.js.map +1 -0
  89. package/dist/cjs/lib/readme/readme.d.ts +65 -0
  90. package/dist/cjs/lib/readme/readme.js +534 -0
  91. package/dist/cjs/lib/readme/readme.js.map +1 -0
  92. package/dist/cjs/lib/readme/types.d.ts +260 -0
  93. package/dist/cjs/lib/readme/types.js +54 -0
  94. package/dist/cjs/lib/readme/types.js.map +1 -0
  95. package/dist/cjs/lib/router.d.ts +132 -0
  96. package/dist/cjs/lib/router.js +165 -0
  97. package/dist/cjs/lib/router.js.map +1 -0
  98. package/dist/cjs/lib/scraper/scrapeListing.d.ts +78 -0
  99. package/dist/cjs/lib/scraper/scrapeListing.js +242 -0
  100. package/dist/cjs/lib/scraper/scrapeListing.js.map +1 -0
  101. package/dist/cjs/lib/test/actor.d.ts +21 -0
  102. package/dist/cjs/lib/test/actor.js +56 -0
  103. package/dist/cjs/lib/test/actor.js.map +1 -0
  104. package/dist/cjs/lib/test/mockApifyClient.d.ts +32 -0
  105. package/dist/cjs/lib/test/mockApifyClient.js +176 -0
  106. package/dist/cjs/lib/test/mockApifyClient.js.map +1 -0
  107. package/dist/cjs/types.d.ts +31 -0
  108. package/dist/cjs/types.js +3 -0
  109. package/dist/cjs/types.js.map +1 -0
  110. package/dist/cjs/utils/async.d.ts +19 -0
  111. package/dist/cjs/utils/async.js +74 -0
  112. package/dist/cjs/utils/async.js.map +1 -0
  113. package/dist/cjs/utils/error.d.ts +1 -0
  114. package/dist/cjs/utils/error.js +10 -0
  115. package/dist/cjs/utils/error.js.map +1 -0
  116. package/dist/cjs/utils/format.d.ts +9 -0
  117. package/dist/cjs/utils/format.js +19 -0
  118. package/dist/cjs/utils/format.js.map +1 -0
  119. package/dist/cjs/utils/package.d.ts +15 -0
  120. package/dist/cjs/utils/package.js +25 -0
  121. package/dist/cjs/utils/package.js.map +1 -0
  122. package/dist/cjs/utils/types.d.ts +6 -0
  123. package/dist/cjs/utils/types.js +9 -0
  124. package/dist/cjs/utils/types.js.map +1 -0
  125. package/dist/cjs/utils/url.d.ts +9 -0
  126. package/dist/cjs/utils/url.js +32 -0
  127. package/dist/cjs/utils/url.js.map +1 -0
  128. package/dist/cjs/utils/valueMonitor.d.ts +31 -0
  129. package/dist/cjs/utils/valueMonitor.js +91 -0
  130. package/dist/cjs/utils/valueMonitor.js.map +1 -0
  131. package/package.json +85 -0
package/README.md ADDED
@@ -0,0 +1,81 @@
1
+ # Crawlee One
2
+
3
+ _The crawler framework you can't refuse._
4
+
5
+ Crawlee One is a framework built on top of Crawlee and Apify\* for writing robust and highly configurable web scrapers.
6
+
7
+ Crawlee One should be your choice if:
8
+
9
+ - You're developing a long-lasting integration.
10
+ - Or your scraper will be part of a data pipeline.
11
+ - Or you wish to make your scrapers available to others in your team / org, whether it's programmatically or via Apify UI.
12
+
13
+ Conversely, Crawlee One is NOT suitable for:
14
+
15
+ - People not familiar with web scraping or Apify
16
+ - One-off data extractions
17
+
18
+ [Read here](./docs/scraping-workflow-summary.md) for the recap of how Crawlee and Apify work.
19
+
20
+ ## Use cases
21
+
22
+ Web crawlers written with Crawlee One can be configured via their input to handle following advanced use cases:
23
+
24
+ - [1. Import URLs to scrape from your database (or elsewhere)](./docs/playbook-01-import-urls.md)
25
+ - [2. Proxy: Avoid rate limiting and geo-blocking](./docs/playbook-02-proxy.md)
26
+ - [3. Simple transformations: Select and rename columns, set how many entries to scrape](./docs/playbook-03-results-mapping-simple.md)
27
+ - [4. Advanced transformations & aggregations](./docs/playbook-04-results-mapping-advanced.md)
28
+ - [5. Filtering results](./docs/playbook-05-results-filtering.md)
29
+ - [6. Deciding what URLs to scrape: Filtering and transforming requests](./docs/playbook-06-requests-mapping-filtering.md)
30
+ - [7. Caching: Extract only new or only previously-seen entries](./docs/playbook-07-caching.md)
31
+ - [8. Configure crawler settings and performance](./docs/playbook-08-settings-performance.md)
32
+ - [9. Create data pipelines from scrapers using metamorph](./docs/playbook-09-data-pipelines-metamorph.md)
33
+ - [10. Privacy compliance: Include or omit personal data](./docs/playbook-10-privacy-compliance.md)
34
+ - [11. Capture errors](./docs/playbook-11-errors.md)
35
+ - [12. Source control: Keep scraper configuration in sync](./docs/playbook-12-source-control.md)
36
+
37
+ ## Library contents
38
+
39
+ Crawlee One includes a set of utility functions for:
40
+
41
+ - Actor boilterplating
42
+ - Allows to set crawler settings from Apify input
43
+ - Enrich data with metadata
44
+ - Configure logging level
45
+ - Routing
46
+ - Error handling
47
+ - Save errors to separate Apify dataset
48
+ - Send errors to Sentry
49
+ - Testing actors
50
+ - Manipulating DOM
51
+ - Actor migration (conceptually similar to database migration)
52
+ - CLI utility for updating actors via apify-client
53
+ - Apify's `actor.json` generation
54
+ - Privacy compliance
55
+ - Metamorphing
56
+
57
+ ## Actor Input Reference
58
+
59
+ [See here](./docs/reference-input.md) the full list of all possible input options that a Crawlee One crawler can have.
60
+
61
+ Crawlee One allows you to configure the following via the input:
62
+
63
+ - [Programmatically-defined input](./docs/reference-input.md#programmatic-input-advanced)
64
+ - [Starting URLs](./docs/reference-input.md#starting-urls)
65
+ - [Proxy](./docs/reference-input.md#proxy)
66
+ - [Privacy & Data governance (GDPR)](./docs/reference-input.md#privacy--data-governance-gdpr)
67
+ - [Requests limit, transformation & filtering](./docs/reference-input.md#requests-limit-transformation--filtering-advanced)
68
+ - [Output size, transformation & filtering (T in ETL)](./docs/reference-input.md#output-size-transformation--filtering-t-in-etl-advanced)
69
+ - [Output Dataset & Caching (L in ETL)](./docs/reference-input.md#output-dataset--caching-l-in-etl-advanced)
70
+ - [Crawler configuration](./docs/reference-input.md#crawler-configuration-advanced)
71
+ - [Performance configuration](./docs/reference-input.md#performance-configuration-advanced)
72
+ - [Logging & Error handling](./docs/reference-input.md#logging--error-handling-advanced)
73
+ - [Integrations (Metamorphing)](./docs/reference-input.md#integrations-metamorphing-advanced)
74
+
75
+ ## Example project
76
+
77
+ // TODO
78
+
79
+ ---
80
+
81
+ \* Apify can be replaced with your own implementation, so the data can be sent elsewhere, not just to Apify. This is set by the `io` options.
@@ -0,0 +1 @@
1
+ export declare const cli: () => void;
@@ -0,0 +1,61 @@
1
+ "use strict";
2
+ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
3
+ function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
4
+ return new (P || (P = Promise))(function (resolve, reject) {
5
+ function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
6
+ function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
7
+ function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
8
+ step((generator = generator.apply(thisArg, _arguments || [])).next());
9
+ });
10
+ };
11
+ var __importDefault = (this && this.__importDefault) || function (mod) {
12
+ return (mod && mod.__esModule) ? mod : { "default": mod };
13
+ };
14
+ Object.defineProperty(exports, "__esModule", { value: true });
15
+ exports.cli = void 0;
16
+ const commander_1 = require("commander");
17
+ const path_1 = __importDefault(require("path"));
18
+ const package_1 = require("../utils/package");
19
+ const localMigrator_1 = require("../lib/migrate/localMigrator");
20
+ const pkgJson = (0, package_1.getPackageJsonInfo)(module, ['name', 'version']);
21
+ commander_1.program //
22
+ .name(pkgJson.name)
23
+ .description('CLI to run crawlee-one tools')
24
+ .version(pkgJson.version);
25
+ commander_1.program
26
+ .command('migrate')
27
+ .description('Run a migration script specified by the version number')
28
+ .requiredOption('-t --target <target-version>', 'migration version to execute, eg "v1"')
29
+ .requiredOption('-d --dir <path>', 'path to the migrations directory')
30
+ .option('--delimeter [delimeter]', 'delimeter between version and rest of file name, eg "v1_filename"')
31
+ .option('--ext --extension [ext-glob]', 'glob pattern for valid extensions for migration files, eg ".js" or ".{js,ts}"')
32
+ .addHelpText('after', `
33
+
34
+ Example call:
35
+ $ crawlee-one migrate -d ./path/to/migrations-dir -t v1`)
36
+ .action(({ dir, target, extension, delimeter }) => __awaiter(void 0, void 0, void 0, function* () {
37
+ const migrationsDir = path_1.default.resolve(process.cwd(), dir);
38
+ const { migrate } = (0, localMigrator_1.createLocalMigrator)({ migrationsDir, extension, delimeter });
39
+ yield migrate(target);
40
+ }));
41
+ commander_1.program
42
+ .command('unmigrate')
43
+ .description('Run an un-migration script specified by the version number')
44
+ .requiredOption('-t --target <target-version>', 'migration version to execute, eg "v1"')
45
+ .requiredOption('-d --dir <path>', 'path to the migrations directory')
46
+ .option('--delimeter [delimeter]', 'delimeter between version and rest of file name, eg "v1_filename"')
47
+ .option('--ext --extension [ext-glob]', 'glob pattern for valid extensions for migration files, eg ".js" or ".{js,ts}"')
48
+ .addHelpText('after', `
49
+
50
+ Example call:
51
+ $ crawlee-one unmigrate -d ./path/to/migrations-dir -t v1`)
52
+ .action(({ dir, target, extension, delimeter }) => __awaiter(void 0, void 0, void 0, function* () {
53
+ const migrationsDir = path_1.default.resolve(process.cwd(), dir);
54
+ const { unmigrate } = (0, localMigrator_1.createLocalMigrator)({ migrationsDir, extension, delimeter });
55
+ yield unmigrate(target);
56
+ }));
57
+ const cli = () => {
58
+ commander_1.program.parse();
59
+ };
60
+ exports.cli = cli;
61
+ //# sourceMappingURL=cli.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"cli.js","sourceRoot":"","sources":["../../../src/cli/cli.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;AAAA,yCAAoC;AACpC,gDAAwB;AAExB,8CAAsD;AACtD,gEAAmE;AAEnE,MAAM,OAAO,GAAG,IAAA,4BAAkB,EAAC,MAAM,EAAE,CAAC,MAAM,EAAE,SAAS,CAAC,CAAC,CAAC;AAEhE,mBAAO,CAAC,EAAE;KACP,IAAI,CAAC,OAAO,CAAC,IAAI,CAAC;KAClB,WAAW,CAAC,8BAA8B,CAAC;KAC3C,OAAO,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;AAE5B,mBAAO;KACJ,OAAO,CAAC,SAAS,CAAC;KAClB,WAAW,CAAC,wDAAwD,CAAC;KACrE,cAAc,CAAC,8BAA8B,EAAE,uCAAuC,CAAC;KACvF,cAAc,CAAC,iBAAiB,EAAE,kCAAkC,CAAC;KACrE,MAAM,CACL,yBAAyB,EACzB,mEAAmE,CACpE;KACA,MAAM,CACL,8BAA8B,EAC9B,+EAA+E,CAChF;KACA,WAAW,CACV,OAAO,EACP;;;0DAGsD,CACvD;KACA,MAAM,CAAC,CAAO,EAAE,GAAG,EAAE,MAAM,EAAE,SAAS,EAAE,SAAS,EAAE,EAAE,EAAE;IACtD,MAAM,aAAa,GAAG,cAAI,CAAC,OAAO,CAAC,OAAO,CAAC,GAAG,EAAE,EAAE,GAAG,CAAC,CAAC;IACvD,MAAM,EAAE,OAAO,EAAE,GAAG,IAAA,mCAAmB,EAAC,EAAE,aAAa,EAAE,SAAS,EAAE,SAAS,EAAE,CAAC,CAAC;IACjF,MAAM,OAAO,CAAC,MAAM,CAAC,CAAC;AACxB,CAAC,CAAA,CAAC,CAAC;AAEL,mBAAO;KACJ,OAAO,CAAC,WAAW,CAAC;KACpB,WAAW,CAAC,4DAA4D,CAAC;KACzE,cAAc,CAAC,8BAA8B,EAAE,uCAAuC,CAAC;KACvF,cAAc,CAAC,iBAAiB,EAAE,kCAAkC,CAAC;KACrE,MAAM,CACL,yBAAyB,EACzB,mEAAmE,CACpE;KACA,MAAM,CACL,8BAA8B,EAC9B,+EAA+E,CAChF;KACA,WAAW,CACV,OAAO,EACP;;;4DAGwD,CACzD;KACA,MAAM,CAAC,CAAO,EAAE,GAAG,EAAE,MAAM,EAAE,SAAS,EAAE,SAAS,EAAE,EAAE,EAAE;IACtD,MAAM,aAAa,GAAG,cAAI,CAAC,OAAO,CAAC,OAAO,CAAC,GAAG,EAAE,EAAE,GAAG,CAAC,CAAC;IACvD,MAAM,EAAE,SAAS,EAAE,GAAG,IAAA,mCAAmB,EAAC,EAAE,aAAa,EAAE,SAAS,EAAE,SAAS,EAAE,CAAC,CAAC;IACnF,MAAM,SAAS,CAAC,MAAM,CAAC,CAAC;AAC1B,CAAC,CAAA,CAAC,CAAC;AAEE,MAAM,GAAG,GAAG,GAAG,EAAE;IACtB,mBAAO,CAAC,KAAK,EAAE,CAAC;AAClB,CAAC,CAAC;AAFW,QAAA,GAAG,OAEd","sourcesContent":["import { program } from 'commander';\nimport path from 'path';\n\nimport { getPackageJsonInfo } from '../utils/package';\nimport { createLocalMigrator } from '../lib/migrate/localMigrator';\n\nconst pkgJson = getPackageJsonInfo(module, ['name', 'version']);\n\nprogram //\n .name(pkgJson.name)\n .description('CLI to run crawlee-one tools')\n .version(pkgJson.version);\n\nprogram\n .command('migrate')\n .description('Run a migration script specified by the version number')\n .requiredOption('-t --target <target-version>', 'migration version to execute, eg \"v1\"')\n .requiredOption('-d --dir <path>', 'path to the migrations directory')\n .option(\n '--delimeter [delimeter]',\n 'delimeter between version and rest of file name, eg \"v1_filename\"'\n )\n .option(\n '--ext --extension [ext-glob]',\n 'glob pattern for valid extensions for migration files, eg \".js\" or \".{js,ts}\"'\n )\n .addHelpText(\n 'after',\n `\n\nExample call:\n $ crawlee-one migrate -d ./path/to/migrations-dir -t v1`\n )\n .action(async ({ dir, target, extension, delimeter }) => {\n const migrationsDir = path.resolve(process.cwd(), dir);\n const { migrate } = createLocalMigrator({ migrationsDir, extension, delimeter });\n await migrate(target);\n });\n\nprogram\n .command('unmigrate')\n .description('Run an un-migration script specified by the version number')\n .requiredOption('-t --target <target-version>', 'migration version to execute, eg \"v1\"')\n .requiredOption('-d --dir <path>', 'path to the migrations directory')\n .option(\n '--delimeter [delimeter]',\n 'delimeter between version and rest of file name, eg \"v1_filename\"'\n )\n .option(\n '--ext --extension [ext-glob]',\n 'glob pattern for valid extensions for migration files, eg \".js\" or \".{js,ts}\"'\n )\n .addHelpText(\n 'after',\n `\n\nExample call:\n $ crawlee-one unmigrate -d ./path/to/migrations-dir -t v1`\n )\n .action(async ({ dir, target, extension, delimeter }) => {\n const migrationsDir = path.resolve(process.cwd(), dir);\n const { unmigrate } = createLocalMigrator({ migrationsDir, extension, delimeter });\n await unmigrate(target);\n });\n\nexport const cli = () => {\n program.parse();\n};\n"]}
@@ -0,0 +1,2 @@
1
+ #!/usr/bin/env node
2
+ export {};
@@ -0,0 +1,6 @@
1
+ #!/usr/bin/env node
2
+ "use strict";
3
+ Object.defineProperty(exports, "__esModule", { value: true });
4
+ const cli_1 = require("./cli");
5
+ (0, cli_1.cli)();
6
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../../../src/cli/index.ts"],"names":[],"mappings":";;;AACA,+BAA4B;AAE5B,IAAA,SAAG,GAAE,CAAC","sourcesContent":["#!/usr/bin/env node\nimport { cli } from './cli';\n\ncli();\n"]}
@@ -0,0 +1,24 @@
1
+ export { createAndRunCrawleeOne, createHttpCrawlerOptions } from './lib/actor/actor';
2
+ export * from './lib/actor/types';
3
+ export * from './lib/actorSpec';
4
+ export * from './lib/config';
5
+ export * from './lib/io/dataset';
6
+ export * from './lib/io/requestQueue';
7
+ export * from './lib/io/pushData';
8
+ export * from './lib/io/pushRequests';
9
+ export * from './lib/actions/dom';
10
+ export * from './lib/actions/domUtils';
11
+ export * from './lib/actions/page';
12
+ export * from './lib/actions/scrapeListing';
13
+ export * from './lib/error/errorHandler';
14
+ export * from './lib/error/sentry';
15
+ export * from './lib/migrate/localMigrator';
16
+ export * from './lib/migrate/localState';
17
+ export * from './lib/migrate/types';
18
+ export * from './lib/readme/readme';
19
+ export * from './lib/readme/types';
20
+ export * from './lib/router';
21
+ export * from './lib/log';
22
+ export * from './lib/test/actor';
23
+ export * from './lib/test/mockApifyClient';
24
+ export type { CrawlerUrl, CrawlerType } from './types';
@@ -0,0 +1,43 @@
1
+ "use strict";
2
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3
+ if (k2 === undefined) k2 = k;
4
+ var desc = Object.getOwnPropertyDescriptor(m, k);
5
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6
+ desc = { enumerable: true, get: function() { return m[k]; } };
7
+ }
8
+ Object.defineProperty(o, k2, desc);
9
+ }) : (function(o, m, k, k2) {
10
+ if (k2 === undefined) k2 = k;
11
+ o[k2] = m[k];
12
+ }));
13
+ var __exportStar = (this && this.__exportStar) || function(m, exports) {
14
+ for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
15
+ };
16
+ Object.defineProperty(exports, "__esModule", { value: true });
17
+ exports.createHttpCrawlerOptions = exports.createAndRunCrawleeOne = void 0;
18
+ var actor_1 = require("./lib/actor/actor");
19
+ Object.defineProperty(exports, "createAndRunCrawleeOne", { enumerable: true, get: function () { return actor_1.createAndRunCrawleeOne; } });
20
+ Object.defineProperty(exports, "createHttpCrawlerOptions", { enumerable: true, get: function () { return actor_1.createHttpCrawlerOptions; } });
21
+ __exportStar(require("./lib/actor/types"), exports);
22
+ __exportStar(require("./lib/actorSpec"), exports);
23
+ __exportStar(require("./lib/config"), exports);
24
+ __exportStar(require("./lib/io/dataset"), exports);
25
+ __exportStar(require("./lib/io/requestQueue"), exports);
26
+ __exportStar(require("./lib/io/pushData"), exports);
27
+ __exportStar(require("./lib/io/pushRequests"), exports);
28
+ __exportStar(require("./lib/actions/dom"), exports);
29
+ __exportStar(require("./lib/actions/domUtils"), exports);
30
+ __exportStar(require("./lib/actions/page"), exports);
31
+ __exportStar(require("./lib/actions/scrapeListing"), exports);
32
+ __exportStar(require("./lib/error/errorHandler"), exports);
33
+ __exportStar(require("./lib/error/sentry"), exports);
34
+ __exportStar(require("./lib/migrate/localMigrator"), exports);
35
+ __exportStar(require("./lib/migrate/localState"), exports);
36
+ __exportStar(require("./lib/migrate/types"), exports);
37
+ __exportStar(require("./lib/readme/readme"), exports);
38
+ __exportStar(require("./lib/readme/types"), exports);
39
+ __exportStar(require("./lib/router"), exports);
40
+ __exportStar(require("./lib/log"), exports);
41
+ __exportStar(require("./lib/test/actor"), exports);
42
+ __exportStar(require("./lib/test/mockApifyClient"), exports);
43
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/index.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;AAAA,2CAAqF;AAA5E,+GAAA,sBAAsB,OAAA;AAAE,iHAAA,wBAAwB,OAAA;AACzD,oDAAkC;AAClC,kDAAgC;AAChC,+CAA6B;AAC7B,mDAAiC;AACjC,wDAAsC;AACtC,oDAAkC;AAClC,wDAAsC;AACtC,oDAAkC;AAClC,yDAAuC;AACvC,qDAAmC;AACnC,8DAA4C;AAC5C,2DAAyC;AACzC,qDAAmC;AACnC,8DAA4C;AAC5C,2DAAyC;AACzC,sDAAoC;AACpC,sDAAoC;AACpC,qDAAmC;AACnC,+CAA6B;AAC7B,4CAA0B;AAC1B,mDAAiC;AACjC,6DAA2C","sourcesContent":["export { createAndRunCrawleeOne, createHttpCrawlerOptions } from './lib/actor/actor';\nexport * from './lib/actor/types';\nexport * from './lib/actorSpec';\nexport * from './lib/config';\nexport * from './lib/io/dataset';\nexport * from './lib/io/requestQueue';\nexport * from './lib/io/pushData';\nexport * from './lib/io/pushRequests';\nexport * from './lib/actions/dom';\nexport * from './lib/actions/domUtils';\nexport * from './lib/actions/page';\nexport * from './lib/actions/scrapeListing';\nexport * from './lib/error/errorHandler';\nexport * from './lib/error/sentry';\nexport * from './lib/migrate/localMigrator';\nexport * from './lib/migrate/localState';\nexport * from './lib/migrate/types';\nexport * from './lib/readme/readme';\nexport * from './lib/readme/types';\nexport * from './lib/router';\nexport * from './lib/log';\nexport * from './lib/test/actor';\nexport * from './lib/test/mockApifyClient';\nexport type { CrawlerUrl, CrawlerType } from './types';\n"]}
@@ -0,0 +1,102 @@
1
+ import { AnyNode, Cheerio } from 'cheerio';
2
+ import type { ElementHandle, Locator, Page } from 'playwright';
3
+ import { StrAsNumOptions } from '../../utils/format';
4
+ import { FormatUrlOptions } from '../../utils/url';
5
+ import type { MaybeArray, MaybePromise } from '../../utils/types';
6
+ /**
7
+ * Common interface for working with DOM despite different environments.
8
+ *
9
+ * Consider these environments:
10
+ * 1) Browser (via Playwright & Chromium) - uses Browser API to work with DOM
11
+ * 2) Cheerio - uses own API to work with DOM
12
+ *
13
+ * This common interfaces makes the scraping code more portable between the two.
14
+ */
15
+ export interface DOMLib<El extends BaseEl, BaseEl> {
16
+ node: El | null;
17
+ /** Get element's text (trimmed) */
18
+ text: (options?: {
19
+ allowEmpty?: boolean;
20
+ }) => MaybePromise<string | null>;
21
+ /** Get element's text as uppercase (trimmed) */
22
+ textAsUpper: (options?: {
23
+ allowEmpty?: boolean;
24
+ }) => MaybePromise<string | null>;
25
+ /** Get element's text as lowercase (trimmed) */
26
+ textAsLower: (options?: {
27
+ allowEmpty?: boolean;
28
+ }) => MaybePromise<string | null>;
29
+ /** Get element's text and convert it to number */
30
+ textAsNumber: (options?: StrAsNumOptions) => MaybePromise<number | null>;
31
+ /** Get element's attribute */
32
+ attr: (attrName: string, options?: {
33
+ allowEmpty?: boolean;
34
+ }) => MaybePromise<string | null>;
35
+ /** Get element's attributes */
36
+ attrs: <T extends string>(attrNames: T[], options?: {
37
+ allowEmpty?: boolean;
38
+ }) => MaybePromise<Record<T, string | null>>;
39
+ /** Get element's property */
40
+ prop: <R = unknown>(
41
+ /** Single or nested prop path */
42
+ propName: MaybeArray<string>, options?: {
43
+ allowEmpty?: boolean;
44
+ }) => MaybePromise<R>;
45
+ /** Get element's properties */
46
+ props: <R extends any[]>(
47
+ /** List of single or nested prop paths */
48
+ propName: MaybeArray<string>[], options?: {
49
+ allowEmpty?: boolean;
50
+ }) => MaybePromise<R>;
51
+ /** Get element's href */
52
+ href: (options?: {
53
+ allowEmpty?: boolean;
54
+ } & FormatUrlOptions) => MaybePromise<string | null>;
55
+ /** Get element's src */
56
+ src: (options?: {
57
+ allowEmpty?: boolean;
58
+ } & FormatUrlOptions) => MaybePromise<string | null>;
59
+ /** Get element's nodeName */
60
+ nodeName: () => MaybePromise<string | null>;
61
+ /** Get URL of website associated with the DOM */
62
+ url: () => MaybePromise<string | null>;
63
+ /** Freely modify the underlying DOM node */
64
+ map: <TVal>(map: (node: El | null) => TVal) => MaybePromise<TVal>;
65
+ /** Get a single descendant matching the selector */
66
+ findOne: <TNewEl extends BaseEl = El>(selector: string) => MaybePromise<DOMLib<TNewEl, BaseEl> | null>;
67
+ /** Get all descendants matching the selector */
68
+ findMany: <TNewEl extends BaseEl = El>(selector: string) => MaybePromise<DOMLib<TNewEl, BaseEl>[]>;
69
+ /** Get a single ancestor (or itself) matching the selector */
70
+ closest: <TNewEl extends BaseEl = El>(selector: string) => MaybePromise<DOMLib<TNewEl, BaseEl> | null>;
71
+ /** Get element's parent */
72
+ parent: <TNewEl extends BaseEl = El>() => MaybePromise<DOMLib<TNewEl, BaseEl> | null>;
73
+ /** Get element's children */
74
+ children: <TNewEl extends BaseEl = El>() => MaybePromise<DOMLib<TNewEl, BaseEl>[]>;
75
+ /** Get remove the element */
76
+ remove: () => MaybePromise<void>;
77
+ /** Get root element */
78
+ root: <TNewEl extends BaseEl = El>() => MaybePromise<DOMLib<TNewEl, BaseEl> | null>;
79
+ /**
80
+ * Given two elements, return closest ancestor element that encompases them both,
81
+ * or `null` if none such found.
82
+ */
83
+ getCommonAncestor: <TNewEl extends BaseEl = El>(otherEl: El) => MaybePromise<DOMLib<TNewEl, BaseEl> | null>;
84
+ /**
85
+ * Given a selector, find all DOM elements that match the selector,
86
+ * and return closest ancestor element that encompases them all,
87
+ * or `null` if none such found.
88
+ */
89
+ getCommonAncestorFromSelector: <TNewEl extends BaseEl = El>(selector: string) => MaybePromise<DOMLib<TNewEl, BaseEl> | null>;
90
+ }
91
+ export type BrowserDOMLib<T extends Element = Element> = DOMLib<T, Element>;
92
+ /** Implementation of DOMLib in browser (using Browser API) */
93
+ export declare const browserDOMLib: <El extends Element>(node: El) => BrowserDOMLib<El>;
94
+ export type CheerioDOMLib<El extends Cheerio<AnyNode> = Cheerio<AnyNode>> = DOMLib<El, Cheerio<AnyNode>>;
95
+ /** Implementation of DOMLib in Cheerio */
96
+ export declare const cheerioDOMLib: <El extends Cheerio<AnyNode>>(cheerioNode: El, srcUrl: string | null) => CheerioDOMLib<El>;
97
+ export type PlaywrightHandleDOMLib<El extends Locator | ElementHandle<Node> = Locator | ElementHandle<Node>> = DOMLib<El, Locator | ElementHandle<Node>>;
98
+ /** Implementation of DOMLib in Playwright using Handles */
99
+ export declare const playwrightHandleDOMLib: <El extends ElementHandle<Node> | Locator>(node: El, page: Page) => PlaywrightHandleDOMLib<El>;
100
+ export type PlaywrightLocatorDOMLib<El extends Locator = Locator> = DOMLib<El, Locator>;
101
+ /** Implementation of DOMLib in Playwright using Locators */
102
+ export declare const playwrightLocatorDOMLib: <El extends Locator>(node: El, page: Page) => PlaywrightLocatorDOMLib<El>;