@arcblock/crawler 1.0.6 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. package/README.md +1 -0
  2. package/lib/cjs/config.d.ts +22 -0
  3. package/{dist → lib/cjs}/config.js +9 -3
  4. package/lib/cjs/crawler.d.ts +26 -0
  5. package/{dist → lib/cjs}/crawler.js +44 -112
  6. package/lib/cjs/cron.d.ts +1 -0
  7. package/lib/cjs/cron.js +49 -0
  8. package/lib/cjs/index.d.ts +9 -0
  9. package/lib/cjs/index.js +78 -0
  10. package/{esm → lib/cjs}/puppeteer.d.ts +2 -2
  11. package/{dist → lib/cjs}/puppeteer.js +43 -54
  12. package/lib/cjs/services/snapshot.d.ts +12 -0
  13. package/lib/cjs/services/snapshot.js +84 -0
  14. package/lib/cjs/site.d.ts +2 -0
  15. package/lib/cjs/site.js +76 -0
  16. package/lib/cjs/store/index.d.ts +3 -0
  17. package/{dist/db → lib/cjs/store}/index.js +21 -5
  18. package/{dist/db → lib/cjs/store}/job.d.ts +4 -3
  19. package/lib/cjs/store/job.js +110 -0
  20. package/{dist/db → lib/cjs/store}/snapshot.d.ts +5 -6
  21. package/lib/cjs/store/snapshot.js +68 -0
  22. package/lib/cjs/utils.d.ts +32 -0
  23. package/{dist → lib/cjs}/utils.js +67 -78
  24. package/lib/esm/config.d.ts +22 -0
  25. package/{esm → lib/esm}/config.js +9 -3
  26. package/lib/esm/crawler.d.ts +26 -0
  27. package/{esm → lib/esm}/crawler.js +35 -100
  28. package/lib/esm/cron.d.ts +1 -0
  29. package/lib/esm/cron.js +43 -0
  30. package/lib/esm/index.d.ts +9 -0
  31. package/{esm → lib/esm}/index.js +19 -10
  32. package/{dist → lib/esm}/puppeteer.d.ts +2 -2
  33. package/{esm → lib/esm}/puppeteer.js +21 -32
  34. package/lib/esm/services/snapshot.d.ts +12 -0
  35. package/lib/esm/services/snapshot.js +75 -0
  36. package/lib/esm/site.d.ts +2 -0
  37. package/lib/esm/site.js +69 -0
  38. package/lib/esm/store/index.d.ts +3 -0
  39. package/{esm/db → lib/esm/store}/index.js +22 -6
  40. package/{esm/db → lib/esm/store}/job.d.ts +4 -3
  41. package/lib/esm/store/job.js +73 -0
  42. package/{esm/db → lib/esm/store}/snapshot.d.ts +5 -6
  43. package/lib/esm/store/snapshot.js +64 -0
  44. package/lib/esm/utils.d.ts +32 -0
  45. package/{esm → lib/esm}/utils.js +64 -71
  46. package/package.json +20 -32
  47. package/third.d.ts +0 -0
  48. package/dist/blocklet.d.ts +0 -6
  49. package/dist/blocklet.js +0 -199
  50. package/dist/cache.d.ts +0 -10
  51. package/dist/cache.js +0 -119
  52. package/dist/config.d.ts +0 -10
  53. package/dist/crawler.d.ts +0 -28
  54. package/dist/db/index.d.ts +0 -1
  55. package/dist/db/job.js +0 -54
  56. package/dist/db/snapshot.js +0 -52
  57. package/dist/index.d.ts +0 -6
  58. package/dist/index.js +0 -45
  59. package/dist/middleware.d.ts +0 -4
  60. package/dist/middleware.js +0 -44
  61. package/dist/utils.d.ts +0 -17
  62. package/esm/blocklet.d.ts +0 -6
  63. package/esm/blocklet.js +0 -190
  64. package/esm/cache.d.ts +0 -10
  65. package/esm/cache.js +0 -114
  66. package/esm/config.d.ts +0 -10
  67. package/esm/crawler.d.ts +0 -28
  68. package/esm/db/index.d.ts +0 -1
  69. package/esm/db/job.js +0 -50
  70. package/esm/db/snapshot.js +0 -48
  71. package/esm/index.d.ts +0 -6
  72. package/esm/middleware.d.ts +0 -4
  73. package/esm/middleware.js +0 -41
  74. package/esm/utils.d.ts +0 -17
@@ -1,48 +0,0 @@
1
- import { DataTypes, Model } from '@sequelize/core';
2
- class Snapshot extends Model {
3
- }
4
- export { Snapshot };
5
- export function initSnapshotModel(sequelize) {
6
- Snapshot.init({
7
- jobId: {
8
- type: DataTypes.STRING,
9
- primaryKey: true,
10
- allowNull: false,
11
- },
12
- url: {
13
- type: DataTypes.STRING,
14
- allowNull: false,
15
- index: true,
16
- },
17
- status: {
18
- type: DataTypes.ENUM('success', 'failed'),
19
- allowNull: false,
20
- },
21
- html: {
22
- type: DataTypes.TEXT,
23
- allowNull: true,
24
- },
25
- screenshot: {
26
- type: DataTypes.STRING,
27
- allowNull: true,
28
- },
29
- error: {
30
- type: DataTypes.STRING,
31
- allowNull: true,
32
- },
33
- lastModified: {
34
- type: DataTypes.STRING,
35
- allowNull: true,
36
- },
37
- options: {
38
- type: DataTypes.JSON,
39
- allowNull: true,
40
- },
41
- }, {
42
- sequelize,
43
- modelName: 'snapshot',
44
- tableName: 'snap',
45
- timestamps: true,
46
- });
47
- return Snapshot;
48
- }
package/esm/index.d.ts DELETED
@@ -1,6 +0,0 @@
1
- import { config } from './config';
2
- export * from './blocklet';
3
- export * from './crawler';
4
- export * from './middleware';
5
- export { Snapshot } from './db/snapshot';
6
- export declare function initCrawler(_config: Partial<typeof config>): Promise<void>;
@@ -1,4 +0,0 @@
1
- export declare function initSEOMiddleware({ autoReturnHtml, allowCrawler, }: {
2
- autoReturnHtml?: Boolean;
3
- allowCrawler?: Boolean | Function;
4
- }): (req: any, res: any, next: Function) => Promise<any>;
package/esm/middleware.js DELETED
@@ -1,41 +0,0 @@
1
- var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
2
- function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
3
- return new (P || (P = Promise))(function (resolve, reject) {
4
- function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
5
- function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
6
- function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
7
- step((generator = generator.apply(thisArg, _arguments || [])).next());
8
- });
9
- };
10
- import { useCache } from './cache';
11
- import { getFullUrl, isAcceptCrawler, isBotUserAgent, isSelfCrawler } from './utils';
12
- export function initSEOMiddleware({ autoReturnHtml = true, allowCrawler = true, }) {
13
- return (req, res, next) => __awaiter(this, void 0, void 0, function* () {
14
- const isBot = isBotUserAgent(req);
15
- const isSelf = isSelfCrawler(req);
16
- if (!isBot || isSelf) {
17
- return next();
18
- }
19
- const fullUrl = getFullUrl(req);
20
- const canCrawl = yield isAcceptCrawler(fullUrl);
21
- const allowCrawlerResult = typeof allowCrawler === 'function' ? allowCrawler(req) : allowCrawler;
22
- // can not crawl, skip
23
- if (!canCrawl || !allowCrawlerResult) {
24
- return next();
25
- }
26
- const cacheData = yield useCache.get(fullUrl);
27
- // add cached html to req
28
- req.cachedHtml = (cacheData === null || cacheData === void 0 ? void 0 : cacheData.content) || cacheData || null;
29
- // add cached lastModified to req, ISO string to GMT string
30
- req.cachedLastmod = (cacheData === null || cacheData === void 0 ? void 0 : cacheData.lastModified) ? new Date(cacheData === null || cacheData === void 0 ? void 0 : cacheData.lastModified).toUTCString() : null;
31
- if (req.cachedLastmod) {
32
- res.setHeader('Last-Modified', req.cachedLastmod);
33
- }
34
- if (autoReturnHtml && req.cachedHtml) {
35
- res.send(req.cachedHtml);
36
- return;
37
- }
38
- // missing cache
39
- next();
40
- });
41
- }
package/esm/utils.d.ts DELETED
@@ -1,17 +0,0 @@
1
- import { Page } from '@blocklet/puppeteer';
2
- export declare const api: import("axios").AxiosInstance;
3
- export declare const sleep: (ms: number) => Promise<unknown>;
4
- export declare const CRAWLER_FLAG = "x-crawler";
5
- export declare const isSelfCrawler: (req: any) => boolean;
6
- export declare const getDefaultRobotsUrl: (url: string) => string;
7
- export declare function getRobots(url: string): Promise<import("robots-parser").Robot | null>;
8
- export declare const getDefaultSitemapUrl: (url: string) => string;
9
- export declare const isAcceptCrawler: (url: string) => Promise<boolean | undefined>;
10
- export declare const getSitemapList: (url: string) => Promise<import("sitemap").SitemapItem[]>;
11
- export declare const isBotUserAgent: (req: any) => boolean;
12
- export declare const getComponentInfo: () => {};
13
- export declare const getFullUrl: (req: any) => string;
14
- export declare const getRelativePath: (url: string) => string;
15
- export declare const formatUrl: (url: string) => string;
16
- export declare function md5(content: string | Uint8Array): string;
17
- export declare function findMaxScrollHeight(page: Page): Promise<number>;