unprint 0.11.13 → 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -13,6 +13,17 @@ unprint.options({
13
13
  headers: {
14
14
  'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'
15
15
  },
16
+ limits: { // request throttling
17
+ default: {
18
+ concurrency: 10,
19
+ interval: 10, // ms
20
+ },
21
+ [hostname]: {
22
+ enable: true, // enabled by default
23
+ concurrency: 1,
24
+ interval: 1000,
25
+ },
26
+ },
16
27
  })
17
28
  ```
18
29
 
@@ -195,3 +206,13 @@ Returns
195
206
  res, // (object) alias for 'response'
196
207
  }
197
208
  ```
209
+
210
+ ### Feedback events
211
+ Usage:
212
+ * `unprint.on('trigger', callbackFn)`
213
+ * `unprint.off('trigger', callbackFn)`
214
+
215
+ Triggers:
216
+ * `requestInit`: A HTTP request is about to be made
217
+ * `requestSuccess`: The HTTP request completed with an OK status code
218
+ * `requestError`: The HTTP request completed with an error status code
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "unprint",
3
- "version": "0.11.13",
3
+ "version": "0.13.0",
4
4
  "description": "Simplify common web scraping tasks while staying in control of the data.",
5
5
  "main": "src/app.js",
6
6
  "scripts": {
package/src/app.js CHANGED
@@ -1,9 +1,11 @@
1
1
  'use strict';
2
2
 
3
3
  const { JSDOM, VirtualConsole } = require('jsdom');
4
+ const EventEmitter = require('events');
4
5
  const http = require('http');
5
6
  const https = require('https');
6
7
  const axios = require('axios').default;
8
+ const Bottleneck = require('bottleneck');
7
9
  const moment = require('moment-timezone');
8
10
  const merge = require('deepmerge');
9
11
 
@@ -11,17 +13,33 @@ const settings = {
11
13
  throwErrors: false,
12
14
  logErrors: true,
13
15
  requestTimeout: 30000,
16
+ limits: {
17
+ default: {
18
+ interval: 10,
19
+ concurrency: 10,
20
+ },
21
+ },
14
22
  };
15
23
 
16
24
  const virtualConsole = new VirtualConsole();
17
25
  const { window: globalWindow } = new JSDOM('', { virtualConsole });
18
26
 
27
+ let globalOptions = {
28
+ ...settings,
29
+ };
30
+
31
+ const events = new EventEmitter();
32
+
33
+ function configure(newOptions) {
34
+ globalOptions = merge(globalOptions, newOptions);
35
+ }
36
+
19
37
  function handleError(error, code) {
20
- if (settings.logErrors) {
38
+ if (globalOptions.logErrors) {
21
39
  console.error(`unprint encountered an error (${code}): ${error.message}`);
22
40
  }
23
41
 
24
- if (settings.throwErrors) {
42
+ if (globalOptions.throwErrors) {
25
43
  throw Object.assign(error, { code });
26
44
  }
27
45
 
@@ -31,12 +49,6 @@ function handleError(error, code) {
31
49
  virtualConsole.on('error', (message) => handleError(message, 'JSDOM'));
32
50
  virtualConsole.on('jsdomError', (message) => handleError(message, 'JSDOM'));
33
51
 
34
- let globalOptions = {};
35
-
36
- function configure(newOptions) {
37
- globalOptions = newOptions;
38
- }
39
-
40
52
  function trim(string) {
41
53
  if (typeof string === 'string') {
42
54
  return string.trim().replace(/\s+/g, ' ');
@@ -773,13 +785,27 @@ function isDomObject(element) {
773
785
 
774
786
  function initQueryFns(fns, context) {
775
787
  if (context) {
776
- return Object.fromEntries(Object.entries(fns).map(([key, fn]) => [key, (...args) => fn(context, ...args)]));
788
+ return Object.fromEntries(Object.entries(fns).map(([key, fn]) => [key, (...args) => {
789
+ events.emit('query', {
790
+ key,
791
+ args,
792
+ origin: context.options.origin,
793
+ });
794
+
795
+ return fn(context, ...args);
796
+ }]));
777
797
  }
778
798
 
779
799
  // context is passed directly to query method
780
800
  return Object.fromEntries(Object.entries(fns).map(([key, fn]) => [key, (...args) => {
781
801
  // first argument is already an unprint context. this seems like a convoluted approach, but there is little reason not to allow it
782
802
  if (args[0]?.isUnprint) {
803
+ events.emit('query', {
804
+ key,
805
+ args,
806
+ origin: context.options.origin,
807
+ });
808
+
783
809
  return fn(...args);
784
810
  }
785
811
 
@@ -787,6 +813,12 @@ function initQueryFns(fns, context) {
787
813
  if (isDomObject(args[0])) {
788
814
  const element = args[0];
789
815
 
816
+ events.emit('query', {
817
+ key,
818
+ args,
819
+ origin: context.options.origin,
820
+ });
821
+
790
822
  return fn({
791
823
  element,
792
824
  html: element.outerHTML || element.body?.outerHTML,
@@ -860,6 +892,44 @@ function initAll(context, selector, options = {}) {
860
892
  .map((element) => init(element, null, options));
861
893
  }
862
894
 
895
+ const limiters = {
896
+ default: new Bottleneck(),
897
+ };
898
+
899
+ function getLimiterValue(prop, options, hostname) {
900
+ if (options[prop] !== undefined) {
901
+ return options[prop];
902
+ }
903
+
904
+ if (options.limits[hostname]?.enable !== false && options.limits[hostname]?.[prop] !== undefined) {
905
+ return options.limits[hostname][prop];
906
+ }
907
+
908
+ return options.limits.default[prop];
909
+ }
910
+
911
+ function getLimiter(url, options) {
912
+ const { hostname } = new URL(url);
913
+
914
+ const interval = getLimiterValue('interval', options, hostname);
915
+ const concurrency = getLimiterValue('concurrency', options, hostname);
916
+
917
+ if (!limiters[interval]?.[concurrency]) {
918
+ limiters[interval] = limiters[interval] || {};
919
+
920
+ limiters[interval][concurrency] = new Bottleneck({
921
+ minTime: interval,
922
+ maxConcurrent: concurrency,
923
+ timeout: options.timeout + 10000, // timeout 10 seconds after axious should
924
+ });
925
+ }
926
+
927
+ return {
928
+ limiter: limiters[interval][concurrency],
929
+ interval: concurrency,
930
+ };
931
+ }
932
+
863
933
  async function request(url, body, customOptions = {}, method = 'GET') {
864
934
  const options = merge.all([{
865
935
  timeout: 1000,
@@ -867,7 +937,19 @@ async function request(url, body, customOptions = {}, method = 'GET') {
867
937
  url,
868
938
  }, globalOptions, customOptions]);
869
939
 
870
- const res = await axios({
940
+ const { limiter, interval, concurrency } = getLimiter(url, options);
941
+
942
+ const feedbackBase = {
943
+ url,
944
+ method,
945
+ interval,
946
+ concurrency,
947
+ options,
948
+ };
949
+
950
+ events.emit('requestInit', feedbackBase);
951
+
952
+ const res = await limiter.schedule(async () => axios({
871
953
  url,
872
954
  method,
873
955
  data: body,
@@ -877,11 +959,17 @@ async function request(url, body, customOptions = {}, method = 'GET') {
877
959
  signal: options.abortSignal,
878
960
  httpAgent: options.httpAgent || new http.Agent({ ...options.agent }),
879
961
  httpsAgent: options.httpsAgent || new https.Agent({ ...options.agent }),
880
- });
962
+ }));
881
963
 
882
964
  if (!(res.status >= 200 && res.status < 300)) {
883
965
  handleError(new Error(`HTTP response from ${url} not OK (${res.status} ${res.statusText}): ${res.data}`), 'HTTP_NOT_OK');
884
966
 
967
+ events.emit('requestError', {
968
+ ...feedbackBase,
969
+ status: res.status,
970
+ statusText: res.statusText,
971
+ });
972
+
885
973
  return {
886
974
  ok: false,
887
975
  status: res.status,
@@ -899,6 +987,12 @@ async function request(url, body, customOptions = {}, method = 'GET') {
899
987
  res,
900
988
  };
901
989
 
990
+ events.emit('requestSuccess', {
991
+ ...feedbackBase,
992
+ status: res.status,
993
+ statusText: res.statusText,
994
+ });
995
+
902
996
  if (res.headers['content-type'].includes('application/json') && typeof res.data === 'object') {
903
997
  return {
904
998
  ...base,
@@ -933,8 +1027,19 @@ async function post(url, body, options) {
933
1027
  return request(url, body, options, 'POST');
934
1028
  }
935
1029
 
1030
+ function on(trigger, fn) {
1031
+ events.on(trigger, fn);
1032
+ }
1033
+
1034
+ function off(trigger, fn) {
1035
+ events.off(trigger, fn);
1036
+ }
1037
+
936
1038
  module.exports = {
937
1039
  configure,
1040
+ on,
1041
+ off,
1042
+ events,
938
1043
  get,
939
1044
  post,
940
1045
  request,
package/tests/init.js CHANGED
@@ -10,7 +10,20 @@ const data = require('./data.json');
10
10
  const port = process.env.PORT || 3101;
11
11
 
12
12
  async function initTest() {
13
- unprint.options({ headers: { 'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36' } });
13
+ unprint.options({
14
+ headers: { 'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36' },
15
+ limits: {
16
+ default: {
17
+ concurrency: 1,
18
+ interval: 100,
19
+ },
20
+ },
21
+ });
22
+
23
+ unprint.on('requestInit', (initData) => console.log('init', initData));
24
+ unprint.on('requestError', (errorData) => console.error('error', errorData));
25
+ unprint.on('requestSuccess', (successData) => console.log('success', successData));
26
+ unprint.on('query', (queryData) => console.log('query', queryData));
14
27
 
15
28
  const res = await unprint.get(`http://127.0.0.1:${port}/html`, { select: 'body' });
16
29
  // const jsonRes = await unprint.get(`http://127.0.0.1:${port}/json`);