unprint 0.11.12 → 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -13,6 +13,17 @@ unprint.options({
13
13
  headers: {
14
14
  'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'
15
15
  },
16
+ limits: { // request throttling
17
+ default: {
18
+ concurrency: 10,
19
+ interval: 10, // ms
20
+ },
21
+ [hostname]: {
22
+ enable: true, // enabled by default
23
+ concurrency: 1,
24
+ interval: 1000,
25
+ },
26
+ },
16
27
  })
17
28
  ```
18
29
 
@@ -195,3 +206,13 @@ Returns
195
206
  res, // (object) alias for 'response'
196
207
  }
197
208
  ```
209
+
210
+ ### Feedback events
211
+ Usage:
212
+ * `unprint.on('trigger', callbackFn)`
213
+ * `unprint.off('trigger', callbackFn)`
214
+
215
+ Triggers:
216
+ * `requestInit`: A HTTP request is about to be made
217
+ * `requestSuccess`: The HTTP request completed with an OK status code
218
+ * `requestError`: The HTTP request completed with an error status code
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "unprint",
3
- "version": "0.11.12",
3
+ "version": "0.13.0",
4
4
  "description": "Simplify common web scraping tasks while staying in control of the data.",
5
5
  "main": "src/app.js",
6
6
  "scripts": {
package/src/app.js CHANGED
@@ -1,9 +1,11 @@
1
1
  'use strict';
2
2
 
3
3
  const { JSDOM, VirtualConsole } = require('jsdom');
4
+ const EventEmitter = require('events');
4
5
  const http = require('http');
5
6
  const https = require('https');
6
7
  const axios = require('axios').default;
8
+ const Bottleneck = require('bottleneck');
7
9
  const moment = require('moment-timezone');
8
10
  const merge = require('deepmerge');
9
11
 
@@ -11,17 +13,33 @@ const settings = {
11
13
  throwErrors: false,
12
14
  logErrors: true,
13
15
  requestTimeout: 30000,
16
+ limits: {
17
+ default: {
18
+ interval: 10,
19
+ concurrency: 10,
20
+ },
21
+ },
14
22
  };
15
23
 
16
24
  const virtualConsole = new VirtualConsole();
17
25
  const { window: globalWindow } = new JSDOM('', { virtualConsole });
18
26
 
27
+ let globalOptions = {
28
+ ...settings,
29
+ };
30
+
31
+ const events = new EventEmitter();
32
+
33
+ function configure(newOptions) {
34
+ globalOptions = merge(globalOptions, newOptions);
35
+ }
36
+
19
37
  function handleError(error, code) {
20
- if (settings.logErrors) {
38
+ if (globalOptions.logErrors) {
21
39
  console.error(`unprint encountered an error (${code}): ${error.message}`);
22
40
  }
23
41
 
24
- if (settings.throwErrors) {
42
+ if (globalOptions.throwErrors) {
25
43
  throw Object.assign(error, { code });
26
44
  }
27
45
 
@@ -31,12 +49,6 @@ function handleError(error, code) {
31
49
  virtualConsole.on('error', (message) => handleError(message, 'JSDOM'));
32
50
  virtualConsole.on('jsdomError', (message) => handleError(message, 'JSDOM'));
33
51
 
34
- let globalOptions = {};
35
-
36
- function configure(newOptions) {
37
- globalOptions = newOptions;
38
- }
39
-
40
52
  function trim(string) {
41
53
  if (typeof string === 'string') {
42
54
  return string.trim().replace(/\s+/g, ' ');
@@ -217,11 +229,23 @@ function extractNumber(rawNumberString, customOptions) {
217
229
  : rawNumberString.replace(',', '');
218
230
 
219
231
  if (numberString && options.match) {
220
- return Number(numberString.match(options.match)?.[options.matchIndex]) || null;
232
+ const number = Number(numberString.match(options.match)?.[options.matchIndex]);
233
+
234
+ if (Number.isNaN(number)) {
235
+ return null;
236
+ }
237
+
238
+ return number;
221
239
  }
222
240
 
223
241
  if (numberString) {
224
- return Number(numberString) || null;
242
+ const number = Number(numberString);
243
+
244
+ if (Number.isNaN(number)) {
245
+ return null;
246
+ }
247
+
248
+ return number;
225
249
  }
226
250
 
227
251
  return null;
@@ -761,13 +785,27 @@ function isDomObject(element) {
761
785
 
762
786
  function initQueryFns(fns, context) {
763
787
  if (context) {
764
- return Object.fromEntries(Object.entries(fns).map(([key, fn]) => [key, (...args) => fn(context, ...args)]));
788
+ return Object.fromEntries(Object.entries(fns).map(([key, fn]) => [key, (...args) => {
789
+ events.emit('query', {
790
+ key,
791
+ args,
792
+ origin: context.options.origin,
793
+ });
794
+
795
+ return fn(context, ...args);
796
+ }]));
765
797
  }
766
798
 
767
799
  // context is passed directly to query method
768
800
  return Object.fromEntries(Object.entries(fns).map(([key, fn]) => [key, (...args) => {
769
801
  // first argument is already an unprint context. this seems like a convoluted approach, but there is little reason not to allow it
770
802
  if (args[0]?.isUnprint) {
803
+ events.emit('query', {
804
+ key,
805
+ args,
806
+ origin: context.options.origin,
807
+ });
808
+
771
809
  return fn(...args);
772
810
  }
773
811
 
@@ -775,6 +813,12 @@ function initQueryFns(fns, context) {
775
813
  if (isDomObject(args[0])) {
776
814
  const element = args[0];
777
815
 
816
+ events.emit('query', {
817
+ key,
818
+ args,
819
+ origin: context.options.origin,
820
+ });
821
+
778
822
  return fn({
779
823
  element,
780
824
  html: element.outerHTML || element.body?.outerHTML,
@@ -848,6 +892,44 @@ function initAll(context, selector, options = {}) {
848
892
  .map((element) => init(element, null, options));
849
893
  }
850
894
 
895
+ const limiters = {
896
+ default: new Bottleneck(),
897
+ };
898
+
899
+ function getLimiterValue(prop, options, hostname) {
900
+ if (options[prop] !== undefined) {
901
+ return options[prop];
902
+ }
903
+
904
+ if (options.limits[hostname]?.enable !== false && options.limits[hostname]?.[prop] !== undefined) {
905
+ return options.limits[hostname][prop];
906
+ }
907
+
908
+ return options.limits.default[prop];
909
+ }
910
+
911
+ function getLimiter(url, options) {
912
+ const { hostname } = new URL(url);
913
+
914
+ const interval = getLimiterValue('interval', options, hostname);
915
+ const concurrency = getLimiterValue('concurrency', options, hostname);
916
+
917
+ if (!limiters[interval]?.[concurrency]) {
918
+ limiters[interval] = limiters[interval] || {};
919
+
920
+ limiters[interval][concurrency] = new Bottleneck({
921
+ minTime: interval,
922
+ maxConcurrent: concurrency,
923
+ timeout: options.timeout + 10000, // timeout 10 seconds after axious should
924
+ });
925
+ }
926
+
927
+ return {
928
+ limiter: limiters[interval][concurrency],
929
+ interval: concurrency,
930
+ };
931
+ }
932
+
851
933
  async function request(url, body, customOptions = {}, method = 'GET') {
852
934
  const options = merge.all([{
853
935
  timeout: 1000,
@@ -855,7 +937,19 @@ async function request(url, body, customOptions = {}, method = 'GET') {
855
937
  url,
856
938
  }, globalOptions, customOptions]);
857
939
 
858
- const res = await axios({
940
+ const { limiter, interval, concurrency } = getLimiter(url, options);
941
+
942
+ const feedbackBase = {
943
+ url,
944
+ method,
945
+ interval,
946
+ concurrency,
947
+ options,
948
+ };
949
+
950
+ events.emit('requestInit', feedbackBase);
951
+
952
+ const res = await limiter.schedule(async () => axios({
859
953
  url,
860
954
  method,
861
955
  data: body,
@@ -865,11 +959,17 @@ async function request(url, body, customOptions = {}, method = 'GET') {
865
959
  signal: options.abortSignal,
866
960
  httpAgent: options.httpAgent || new http.Agent({ ...options.agent }),
867
961
  httpsAgent: options.httpsAgent || new https.Agent({ ...options.agent }),
868
- });
962
+ }));
869
963
 
870
964
  if (!(res.status >= 200 && res.status < 300)) {
871
965
  handleError(new Error(`HTTP response from ${url} not OK (${res.status} ${res.statusText}): ${res.data}`), 'HTTP_NOT_OK');
872
966
 
967
+ events.emit('requestError', {
968
+ ...feedbackBase,
969
+ status: res.status,
970
+ statusText: res.statusText,
971
+ });
972
+
873
973
  return {
874
974
  ok: false,
875
975
  status: res.status,
@@ -887,6 +987,12 @@ async function request(url, body, customOptions = {}, method = 'GET') {
887
987
  res,
888
988
  };
889
989
 
990
+ events.emit('requestSuccess', {
991
+ ...feedbackBase,
992
+ status: res.status,
993
+ statusText: res.statusText,
994
+ });
995
+
890
996
  if (res.headers['content-type'].includes('application/json') && typeof res.data === 'object') {
891
997
  return {
892
998
  ...base,
@@ -921,8 +1027,19 @@ async function post(url, body, options) {
921
1027
  return request(url, body, options, 'POST');
922
1028
  }
923
1029
 
1030
+ function on(trigger, fn) {
1031
+ events.on(trigger, fn);
1032
+ }
1033
+
1034
+ function off(trigger, fn) {
1035
+ events.off(trigger, fn);
1036
+ }
1037
+
924
1038
  module.exports = {
925
1039
  configure,
1040
+ on,
1041
+ off,
1042
+ events,
926
1043
  get,
927
1044
  post,
928
1045
  request,
package/tests/init.js CHANGED
@@ -10,7 +10,20 @@ const data = require('./data.json');
10
10
  const port = process.env.PORT || 3101;
11
11
 
12
12
  async function initTest() {
13
- unprint.options({ headers: { 'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36' } });
13
+ unprint.options({
14
+ headers: { 'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36' },
15
+ limits: {
16
+ default: {
17
+ concurrency: 1,
18
+ interval: 100,
19
+ },
20
+ },
21
+ });
22
+
23
+ unprint.on('requestInit', (initData) => console.log('init', initData));
24
+ unprint.on('requestError', (errorData) => console.error('error', errorData));
25
+ unprint.on('requestSuccess', (successData) => console.log('success', successData));
26
+ unprint.on('query', (queryData) => console.log('query', queryData));
14
27
 
15
28
  const res = await unprint.get(`http://127.0.0.1:${port}/html`, { select: 'body' });
16
29
  // const jsonRes = await unprint.get(`http://127.0.0.1:${port}/json`);