scrapex 0.5.2 → 1.0.0-alpha.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/LICENSE +1 -1
  2. package/README.md +392 -145
  3. package/dist/enhancer-Q6CSc1gA.mjs +220 -0
  4. package/dist/enhancer-Q6CSc1gA.mjs.map +1 -0
  5. package/dist/enhancer-oM4BhYYS.cjs +268 -0
  6. package/dist/enhancer-oM4BhYYS.cjs.map +1 -0
  7. package/dist/index.cjs +852 -0
  8. package/dist/index.cjs.map +1 -0
  9. package/dist/index.d.cts +264 -0
  10. package/dist/index.d.cts.map +1 -0
  11. package/dist/index.d.mts +264 -0
  12. package/dist/index.d.mts.map +1 -0
  13. package/dist/index.mjs +798 -0
  14. package/dist/index.mjs.map +1 -0
  15. package/dist/llm/index.cjs +316 -0
  16. package/dist/llm/index.cjs.map +1 -0
  17. package/dist/llm/index.d.cts +211 -0
  18. package/dist/llm/index.d.cts.map +1 -0
  19. package/dist/llm/index.d.mts +211 -0
  20. package/dist/llm/index.d.mts.map +1 -0
  21. package/dist/llm/index.mjs +310 -0
  22. package/dist/llm/index.mjs.map +1 -0
  23. package/dist/parsers/index.cjs +200 -0
  24. package/dist/parsers/index.cjs.map +1 -0
  25. package/dist/parsers/index.d.cts +133 -0
  26. package/dist/parsers/index.d.cts.map +1 -0
  27. package/dist/parsers/index.d.mts +133 -0
  28. package/dist/parsers/index.d.mts.map +1 -0
  29. package/dist/parsers/index.mjs +192 -0
  30. package/dist/parsers/index.mjs.map +1 -0
  31. package/dist/types-CNQZVW36.d.mts +150 -0
  32. package/dist/types-CNQZVW36.d.mts.map +1 -0
  33. package/dist/types-D0HYR95H.d.cts +150 -0
  34. package/dist/types-D0HYR95H.d.cts.map +1 -0
  35. package/package.json +80 -100
  36. package/dist/index.d.ts +0 -45
  37. package/dist/index.js +0 -8
  38. package/dist/scrapex.cjs.development.js +0 -1128
  39. package/dist/scrapex.cjs.development.js.map +0 -1
  40. package/dist/scrapex.cjs.production.min.js +0 -2
  41. package/dist/scrapex.cjs.production.min.js.map +0 -1
  42. package/dist/scrapex.esm.js +0 -1120
  43. package/dist/scrapex.esm.js.map +0 -1
@@ -1,1120 +0,0 @@
1
- import { Readability } from '@mozilla/readability';
2
- import cheerio from 'cheerio';
3
- import { createWindow } from 'domino';
4
- import { JSDOM } from 'jsdom';
5
- import get from 'lodash.get';
6
- import uniq from 'lodash.uniq';
7
- import metascraper from 'metascraper';
8
- import { getMetadata } from 'page-metadata-parser';
9
- import robotsParser from 'robots-parser';
10
- import sanitize from 'sanitize-html';
11
- import { isUri } from 'valid-url';
12
- import HttpAgent from 'agentkeepalive';
13
- import got from 'got';
14
-
15
- function asyncGeneratorStep(gen, resolve, reject, _next, _throw, key, arg) {
16
- try {
17
- var info = gen[key](arg);
18
- var value = info.value;
19
- } catch (error) {
20
- reject(error);
21
- return;
22
- }
23
-
24
- if (info.done) {
25
- resolve(value);
26
- } else {
27
- Promise.resolve(value).then(_next, _throw);
28
- }
29
- }
30
-
31
- function _asyncToGenerator(fn) {
32
- return function () {
33
- var self = this,
34
- args = arguments;
35
- return new Promise(function (resolve, reject) {
36
- var gen = fn.apply(self, args);
37
-
38
- function _next(value) {
39
- asyncGeneratorStep(gen, resolve, reject, _next, _throw, "next", value);
40
- }
41
-
42
- function _throw(err) {
43
- asyncGeneratorStep(gen, resolve, reject, _next, _throw, "throw", err);
44
- }
45
-
46
- _next(undefined);
47
- });
48
- };
49
- }
50
-
51
- function _extends() {
52
- _extends = Object.assign || function (target) {
53
- for (var i = 1; i < arguments.length; i++) {
54
- var source = arguments[i];
55
-
56
- for (var key in source) {
57
- if (Object.prototype.hasOwnProperty.call(source, key)) {
58
- target[key] = source[key];
59
- }
60
- }
61
- }
62
-
63
- return target;
64
- };
65
-
66
- return _extends.apply(this, arguments);
67
- }
68
-
69
- function createCommonjsModule(fn, module) {
70
- return module = { exports: {} }, fn(module, module.exports), module.exports;
71
- }
72
-
73
- var runtime_1 = createCommonjsModule(function (module) {
74
- /**
75
- * Copyright (c) 2014-present, Facebook, Inc.
76
- *
77
- * This source code is licensed under the MIT license found in the
78
- * LICENSE file in the root directory of this source tree.
79
- */
80
-
81
- var runtime = (function (exports) {
82
-
83
- var Op = Object.prototype;
84
- var hasOwn = Op.hasOwnProperty;
85
- var undefined$1; // More compressible than void 0.
86
- var $Symbol = typeof Symbol === "function" ? Symbol : {};
87
- var iteratorSymbol = $Symbol.iterator || "@@iterator";
88
- var asyncIteratorSymbol = $Symbol.asyncIterator || "@@asyncIterator";
89
- var toStringTagSymbol = $Symbol.toStringTag || "@@toStringTag";
90
-
91
- function define(obj, key, value) {
92
- Object.defineProperty(obj, key, {
93
- value: value,
94
- enumerable: true,
95
- configurable: true,
96
- writable: true
97
- });
98
- return obj[key];
99
- }
100
- try {
101
- // IE 8 has a broken Object.defineProperty that only works on DOM objects.
102
- define({}, "");
103
- } catch (err) {
104
- define = function(obj, key, value) {
105
- return obj[key] = value;
106
- };
107
- }
108
-
109
- function wrap(innerFn, outerFn, self, tryLocsList) {
110
- // If outerFn provided and outerFn.prototype is a Generator, then outerFn.prototype instanceof Generator.
111
- var protoGenerator = outerFn && outerFn.prototype instanceof Generator ? outerFn : Generator;
112
- var generator = Object.create(protoGenerator.prototype);
113
- var context = new Context(tryLocsList || []);
114
-
115
- // The ._invoke method unifies the implementations of the .next,
116
- // .throw, and .return methods.
117
- generator._invoke = makeInvokeMethod(innerFn, self, context);
118
-
119
- return generator;
120
- }
121
- exports.wrap = wrap;
122
-
123
- // Try/catch helper to minimize deoptimizations. Returns a completion
124
- // record like context.tryEntries[i].completion. This interface could
125
- // have been (and was previously) designed to take a closure to be
126
- // invoked without arguments, but in all the cases we care about we
127
- // already have an existing method we want to call, so there's no need
128
- // to create a new function object. We can even get away with assuming
129
- // the method takes exactly one argument, since that happens to be true
130
- // in every case, so we don't have to touch the arguments object. The
131
- // only additional allocation required is the completion record, which
132
- // has a stable shape and so hopefully should be cheap to allocate.
133
- function tryCatch(fn, obj, arg) {
134
- try {
135
- return { type: "normal", arg: fn.call(obj, arg) };
136
- } catch (err) {
137
- return { type: "throw", arg: err };
138
- }
139
- }
140
-
141
- var GenStateSuspendedStart = "suspendedStart";
142
- var GenStateSuspendedYield = "suspendedYield";
143
- var GenStateExecuting = "executing";
144
- var GenStateCompleted = "completed";
145
-
146
- // Returning this object from the innerFn has the same effect as
147
- // breaking out of the dispatch switch statement.
148
- var ContinueSentinel = {};
149
-
150
- // Dummy constructor functions that we use as the .constructor and
151
- // .constructor.prototype properties for functions that return Generator
152
- // objects. For full spec compliance, you may wish to configure your
153
- // minifier not to mangle the names of these two functions.
154
- function Generator() {}
155
- function GeneratorFunction() {}
156
- function GeneratorFunctionPrototype() {}
157
-
158
- // This is a polyfill for %IteratorPrototype% for environments that
159
- // don't natively support it.
160
- var IteratorPrototype = {};
161
- define(IteratorPrototype, iteratorSymbol, function () {
162
- return this;
163
- });
164
-
165
- var getProto = Object.getPrototypeOf;
166
- var NativeIteratorPrototype = getProto && getProto(getProto(values([])));
167
- if (NativeIteratorPrototype &&
168
- NativeIteratorPrototype !== Op &&
169
- hasOwn.call(NativeIteratorPrototype, iteratorSymbol)) {
170
- // This environment has a native %IteratorPrototype%; use it instead
171
- // of the polyfill.
172
- IteratorPrototype = NativeIteratorPrototype;
173
- }
174
-
175
- var Gp = GeneratorFunctionPrototype.prototype =
176
- Generator.prototype = Object.create(IteratorPrototype);
177
- GeneratorFunction.prototype = GeneratorFunctionPrototype;
178
- define(Gp, "constructor", GeneratorFunctionPrototype);
179
- define(GeneratorFunctionPrototype, "constructor", GeneratorFunction);
180
- GeneratorFunction.displayName = define(
181
- GeneratorFunctionPrototype,
182
- toStringTagSymbol,
183
- "GeneratorFunction"
184
- );
185
-
186
- // Helper for defining the .next, .throw, and .return methods of the
187
- // Iterator interface in terms of a single ._invoke method.
188
- function defineIteratorMethods(prototype) {
189
- ["next", "throw", "return"].forEach(function(method) {
190
- define(prototype, method, function(arg) {
191
- return this._invoke(method, arg);
192
- });
193
- });
194
- }
195
-
196
- exports.isGeneratorFunction = function(genFun) {
197
- var ctor = typeof genFun === "function" && genFun.constructor;
198
- return ctor
199
- ? ctor === GeneratorFunction ||
200
- // For the native GeneratorFunction constructor, the best we can
201
- // do is to check its .name property.
202
- (ctor.displayName || ctor.name) === "GeneratorFunction"
203
- : false;
204
- };
205
-
206
- exports.mark = function(genFun) {
207
- if (Object.setPrototypeOf) {
208
- Object.setPrototypeOf(genFun, GeneratorFunctionPrototype);
209
- } else {
210
- genFun.__proto__ = GeneratorFunctionPrototype;
211
- define(genFun, toStringTagSymbol, "GeneratorFunction");
212
- }
213
- genFun.prototype = Object.create(Gp);
214
- return genFun;
215
- };
216
-
217
- // Within the body of any async function, `await x` is transformed to
218
- // `yield regeneratorRuntime.awrap(x)`, so that the runtime can test
219
- // `hasOwn.call(value, "__await")` to determine if the yielded value is
220
- // meant to be awaited.
221
- exports.awrap = function(arg) {
222
- return { __await: arg };
223
- };
224
-
225
- function AsyncIterator(generator, PromiseImpl) {
226
- function invoke(method, arg, resolve, reject) {
227
- var record = tryCatch(generator[method], generator, arg);
228
- if (record.type === "throw") {
229
- reject(record.arg);
230
- } else {
231
- var result = record.arg;
232
- var value = result.value;
233
- if (value &&
234
- typeof value === "object" &&
235
- hasOwn.call(value, "__await")) {
236
- return PromiseImpl.resolve(value.__await).then(function(value) {
237
- invoke("next", value, resolve, reject);
238
- }, function(err) {
239
- invoke("throw", err, resolve, reject);
240
- });
241
- }
242
-
243
- return PromiseImpl.resolve(value).then(function(unwrapped) {
244
- // When a yielded Promise is resolved, its final value becomes
245
- // the .value of the Promise<{value,done}> result for the
246
- // current iteration.
247
- result.value = unwrapped;
248
- resolve(result);
249
- }, function(error) {
250
- // If a rejected Promise was yielded, throw the rejection back
251
- // into the async generator function so it can be handled there.
252
- return invoke("throw", error, resolve, reject);
253
- });
254
- }
255
- }
256
-
257
- var previousPromise;
258
-
259
- function enqueue(method, arg) {
260
- function callInvokeWithMethodAndArg() {
261
- return new PromiseImpl(function(resolve, reject) {
262
- invoke(method, arg, resolve, reject);
263
- });
264
- }
265
-
266
- return previousPromise =
267
- // If enqueue has been called before, then we want to wait until
268
- // all previous Promises have been resolved before calling invoke,
269
- // so that results are always delivered in the correct order. If
270
- // enqueue has not been called before, then it is important to
271
- // call invoke immediately, without waiting on a callback to fire,
272
- // so that the async generator function has the opportunity to do
273
- // any necessary setup in a predictable way. This predictability
274
- // is why the Promise constructor synchronously invokes its
275
- // executor callback, and why async functions synchronously
276
- // execute code before the first await. Since we implement simple
277
- // async functions in terms of async generators, it is especially
278
- // important to get this right, even though it requires care.
279
- previousPromise ? previousPromise.then(
280
- callInvokeWithMethodAndArg,
281
- // Avoid propagating failures to Promises returned by later
282
- // invocations of the iterator.
283
- callInvokeWithMethodAndArg
284
- ) : callInvokeWithMethodAndArg();
285
- }
286
-
287
- // Define the unified helper method that is used to implement .next,
288
- // .throw, and .return (see defineIteratorMethods).
289
- this._invoke = enqueue;
290
- }
291
-
292
- defineIteratorMethods(AsyncIterator.prototype);
293
- define(AsyncIterator.prototype, asyncIteratorSymbol, function () {
294
- return this;
295
- });
296
- exports.AsyncIterator = AsyncIterator;
297
-
298
- // Note that simple async functions are implemented on top of
299
- // AsyncIterator objects; they just return a Promise for the value of
300
- // the final result produced by the iterator.
301
- exports.async = function(innerFn, outerFn, self, tryLocsList, PromiseImpl) {
302
- if (PromiseImpl === void 0) PromiseImpl = Promise;
303
-
304
- var iter = new AsyncIterator(
305
- wrap(innerFn, outerFn, self, tryLocsList),
306
- PromiseImpl
307
- );
308
-
309
- return exports.isGeneratorFunction(outerFn)
310
- ? iter // If outerFn is a generator, return the full iterator.
311
- : iter.next().then(function(result) {
312
- return result.done ? result.value : iter.next();
313
- });
314
- };
315
-
316
- function makeInvokeMethod(innerFn, self, context) {
317
- var state = GenStateSuspendedStart;
318
-
319
- return function invoke(method, arg) {
320
- if (state === GenStateExecuting) {
321
- throw new Error("Generator is already running");
322
- }
323
-
324
- if (state === GenStateCompleted) {
325
- if (method === "throw") {
326
- throw arg;
327
- }
328
-
329
- // Be forgiving, per 25.3.3.3.3 of the spec:
330
- // https://people.mozilla.org/~jorendorff/es6-draft.html#sec-generatorresume
331
- return doneResult();
332
- }
333
-
334
- context.method = method;
335
- context.arg = arg;
336
-
337
- while (true) {
338
- var delegate = context.delegate;
339
- if (delegate) {
340
- var delegateResult = maybeInvokeDelegate(delegate, context);
341
- if (delegateResult) {
342
- if (delegateResult === ContinueSentinel) continue;
343
- return delegateResult;
344
- }
345
- }
346
-
347
- if (context.method === "next") {
348
- // Setting context._sent for legacy support of Babel's
349
- // function.sent implementation.
350
- context.sent = context._sent = context.arg;
351
-
352
- } else if (context.method === "throw") {
353
- if (state === GenStateSuspendedStart) {
354
- state = GenStateCompleted;
355
- throw context.arg;
356
- }
357
-
358
- context.dispatchException(context.arg);
359
-
360
- } else if (context.method === "return") {
361
- context.abrupt("return", context.arg);
362
- }
363
-
364
- state = GenStateExecuting;
365
-
366
- var record = tryCatch(innerFn, self, context);
367
- if (record.type === "normal") {
368
- // If an exception is thrown from innerFn, we leave state ===
369
- // GenStateExecuting and loop back for another invocation.
370
- state = context.done
371
- ? GenStateCompleted
372
- : GenStateSuspendedYield;
373
-
374
- if (record.arg === ContinueSentinel) {
375
- continue;
376
- }
377
-
378
- return {
379
- value: record.arg,
380
- done: context.done
381
- };
382
-
383
- } else if (record.type === "throw") {
384
- state = GenStateCompleted;
385
- // Dispatch the exception by looping back around to the
386
- // context.dispatchException(context.arg) call above.
387
- context.method = "throw";
388
- context.arg = record.arg;
389
- }
390
- }
391
- };
392
- }
393
-
394
- // Call delegate.iterator[context.method](context.arg) and handle the
395
- // result, either by returning a { value, done } result from the
396
- // delegate iterator, or by modifying context.method and context.arg,
397
- // setting context.delegate to null, and returning the ContinueSentinel.
398
- function maybeInvokeDelegate(delegate, context) {
399
- var method = delegate.iterator[context.method];
400
- if (method === undefined$1) {
401
- // A .throw or .return when the delegate iterator has no .throw
402
- // method always terminates the yield* loop.
403
- context.delegate = null;
404
-
405
- if (context.method === "throw") {
406
- // Note: ["return"] must be used for ES3 parsing compatibility.
407
- if (delegate.iterator["return"]) {
408
- // If the delegate iterator has a return method, give it a
409
- // chance to clean up.
410
- context.method = "return";
411
- context.arg = undefined$1;
412
- maybeInvokeDelegate(delegate, context);
413
-
414
- if (context.method === "throw") {
415
- // If maybeInvokeDelegate(context) changed context.method from
416
- // "return" to "throw", let that override the TypeError below.
417
- return ContinueSentinel;
418
- }
419
- }
420
-
421
- context.method = "throw";
422
- context.arg = new TypeError(
423
- "The iterator does not provide a 'throw' method");
424
- }
425
-
426
- return ContinueSentinel;
427
- }
428
-
429
- var record = tryCatch(method, delegate.iterator, context.arg);
430
-
431
- if (record.type === "throw") {
432
- context.method = "throw";
433
- context.arg = record.arg;
434
- context.delegate = null;
435
- return ContinueSentinel;
436
- }
437
-
438
- var info = record.arg;
439
-
440
- if (! info) {
441
- context.method = "throw";
442
- context.arg = new TypeError("iterator result is not an object");
443
- context.delegate = null;
444
- return ContinueSentinel;
445
- }
446
-
447
- if (info.done) {
448
- // Assign the result of the finished delegate to the temporary
449
- // variable specified by delegate.resultName (see delegateYield).
450
- context[delegate.resultName] = info.value;
451
-
452
- // Resume execution at the desired location (see delegateYield).
453
- context.next = delegate.nextLoc;
454
-
455
- // If context.method was "throw" but the delegate handled the
456
- // exception, let the outer generator proceed normally. If
457
- // context.method was "next", forget context.arg since it has been
458
- // "consumed" by the delegate iterator. If context.method was
459
- // "return", allow the original .return call to continue in the
460
- // outer generator.
461
- if (context.method !== "return") {
462
- context.method = "next";
463
- context.arg = undefined$1;
464
- }
465
-
466
- } else {
467
- // Re-yield the result returned by the delegate method.
468
- return info;
469
- }
470
-
471
- // The delegate iterator is finished, so forget it and continue with
472
- // the outer generator.
473
- context.delegate = null;
474
- return ContinueSentinel;
475
- }
476
-
477
- // Define Generator.prototype.{next,throw,return} in terms of the
478
- // unified ._invoke helper method.
479
- defineIteratorMethods(Gp);
480
-
481
- define(Gp, toStringTagSymbol, "Generator");
482
-
483
- // A Generator should always return itself as the iterator object when the
484
- // @@iterator function is called on it. Some browsers' implementations of the
485
- // iterator prototype chain incorrectly implement this, causing the Generator
486
- // object to not be returned from this call. This ensures that doesn't happen.
487
- // See https://github.com/facebook/regenerator/issues/274 for more details.
488
- define(Gp, iteratorSymbol, function() {
489
- return this;
490
- });
491
-
492
- define(Gp, "toString", function() {
493
- return "[object Generator]";
494
- });
495
-
496
- function pushTryEntry(locs) {
497
- var entry = { tryLoc: locs[0] };
498
-
499
- if (1 in locs) {
500
- entry.catchLoc = locs[1];
501
- }
502
-
503
- if (2 in locs) {
504
- entry.finallyLoc = locs[2];
505
- entry.afterLoc = locs[3];
506
- }
507
-
508
- this.tryEntries.push(entry);
509
- }
510
-
511
- function resetTryEntry(entry) {
512
- var record = entry.completion || {};
513
- record.type = "normal";
514
- delete record.arg;
515
- entry.completion = record;
516
- }
517
-
518
- function Context(tryLocsList) {
519
- // The root entry object (effectively a try statement without a catch
520
- // or a finally block) gives us a place to store values thrown from
521
- // locations where there is no enclosing try statement.
522
- this.tryEntries = [{ tryLoc: "root" }];
523
- tryLocsList.forEach(pushTryEntry, this);
524
- this.reset(true);
525
- }
526
-
527
- exports.keys = function(object) {
528
- var keys = [];
529
- for (var key in object) {
530
- keys.push(key);
531
- }
532
- keys.reverse();
533
-
534
- // Rather than returning an object with a next method, we keep
535
- // things simple and return the next function itself.
536
- return function next() {
537
- while (keys.length) {
538
- var key = keys.pop();
539
- if (key in object) {
540
- next.value = key;
541
- next.done = false;
542
- return next;
543
- }
544
- }
545
-
546
- // To avoid creating an additional object, we just hang the .value
547
- // and .done properties off the next function object itself. This
548
- // also ensures that the minifier will not anonymize the function.
549
- next.done = true;
550
- return next;
551
- };
552
- };
553
-
554
- function values(iterable) {
555
- if (iterable) {
556
- var iteratorMethod = iterable[iteratorSymbol];
557
- if (iteratorMethod) {
558
- return iteratorMethod.call(iterable);
559
- }
560
-
561
- if (typeof iterable.next === "function") {
562
- return iterable;
563
- }
564
-
565
- if (!isNaN(iterable.length)) {
566
- var i = -1, next = function next() {
567
- while (++i < iterable.length) {
568
- if (hasOwn.call(iterable, i)) {
569
- next.value = iterable[i];
570
- next.done = false;
571
- return next;
572
- }
573
- }
574
-
575
- next.value = undefined$1;
576
- next.done = true;
577
-
578
- return next;
579
- };
580
-
581
- return next.next = next;
582
- }
583
- }
584
-
585
- // Return an iterator with no values.
586
- return { next: doneResult };
587
- }
588
- exports.values = values;
589
-
590
- function doneResult() {
591
- return { value: undefined$1, done: true };
592
- }
593
-
594
- Context.prototype = {
595
- constructor: Context,
596
-
597
- reset: function(skipTempReset) {
598
- this.prev = 0;
599
- this.next = 0;
600
- // Resetting context._sent for legacy support of Babel's
601
- // function.sent implementation.
602
- this.sent = this._sent = undefined$1;
603
- this.done = false;
604
- this.delegate = null;
605
-
606
- this.method = "next";
607
- this.arg = undefined$1;
608
-
609
- this.tryEntries.forEach(resetTryEntry);
610
-
611
- if (!skipTempReset) {
612
- for (var name in this) {
613
- // Not sure about the optimal order of these conditions:
614
- if (name.charAt(0) === "t" &&
615
- hasOwn.call(this, name) &&
616
- !isNaN(+name.slice(1))) {
617
- this[name] = undefined$1;
618
- }
619
- }
620
- }
621
- },
622
-
623
- stop: function() {
624
- this.done = true;
625
-
626
- var rootEntry = this.tryEntries[0];
627
- var rootRecord = rootEntry.completion;
628
- if (rootRecord.type === "throw") {
629
- throw rootRecord.arg;
630
- }
631
-
632
- return this.rval;
633
- },
634
-
635
- dispatchException: function(exception) {
636
- if (this.done) {
637
- throw exception;
638
- }
639
-
640
- var context = this;
641
- function handle(loc, caught) {
642
- record.type = "throw";
643
- record.arg = exception;
644
- context.next = loc;
645
-
646
- if (caught) {
647
- // If the dispatched exception was caught by a catch block,
648
- // then let that catch block handle the exception normally.
649
- context.method = "next";
650
- context.arg = undefined$1;
651
- }
652
-
653
- return !! caught;
654
- }
655
-
656
- for (var i = this.tryEntries.length - 1; i >= 0; --i) {
657
- var entry = this.tryEntries[i];
658
- var record = entry.completion;
659
-
660
- if (entry.tryLoc === "root") {
661
- // Exception thrown outside of any try block that could handle
662
- // it, so set the completion value of the entire function to
663
- // throw the exception.
664
- return handle("end");
665
- }
666
-
667
- if (entry.tryLoc <= this.prev) {
668
- var hasCatch = hasOwn.call(entry, "catchLoc");
669
- var hasFinally = hasOwn.call(entry, "finallyLoc");
670
-
671
- if (hasCatch && hasFinally) {
672
- if (this.prev < entry.catchLoc) {
673
- return handle(entry.catchLoc, true);
674
- } else if (this.prev < entry.finallyLoc) {
675
- return handle(entry.finallyLoc);
676
- }
677
-
678
- } else if (hasCatch) {
679
- if (this.prev < entry.catchLoc) {
680
- return handle(entry.catchLoc, true);
681
- }
682
-
683
- } else if (hasFinally) {
684
- if (this.prev < entry.finallyLoc) {
685
- return handle(entry.finallyLoc);
686
- }
687
-
688
- } else {
689
- throw new Error("try statement without catch or finally");
690
- }
691
- }
692
- }
693
- },
694
-
695
- abrupt: function(type, arg) {
696
- for (var i = this.tryEntries.length - 1; i >= 0; --i) {
697
- var entry = this.tryEntries[i];
698
- if (entry.tryLoc <= this.prev &&
699
- hasOwn.call(entry, "finallyLoc") &&
700
- this.prev < entry.finallyLoc) {
701
- var finallyEntry = entry;
702
- break;
703
- }
704
- }
705
-
706
- if (finallyEntry &&
707
- (type === "break" ||
708
- type === "continue") &&
709
- finallyEntry.tryLoc <= arg &&
710
- arg <= finallyEntry.finallyLoc) {
711
- // Ignore the finally entry if control is not jumping to a
712
- // location outside the try/catch block.
713
- finallyEntry = null;
714
- }
715
-
716
- var record = finallyEntry ? finallyEntry.completion : {};
717
- record.type = type;
718
- record.arg = arg;
719
-
720
- if (finallyEntry) {
721
- this.method = "next";
722
- this.next = finallyEntry.finallyLoc;
723
- return ContinueSentinel;
724
- }
725
-
726
- return this.complete(record);
727
- },
728
-
729
- complete: function(record, afterLoc) {
730
- if (record.type === "throw") {
731
- throw record.arg;
732
- }
733
-
734
- if (record.type === "break" ||
735
- record.type === "continue") {
736
- this.next = record.arg;
737
- } else if (record.type === "return") {
738
- this.rval = this.arg = record.arg;
739
- this.method = "return";
740
- this.next = "end";
741
- } else if (record.type === "normal" && afterLoc) {
742
- this.next = afterLoc;
743
- }
744
-
745
- return ContinueSentinel;
746
- },
747
-
748
- finish: function(finallyLoc) {
749
- for (var i = this.tryEntries.length - 1; i >= 0; --i) {
750
- var entry = this.tryEntries[i];
751
- if (entry.finallyLoc === finallyLoc) {
752
- this.complete(entry.completion, entry.afterLoc);
753
- resetTryEntry(entry);
754
- return ContinueSentinel;
755
- }
756
- }
757
- },
758
-
759
- "catch": function(tryLoc) {
760
- for (var i = this.tryEntries.length - 1; i >= 0; --i) {
761
- var entry = this.tryEntries[i];
762
- if (entry.tryLoc === tryLoc) {
763
- var record = entry.completion;
764
- if (record.type === "throw") {
765
- var thrown = record.arg;
766
- resetTryEntry(entry);
767
- }
768
- return thrown;
769
- }
770
- }
771
-
772
- // The context.catch method must only be called with a location
773
- // argument that corresponds to a known catch block.
774
- throw new Error("illegal catch attempt");
775
- },
776
-
777
- delegateYield: function(iterable, resultName, nextLoc) {
778
- this.delegate = {
779
- iterator: values(iterable),
780
- resultName: resultName,
781
- nextLoc: nextLoc
782
- };
783
-
784
- if (this.method === "next") {
785
- // Deliberately forget the last sent value so that we don't
786
- // accidentally pass it on to the delegate.
787
- this.arg = undefined$1;
788
- }
789
-
790
- return ContinueSentinel;
791
- }
792
- };
793
-
794
- // Regardless of whether this script is executing as a CommonJS module
795
- // or not, return the runtime object so that we can declare the variable
796
- // regeneratorRuntime in the outer scope, which allows this module to be
797
- // injected easily by `bin/regenerator --include-runtime script.js`.
798
- return exports;
799
-
800
- }(
801
- // If this script is executing as a CommonJS module, use module.exports
802
- // as the regeneratorRuntime namespace. Otherwise create a new empty
803
- // object. Either way, the resulting object will be used to initialize
804
- // the regeneratorRuntime variable at the top of this file.
805
- module.exports
806
- ));
807
-
808
- try {
809
- regeneratorRuntime = runtime;
810
- } catch (accidentalStrictMode) {
811
- // This module should not be running in strict mode, so the above
812
- // assignment should always work unless something is misconfigured. Just
813
- // in case runtime.js accidentally runs in strict mode, in modern engines
814
- // we can explicitly access globalThis. In older engines we can escape
815
- // strict mode using a global Function call. This could conceivably fail
816
- // if a Content Security Policy forbids using Function, but in that case
817
- // the proper solution is to fix the accidental strict mode problem. If
818
- // you've misconfigured your bundler to force strict mode and applied a
819
- // CSP to forbid Function, and you're not willing to fix either of those
820
- // problems, please detail your unique predicament in a GitHub issue.
821
- if (typeof globalThis === "object") {
822
- globalThis.regeneratorRuntime = runtime;
823
- } else {
824
- Function("r", "regeneratorRuntime = r")(runtime);
825
- }
826
- }
827
- });
828
-
829
- var HttpsAgent = HttpAgent.HttpsAgent;
830
- var userAgent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36';
831
- var TWITTER_META_TAGS = ['site', 'creator', 'description', 'title', 'image'];
832
-
833
- function extractTwitterMeta($) {
834
- var tags = {};
835
- TWITTER_META_TAGS.map(function (tag) {
836
- tags["" + tag] = $("meta[name='twitter:" + tag + "']").attr('content') || $("meta[property='twitter:" + tag + "']").attr('content');
837
- });
838
- return tags;
839
- }
840
-
841
- function getEmbedAttrs(el) {
842
- return {
843
- src: el.attribs['src'],
844
- height: el.attribs['height'],
845
- width: el.attribs['width'],
846
- title: el.attribs['title']
847
- };
848
- }
849
-
850
- function extractEmbeds($) {
851
- var embeds = [];
852
- $('iframe, video, embed').each(function (_, el) {
853
- embeds.push(getEmbedAttrs(el));
854
- });
855
- return embeds;
856
- }
857
-
858
- function extractCodeSnippets($) {
859
- var code = [];
860
- var codeBlocks = $('pre code');
861
- codeBlocks.each(function (_, el) {
862
- code.push($(el).text());
863
- });
864
- return code;
865
- }
866
-
867
- function robotsAllowed(_x) {
868
- return _robotsAllowed.apply(this, arguments);
869
- }
870
-
871
- function _robotsAllowed() {
872
- _robotsAllowed = _asyncToGenerator( /*#__PURE__*/runtime_1.mark(function _callee3(prefixUrl) {
873
- var robotsUrl, site, robots;
874
- return runtime_1.wrap(function _callee3$(_context3) {
875
- while (1) {
876
- switch (_context3.prev = _context3.next) {
877
- case 0:
878
- robotsUrl = new URL('/robots.txt', prefixUrl);
879
- _context3.next = 3;
880
- return got('robots.txt', {
881
- throwHttpErrors: false,
882
- prefixUrl: prefixUrl,
883
- agent: {
884
- http: new HttpAgent(),
885
- https: new HttpsAgent()
886
- },
887
- timeout: 10 * 1000
888
- });
889
-
890
- case 3:
891
- site = _context3.sent;
892
- robots = robotsParser(robotsUrl, site.body);
893
- return _context3.abrupt("return", robots.isAllowed(prefixUrl, userAgent));
894
-
895
- case 6:
896
- case "end":
897
- return _context3.stop();
898
- }
899
- }
900
- }, _callee3);
901
- }));
902
- return _robotsAllowed.apply(this, arguments);
903
- }
904
-
905
- var defaultRules = ['author', 'clearbit', 'date', 'description', 'image', 'lang', 'logo-favicon', 'publisher', 'readability', 'title', 'url'];
906
- var defaultSanitizeOptions = {
907
- allowedTags: ['article', 'header', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hgroup', 'main', 'section', 'blockquote', 'dd', 'div', 'dl', 'dt', 'figcaption', 'figure', 'hr', 'li', 'main', 'ol', 'p', 'pre', 'ul', 'a', 'abbr', 'b', 'br', 'cite', 'code', 'data', 'dfn', 'em', 'i', 'mark', 'q', 's', 'samp', 'small', 'span', 'strong', 'sub', 'sup', 'time', 'u', 'var', 'wbr', 'caption', 'col', 'colgroup', 'table', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr', 'img']
908
- };
909
- var defaultOptions = {
910
- metascraperRules: [],
911
- timeout: 60,
912
- sanitizeOptions: defaultSanitizeOptions
913
- };
914
-
915
- function parseMetadata(_x2, _x3, _x4) {
916
- return _parseMetadata.apply(this, arguments);
917
- }
918
-
919
- function _parseMetadata() {
920
- _parseMetadata = _asyncToGenerator( /*#__PURE__*/runtime_1.mark(function _callee4(url, html, options) {
921
- var parsedUrl, _ref4, metascraperRules, sanitizeOptions, rules, scraper, $, metadata, doc, data, jsdom, article, content, links, tags, embeds, code, title, text;
922
-
923
- return runtime_1.wrap(function _callee4$(_context4) {
924
- while (1) {
925
- switch (_context4.prev = _context4.next) {
926
- case 0:
927
- parsedUrl = new URL(url);
928
- _ref4 = options ? _extends({}, defaultOptions, options) : defaultOptions, metascraperRules = _ref4.metascraperRules, sanitizeOptions = _ref4.sanitizeOptions;
929
- rules = [].concat(defaultRules, metascraperRules).map(function (rule) {
930
- return require("metascraper-" + rule)();
931
- });
932
- scraper = metascraper(rules);
933
- $ = cheerio.load(html);
934
- _context4.next = 7;
935
- return scraper({
936
- html: html,
937
- url: url
938
- });
939
-
940
- case 7:
941
- metadata = _context4.sent;
942
- // console.log([...rules], metadata)
943
- doc = createWindow(html).document;
944
- data = getMetadata(doc, url);
945
- jsdom = new JSDOM(html, {
946
- url: url
947
- });
948
- article = new Readability(jsdom.window.document).parse();
949
- content = sanitize((article == null ? void 0 : article.content) || '', sanitizeOptions).replace(/(\r\n|\n|\r)/gm, '').trim();
950
- links = [];
951
- cheerio.load(content)('a').each(function (_, link) {
952
- links.push({
953
- href: $(link).attr('href'),
954
- text: $(link).text()
955
- });
956
- });
957
- tags = [];
958
- $("a[href*='/t/'],a[href*='/tag/'], a[href*='/tags/'], a[href*='/topic/'],a[href*='/tagged/'], a[href*='?keyword=']").each(function (_, link) {
959
- tags.push($(link).text());
960
- });
961
- embeds = extractEmbeds($);
962
- code = extractCodeSnippets($);
963
- title = get(metadata, 'title') || (article == null ? void 0 : article.title); // console.dir({ defaultRules, rules });
964
-
965
- text = sanitize(content || '', {
966
- allowedTags: [],
967
- allowedAttributes: {}
968
- });
969
- return _context4.abrupt("return", _extends({
970
- html: html,
971
- content: content
972
- }, metadata, {
973
- author: get(metadata, 'author') || (article == null ? void 0 : article.byline),
974
- favicon: get(data, 'icon'),
975
- publisher: get(metadata, 'publisher') || (article == null ? void 0 : article.siteName) || data.provider,
976
- description: get(metadata, 'description') || (article == null ? void 0 : article.excerpt) || data.description,
977
- lang: get(metadata, 'lang') || get(data, 'lang'),
978
- url: get(data, 'url') || get(metadata, 'url'),
979
- text: text,
980
- embeds: embeds,
981
- code: code,
982
- tags: uniq(tags),
983
- source: parsedUrl.hostname,
984
- twitter: extractTwitterMeta($),
985
- title: title,
986
- links: links,
987
- keywords: get(data, 'keywords', [])
988
- }));
989
-
990
- case 22:
991
- case "end":
992
- return _context4.stop();
993
- }
994
- }
995
- }, _callee4);
996
- }));
997
- return _parseMetadata.apply(this, arguments);
998
- }
999
-
1000
- var scrape = /*#__PURE__*/function () {
1001
- var _ref = /*#__PURE__*/_asyncToGenerator( /*#__PURE__*/runtime_1.mark(function _callee(url, options) {
1002
- var _ref2, timeout, valid, isAllowed, _yield$got, html;
1003
-
1004
- return runtime_1.wrap(function _callee$(_context) {
1005
- while (1) {
1006
- switch (_context.prev = _context.next) {
1007
- case 0:
1008
- _ref2 = options ? _extends({}, defaultOptions, options) : defaultOptions, timeout = _ref2.timeout;
1009
- valid = isUri(url);
1010
-
1011
- if (valid) {
1012
- _context.next = 4;
1013
- break;
1014
- }
1015
-
1016
- throw new Error('Invalid URL');
1017
-
1018
- case 4:
1019
- if (!valid) {
1020
- _context.next = 19;
1021
- break;
1022
- }
1023
-
1024
- _context.next = 7;
1025
- return robotsAllowed(url);
1026
-
1027
- case 7:
1028
- isAllowed = _context.sent;
1029
-
1030
- if (!isAllowed) {
1031
- _context.next = 18;
1032
- break;
1033
- }
1034
-
1035
- _context.next = 11;
1036
- return got(url, {
1037
- headers: {
1038
- 'User-Agent': userAgent
1039
- },
1040
- agent: {
1041
- http: new HttpAgent(),
1042
- https: new HttpsAgent()
1043
- },
1044
- timeout: timeout * 1000
1045
- });
1046
-
1047
- case 11:
1048
- _yield$got = _context.sent;
1049
- html = _yield$got.body;
1050
- _context.next = 15;
1051
- return parseMetadata(url, html, options);
1052
-
1053
- case 15:
1054
- return _context.abrupt("return", _context.sent);
1055
-
1056
- case 18:
1057
- throw new Error('Robots.txt disallowed');
1058
-
1059
- case 19:
1060
- return _context.abrupt("return", null);
1061
-
1062
- case 20:
1063
- case "end":
1064
- return _context.stop();
1065
- }
1066
- }
1067
- }, _callee);
1068
- }));
1069
-
1070
- return function scrape(_x5, _x6) {
1071
- return _ref.apply(this, arguments);
1072
- };
1073
- }();
1074
- var scrapeHtml = /*#__PURE__*/function () {
1075
- var _ref3 = /*#__PURE__*/_asyncToGenerator( /*#__PURE__*/runtime_1.mark(function _callee2(url, html, options) {
1076
- var valid, metadata;
1077
- return runtime_1.wrap(function _callee2$(_context2) {
1078
- while (1) {
1079
- switch (_context2.prev = _context2.next) {
1080
- case 0:
1081
- valid = isUri(url);
1082
-
1083
- if (valid) {
1084
- _context2.next = 3;
1085
- break;
1086
- }
1087
-
1088
- throw new Error('Invalid URL');
1089
-
1090
- case 3:
1091
- if (!valid) {
1092
- _context2.next = 8;
1093
- break;
1094
- }
1095
-
1096
- _context2.next = 6;
1097
- return parseMetadata(url, html, options ? _extends({}, defaultOptions, options) : defaultOptions);
1098
-
1099
- case 6:
1100
- metadata = _context2.sent;
1101
- return _context2.abrupt("return", metadata);
1102
-
1103
- case 8:
1104
- return _context2.abrupt("return", null);
1105
-
1106
- case 9:
1107
- case "end":
1108
- return _context2.stop();
1109
- }
1110
- }
1111
- }, _callee2);
1112
- }));
1113
-
1114
- return function scrapeHtml(_x7, _x8, _x9) {
1115
- return _ref3.apply(this, arguments);
1116
- };
1117
- }();
1118
-
1119
- export { getEmbedAttrs, scrape, scrapeHtml };
1120
- //# sourceMappingURL=scrapex.esm.js.map