webscout 5.5__py3-none-any.whl → 5.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of webscout might be problematic. Click here for more details.

Files changed (46) hide show
  1. webscout/Agents/Onlinesearcher.py +3 -3
  2. webscout/Agents/__init__.py +0 -1
  3. webscout/Agents/functioncall.py +3 -3
  4. webscout/Provider/Bing.py +243 -0
  5. webscout/Provider/Chatify.py +1 -1
  6. webscout/Provider/Cloudflare.py +1 -1
  7. webscout/Provider/DARKAI.py +1 -1
  8. webscout/Provider/DiscordRocks.py +109 -246
  9. webscout/Provider/Farfalle.py +1 -1
  10. webscout/Provider/Free2GPT.py +234 -0
  11. webscout/{Agents/ai.py → Provider/GPTWeb.py} +40 -33
  12. webscout/Provider/Llama3.py +65 -62
  13. webscout/Provider/OLLAMA.py +1 -1
  14. webscout/Provider/PizzaGPT.py +1 -1
  15. webscout/Provider/RUBIKSAI.py +13 -3
  16. webscout/Provider/TTI/Nexra.py +120 -0
  17. webscout/Provider/TTI/__init__.py +3 -1
  18. webscout/Provider/TTI/blackboximage.py +153 -0
  19. webscout/Provider/TTI/deepinfra.py +2 -2
  20. webscout/Provider/TeachAnything.py +1 -1
  21. webscout/Provider/Youchat.py +1 -1
  22. webscout/Provider/__init__.py +11 -6
  23. webscout/Provider/{NetFly.py → aigames.py} +76 -79
  24. webscout/Provider/cleeai.py +1 -1
  25. webscout/Provider/elmo.py +1 -1
  26. webscout/Provider/felo_search.py +1 -1
  27. webscout/Provider/genspark.py +1 -1
  28. webscout/Provider/julius.py +7 -1
  29. webscout/Provider/lepton.py +1 -1
  30. webscout/Provider/meta.py +2 -2
  31. webscout/Provider/turboseek.py +1 -1
  32. webscout/Provider/upstage.py +230 -0
  33. webscout/Provider/x0gpt.py +1 -1
  34. webscout/Provider/xdash.py +1 -1
  35. webscout/Provider/yep.py +2 -2
  36. webscout/__init__.py +1 -0
  37. webscout/requestsHTMLfix.py +775 -0
  38. webscout/version.py +1 -1
  39. webscout/webai.py +1 -1
  40. {webscout-5.5.dist-info → webscout-5.7.dist-info}/METADATA +5 -29
  41. {webscout-5.5.dist-info → webscout-5.7.dist-info}/RECORD +45 -40
  42. webscout/Provider/ThinkAnyAI.py +0 -219
  43. {webscout-5.5.dist-info → webscout-5.7.dist-info}/LICENSE.md +0 -0
  44. {webscout-5.5.dist-info → webscout-5.7.dist-info}/WHEEL +0 -0
  45. {webscout-5.5.dist-info → webscout-5.7.dist-info}/entry_points.txt +0 -0
  46. {webscout-5.5.dist-info → webscout-5.7.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,775 @@
1
+ import sys
2
+ import asyncio
3
+ from urllib.parse import urlparse, urlunparse, urljoin
4
+ from concurrent.futures import ThreadPoolExecutor
5
+ from concurrent.futures._base import TimeoutError
6
+ from functools import partial
7
+ from typing import Set, Union, List, MutableMapping, Optional
8
+
9
+ import pyppeteer
10
+ import requests
11
+ from pyquery import PyQuery
12
+
13
+ from fake_useragent import UserAgent
14
+ from lxml_html_clean import Cleaner
15
+ import lxml
16
+ from lxml import etree
17
+ from lxml.html import HtmlElement
18
+ from lxml.html import tostring as lxml_html_tostring
19
+ from lxml.html.soupparser import fromstring as soup_parse
20
+ from parse import search as parse_search
21
+ from parse import findall, Result
22
+ from w3lib.encoding import html_to_unicode
23
+
24
+ DEFAULT_ENCODING = 'utf-8'
25
+ DEFAULT_URL = 'https://example.org/'
26
+ DEFAULT_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8'
27
+ DEFAULT_NEXT_SYMBOL = ['next', 'more', 'older']
28
+
29
+ cleaner = Cleaner()
30
+ cleaner.javascript = True
31
+ cleaner.style = True
32
+
33
+ useragent = None
34
+
35
+ # Typing.
36
+ _Find = Union[List['Element'], 'Element']
37
+ _XPath = Union[List[str], List['Element'], str, 'Element']
38
+ _Result = Union[List['Result'], 'Result']
39
+ _HTML = Union[str, bytes]
40
+ _BaseHTML = str
41
+ _UserAgent = str
42
+ _DefaultEncoding = str
43
+ _URL = str
44
+ _RawHTML = bytes
45
+ _Encoding = str
46
+ _LXML = HtmlElement
47
+ _Text = str
48
+ _Search = Result
49
+ _Containing = Union[str, List[str]]
50
+ _Links = Set[str]
51
+ _Attrs = MutableMapping
52
+ _Next = Union['HTML', List[str]]
53
+ _NextSymbol = List[str]
54
+
55
+ # Sanity checking.
56
+ try:
57
+ assert sys.version_info.major == 3
58
+ assert sys.version_info.minor > 5
59
+ except AssertionError:
60
+ raise RuntimeError('Requests-HTML requires Python 3.6+!')
61
+
62
+
63
+ class MaxRetries(Exception):
64
+
65
+ def __init__(self, message):
66
+ self.message = message
67
+
68
+
69
+ class BaseParser:
70
+ """A basic HTML/Element Parser, for Humans.
71
+
72
+ :param element: The element from which to base the parsing upon.
73
+ :param default_encoding: Which encoding to default to.
74
+ :param html: HTML from which to base the parsing upon (optional).
75
+ :param url: The URL from which the HTML originated, used for ``absolute_links``.
76
+
77
+ """
78
+
79
+ def __init__(self, *, element, default_encoding: _DefaultEncoding = None, html: _HTML = None, url: _URL) -> None:
80
+ self.element = element
81
+ self.url = url
82
+ self.skip_anchors = True
83
+ self.default_encoding = default_encoding
84
+ self._encoding = None
85
+ self._html = html.encode(DEFAULT_ENCODING) if isinstance(html, str) else html
86
+ self._lxml = None
87
+ self._pq = None
88
+
89
+ @property
90
+ def raw_html(self) -> _RawHTML:
91
+ """Bytes representation of the HTML content.
92
+ (`learn more <http://www.diveintopython3.net/strings.html>`_).
93
+ """
94
+ if self._html:
95
+ return self._html
96
+ else:
97
+ return etree.tostring(self.element, encoding='unicode').strip().encode(self.encoding)
98
+
99
+ @property
100
+ def html(self) -> _BaseHTML:
101
+ """Unicode representation of the HTML content
102
+ (`learn more <http://www.diveintopython3.net/strings.html>`_).
103
+ """
104
+ if self._html:
105
+ return self.raw_html.decode(self.encoding, errors='replace')
106
+ else:
107
+ return etree.tostring(self.element, encoding='unicode').strip()
108
+
109
+ @html.setter
110
+ def html(self, html: str) -> None:
111
+ self._html = html.encode(self.encoding)
112
+
113
+ @raw_html.setter
114
+ def raw_html(self, html: bytes) -> None:
115
+ """Property setter for self.html."""
116
+ self._html = html
117
+
118
+ @property
119
+ def encoding(self) -> _Encoding:
120
+ """The encoding string to be used, extracted from the HTML and
121
+ :class:`HTMLResponse <HTMLResponse>` headers.
122
+ """
123
+ if self._encoding:
124
+ return self._encoding
125
+
126
+ # Scan meta tags for charset.
127
+ if self._html:
128
+ self._encoding = html_to_unicode(self.default_encoding, self._html)[0]
129
+ # Fall back to requests' detected encoding if decode fails.
130
+ try:
131
+ self.raw_html.decode(self.encoding, errors='replace')
132
+ except UnicodeDecodeError:
133
+ self._encoding = self.default_encoding
134
+
135
+
136
+ return self._encoding if self._encoding else self.default_encoding
137
+
138
+ @encoding.setter
139
+ def encoding(self, enc: str) -> None:
140
+ """Property setter for self.encoding."""
141
+ self._encoding = enc
142
+
143
+ @property
144
+ def pq(self) -> PyQuery:
145
+ """`PyQuery <https://pythonhosted.org/pyquery/>`_ representation
146
+ of the :class:`Element <Element>` or :class:`HTML <HTML>`.
147
+ """
148
+ if self._pq is None:
149
+ self._pq = PyQuery(self.lxml)
150
+
151
+ return self._pq
152
+
153
+ @property
154
+ def lxml(self) -> HtmlElement:
155
+ """`lxml <http://lxml.de>`_ representation of the
156
+ :class:`Element <Element>` or :class:`HTML <HTML>`.
157
+ """
158
+ if self._lxml is None:
159
+ try:
160
+ self._lxml = soup_parse(self.html, features='html.parser')
161
+ except ValueError:
162
+ self._lxml = lxml.html.fromstring(self.raw_html)
163
+
164
+ return self._lxml
165
+
166
+ @property
167
+ def text(self) -> _Text:
168
+ """The text content of the
169
+ :class:`Element <Element>` or :class:`HTML <HTML>`.
170
+ """
171
+ return self.pq.text()
172
+
173
+ @property
174
+ def full_text(self) -> _Text:
175
+ """The full text content (including links) of the
176
+ :class:`Element <Element>` or :class:`HTML <HTML>`.
177
+ """
178
+ return self.lxml.text_content()
179
+
180
+ def find(self, selector: str = "*", *, containing: _Containing = None, clean: bool = False, first: bool = False, _encoding: str = None) -> _Find:
181
+ """Given a CSS Selector, returns a list of
182
+ :class:`Element <Element>` objects or a single one.
183
+
184
+ :param selector: CSS Selector to use.
185
+ :param clean: Whether or not to sanitize the found HTML of ``<script>`` and ``<style>`` tags.
186
+ :param containing: If specified, only return elements that contain the provided text.
187
+ :param first: Whether or not to return just the first result.
188
+ :param _encoding: The encoding format.
189
+
190
+ Example CSS Selectors:
191
+
192
+ - ``a``
193
+ - ``a.someClass``
194
+ - ``a#someID``
195
+ - ``a[target=_blank]``
196
+
197
+ See W3School's `CSS Selectors Reference
198
+ <https://www.w3schools.com/cssref/css_selectors.asp>`_
199
+ for more details.
200
+
201
+ If ``first`` is ``True``, only returns the first
202
+ :class:`Element <Element>` found.
203
+ """
204
+
205
+ # Convert a single containing into a list.
206
+ if isinstance(containing, str):
207
+ containing = [containing]
208
+
209
+ encoding = _encoding or self.encoding
210
+ elements = [
211
+ Element(element=found, url=self.url, default_encoding=encoding)
212
+ for found in self.pq(selector)
213
+ ]
214
+
215
+ if containing:
216
+ elements_copy = elements.copy()
217
+ elements = []
218
+
219
+ for element in elements_copy:
220
+ if any([c.lower() in element.full_text.lower() for c in containing]):
221
+ elements.append(element)
222
+
223
+ elements.reverse()
224
+
225
+ # Sanitize the found HTML.
226
+ if clean:
227
+ elements_copy = elements.copy()
228
+ elements = []
229
+
230
+ for element in elements_copy:
231
+ element.raw_html = lxml_html_tostring(cleaner.clean_html(element.lxml))
232
+ elements.append(element)
233
+
234
+ return _get_first_or_list(elements, first)
235
+
236
+ def xpath(self, selector: str, *, clean: bool = False, first: bool = False, _encoding: str = None) -> _XPath:
237
+ """Given an XPath selector, returns a list of
238
+ :class:`Element <Element>` objects or a single one.
239
+
240
+ :param selector: XPath Selector to use.
241
+ :param clean: Whether or not to sanitize the found HTML of ``<script>`` and ``<style>`` tags.
242
+ :param first: Whether or not to return just the first result.
243
+ :param _encoding: The encoding format.
244
+
245
+ If a sub-selector is specified (e.g. ``//a/@href``), a simple
246
+ list of results is returned.
247
+
248
+ See W3School's `XPath Examples
249
+ <https://www.w3schools.com/xml/xpath_examples.asp>`_
250
+ for more details.
251
+
252
+ If ``first`` is ``True``, only returns the first
253
+ :class:`Element <Element>` found.
254
+ """
255
+ selected = self.lxml.xpath(selector)
256
+
257
+ elements = [
258
+ Element(element=selection, url=self.url, default_encoding=_encoding or self.encoding)
259
+ if not isinstance(selection, etree._ElementUnicodeResult) else str(selection)
260
+ for selection in selected
261
+ ]
262
+
263
+ # Sanitize the found HTML.
264
+ if clean:
265
+ elements_copy = elements.copy()
266
+ elements = []
267
+
268
+ for element in elements_copy:
269
+ element.raw_html = lxml_html_tostring(cleaner.clean_html(element.lxml))
270
+ elements.append(element)
271
+
272
+ return _get_first_or_list(elements, first)
273
+
274
+ def search(self, template: str) -> Result:
275
+ """Search the :class:`Element <Element>` for the given Parse template.
276
+
277
+ :param template: The Parse template to use.
278
+ """
279
+
280
+ return parse_search(template, self.html)
281
+
282
+ def search_all(self, template: str) -> _Result:
283
+ """Search the :class:`Element <Element>` (multiple times) for the given parse
284
+ template.
285
+
286
+ :param template: The Parse template to use.
287
+ """
288
+ return [r for r in findall(template, self.html)]
289
+
290
+ @property
291
+ def links(self) -> _Links:
292
+ """All found links on page, in as–is form."""
293
+
294
+ def gen():
295
+ for link in self.find('a'):
296
+
297
+ try:
298
+ href = link.attrs['href'].strip()
299
+ if href and not (href.startswith('#') and self.skip_anchors) and not href.startswith(('javascript:', 'mailto:')):
300
+ yield href
301
+ except KeyError:
302
+ pass
303
+
304
+ return set(gen())
305
+
306
+ def _make_absolute(self, link):
307
+ """Makes a given link absolute."""
308
+
309
+ # Parse the link with stdlib.
310
+ parsed = urlparse(link)._asdict()
311
+
312
+ # If link is relative, then join it with base_url.
313
+ if not parsed['netloc']:
314
+ return urljoin(self.base_url, link)
315
+
316
+ # Link is absolute; if it lacks a scheme, add one from base_url.
317
+ if not parsed['scheme']:
318
+ parsed['scheme'] = urlparse(self.base_url).scheme
319
+
320
+ # Reconstruct the URL to incorporate the new scheme.
321
+ parsed = (v for v in parsed.values())
322
+ return urlunparse(parsed)
323
+
324
+ # Link is absolute and complete with scheme; nothing to be done here.
325
+ return link
326
+
327
+
328
+ @property
329
+ def absolute_links(self) -> _Links:
330
+ """All found links on page, in absolute form
331
+ (`learn more <https://www.navegabem.com/absolute-or-relative-links.html>`_).
332
+ """
333
+
334
+ def gen():
335
+ for link in self.links:
336
+ yield self._make_absolute(link)
337
+
338
+ return set(gen())
339
+
340
+ @property
341
+ def base_url(self) -> _URL:
342
+ """The base URL for the page. Supports the ``<base>`` tag
343
+ (`learn more <https://www.w3schools.com/tags/tag_base.asp>`_)."""
344
+
345
+ # Support for <base> tag.
346
+ base = self.find('base', first=True)
347
+ if base:
348
+ result = base.attrs.get('href', '').strip()
349
+ if result:
350
+ return result
351
+
352
+ # Parse the url to separate out the path
353
+ parsed = urlparse(self.url)._asdict()
354
+
355
+ # Remove any part of the path after the last '/'
356
+ parsed['path'] = '/'.join(parsed['path'].split('/')[:-1]) + '/'
357
+
358
+ # Reconstruct the url with the modified path
359
+ parsed = (v for v in parsed.values())
360
+ url = urlunparse(parsed)
361
+
362
+ return url
363
+
364
+
365
+ class Element(BaseParser):
366
+ """An element of HTML.
367
+
368
+ :param element: The element from which to base the parsing upon.
369
+ :param url: The URL from which the HTML originated, used for ``absolute_links``.
370
+ :param default_encoding: Which encoding to default to.
371
+ """
372
+
373
+ __slots__ = [
374
+ 'element', 'url', 'skip_anchors', 'default_encoding', '_encoding',
375
+ '_html', '_lxml', '_pq', '_attrs', 'session'
376
+ ]
377
+
378
+ def __init__(self, *, element, url: _URL, default_encoding: _DefaultEncoding = None) -> None:
379
+ super(Element, self).__init__(element=element, url=url, default_encoding=default_encoding)
380
+ self.element = element
381
+ self.tag = element.tag
382
+ self.lineno = element.sourceline
383
+ self._attrs = None
384
+
385
+ def __repr__(self) -> str:
386
+ attrs = ['{}={}'.format(attr, repr(self.attrs[attr])) for attr in self.attrs]
387
+ return "<Element {} {}>".format(repr(self.element.tag), ' '.join(attrs))
388
+
389
+ @property
390
+ def attrs(self) -> _Attrs:
391
+ """Returns a dictionary of the attributes of the :class:`Element <Element>`
392
+ (`learn more <https://www.w3schools.com/tags/ref_attributes.asp>`_).
393
+ """
394
+ if self._attrs is None:
395
+ self._attrs = {k: v for k, v in self.element.items()}
396
+
397
+ # Split class and rel up, as there are ussually many of them:
398
+ for attr in ['class', 'rel']:
399
+ if attr in self._attrs:
400
+ self._attrs[attr] = tuple(self._attrs[attr].split())
401
+
402
+ return self._attrs
403
+
404
+
405
+ class HTML(BaseParser):
406
+ """An HTML document, ready for parsing.
407
+
408
+ :param url: The URL from which the HTML originated, used for ``absolute_links``.
409
+ :param html: HTML from which to base the parsing upon (optional).
410
+ :param default_encoding: Which encoding to default to.
411
+ """
412
+
413
+ def __init__(self, *, session: Union['HTMLSession', 'AsyncHTMLSession'] = None, url: str = DEFAULT_URL, html: _HTML, default_encoding: str = DEFAULT_ENCODING, async_: bool = False) -> None:
414
+
415
+ # Convert incoming unicode HTML into bytes.
416
+ if isinstance(html, str):
417
+ html = html.encode(DEFAULT_ENCODING)
418
+
419
+ super(HTML, self).__init__(
420
+ # Convert unicode HTML to bytes.
421
+ element=PyQuery(html)('html') or PyQuery(f'<html>{html}</html>')('html'),
422
+ html=html,
423
+ url=url,
424
+ default_encoding=default_encoding
425
+ )
426
+ self.session = session or async_ and AsyncHTMLSession() or HTMLSession()
427
+ self.page = None
428
+ self.next_symbol = DEFAULT_NEXT_SYMBOL
429
+
430
+ def __repr__(self) -> str:
431
+ return f"<HTML url={self.url!r}>"
432
+
433
+ def next(self, fetch: bool = False, next_symbol: _NextSymbol = DEFAULT_NEXT_SYMBOL) -> _Next:
434
+ """Attempts to find the next page, if there is one. If ``fetch``
435
+ is ``True`` (default), returns :class:`HTML <HTML>` object of
436
+ next page. If ``fetch`` is ``False``, simply returns the next URL.
437
+
438
+ """
439
+
440
+ def get_next():
441
+ candidates = self.find('a', containing=next_symbol)
442
+
443
+ for candidate in candidates:
444
+ if candidate.attrs.get('href'):
445
+ # Support 'next' rel (e.g. reddit).
446
+ if 'next' in candidate.attrs.get('rel', []):
447
+ return candidate.attrs['href']
448
+
449
+ # Support 'next' in classnames.
450
+ for _class in candidate.attrs.get('class', []):
451
+ if 'next' in _class:
452
+ return candidate.attrs['href']
453
+
454
+ if 'page' in candidate.attrs['href']:
455
+ return candidate.attrs['href']
456
+
457
+ try:
458
+ # Resort to the last candidate.
459
+ return candidates[-1].attrs['href']
460
+ except IndexError:
461
+ return None
462
+
463
+ __next = get_next()
464
+ if __next:
465
+ url = self._make_absolute(__next)
466
+ else:
467
+ return None
468
+
469
+ if fetch:
470
+ return self.session.get(url)
471
+ else:
472
+ return url
473
+
474
+ def __iter__(self):
475
+
476
+ next = self
477
+
478
+ while True:
479
+ yield next
480
+ try:
481
+ next = next.next(fetch=True, next_symbol=self.next_symbol).html
482
+ except AttributeError:
483
+ break
484
+
485
+ def __next__(self):
486
+ return self.next(fetch=True, next_symbol=self.next_symbol).html
487
+
488
+ def __aiter__(self):
489
+ return self
490
+
491
+ async def __anext__(self):
492
+ while True:
493
+ url = self.next(fetch=False, next_symbol=self.next_symbol)
494
+ if not url:
495
+ break
496
+ response = await self.session.get(url)
497
+ return response.html
498
+
499
+ def add_next_symbol(self, next_symbol):
500
+ self.next_symbol.append(next_symbol)
501
+
502
+ async def _async_render(self, *, url: str, script: str = None, scrolldown, sleep: int, wait: float, reload, content: Optional[str], timeout: Union[float, int], keep_page: bool):
503
+ """ Handle page creation and js rendering. Internal use for render/arender methods. """
504
+ try:
505
+ page = await self.browser.newPage()
506
+
507
+ # Wait before rendering the page, to prevent timeouts.
508
+ await asyncio.sleep(wait)
509
+
510
+ # Load the given page (GET request, obviously.)
511
+ if reload:
512
+ await page.goto(url, options={'timeout': int(timeout * 1000)})
513
+ else:
514
+ await page.goto(f'data:text/html,{self.html}', options={'timeout': int(timeout * 1000)})
515
+
516
+ result = None
517
+ if script:
518
+ result = await page.evaluate(script)
519
+
520
+ if scrolldown:
521
+ for _ in range(scrolldown):
522
+ await page._keyboard.down('PageDown')
523
+ await asyncio.sleep(sleep)
524
+ else:
525
+ await asyncio.sleep(sleep)
526
+
527
+ if scrolldown:
528
+ await page._keyboard.up('PageDown')
529
+
530
+ # Return the content of the page, JavaScript evaluated.
531
+ content = await page.content()
532
+ if not keep_page:
533
+ await page.close()
534
+ page = None
535
+ return content, result, page
536
+ except TimeoutError:
537
+ await page.close()
538
+ page = None
539
+ return None
540
+
541
+ def render(self, retries: int = 8, script: str = None, wait: float = 0.2, scrolldown=False, sleep: int = 0, reload: bool = True, timeout: Union[float, int] = 8.0, keep_page: bool = False):
542
+ """Reloads the response in Chromium, and replaces HTML content
543
+ with an updated version, with JavaScript executed.
544
+
545
+ :param retries: The number of times to retry loading the page in Chromium.
546
+ :param script: JavaScript to execute upon page load (optional).
547
+ :param wait: The number of seconds to wait before loading the page, preventing timeouts (optional).
548
+ :param scrolldown: Integer, if provided, of how many times to page down.
549
+ :param sleep: Integer, if provided, of how many long to sleep after initial render.
550
+ :param reload: If ``False``, content will not be loaded from the browser, but will be provided from memory.
551
+ :param keep_page: If ``True`` will allow you to interact with the browser page through ``r.html.page``.
552
+
553
+ If ``scrolldown`` is specified, the page will scrolldown the specified
554
+ number of times, after sleeping the specified amount of time
555
+ (e.g. ``scrolldown=10, sleep=1``).
556
+
557
+ If just ``sleep`` is provided, the rendering will wait *n* seconds, before
558
+ returning.
559
+
560
+ If ``script`` is specified, it will execute the provided JavaScript at
561
+ runtime. Example:
562
+
563
+ .. code-block:: python
564
+
565
+ script = \"\"\"
566
+ () => {
567
+ return {
568
+ width: document.documentElement.clientWidth,
569
+ height: document.documentElement.clientHeight,
570
+ deviceScaleFactor: window.devicePixelRatio,
571
+ }
572
+ }
573
+ \"\"\"
574
+
575
+ Returns the return value of the executed ``script``, if any is provided:
576
+
577
+ .. code-block:: python
578
+
579
+ >>> r.html.render(script=script)
580
+ {'width': 800, 'height': 600, 'deviceScaleFactor': 1}
581
+
582
+ Warning: the first time you run this method, it will download
583
+ Chromium into your home directory (``~/.pyppeteer``).
584
+ """
585
+
586
+ self.browser = self.session.browser # Automatically create a event loop and browser
587
+ content = None
588
+
589
+ # Automatically set Reload to False, if example URL is being used.
590
+ if self.url == DEFAULT_URL:
591
+ reload = False
592
+
593
+
594
+ for i in range(retries):
595
+ if not content:
596
+ try:
597
+
598
+ content, result, page = self.session.loop.run_until_complete(self._async_render(url=self.url, script=script, sleep=sleep, wait=wait, content=self.html, reload=reload, scrolldown=scrolldown, timeout=timeout, keep_page=keep_page))
599
+ except TypeError:
600
+ pass
601
+ else:
602
+ break
603
+
604
+ if not content:
605
+ raise MaxRetries("Unable to render the page. Try increasing timeout")
606
+
607
+ html = HTML(url=self.url, html=content.encode(DEFAULT_ENCODING), default_encoding=DEFAULT_ENCODING)
608
+ self.__dict__.update(html.__dict__)
609
+ self.page = page
610
+ return result
611
+
612
+ async def arender(self, retries: int = 8, script: str = None, wait: float = 0.2, scrolldown=False, sleep: int = 0, reload: bool = True, timeout: Union[float, int] = 8.0, keep_page: bool = False):
613
+ """ Async version of render. Takes same parameters. """
614
+
615
+ self.browser = await self.session.browser
616
+ content = None
617
+
618
+ # Automatically set Reload to False, if example URL is being used.
619
+ if self.url == DEFAULT_URL:
620
+ reload = False
621
+
622
+ for _ in range(retries):
623
+ if not content:
624
+ try:
625
+
626
+ content, result, page = await self._async_render(url=self.url, script=script, sleep=sleep, wait=wait, content=self.html, reload=reload, scrolldown=scrolldown, timeout=timeout, keep_page=keep_page)
627
+ except TypeError:
628
+ pass
629
+ else:
630
+ break
631
+
632
+ if not content:
633
+ raise MaxRetries("Unable to render the page. Try increasing timeout")
634
+
635
+ html = HTML(url=self.url, html=content.encode(DEFAULT_ENCODING), default_encoding=DEFAULT_ENCODING)
636
+ self.__dict__.update(html.__dict__)
637
+ self.page = page
638
+ return result
639
+
640
+
641
+ class HTMLResponse(requests.Response):
642
+ """An HTML-enabled :class:`requests.Response <requests.Response>` object.
643
+ Effectively the same, but with an intelligent ``.html`` property added.
644
+ """
645
+
646
+ def __init__(self, session: Union['HTMLSession', 'AsyncHTMLSession']) -> None:
647
+ super(HTMLResponse, self).__init__()
648
+ self._html = None # type: HTML
649
+ self.session = session
650
+
651
+ @property
652
+ def html(self) -> HTML:
653
+ if not self._html:
654
+ self._html = HTML(session=self.session, url=self.url, html=self.content, default_encoding=self.encoding)
655
+
656
+ return self._html
657
+
658
+ @classmethod
659
+ def _from_response(cls, response, session: Union['HTMLSession', 'AsyncHTMLSession']):
660
+ html_r = cls(session=session)
661
+ html_r.__dict__.update(response.__dict__)
662
+ return html_r
663
+
664
+
665
+ def user_agent(style=None) -> _UserAgent:
666
+ """Returns an apparently legit user-agent, if not requested one of a specific
667
+ style. Defaults to a Chrome-style User-Agent.
668
+ """
669
+ global useragent
670
+ if (not useragent) and style:
671
+ useragent = UserAgent()
672
+
673
+ return useragent[style] if style else DEFAULT_USER_AGENT
674
+
675
+
676
+ def _get_first_or_list(l, first=False):
677
+ if first:
678
+ try:
679
+ return l[0]
680
+ except IndexError:
681
+ return None
682
+ else:
683
+ return l
684
+
685
+
686
+ class BaseSession(requests.Session):
687
+ """ A consumable session, for cookie persistence and connection pooling,
688
+ amongst other things.
689
+ """
690
+
691
+ def __init__(self, mock_browser : bool = True, verify : bool = True,
692
+ browser_args : list = ['--no-sandbox']):
693
+ super().__init__()
694
+
695
+ # Mock a web browser's user agent.
696
+ if mock_browser:
697
+ self.headers['User-Agent'] = user_agent()
698
+
699
+ self.hooks['response'].append(self.response_hook)
700
+ self.verify = verify
701
+
702
+ self.__browser_args = browser_args
703
+
704
+
705
+ def response_hook(self, response, **kwargs) -> HTMLResponse:
706
+ """ Change response enconding and replace it by a HTMLResponse. """
707
+ if not response.encoding:
708
+ response.encoding = DEFAULT_ENCODING
709
+ return HTMLResponse._from_response(response, self)
710
+
711
+ @property
712
+ async def browser(self):
713
+ if not hasattr(self, "_browser"):
714
+ self._browser = await pyppeteer.launch(ignoreHTTPSErrors=not(self.verify), headless=True, args=self.__browser_args)
715
+
716
+ return self._browser
717
+
718
+
719
+ class HTMLSession(BaseSession):
720
+
721
+ def __init__(self, **kwargs):
722
+ super(HTMLSession, self).__init__(**kwargs)
723
+
724
+ @property
725
+ def browser(self):
726
+ if not hasattr(self, "_browser"):
727
+ self.loop = asyncio.get_event_loop()
728
+ if self.loop.is_running():
729
+ raise RuntimeError("Cannot use HTMLSession within an existing event loop. Use AsyncHTMLSession instead.")
730
+ self._browser = self.loop.run_until_complete(super().browser)
731
+ return self._browser
732
+
733
+ def close(self):
734
+ """ If a browser was created close it first. """
735
+ if hasattr(self, "_browser"):
736
+ self.loop.run_until_complete(self._browser.close())
737
+ super().close()
738
+
739
+
740
+ class AsyncHTMLSession(BaseSession):
741
+ """ An async consumable session. """
742
+
743
+ def __init__(self, loop=None, workers=None,
744
+ mock_browser: bool = True, *args, **kwargs):
745
+ """ Set or create an event loop and a thread pool.
746
+
747
+ :param loop: Asyncio loop to use.
748
+ :param workers: Amount of threads to use for executing async calls.
749
+ If not pass it will default to the number of processors on the
750
+ machine, multiplied by 5. """
751
+ super().__init__(*args, **kwargs)
752
+
753
+ self.loop = loop or asyncio.get_event_loop()
754
+ self.thread_pool = ThreadPoolExecutor(max_workers=workers)
755
+
756
+ def request(self, *args, **kwargs):
757
+ """ Partial original request func and run it in a thread. """
758
+ func = partial(super().request, *args, **kwargs)
759
+ return self.loop.run_in_executor(self.thread_pool, func)
760
+
761
+ async def close(self):
762
+ """ If a browser was created close it first. """
763
+ if hasattr(self, "_browser"):
764
+ await self._browser.close()
765
+ super().close()
766
+
767
+ def run(self, *coros):
768
+ """ Pass in all the coroutines you want to run, it will wrap each one
769
+ in a task, run it and wait for the result. Return a list with all
770
+ results, this is returned in the same order coros are passed in. """
771
+ tasks = [
772
+ asyncio.ensure_future(coro()) for coro in coros
773
+ ]
774
+ done, _ = self.loop.run_until_complete(asyncio.wait(tasks))
775
+ return [t.result() for t in done]