scrapling 0.2.9__tar.gz → 0.2.91__tar.gz

Sign up to get free protection for your applications and to get access to all the features.
Files changed (54) hide show
  1. {scrapling-0.2.9/scrapling.egg-info → scrapling-0.2.91}/PKG-INFO +2 -3
  2. {scrapling-0.2.9 → scrapling-0.2.91}/scrapling/__init__.py +1 -1
  3. {scrapling-0.2.9 → scrapling-0.2.91}/scrapling/core/_types.py +2 -0
  4. {scrapling-0.2.9 → scrapling-0.2.91}/scrapling/engines/camo.py +44 -18
  5. {scrapling-0.2.9 → scrapling-0.2.91}/scrapling/engines/pw.py +47 -18
  6. {scrapling-0.2.9 → scrapling-0.2.91}/scrapling/engines/toolbelt/custom.py +1 -5
  7. {scrapling-0.2.9 → scrapling-0.2.91}/scrapling/fetchers.py +7 -7
  8. {scrapling-0.2.9 → scrapling-0.2.91}/scrapling/parser.py +1 -1
  9. {scrapling-0.2.9 → scrapling-0.2.91/scrapling.egg-info}/PKG-INFO +2 -3
  10. {scrapling-0.2.9 → scrapling-0.2.91}/scrapling.egg-info/requires.txt +1 -1
  11. {scrapling-0.2.9 → scrapling-0.2.91}/setup.cfg +1 -1
  12. {scrapling-0.2.9 → scrapling-0.2.91}/setup.py +2 -3
  13. {scrapling-0.2.9 → scrapling-0.2.91}/LICENSE +0 -0
  14. {scrapling-0.2.9 → scrapling-0.2.91}/MANIFEST.in +0 -0
  15. {scrapling-0.2.9 → scrapling-0.2.91}/README.md +0 -0
  16. {scrapling-0.2.9 → scrapling-0.2.91}/scrapling/core/__init__.py +0 -0
  17. {scrapling-0.2.9 → scrapling-0.2.91}/scrapling/core/custom_types.py +0 -0
  18. {scrapling-0.2.9 → scrapling-0.2.91}/scrapling/core/mixins.py +0 -0
  19. {scrapling-0.2.9 → scrapling-0.2.91}/scrapling/core/storage_adaptors.py +0 -0
  20. {scrapling-0.2.9 → scrapling-0.2.91}/scrapling/core/translator.py +0 -0
  21. {scrapling-0.2.9 → scrapling-0.2.91}/scrapling/core/utils.py +0 -0
  22. {scrapling-0.2.9 → scrapling-0.2.91}/scrapling/defaults.py +0 -0
  23. {scrapling-0.2.9 → scrapling-0.2.91}/scrapling/engines/__init__.py +0 -0
  24. {scrapling-0.2.9 → scrapling-0.2.91}/scrapling/engines/constants.py +0 -0
  25. {scrapling-0.2.9 → scrapling-0.2.91}/scrapling/engines/static.py +0 -0
  26. {scrapling-0.2.9 → scrapling-0.2.91}/scrapling/engines/toolbelt/__init__.py +0 -0
  27. {scrapling-0.2.9 → scrapling-0.2.91}/scrapling/engines/toolbelt/bypasses/navigator_plugins.js +0 -0
  28. {scrapling-0.2.9 → scrapling-0.2.91}/scrapling/engines/toolbelt/bypasses/notification_permission.js +0 -0
  29. {scrapling-0.2.9 → scrapling-0.2.91}/scrapling/engines/toolbelt/bypasses/pdf_viewer.js +0 -0
  30. {scrapling-0.2.9 → scrapling-0.2.91}/scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js +0 -0
  31. {scrapling-0.2.9 → scrapling-0.2.91}/scrapling/engines/toolbelt/bypasses/screen_props.js +0 -0
  32. {scrapling-0.2.9 → scrapling-0.2.91}/scrapling/engines/toolbelt/bypasses/webdriver_fully.js +0 -0
  33. {scrapling-0.2.9 → scrapling-0.2.91}/scrapling/engines/toolbelt/bypasses/window_chrome.js +0 -0
  34. {scrapling-0.2.9 → scrapling-0.2.91}/scrapling/engines/toolbelt/fingerprints.py +0 -0
  35. {scrapling-0.2.9 → scrapling-0.2.91}/scrapling/engines/toolbelt/navigation.py +0 -0
  36. {scrapling-0.2.9 → scrapling-0.2.91}/scrapling/py.typed +0 -0
  37. {scrapling-0.2.9 → scrapling-0.2.91}/scrapling.egg-info/SOURCES.txt +0 -0
  38. {scrapling-0.2.9 → scrapling-0.2.91}/scrapling.egg-info/dependency_links.txt +0 -0
  39. {scrapling-0.2.9 → scrapling-0.2.91}/scrapling.egg-info/not-zip-safe +0 -0
  40. {scrapling-0.2.9 → scrapling-0.2.91}/scrapling.egg-info/top_level.txt +0 -0
  41. {scrapling-0.2.9 → scrapling-0.2.91}/tests/__init__.py +0 -0
  42. {scrapling-0.2.9 → scrapling-0.2.91}/tests/fetchers/__init__.py +0 -0
  43. {scrapling-0.2.9 → scrapling-0.2.91}/tests/fetchers/async/__init__.py +0 -0
  44. {scrapling-0.2.9 → scrapling-0.2.91}/tests/fetchers/async/test_camoufox.py +0 -0
  45. {scrapling-0.2.9 → scrapling-0.2.91}/tests/fetchers/async/test_httpx.py +0 -0
  46. {scrapling-0.2.9 → scrapling-0.2.91}/tests/fetchers/async/test_playwright.py +0 -0
  47. {scrapling-0.2.9 → scrapling-0.2.91}/tests/fetchers/sync/__init__.py +0 -0
  48. {scrapling-0.2.9 → scrapling-0.2.91}/tests/fetchers/sync/test_camoufox.py +0 -0
  49. {scrapling-0.2.9 → scrapling-0.2.91}/tests/fetchers/sync/test_httpx.py +0 -0
  50. {scrapling-0.2.9 → scrapling-0.2.91}/tests/fetchers/sync/test_playwright.py +0 -0
  51. {scrapling-0.2.9 → scrapling-0.2.91}/tests/fetchers/test_utils.py +0 -0
  52. {scrapling-0.2.9 → scrapling-0.2.91}/tests/parser/__init__.py +0 -0
  53. {scrapling-0.2.9 → scrapling-0.2.91}/tests/parser/test_automatch.py +0 -0
  54. {scrapling-0.2.9 → scrapling-0.2.91}/tests/parser/test_general.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: scrapling
3
- Version: 0.2.9
3
+ Version: 0.2.91
4
4
  Summary: Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
5
5
  Home-page: https://github.com/D4Vinci/Scrapling
6
6
  Author: Karim Shoair
@@ -21,7 +21,6 @@ Classifier: Topic :: Text Processing :: Markup :: HTML
21
21
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
22
22
  Classifier: Programming Language :: Python :: 3
23
23
  Classifier: Programming Language :: Python :: 3 :: Only
24
- Classifier: Programming Language :: Python :: 3.8
25
24
  Classifier: Programming Language :: Python :: 3.9
26
25
  Classifier: Programming Language :: Python :: 3.10
27
26
  Classifier: Programming Language :: Python :: 3.11
@@ -38,7 +37,7 @@ Requires-Dist: cssselect>=1.2
38
37
  Requires-Dist: w3lib
39
38
  Requires-Dist: orjson>=3
40
39
  Requires-Dist: tldextract
41
- Requires-Dist: httpx[brotli,zstd]
40
+ Requires-Dist: httpx[brotli,socks,zstd]
42
41
  Requires-Dist: playwright>=1.49.1
43
42
  Requires-Dist: rebrowser-playwright>=1.49.1
44
43
  Requires-Dist: camoufox[geoip]>=0.4.9
@@ -5,7 +5,7 @@ from scrapling.fetchers import (AsyncFetcher, CustomFetcher, Fetcher,
5
5
  from scrapling.parser import Adaptor, Adaptors
6
6
 
7
7
  __author__ = "Karim Shoair (karim.shoair@pm.me)"
8
- __version__ = "0.2.9"
8
+ __version__ = "0.2.91"
9
9
  __copyright__ = "Copyright (c) 2024 Karim Shoair"
10
10
 
11
11
 
@@ -5,6 +5,8 @@ Type definitions for type checking purposes.
5
5
  from typing import (TYPE_CHECKING, Any, Callable, Dict, Generator, Iterable,
6
6
  List, Literal, Optional, Pattern, Tuple, Type, Union)
7
7
 
8
+ SelectorWaitStates = Literal["attached", "detached", "hidden", "visible"]
9
+
8
10
  try:
9
11
  from typing import Protocol
10
12
  except ImportError:
@@ -3,7 +3,7 @@ from camoufox.async_api import AsyncCamoufox
3
3
  from camoufox.sync_api import Camoufox
4
4
 
5
5
  from scrapling.core._types import (Callable, Dict, List, Literal, Optional,
6
- Union)
6
+ SelectorWaitStates, Union)
7
7
  from scrapling.core.utils import log
8
8
  from scrapling.engines.toolbelt import (Response, StatusText,
9
9
  async_intercept_route,
@@ -18,7 +18,7 @@ class CamoufoxEngine:
18
18
  self, headless: Optional[Union[bool, Literal['virtual']]] = True, block_images: Optional[bool] = False, disable_resources: Optional[bool] = False,
19
19
  block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = True, network_idle: Optional[bool] = False, humanize: Optional[Union[bool, float]] = True,
20
20
  timeout: Optional[float] = 30000, page_action: Callable = None, wait_selector: Optional[str] = None, addons: Optional[List[str]] = None,
21
- wait_selector_state: str = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None,
21
+ wait_selector_state: Optional[SelectorWaitStates] = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None,
22
22
  proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = True,
23
23
  geoip: Optional[bool] = False,
24
24
  adaptor_arguments: Dict = None,
@@ -84,6 +84,14 @@ class CamoufoxEngine:
84
84
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
85
85
  """
86
86
  addons = [] if self.disable_ads else [DefaultAddons.UBO]
87
+ # Store the final response
88
+ final_response = None
89
+
90
+ def handle_response(finished_response):
91
+ nonlocal final_response
92
+ if finished_response.request.resource_type == "document":
93
+ final_response = finished_response
94
+
87
95
  with Camoufox(
88
96
  geoip=self.geoip,
89
97
  proxy=self.proxy,
@@ -100,13 +108,15 @@ class CamoufoxEngine:
100
108
  page = browser.new_page()
101
109
  page.set_default_navigation_timeout(self.timeout)
102
110
  page.set_default_timeout(self.timeout)
111
+ # Listen for all responses
112
+ page.on("response", handle_response)
103
113
  if self.disable_resources:
104
114
  page.route("**/*", intercept_route)
105
115
 
106
116
  if self.extra_headers:
107
117
  page.set_extra_http_headers(self.extra_headers)
108
118
 
109
- res = page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
119
+ first_response = page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
110
120
  page.wait_for_load_state(state="domcontentloaded")
111
121
  if self.network_idle:
112
122
  page.wait_for_load_state('networkidle')
@@ -123,21 +133,24 @@ class CamoufoxEngine:
123
133
  if self.network_idle:
124
134
  page.wait_for_load_state('networkidle')
125
135
 
136
+ response_bytes = final_response.body() if final_response else page.content().encode('utf-8')
137
+ # In case we didn't catch a document type somehow
138
+ final_response = final_response if final_response else first_response
126
139
  # This will be parsed inside `Response`
127
- encoding = res.headers.get('content-type', '') or 'utf-8' # default encoding
140
+ encoding = final_response.headers.get('content-type', '') or 'utf-8' # default encoding
128
141
  # PlayWright API sometimes give empty status text for some reason!
129
- status_text = res.status_text or StatusText.get(res.status)
142
+ status_text = final_response.status_text or StatusText.get(final_response.status)
130
143
 
131
144
  response = Response(
132
- url=res.url,
145
+ url=final_response.url,
133
146
  text=page.content(),
134
- body=page.content().encode('utf-8'),
135
- status=res.status,
147
+ body=response_bytes,
148
+ status=final_response.status,
136
149
  reason=status_text,
137
150
  encoding=encoding,
138
151
  cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
139
- headers=res.all_headers(),
140
- request_headers=res.request.all_headers(),
152
+ headers=final_response.all_headers(),
153
+ request_headers=final_response.request.all_headers(),
141
154
  **self.adaptor_arguments
142
155
  )
143
156
  page.close()
@@ -151,6 +164,14 @@ class CamoufoxEngine:
151
164
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
152
165
  """
153
166
  addons = [] if self.disable_ads else [DefaultAddons.UBO]
167
+ # Store the final response
168
+ final_response = None
169
+
170
+ async def handle_response(finished_response):
171
+ nonlocal final_response
172
+ if finished_response.request.resource_type == "document":
173
+ final_response = finished_response
174
+
154
175
  async with AsyncCamoufox(
155
176
  geoip=self.geoip,
156
177
  proxy=self.proxy,
@@ -167,13 +188,15 @@ class CamoufoxEngine:
167
188
  page = await browser.new_page()
168
189
  page.set_default_navigation_timeout(self.timeout)
169
190
  page.set_default_timeout(self.timeout)
191
+ # Listen for all responses
192
+ page.on("response", handle_response)
170
193
  if self.disable_resources:
171
194
  await page.route("**/*", async_intercept_route)
172
195
 
173
196
  if self.extra_headers:
174
197
  await page.set_extra_http_headers(self.extra_headers)
175
198
 
176
- res = await page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
199
+ first_response = await page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
177
200
  await page.wait_for_load_state(state="domcontentloaded")
178
201
  if self.network_idle:
179
202
  await page.wait_for_load_state('networkidle')
@@ -190,21 +213,24 @@ class CamoufoxEngine:
190
213
  if self.network_idle:
191
214
  await page.wait_for_load_state('networkidle')
192
215
 
216
+ response_bytes = await final_response.body() if final_response else (await page.content()).encode('utf-8')
217
+ # In case we didn't catch a document type somehow
218
+ final_response = final_response if final_response else first_response
193
219
  # This will be parsed inside `Response`
194
- encoding = res.headers.get('content-type', '') or 'utf-8' # default encoding
220
+ encoding = final_response.headers.get('content-type', '') or 'utf-8' # default encoding
195
221
  # PlayWright API sometimes give empty status text for some reason!
196
- status_text = res.status_text or StatusText.get(res.status)
222
+ status_text = final_response.status_text or StatusText.get(final_response.status)
197
223
 
198
224
  response = Response(
199
- url=res.url,
225
+ url=final_response.url,
200
226
  text=await page.content(),
201
- body=(await page.content()).encode('utf-8'),
202
- status=res.status,
227
+ body=response_bytes,
228
+ status=final_response.status,
203
229
  reason=status_text,
204
230
  encoding=encoding,
205
231
  cookies={cookie['name']: cookie['value'] for cookie in await page.context.cookies()},
206
- headers=await res.all_headers(),
207
- request_headers=await res.request.all_headers(),
232
+ headers=await final_response.all_headers(),
233
+ request_headers=await final_response.request.all_headers(),
208
234
  **self.adaptor_arguments
209
235
  )
210
236
  await page.close()
@@ -1,6 +1,7 @@
1
1
  import json
2
2
 
3
- from scrapling.core._types import Callable, Dict, Optional, Union
3
+ from scrapling.core._types import (Callable, Dict, Optional,
4
+ SelectorWaitStates, Union)
4
5
  from scrapling.core.utils import log, lru_cache
5
6
  from scrapling.engines.constants import (DEFAULT_STEALTH_FLAGS,
6
7
  NSTBROWSER_DEFAULT_QUERY)
@@ -23,7 +24,7 @@ class PlaywrightEngine:
23
24
  page_action: Callable = None,
24
25
  wait_selector: Optional[str] = None,
25
26
  locale: Optional[str] = 'en-US',
26
- wait_selector_state: Optional[str] = 'attached',
27
+ wait_selector_state: SelectorWaitStates = 'attached',
27
28
  stealth: Optional[bool] = False,
28
29
  real_chrome: Optional[bool] = False,
29
30
  hide_canvas: Optional[bool] = False,
@@ -193,12 +194,21 @@ class PlaywrightEngine:
193
194
  :param url: Target url.
194
195
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
195
196
  """
197
+ from playwright.sync_api import Response as PlaywrightResponse
196
198
  if not self.stealth or self.real_chrome:
197
199
  # Because rebrowser_playwright doesn't play well with real browsers
198
200
  from playwright.sync_api import sync_playwright
199
201
  else:
200
202
  from rebrowser_playwright.sync_api import sync_playwright
201
203
 
204
+ # Store the final response
205
+ final_response = None
206
+
207
+ def handle_response(finished_response: PlaywrightResponse):
208
+ nonlocal final_response
209
+ if finished_response.request.resource_type == "document":
210
+ final_response = finished_response
211
+
202
212
  with sync_playwright() as p:
203
213
  # Creating the browser
204
214
  if self.cdp_url:
@@ -212,6 +222,8 @@ class PlaywrightEngine:
212
222
  page = context.new_page()
213
223
  page.set_default_navigation_timeout(self.timeout)
214
224
  page.set_default_timeout(self.timeout)
225
+ # Listen for all responses
226
+ page.on("response", handle_response)
215
227
 
216
228
  if self.extra_headers:
217
229
  page.set_extra_http_headers(self.extra_headers)
@@ -223,7 +235,7 @@ class PlaywrightEngine:
223
235
  for script in self.__stealth_scripts():
224
236
  page.add_init_script(path=script)
225
237
 
226
- res = page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
238
+ first_response = page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
227
239
  page.wait_for_load_state(state="domcontentloaded")
228
240
  if self.network_idle:
229
241
  page.wait_for_load_state('networkidle')
@@ -240,21 +252,24 @@ class PlaywrightEngine:
240
252
  if self.network_idle:
241
253
  page.wait_for_load_state('networkidle')
242
254
 
255
+ response_bytes = final_response.body() if final_response else page.content().encode('utf-8')
256
+ # In case we didn't catch a document type somehow
257
+ final_response = final_response if final_response else first_response
243
258
  # This will be parsed inside `Response`
244
- encoding = res.headers.get('content-type', '') or 'utf-8' # default encoding
259
+ encoding = final_response.headers.get('content-type', '') or 'utf-8' # default encoding
245
260
  # PlayWright API sometimes give empty status text for some reason!
246
- status_text = res.status_text or StatusText.get(res.status)
261
+ status_text = final_response.status_text or StatusText.get(final_response.status)
247
262
 
248
263
  response = Response(
249
- url=res.url,
264
+ url=final_response.url,
250
265
  text=page.content(),
251
- body=page.content().encode('utf-8'),
252
- status=res.status,
266
+ body=response_bytes,
267
+ status=final_response.status,
253
268
  reason=status_text,
254
269
  encoding=encoding,
255
270
  cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
256
- headers=res.all_headers(),
257
- request_headers=res.request.all_headers(),
271
+ headers=final_response.all_headers(),
272
+ request_headers=final_response.request.all_headers(),
258
273
  **self.adaptor_arguments
259
274
  )
260
275
  page.close()
@@ -266,12 +281,21 @@ class PlaywrightEngine:
266
281
  :param url: Target url.
267
282
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
268
283
  """
284
+ from playwright.async_api import Response as PlaywrightResponse
269
285
  if not self.stealth or self.real_chrome:
270
286
  # Because rebrowser_playwright doesn't play well with real browsers
271
287
  from playwright.async_api import async_playwright
272
288
  else:
273
289
  from rebrowser_playwright.async_api import async_playwright
274
290
 
291
+ # Store the final response
292
+ final_response = None
293
+
294
+ async def handle_response(finished_response: PlaywrightResponse):
295
+ nonlocal final_response
296
+ if finished_response.request.resource_type == "document":
297
+ final_response = finished_response
298
+
275
299
  async with async_playwright() as p:
276
300
  # Creating the browser
277
301
  if self.cdp_url:
@@ -285,6 +309,8 @@ class PlaywrightEngine:
285
309
  page = await context.new_page()
286
310
  page.set_default_navigation_timeout(self.timeout)
287
311
  page.set_default_timeout(self.timeout)
312
+ # Listen for all responses
313
+ page.on("response", handle_response)
288
314
 
289
315
  if self.extra_headers:
290
316
  await page.set_extra_http_headers(self.extra_headers)
@@ -296,7 +322,7 @@ class PlaywrightEngine:
296
322
  for script in self.__stealth_scripts():
297
323
  await page.add_init_script(path=script)
298
324
 
299
- res = await page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
325
+ first_response = await page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
300
326
  await page.wait_for_load_state(state="domcontentloaded")
301
327
  if self.network_idle:
302
328
  await page.wait_for_load_state('networkidle')
@@ -313,21 +339,24 @@ class PlaywrightEngine:
313
339
  if self.network_idle:
314
340
  await page.wait_for_load_state('networkidle')
315
341
 
342
+ response_bytes = await final_response.body() if final_response else (await page.content()).encode('utf-8')
343
+ # In case we didn't catch a document type somehow
344
+ final_response = final_response if final_response else first_response
316
345
  # This will be parsed inside `Response`
317
- encoding = res.headers.get('content-type', '') or 'utf-8' # default encoding
346
+ encoding = final_response.headers.get('content-type', '') or 'utf-8' # default encoding
318
347
  # PlayWright API sometimes give empty status text for some reason!
319
- status_text = res.status_text or StatusText.get(res.status)
348
+ status_text = final_response.status_text or StatusText.get(final_response.status)
320
349
 
321
350
  response = Response(
322
- url=res.url,
351
+ url=final_response.url,
323
352
  text=await page.content(),
324
- body=(await page.content()).encode('utf-8'),
325
- status=res.status,
353
+ body=response_bytes,
354
+ status=final_response.status,
326
355
  reason=status_text,
327
356
  encoding=encoding,
328
357
  cookies={cookie['name']: cookie['value'] for cookie in await page.context.cookies()},
329
- headers=await res.all_headers(),
330
- request_headers=await res.request.all_headers(),
358
+ headers=await final_response.all_headers(),
359
+ request_headers=await final_response.request.all_headers(),
331
360
  **self.adaptor_arguments
332
361
  )
333
362
  await page.close()
@@ -84,8 +84,6 @@ class ResponseEncoding:
84
84
  class Response(Adaptor):
85
85
  """This class is returned by all engines as a way to unify response type between different libraries."""
86
86
 
87
- _is_response_result_logged = False # Class-level flag, initialized to False
88
-
89
87
  def __init__(self, url: str, text: str, body: bytes, status: int, reason: str, cookies: Dict, headers: Dict, request_headers: Dict,
90
88
  encoding: str = 'utf-8', method: str = 'GET', **adaptor_arguments: Dict):
91
89
  automatch_domain = adaptor_arguments.pop('automatch_domain', None)
@@ -99,9 +97,7 @@ class Response(Adaptor):
99
97
  # For back-ward compatibility
100
98
  self.adaptor = self
101
99
  # For easier debugging while working from a Python shell
102
- if not Response._is_response_result_logged:
103
- log.info(f'Fetched ({status}) <{method} {url}> (referer: {request_headers.get("referer")})')
104
- Response._is_response_result_logged = True
100
+ log.info(f'Fetched ({status}) <{method} {url}> (referer: {request_headers.get("referer")})')
105
101
 
106
102
  # def __repr__(self):
107
103
  # return f'<{self.__class__.__name__} [{self.status} {self.reason}]>'
@@ -1,5 +1,5 @@
1
1
  from scrapling.core._types import (Callable, Dict, List, Literal, Optional,
2
- Union)
2
+ SelectorWaitStates, Union)
3
3
  from scrapling.engines import (CamoufoxEngine, PlaywrightEngine, StaticEngine,
4
4
  check_if_engine_usable)
5
5
  from scrapling.engines.toolbelt import BaseFetcher, Response
@@ -176,8 +176,8 @@ class StealthyFetcher(BaseFetcher):
176
176
  self, url: str, headless: Optional[Union[bool, Literal['virtual']]] = True, block_images: Optional[bool] = False, disable_resources: Optional[bool] = False,
177
177
  block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = True, network_idle: Optional[bool] = False, addons: Optional[List[str]] = None,
178
178
  timeout: Optional[float] = 30000, page_action: Callable = None, wait_selector: Optional[str] = None, humanize: Optional[Union[bool, float]] = True,
179
- wait_selector_state: str = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None, proxy: Optional[Union[str, Dict[str, str]]] = None,
180
- os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = True, geoip: Optional[bool] = False,
179
+ wait_selector_state: SelectorWaitStates = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None,
180
+ proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = True, geoip: Optional[bool] = False,
181
181
  ) -> Response:
182
182
  """
183
183
  Opens up a browser and do your request based on your chosen options below.
@@ -234,8 +234,8 @@ class StealthyFetcher(BaseFetcher):
234
234
  self, url: str, headless: Optional[Union[bool, Literal['virtual']]] = True, block_images: Optional[bool] = False, disable_resources: Optional[bool] = False,
235
235
  block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = True, network_idle: Optional[bool] = False, addons: Optional[List[str]] = None,
236
236
  timeout: Optional[float] = 30000, page_action: Callable = None, wait_selector: Optional[str] = None, humanize: Optional[Union[bool, float]] = True,
237
- wait_selector_state: str = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None, proxy: Optional[Union[str, Dict[str, str]]] = None,
238
- os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = True, geoip: Optional[bool] = False,
237
+ wait_selector_state: SelectorWaitStates = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None,
238
+ proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = True, geoip: Optional[bool] = False,
239
239
  ) -> Response:
240
240
  """
241
241
  Opens up a browser and do your request based on your chosen options below.
@@ -308,7 +308,7 @@ class PlayWrightFetcher(BaseFetcher):
308
308
  def fetch(
309
309
  self, url: str, headless: Union[bool, str] = True, disable_resources: bool = None,
310
310
  useragent: Optional[str] = None, network_idle: Optional[bool] = False, timeout: Optional[float] = 30000,
311
- page_action: Optional[Callable] = None, wait_selector: Optional[str] = None, wait_selector_state: Optional[str] = 'attached',
311
+ page_action: Optional[Callable] = None, wait_selector: Optional[str] = None, wait_selector_state: SelectorWaitStates = 'attached',
312
312
  hide_canvas: Optional[bool] = False, disable_webgl: Optional[bool] = False, extra_headers: Optional[Dict[str, str]] = None, google_search: Optional[bool] = True,
313
313
  proxy: Optional[Union[str, Dict[str, str]]] = None, locale: Optional[str] = 'en-US',
314
314
  stealth: Optional[bool] = False, real_chrome: Optional[bool] = False,
@@ -368,7 +368,7 @@ class PlayWrightFetcher(BaseFetcher):
368
368
  async def async_fetch(
369
369
  self, url: str, headless: Union[bool, str] = True, disable_resources: bool = None,
370
370
  useragent: Optional[str] = None, network_idle: Optional[bool] = False, timeout: Optional[float] = 30000,
371
- page_action: Optional[Callable] = None, wait_selector: Optional[str] = None, wait_selector_state: Optional[str] = 'attached',
371
+ page_action: Optional[Callable] = None, wait_selector: Optional[str] = None, wait_selector_state: SelectorWaitStates = 'attached',
372
372
  hide_canvas: Optional[bool] = False, disable_webgl: Optional[bool] = False, extra_headers: Optional[Dict[str, str]] = None, google_search: Optional[bool] = True,
373
373
  proxy: Optional[Union[str, Dict[str, str]]] = None, locale: Optional[str] = 'en-US',
374
374
  stealth: Optional[bool] = False, real_chrome: Optional[bool] = False,
@@ -155,7 +155,7 @@ class Adaptor(SelectorsGeneration):
155
155
  else:
156
156
  if issubclass(type(element), html.HtmlMixin):
157
157
 
158
- return self.__class__(
158
+ return Adaptor(
159
159
  root=element,
160
160
  text='', body=b'', # Since root argument is provided, both `text` and `body` will be ignored so this is just a filler
161
161
  url=self.url, encoding=self.encoding, auto_match=self.__auto_match_enabled,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: scrapling
3
- Version: 0.2.9
3
+ Version: 0.2.91
4
4
  Summary: Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
5
5
  Home-page: https://github.com/D4Vinci/Scrapling
6
6
  Author: Karim Shoair
@@ -21,7 +21,6 @@ Classifier: Topic :: Text Processing :: Markup :: HTML
21
21
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
22
22
  Classifier: Programming Language :: Python :: 3
23
23
  Classifier: Programming Language :: Python :: 3 :: Only
24
- Classifier: Programming Language :: Python :: 3.8
25
24
  Classifier: Programming Language :: Python :: 3.9
26
25
  Classifier: Programming Language :: Python :: 3.10
27
26
  Classifier: Programming Language :: Python :: 3.11
@@ -38,7 +37,7 @@ Requires-Dist: cssselect>=1.2
38
37
  Requires-Dist: w3lib
39
38
  Requires-Dist: orjson>=3
40
39
  Requires-Dist: tldextract
41
- Requires-Dist: httpx[brotli,zstd]
40
+ Requires-Dist: httpx[brotli,socks,zstd]
42
41
  Requires-Dist: playwright>=1.49.1
43
42
  Requires-Dist: rebrowser-playwright>=1.49.1
44
43
  Requires-Dist: camoufox[geoip]>=0.4.9
@@ -4,7 +4,7 @@ cssselect>=1.2
4
4
  w3lib
5
5
  orjson>=3
6
6
  tldextract
7
- httpx[brotli,zstd]
7
+ httpx[brotli,socks,zstd]
8
8
  playwright>=1.49.1
9
9
  rebrowser-playwright>=1.49.1
10
10
  camoufox[geoip]>=0.4.9
@@ -1,6 +1,6 @@
1
1
  [metadata]
2
2
  name = scrapling
3
- version = 0.2.9
3
+ version = 0.2.91
4
4
  author = Karim Shoair
5
5
  author_email = karim.shoair@pm.me
6
6
  description = Scrapling is an undetectable, powerful, flexible, adaptive, and high-performance web scraping library for Python.
@@ -6,7 +6,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
6
6
 
7
7
  setup(
8
8
  name="scrapling",
9
- version="0.2.9",
9
+ version="0.2.91",
10
10
  description="""Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
11
11
  simplifies the process of extracting data from websites, even when they undergo structural changes, and offers
12
12
  impressive speed improvements over many popular scraping tools.""",
@@ -37,7 +37,6 @@ setup(
37
37
  "Topic :: Software Development :: Libraries :: Python Modules",
38
38
  "Programming Language :: Python :: 3",
39
39
  "Programming Language :: Python :: 3 :: Only",
40
- "Programming Language :: Python :: 3.8",
41
40
  "Programming Language :: Python :: 3.9",
42
41
  "Programming Language :: Python :: 3.10",
43
42
  "Programming Language :: Python :: 3.11",
@@ -54,7 +53,7 @@ setup(
54
53
  "w3lib",
55
54
  "orjson>=3",
56
55
  "tldextract",
57
- 'httpx[brotli,zstd]',
56
+ 'httpx[brotli,zstd, socks]',
58
57
  'playwright>=1.49.1',
59
58
  'rebrowser-playwright>=1.49.1',
60
59
  'camoufox[geoip]>=0.4.9'
File without changes
File without changes
File without changes
File without changes
File without changes