yad2-scraper 0.3.0__tar.gz → 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: yad2-scraper
3
- Version: 0.3.0
3
+ Version: 0.4.0
4
4
  Summary: Scrape Yad2 in Python.
5
5
  License: LICENSE
6
6
  Author: dav ost
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "yad2-scraper"
3
- version = "0.3.0"
3
+ version = "0.4.0"
4
4
  description = "Scrape Yad2 in Python."
5
5
  authors = ["dav ost <davidost2003@gmail.com>"]
6
6
  license = "LICENSE"
@@ -19,7 +19,7 @@ ALLOW_REQUEST_REDIRECTS = True
19
19
  VERIFY_REQUEST_SSL = True
20
20
 
21
21
  ANTIBOT_CONTENT_IDENTIFIER = b"Are you for real" # robot-captcha
22
- YAD2_CONTENT_IDENTIFIER = b"https://www.yad2.co.il/"
22
+ PAGE_CONTENT_IDENTIFIER = b"https://www.yad2.co.il/"
23
23
 
24
24
  FIRST_PAGE_NUMBER = 1
25
25
  NOT_MENTIONED_PRICE_RANGE = 0, 0
@@ -0,0 +1,32 @@
1
+ import httpx
2
+ from typing import List, Union
3
+
4
+
5
+ class ResponseError(Exception):
6
+ def __init__(self, msg: str, request: httpx.Request, response: httpx.Response):
7
+ super().__init__(msg)
8
+ self.request = request
9
+ self.response = response
10
+
11
+
12
+ class AntiBotDetectedError(ResponseError):
13
+ pass
14
+
15
+
16
+ class UnexpectedContentError(ResponseError):
17
+ pass
18
+
19
+
20
+ class MaxAttemptsExceededError(Exception):
21
+ def __init__(self, msg: str, max_attempts: int, errors: List[BaseException] = None):
22
+ super().__init__(msg)
23
+ self.max_attempts = max_attempts
24
+ self.errors = errors
25
+
26
+
27
+ class MaxRequestAttemptsExceededError(MaxAttemptsExceededError):
28
+ def __init__(self, method: str, url: str, max_attempts: int, errors: List[Union[httpx.HTTPError, ResponseError]]):
29
+ msg = f"All {max_attempts} attempts for {method} request to '{url}' have failed"
30
+ super().__init__(msg, max_attempts, errors)
31
+ self.method = method
32
+ self.url = url
@@ -1,25 +1,25 @@
1
1
  import logging
2
2
  import httpx
3
3
  import time
4
- import random
5
- from typing import Optional, Dict, Any, Tuple, Union, Type, TypeVar
4
+ from fake_useragent import FakeUserAgent
5
+ from typing import Optional, Dict, Any, Callable, Union, Type, TypeVar
6
6
 
7
7
  from yad2_scraper.category import Yad2Category
8
8
  from yad2_scraper.query import QueryFilters
9
- from yad2_scraper.utils import get_random_user_agent
10
- from yad2_scraper.exceptions import AntiBotDetectedError, UnexpectedContentError, MaxRequestRetriesExceededError
9
+ from yad2_scraper.exceptions import AntiBotDetectedError, UnexpectedContentError, MaxRequestAttemptsExceededError
11
10
  from yad2_scraper.constants import (
12
11
  DEFAULT_REQUEST_HEADERS,
13
12
  ALLOW_REQUEST_REDIRECTS,
14
13
  VERIFY_REQUEST_SSL,
15
14
  ANTIBOT_CONTENT_IDENTIFIER,
16
- YAD2_CONTENT_IDENTIFIER
15
+ PAGE_CONTENT_IDENTIFIER
17
16
  )
18
17
 
19
18
  Category = TypeVar("Category", bound=Yad2Category)
20
- DelayRange = Tuple[float, float]
19
+ WaitStrategy = Callable[[int], Optional[float]]
21
20
  QueryParamTypes = Union[QueryFilters, Dict[str, Any]]
22
21
 
22
+ fua = FakeUserAgent()
23
23
  logger = logging.getLogger(__name__)
24
24
 
25
25
 
@@ -28,9 +28,9 @@ class Yad2Scraper:
28
28
  self,
29
29
  client: Optional[httpx.Client] = None,
30
30
  request_defaults: Optional[Dict[str, Any]] = None,
31
- randomize_user_agent: bool = False,
32
- random_delay_range: Optional[DelayRange] = None,
33
- max_retries: int = 0
31
+ randomize_user_agent: bool = True,
32
+ wait_strategy: Optional[WaitStrategy] = None,
33
+ max_request_attempts: int = 1
34
34
  ):
35
35
  self.client = client or httpx.Client(
36
36
  headers=DEFAULT_REQUEST_HEADERS,
@@ -39,16 +39,25 @@ class Yad2Scraper:
39
39
  )
40
40
  self.request_defaults = request_defaults or {}
41
41
  self.randomize_user_agent = randomize_user_agent
42
- self.random_delay_range = random_delay_range
43
- self.max_retries = max_retries
42
+ self.wait_strategy = wait_strategy
43
+ self.max_request_attempts = max_request_attempts
44
44
 
45
45
  logger.debug(f"Scraper initialized with client: {self.client}")
46
46
 
47
+ def set_user_agent(self, user_agent: str) -> None:
48
+ self.client.headers["User-Agent"] = user_agent
49
+ logger.debug(f"User-Agent client header set to: '{user_agent}'")
50
+
51
+ def set_no_script(self, no_script: bool) -> None:
52
+ value = "1" if no_script else "0"
53
+ self.client.cookies.set("noscript", value)
54
+ logger.debug(f"NoScript (noscript) client cookie set to: '{value}'")
55
+
47
56
  def fetch_category(
48
57
  self,
49
58
  url: str,
50
- params: Optional[QueryParamTypes] = None,
51
- category_type: Type[Category] = Yad2Category
59
+ category_type: Type[Category] = Yad2Category,
60
+ params: Optional[QueryParamTypes] = None
52
61
  ) -> Category:
53
62
  logger.debug(f"Fetching category from URL: '{url}'")
54
63
  response = self.get(url, params)
@@ -59,72 +68,48 @@ class Yad2Scraper:
59
68
  return self.request("GET", url, params=params)
60
69
 
61
70
  def request(self, method: str, url: str, params: Optional[QueryParamTypes] = None) -> httpx.Response:
71
+ if not isinstance(self.max_request_attempts, int):
72
+ raise TypeError(f"max_request_attempts must be of type 'int', but got {type(self.max_request_attempts)}")
73
+
74
+ if self.max_request_attempts <= 0:
75
+ raise ValueError(f"max_request_attempts must be a positive integer, but got {self.max_request_attempts}")
76
+
62
77
  request_options = self._prepare_request_options(params=params)
78
+ error_list = []
63
79
 
64
- try:
65
- return self._send_request(method, url, request_options)
66
- except Exception as error:
67
- return self._handle_request_error(method, url, request_options, error)
80
+ for attempt in range(1, self.max_request_attempts + 1):
81
+ try:
82
+ return self._send_request(method, url, request_options, attempt)
83
+ except Exception as error:
84
+ logger.error(f"{method} request to '{url}' failed {self._format_attempt_info(attempt)}: {error}")
85
+ error_list.append(error)
68
86
 
69
- def set_user_agent(self, user_agent: str) -> None:
70
- self.client.headers["User-Agent"] = user_agent
71
- logger.debug(f"User-Agent client header set to: '{user_agent}'")
87
+ if self.max_request_attempts == 1:
88
+ raise error_list[0] # only one error exists, raise it
72
89
 
73
- def set_no_script(self, no_script: bool) -> None:
74
- value = "1" if no_script else "0"
75
- self.client.cookies.set("noscript", value)
76
- logger.debug(f"noscript client cookie set to: '{value}'")
90
+ max_attempts_error = MaxRequestAttemptsExceededError(method, url, self.max_request_attempts, error_list)
91
+ logger.error(str(max_attempts_error))
92
+ raise max_attempts_error from error_list[-1] # multiple errors exist, raise from the last one
77
93
 
78
94
  def close(self) -> None:
79
95
  logger.debug("Closing scraper client")
80
96
  self.client.close()
81
97
  logger.info("Scraper client closed")
82
98
 
83
- def _send_request(self, method: str, url: str, request_options: Dict[str, Any]) -> httpx.Response:
99
+ def _send_request(self, method: str, url: str, request_options: Dict[str, Any], attempt: int) -> httpx.Response:
84
100
  if self.randomize_user_agent:
85
101
  self._set_random_user_agent(request_options)
86
102
 
87
- if self.random_delay_range:
88
- self._apply_request_delay()
103
+ if self.wait_strategy:
104
+ self._apply_wait_strategy(attempt)
89
105
 
90
- logger.info(f"Making {method} request to URL: '{url}'")
106
+ logger.info(f"Sending {method} request to URL: '{url}' {self._format_attempt_info(attempt)}")
91
107
  response = self.client.request(method, url, **request_options)
92
- logger.debug(f"Received response with status code: {response.status_code}")
108
+ logger.debug(f"Received response {response.status_code} from '{url}' {self._format_attempt_info(attempt)}")
93
109
  self._validate_response(response)
94
110
 
95
111
  return response
96
112
 
97
- def _handle_request_error(
98
- self,
99
- method: str,
100
- url: str,
101
- request_options: Dict[str, Any],
102
- error: Exception
103
- ) -> httpx.Response:
104
- logger.error(f"{method} request to '{url}' failed: {error}")
105
-
106
- if self.max_retries == 0:
107
- raise error
108
-
109
- return self._retry_request(method, url, request_options)
110
-
111
- def _retry_request(self, method: str, url: str, request_options: Dict[str, Any]) -> httpx.Response:
112
- logger.info(f"Retrying {method} request to '{url}' (max retries: {self.max_retries})")
113
-
114
- errors = []
115
-
116
- for retry_attempt in range(1, self.max_retries + 1):
117
- try:
118
- logger.debug(f"Retry attempt {retry_attempt}/{self.max_retries}")
119
- return self._send_request(method, url, request_options)
120
- except Exception as error:
121
- logger.warning(f"Retry attempt {retry_attempt} failed: {error}")
122
- errors.append(error)
123
-
124
- error_to_raise = MaxRequestRetriesExceededError(method, url, self.max_retries, errors)
125
- logger.error(str(error_to_raise))
126
- raise error_to_raise from errors[-1]
127
-
128
113
  def _prepare_request_options(self, params: Optional[QueryParamTypes] = None) -> Dict[str, Any]:
129
114
  logger.debug("Preparing request options from defaults")
130
115
  request_options = self.request_defaults.copy()
@@ -135,17 +120,20 @@ class Yad2Scraper:
135
120
 
136
121
  return request_options
137
122
 
138
- def _apply_request_delay(self):
139
- delay = random.uniform(*self.random_delay_range)
140
- logger.debug(f"Applying request delay of {delay:.2f} seconds")
141
- time.sleep(delay)
142
-
143
123
  @staticmethod
144
124
  def _set_random_user_agent(request_options: Dict[str, str]):
145
- user_agent = get_random_user_agent()
125
+ user_agent = fua.random
146
126
  request_options.setdefault("headers", {})["User-Agent"] = user_agent
147
127
  logger.debug(f"Updated request options with random User-Agent header: '{user_agent}'")
148
128
 
129
+ def _apply_wait_strategy(self, attempt: int):
130
+ wait_time = self.wait_strategy(attempt)
131
+ if not wait_time:
132
+ return
133
+
134
+ logger.debug(f"Waiting {wait_time:.2f} seconds before request {self._format_attempt_info(attempt)}")
135
+ time.sleep(wait_time)
136
+
149
137
  @staticmethod
150
138
  def _validate_response(response: httpx.Response):
151
139
  response.raise_for_status()
@@ -156,15 +144,18 @@ class Yad2Scraper:
156
144
  request=response.request,
157
145
  response=response
158
146
  )
159
- if YAD2_CONTENT_IDENTIFIER not in response.content:
147
+ if response.request.method == "GET" and PAGE_CONTENT_IDENTIFIER not in response.content:
160
148
  raise UnexpectedContentError(
161
- "The response does not contain yad2 content",
149
+ "The GET response does not contain yad2 related content",
162
150
  request=response.request,
163
151
  response=response
164
152
  )
165
153
 
166
154
  logger.debug("Response validation succeeded")
167
155
 
156
+ def _format_attempt_info(self, attempt: int) -> str:
157
+ return f"(attempt {attempt}/{self.max_request_attempts})"
158
+
168
159
  def __enter__(self):
169
160
  logger.debug("Entering scraper context")
170
161
  return self
@@ -1,13 +1,6 @@
1
- from fake_useragent import FakeUserAgent
2
1
  from bs4 import BeautifulSoup, Tag
3
2
  from typing import Union, List
4
3
 
5
- fua = FakeUserAgent()
6
-
7
-
8
- def get_random_user_agent() -> str:
9
- return fua.random
10
-
11
4
 
12
5
  def join_url(url: str, path: str) -> str:
13
6
  return url.rstrip("/") + "/" + path.lstrip("/")
@@ -1,31 +0,0 @@
1
- import httpx
2
- from typing import List
3
-
4
-
5
- class ResponseError(httpx.HTTPStatusError):
6
- # This adds the request/response objects to the error
7
- pass
8
-
9
-
10
- class AntiBotDetectedError(ResponseError):
11
- pass
12
-
13
-
14
- class UnexpectedContentError(ResponseError):
15
- pass
16
-
17
-
18
- class MaxRetriesExceededError(Exception):
19
- def __init__(self, msg: str, errors: List[Exception] = None):
20
- super().__init__(msg)
21
- self.errors = errors
22
-
23
-
24
- class MaxRequestRetriesExceededError(MaxRetriesExceededError):
25
- def __init__(self, method: str, url: str, max_retries: int, errors: List[Exception] = None):
26
- self.method = method
27
- self.url = url
28
- self.max_retries = max_retries
29
-
30
- msg = f"All {self.max_retries} retry attempts for {self.method} request to '{self.url}' have failed"
31
- super().__init__(msg, errors)
File without changes
File without changes