fraudcrawler 0.4.3__py3-none-any.whl → 0.4.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of fraudcrawler might be problematic. Click here for more details.

fraudcrawler/base/base.py CHANGED
@@ -63,6 +63,14 @@ class Host(BaseModel):
63
63
  return [cls._normalize_domain(dom.strip()) for dom in val]
64
64
 
65
65
 
66
+ class ClassificationResult(BaseModel):
67
+ """Model for classification results."""
68
+
69
+ result: int
70
+ input_tokens: int
71
+ output_tokens: int
72
+
73
+
66
74
  class Location(BaseModel):
67
75
  """Model for location details (e.g. `Location(name="Switzerland", code="ch")`)."""
68
76
 
@@ -137,6 +145,9 @@ class ProductItem(BaseModel):
137
145
  # Processor parameters are set dynamic so we must allow extra fields
138
146
  classifications: Dict[str, int] = Field(default_factory=dict)
139
147
 
148
+ # Usage parameters
149
+ usage: Dict[str, Dict[str, int]] = Field(default_factory=dict)
150
+
140
151
  # Filtering parameters
141
152
  filtered: bool = False
142
153
  filtered_at_stage: str | None = None
@@ -7,10 +7,6 @@ from bs4 import BeautifulSoup
7
7
 
8
8
  from fraudcrawler.settings import (
9
9
  PROCESSOR_DEFAULT_MODEL,
10
- PROCESSOR_DEFAULT_IF_MISSING,
11
- PROCESSOR_PRODUCT_DETAILS_TEMPLATE,
12
- MAX_RETRIES,
13
- RETRY_DELAY,
14
10
  )
15
11
  from fraudcrawler.settings import (
16
12
  DEFAULT_N_SERP_WKRS,
@@ -61,9 +57,6 @@ class Orchestrator(ABC):
61
57
  zyteapi_key: str,
62
58
  openaiapi_key: str,
63
59
  openai_model: str = PROCESSOR_DEFAULT_MODEL,
64
- max_retries: int = MAX_RETRIES,
65
- retry_delay: int = RETRY_DELAY,
66
- default_if_missing: int = PROCESSOR_DEFAULT_IF_MISSING,
67
60
  n_serp_wkrs: int = DEFAULT_N_SERP_WKRS,
68
61
  n_zyte_wkrs: int = DEFAULT_N_ZYTE_WKRS,
69
62
  n_proc_wkrs: int = DEFAULT_N_PROC_WKRS,
@@ -77,25 +70,18 @@ class Orchestrator(ABC):
77
70
  zyteapi_key: The API key for Zyte API.
78
71
  openaiapi_key: The API key for OpenAI.
79
72
  openai_model: The model to use for the processing (optional).
80
- max_retries: Maximum number of retries for API calls (optional).
81
- retry_delay: Delay between retries in seconds (optional).
82
73
  n_serp_wkrs: Number of async workers for serp (optional).
83
74
  n_zyte_wkrs: Number of async workers for zyte (optional).
84
75
  n_proc_wkrs: Number of async workers for the processor (optional).
85
76
  """
86
77
  # Setup the clients
87
- self._serpapi = SerpApi(
88
- api_key=serpapi_key, max_retries=max_retries, retry_delay=retry_delay
89
- )
78
+ self._serpapi = SerpApi(api_key=serpapi_key)
90
79
  self._enricher = Enricher(user=dataforseo_user, pwd=dataforseo_pwd)
91
80
  self._url_collector = URLCollector()
92
- self._zyteapi = ZyteApi(
93
- api_key=zyteapi_key, max_retries=max_retries, retry_delay=retry_delay
94
- )
81
+ self._zyteapi = ZyteApi(api_key=zyteapi_key)
95
82
  self._processor = Processor(
96
83
  api_key=openaiapi_key,
97
84
  model=openai_model,
98
- default_if_missing=default_if_missing,
99
85
  )
100
86
 
101
87
  # Setup the async framework
@@ -249,7 +235,6 @@ class Orchestrator(ABC):
249
235
  """
250
236
 
251
237
  # Process the products
252
-
253
238
  while True:
254
239
  product = await queue_in.get()
255
240
  if product is None:
@@ -259,31 +244,23 @@ class Orchestrator(ABC):
259
244
 
260
245
  if not product.filtered:
261
246
  try:
262
- url = product.url
263
247
  # Run all the configured prompts
264
248
  for prompt in prompts:
265
- # Dynamically build product_details string
266
- details = []
267
- for field in prompt.product_item_fields:
268
- value = getattr(product, field, None)
269
- if value is not None:
270
- details.append(
271
- PROCESSOR_PRODUCT_DETAILS_TEMPLATE.format(
272
- field_name=field, field_value=value
273
- )
274
- )
275
- product_details = "\n\n".join(details)
276
- logger.debug(
277
- f"Classify product at {url} with prompt {prompt.name} and details: {product_details}"
278
- )
279
249
  classification = await self._processor.classify(
250
+ product=product,
280
251
  prompt=prompt,
281
- url=url,
282
- product_details=product_details,
283
252
  )
284
- product.classifications[prompt.name] = classification
253
+ product.classifications[prompt.name] = int(
254
+ classification.result
255
+ )
256
+ product.usage[prompt.name] = {
257
+ "input_tokens": classification.input_tokens,
258
+ "output_tokens": classification.output_tokens,
259
+ }
285
260
  except Exception as e:
286
- logger.warning(f"Error processing product: {e}.")
261
+ logger.warning(
262
+ f"Error processing product with url={product.url}: {e}."
263
+ )
287
264
 
288
265
  await queue_out.put(product)
289
266
  queue_in.task_done()
@@ -0,0 +1,37 @@
1
+ from aiohttp.web_exceptions import HTTPException
2
+ from tenacity import (
3
+ AsyncRetrying,
4
+ retry_if_exception,
5
+ stop_after_attempt,
6
+ wait_exponential_jitter,
7
+ )
8
+
9
+ from fraudcrawler.settings import (
10
+ RETRY_STOP_AFTER_ATTEMPT,
11
+ RETRY_INITIAL_DELAY,
12
+ RETRY_MAX_DELAY,
13
+ RETRY_EXP_BASE,
14
+ RETRY_JITTER,
15
+ RETRY_SKIP_IF_CODE,
16
+ )
17
+
18
+
19
+ def _is_retryable_exception(err: BaseException) -> bool:
20
+ if isinstance(err, HTTPException) and err.status_code in RETRY_SKIP_IF_CODE:
21
+ return False
22
+ return True
23
+
24
+
25
+ def get_async_retry() -> AsyncRetrying:
26
+ """returns the retry configuration for async operations."""
27
+ return AsyncRetrying(
28
+ retry=retry_if_exception(_is_retryable_exception),
29
+ stop=stop_after_attempt(RETRY_STOP_AFTER_ATTEMPT),
30
+ wait=wait_exponential_jitter(
31
+ initial=RETRY_INITIAL_DELAY,
32
+ max=RETRY_MAX_DELAY,
33
+ exp_base=RETRY_EXP_BASE,
34
+ jitter=RETRY_JITTER,
35
+ ),
36
+ reraise=True,
37
+ )
@@ -1,11 +1,15 @@
1
1
  import logging
2
2
 
3
3
  from openai import AsyncOpenAI
4
+ from tenacity import RetryCallState
4
5
 
5
- from fraudcrawler.base.base import Prompt
6
+ from fraudcrawler.base.base import ProductItem, Prompt, ClassificationResult
7
+ from fraudcrawler.base.retry import get_async_retry
6
8
  from fraudcrawler.settings import (
9
+ PROCESSOR_PRODUCT_DETAILS_TEMPLATE,
7
10
  PROCESSOR_USER_PROMPT_TEMPLATE,
8
11
  PROCESSOR_DEFAULT_IF_MISSING,
12
+ PROCESSOR_EMPTY_TOKEN_COUNT,
9
13
  )
10
14
 
11
15
 
@@ -20,6 +24,7 @@ class Processor:
20
24
  api_key: str,
21
25
  model: str,
22
26
  default_if_missing: int = PROCESSOR_DEFAULT_IF_MISSING,
27
+ empty_token_count: int = PROCESSOR_EMPTY_TOKEN_COUNT,
23
28
  ):
24
29
  """Initializes the Processor.
25
30
 
@@ -27,17 +32,66 @@ class Processor:
27
32
  api_key: The OpenAI API key.
28
33
  model: The OpenAI model to use.
29
34
  default_if_missing: The default classification to return if error occurs.
35
+ empty_token_count: The default value to return as tokensif the classification is empty.
30
36
  """
31
37
  self._client = AsyncOpenAI(api_key=api_key)
32
38
  self._model = model
33
- self._default_if_missing = default_if_missing
39
+ self._error_response = ClassificationResult(
40
+ result=default_if_missing,
41
+ input_tokens=empty_token_count,
42
+ output_tokens=empty_token_count,
43
+ )
44
+
45
+ @staticmethod
46
+ def _get_product_details(product: ProductItem, prompt: Prompt) -> str:
47
+ """Extracts product details based on the prompt configuration.
48
+
49
+ Args:
50
+ product: The product item to extract details from.
51
+ prompt: The prompt configuration containing field names.
52
+ """
53
+ details = []
54
+ for field in prompt.product_item_fields:
55
+ if value := getattr(product, field, None):
56
+ details.append(
57
+ PROCESSOR_PRODUCT_DETAILS_TEMPLATE.format(
58
+ field_name=field, field_value=value
59
+ )
60
+ )
61
+ else:
62
+ logger.error(
63
+ f'Field "{field}" is missing in ProductItem with url="{product.url}"'
64
+ )
65
+ return "\n\n".join(details)
66
+
67
+ @staticmethod
68
+ def _log_before(url: str, prompt: Prompt, retry_state: RetryCallState) -> None:
69
+ """Context aware logging before the request is made."""
70
+ if retry_state:
71
+ logger.debug(
72
+ f"Classifying product with url={url} using prompt={prompt} (Attempt {retry_state.attempt_number})."
73
+ )
74
+ else:
75
+ logger.debug(f"retry_state is {retry_state}; not logging before.")
76
+
77
+ @staticmethod
78
+ def _log_before_sleep(
79
+ url: str, prompt: Prompt, retry_state: RetryCallState
80
+ ) -> None:
81
+ """Context aware logging before sleeping after a failed request."""
82
+ if retry_state and retry_state.outcome:
83
+ logger.warning(
84
+ f"Attempt {retry_state.attempt_number} of classifying product with url={url} using prompt={prompt} "
85
+ f"failed with error: {retry_state.outcome.exception()}. "
86
+ f"Retrying in {retry_state.upcoming_sleep:.0f} seconds."
87
+ )
34
88
 
35
89
  async def _call_openai_api(
36
90
  self,
37
91
  system_prompt: str,
38
92
  user_prompt: str,
39
93
  **kwargs,
40
- ) -> str:
94
+ ) -> ClassificationResult:
41
95
  """Calls the OpenAI API with the given user prompt."""
42
96
  response = await self._client.chat.completions.create(
43
97
  model=self._model,
@@ -50,15 +104,35 @@ class Processor:
50
104
  content = response.choices[0].message.content
51
105
  if not content:
52
106
  raise ValueError("Empty response from OpenAI API")
53
- return content
54
107
 
55
- async def classify(self, prompt: Prompt, url: str, product_details: str) -> int:
56
- """A generic classification method that classifies a product based on a prompt object.
108
+ # Convert the content to an integer
109
+ try:
110
+ content = int(content.strip())
111
+ except Exception as e:
112
+ msg = f"Failed to convert OpenAI response '{content}' to integer: {e}"
113
+ logger.error(msg)
114
+ raise ValueError(msg)
115
+
116
+ # For tracking consumption we alre return the tokens used
117
+ classification = ClassificationResult(
118
+ result=content,
119
+ input_tokens=response.usage.prompt_tokens,
120
+ output_tokens=response.usage.completion_tokens,
121
+ )
122
+
123
+ return classification
124
+
125
+ async def classify(
126
+ self,
127
+ product: ProductItem,
128
+ prompt: Prompt,
129
+ ) -> ClassificationResult:
130
+ """A generic classification method that classifies a product based on a prompt object and returns
131
+ the classification, input tokens, and output tokens.
57
132
 
58
133
  Args:
59
- prompt: A dictionary with keys "system_prompt", etc.
60
- url: Product URL (often used in the user_prompt).
61
- product_details: String with product details, formatted per prompt.product_item_fields.
134
+ product: The product item to classify.
135
+ prompt: The prompt to use for classification.
62
136
 
63
137
  Note:
64
138
  This method returns `PROCESSOR_DEFAULT_IF_MISSING` if:
@@ -66,12 +140,15 @@ class Processor:
66
140
  - an error occurs during the API call
67
141
  - if the response isn't in allowed_classes.
68
142
  """
69
- # If required fields are missing, return the prompt's default fallback if provided.
143
+ url = product.url
144
+
145
+ # Form the product details from the ProductItem
146
+ product_details = self._get_product_details(product=product, prompt=prompt)
70
147
  if not product_details:
71
148
  logger.warning("Missing required product_details for classification.")
72
- return self._default_if_missing
149
+ return self._error_response
73
150
 
74
- # Substitute placeholders in user_prompt with the relevant arguments
151
+ # Prepare the user prompt
75
152
  user_prompt = PROCESSOR_USER_PROMPT_TEMPLATE.format(
76
153
  product_details=product_details,
77
154
  )
@@ -79,24 +156,35 @@ class Processor:
79
156
  # Call the OpenAI API
80
157
  try:
81
158
  logger.debug(
82
- f'Calling OpenAI API for classification (url="{url}", prompt="{prompt.name}")'
159
+ f"Classifying product with url={url} using prompt={prompt.name} and user_prompt={user_prompt}."
160
+ )
161
+ # Perform the request and retry if necessary. There is some context aware logging
162
+ # - `before`: before the request is made (or before retrying)
163
+ # - `before_sleep`: if the request fails before sleeping
164
+ retry = get_async_retry()
165
+ retry.before = lambda retry_state: self._log_before(
166
+ url=url, prompt=prompt, retry_state=retry_state
83
167
  )
84
- content = await self._call_openai_api(
85
- system_prompt=prompt.system_prompt,
86
- user_prompt=user_prompt,
87
- max_tokens=1,
168
+ retry.before_sleep = lambda retry_state: self._log_before_sleep(
169
+ url=url, prompt=prompt, retry_state=retry_state
88
170
  )
89
- classification = int(content.strip())
171
+ async for attempt in retry:
172
+ with attempt:
173
+ classification = await self._call_openai_api(
174
+ system_prompt=prompt.system_prompt,
175
+ user_prompt=user_prompt,
176
+ max_tokens=1,
177
+ )
90
178
 
91
179
  # Enforce that the classification is in the allowed classes
92
- if classification not in prompt.allowed_classes:
180
+ if classification.result not in prompt.allowed_classes:
93
181
  logger.warning(
94
- f"Classification '{classification}' not in allowed classes {prompt.allowed_classes}"
182
+ f"Classification '{classification.result}' not in allowed classes {prompt.allowed_classes}"
95
183
  )
96
- return self._default_if_missing
184
+ return self._error_response
97
185
 
98
186
  logger.info(
99
- f'Classification for url="{url}" (prompt={prompt.name}): {classification}'
187
+ f'Classification for url="{url}" (prompt={prompt.name}): {classification.result} and total tokens used: {classification.input_tokens + classification.output_tokens}'
100
188
  )
101
189
  return classification
102
190
 
@@ -104,4 +192,4 @@ class Processor:
104
192
  logger.error(
105
193
  f'Error classifying product at url="{url}" with prompt "{prompt.name}": {e}'
106
194
  )
107
- return self._default_if_missing
195
+ return self._error_response
@@ -4,8 +4,11 @@ import logging
4
4
  from pydantic import BaseModel
5
5
  from typing import Dict, List, Iterator
6
6
 
7
+ from tenacity import RetryCallState
8
+
7
9
  from fraudcrawler.settings import ENRICHMENT_DEFAULT_LIMIT
8
10
  from fraudcrawler.base.base import Location, Language, AsyncClient
11
+ from fraudcrawler.base.retry import get_async_retry
9
12
 
10
13
 
11
14
  logger = logging.getLogger(__name__)
@@ -22,8 +25,6 @@ class Enricher(AsyncClient):
22
25
  """A client to interact with the DataForSEO API for enhancing searches (producing alternative search_terms)."""
23
26
 
24
27
  _auth_encoding = "ascii"
25
- _max_retries = 3
26
- _retry_delay = 2
27
28
  _base_endpoint = "https://api.dataforseo.com"
28
29
  _suggestions_endpoint = "/v3/dataforseo_labs/google/keyword_suggestions/live"
29
30
  _keywords_endpoint = "/v3/dataforseo_labs/google/related_keywords/live"
@@ -44,6 +45,28 @@ class Enricher(AsyncClient):
44
45
  "Content-Encoding": "gzip",
45
46
  }
46
47
 
48
+ @staticmethod
49
+ def _log_before(search_term: str, retry_state: RetryCallState | None) -> None:
50
+ """Context aware logging before the request is made."""
51
+ if retry_state:
52
+ logger.debug(
53
+ f'DataForSEO suggested search with search="{search_term}" (attempt {retry_state.attempt_number}).'
54
+ )
55
+ else:
56
+ logger.debug(f"retry_state is {retry_state}, not logging before.")
57
+
58
+ @staticmethod
59
+ def _log_before_sleep(search_term: str, retry_state: RetryCallState | None) -> None:
60
+ """Context aware logging before sleeping after a failed request."""
61
+ if retry_state and retry_state.outcome:
62
+ logger.warning(
63
+ f'Attempt {retry_state.attempt_number} DataForSEO suggested search with search_term="{search_term}" '
64
+ f"failed with error: {retry_state.outcome.exception()}. "
65
+ f"Retrying in {retry_state.upcoming_sleep:.0f} seconds."
66
+ )
67
+ else:
68
+ logger.debug(f"retry_state is {retry_state}, not logging before_sleep.")
69
+
47
70
  @staticmethod
48
71
  def _extract_items_from_data(data: dict) -> Iterator[dict]:
49
72
  """Extracts the items from the DataForSEO response.
@@ -126,7 +149,7 @@ class Enricher(AsyncClient):
126
149
  limit: The upper limit of suggestions to get.
127
150
  """
128
151
 
129
- # Data must be a list of dictionaries setting a number of search tasks; here we only have one task.
152
+ # Data must be a list of dictionaries, setting a number of search tasks; here we only have one task.
130
153
  data = [
131
154
  {
132
155
  "keyword": search_term,
@@ -137,23 +160,25 @@ class Enricher(AsyncClient):
137
160
  "include_seed_keyword": True,
138
161
  }
139
162
  ]
140
- logger.debug(
141
- f'DataForSEO search for suggested keywords with search_term="{search_term}".'
163
+ url = f"{self._base_endpoint}{self._suggestions_endpoint}"
164
+ logger.debug(f'DataForSEO url="{url}" with data="{data}".')
165
+
166
+ # Perform the request and retry if necessary. There is some context aware logging
167
+ # - `before`: before the request is made (or before retrying)
168
+ # - `before_sleep`: if the request fails before sleeping
169
+ retry = get_async_retry()
170
+ retry.before = lambda retry_state: self._log_before(
171
+ search_term=search_term, retry_state=retry_state
142
172
  )
143
- try:
144
- url = f"{self._base_endpoint}{self._suggestions_endpoint}"
145
- logger.debug(f'DataForSEO url="{url}" with data="{data}".')
146
- sugg_data = await self.post(url=url, headers=self._headers, data=data)
147
- except Exception as e:
148
- logger.error(f"DataForSEO suggested search failed with error: {e}.")
173
+ retry.before_sleep = lambda retry_state: self._log_before_sleep(
174
+ search_term=search_term, retry_state=retry_state
175
+ )
176
+ async for attempt in retry:
177
+ with attempt:
178
+ sugg_data = await self.post(url=url, headers=self._headers, data=data)
149
179
 
150
180
  # Extract the keywords from the response
151
- try:
152
- keywords = self._extract_suggested_keywords(data=sugg_data)
153
- except Exception as e:
154
- logger.error(
155
- f"Failed to extract suggested keywords from DataForSEO response with error: {e}."
156
- )
181
+ keywords = self._extract_suggested_keywords(data=sugg_data)
157
182
 
158
183
  logger.debug(f"Found {len(keywords)} suggestions from DataForSEO search.")
159
184
  return keywords
@@ -271,22 +296,36 @@ class Enricher(AsyncClient):
271
296
  language: The language to use for the search.
272
297
  n_terms: The number of additional terms
273
298
  """
274
- # Get the additional keywords
275
299
  logger.info(
276
300
  f'Applying enrichment for search_term="{search_term}" and n_terms="{n_terms}".'
277
301
  )
278
- suggested = await self._get_suggested_keywords(
279
- search_term=search_term,
280
- location=location,
281
- language=language,
282
- limit=n_terms,
283
- )
284
- related = await self._get_related_keywords(
285
- search_term=search_term,
286
- location=location,
287
- language=language,
288
- limit=n_terms,
289
- )
302
+ # Get the additional suggested keywords
303
+ try:
304
+ suggested = await self._get_suggested_keywords(
305
+ search_term=search_term,
306
+ location=location,
307
+ language=language,
308
+ limit=n_terms,
309
+ )
310
+ except Exception as e:
311
+ logger.error(
312
+ f"Error fetching suggested keywords for search_term='{search_term}': {e}"
313
+ )
314
+ suggested = []
315
+
316
+ # Get the additional related keywords
317
+ try:
318
+ related = await self._get_related_keywords(
319
+ search_term=search_term,
320
+ location=location,
321
+ language=language,
322
+ limit=n_terms,
323
+ )
324
+ except Exception as e:
325
+ logger.error(
326
+ f"Error fetching related keywords for search_term='{search_term}': {e}"
327
+ )
328
+ related = []
290
329
 
291
330
  # Remove original keyword and aggregate them by volume
292
331
  keywords = [kw for kw in suggested + related if kw.text != search_term]
@@ -1,4 +1,3 @@
1
- import asyncio
2
1
  from enum import Enum
3
2
  import logging
4
3
  from pydantic import BaseModel
@@ -6,8 +5,11 @@ from typing import List
6
5
  from urllib.parse import urlparse
7
6
  import re
8
7
 
9
- from fraudcrawler.settings import MAX_RETRIES, RETRY_DELAY, SERP_DEFAULT_COUNTRY_CODES
8
+ from tenacity import RetryCallState
9
+
10
+ from fraudcrawler.settings import SERP_DEFAULT_COUNTRY_CODES
10
11
  from fraudcrawler.base.base import Host, Language, Location, AsyncClient
12
+ from fraudcrawler.base.retry import get_async_retry
11
13
 
12
14
  logger = logging.getLogger(__name__)
13
15
 
@@ -42,20 +44,14 @@ class SerpApi(AsyncClient):
42
44
  def __init__(
43
45
  self,
44
46
  api_key: str,
45
- max_retries: int = MAX_RETRIES,
46
- retry_delay: int = RETRY_DELAY,
47
47
  ):
48
48
  """Initializes the SerpApiClient with the given API key.
49
49
 
50
50
  Args:
51
51
  api_key: The API key for SerpApi.
52
- max_retries: Maximum number of retries for API calls.
53
- retry_delay: Delay between retries in seconds.
54
52
  """
55
53
  super().__init__()
56
54
  self._api_key = api_key
57
- self._max_retries = max_retries
58
- self._retry_delay = retry_delay
59
55
 
60
56
  def _get_domain(self, url: str) -> str:
61
57
  """Extracts the second-level domain together with the top-level domain (e.g. `google.com`).
@@ -116,6 +112,31 @@ class SerpApi(AsyncClient):
116
112
 
117
113
  return urls
118
114
 
115
+ @staticmethod
116
+ def _log_before(search_string: str, retry_state: RetryCallState | None) -> None:
117
+ """Context aware logging before the request is made."""
118
+ if retry_state:
119
+ logger.debug(
120
+ f'Performing SerpAPI search with q="{search_string}" '
121
+ f"(attempt {retry_state.attempt_number})."
122
+ )
123
+ else:
124
+ logger.debug(f"retry_state is {retry_state}, not logging before.")
125
+
126
+ @staticmethod
127
+ def _log_before_sleep(
128
+ search_string: str, retry_state: RetryCallState | None
129
+ ) -> None:
130
+ """Context aware logging before sleeping after a failed request."""
131
+ if retry_state and retry_state.outcome:
132
+ logger.warning(
133
+ f'Attempt {retry_state.attempt_number} of SerpAPI search with q="{search_string}" '
134
+ f"failed with error: {retry_state.outcome.exception()}. "
135
+ f"Retrying in {retry_state.upcoming_sleep:.0f} seconds."
136
+ )
137
+ else:
138
+ logger.debug(f"retry_state is {retry_state}; not logging before_sleep.")
139
+
119
140
  async def _search(
120
141
  self,
121
142
  engine: str,
@@ -172,25 +193,21 @@ class SerpApi(AsyncClient):
172
193
  "num": num_results,
173
194
  "api_key": self._api_key,
174
195
  }
175
-
176
- # Perform the request
177
- attempts = 0
178
- err = None
179
- while attempts < self._max_retries:
180
- try:
181
- logger.debug(
182
- f'Performing SerpAPI search with q="{search_string}" (Attempt {attempts + 1}).'
183
- )
196
+ logger.debug(f"SerpAPI search with params: {params}")
197
+
198
+ # Perform the request and retry if necessary. There is some context aware logging:
199
+ # - `before`: before the request is made (and before retrying)
200
+ # - `before_sleep`: if the request fails before sleeping
201
+ retry = get_async_retry()
202
+ retry.before = lambda retry_state: self._log_before(
203
+ search_string=search_string, retry_state=retry_state
204
+ )
205
+ retry.before_sleep = lambda retry_state: self._log_before_sleep(
206
+ search_string=search_string, retry_state=retry_state
207
+ )
208
+ async for attempt in retry:
209
+ with attempt:
184
210
  response = await self.get(url=self._endpoint, params=params)
185
- break
186
- except Exception as e:
187
- logger.error(f"SerpAPI search failed with error: {e}.")
188
- err = e
189
- attempts += 1
190
- if attempts < self._max_retries:
191
- await asyncio.sleep(self._retry_delay)
192
- if err is not None:
193
- raise err
194
211
 
195
212
  # Extract the URLs from the response
196
213
  urls = self._extract_search_results(response=response, engine=engine)
@@ -1,16 +1,13 @@
1
- import asyncio
2
1
  import logging
3
2
  from typing import List
4
3
  from base64 import b64decode
5
4
 
6
5
  import aiohttp
6
+ from tenacity import RetryCallState
7
7
 
8
- from fraudcrawler.settings import (
9
- MAX_RETRIES,
10
- RETRY_DELAY,
11
- ZYTE_DEFALUT_PROBABILITY_THRESHOLD,
12
- )
8
+ from fraudcrawler.settings import ZYTE_DEFALUT_PROBABILITY_THRESHOLD
13
9
  from fraudcrawler.base.base import AsyncClient
10
+ from fraudcrawler.base.retry import get_async_retry
14
11
 
15
12
  logger = logging.getLogger(__name__)
16
13
 
@@ -34,19 +31,32 @@ class ZyteApi(AsyncClient):
34
31
  def __init__(
35
32
  self,
36
33
  api_key: str,
37
- max_retries: int = MAX_RETRIES,
38
- retry_delay: int = RETRY_DELAY,
39
34
  ):
40
35
  """Initializes the ZyteApiClient with the given API key and retry configurations.
41
36
 
42
37
  Args:
43
38
  api_key: The API key for Zyte API.
44
- max_retries: Maximum number of retries for API calls.
45
- retry_delay: Delay between retries in seconds.
46
39
  """
47
40
  self._aiohttp_basic_auth = aiohttp.BasicAuth(api_key)
48
- self._max_retries = max_retries
49
- self._retry_delay = retry_delay
41
+
42
+ def _log_before(self, url: str, retry_state: RetryCallState | None) -> None:
43
+ """Context aware logging before the request is made."""
44
+ if retry_state:
45
+ logger.debug(
46
+ f"Zyte fetching product details for URL {url} (Attempt {retry_state.attempt_number})."
47
+ )
48
+ else:
49
+ logger.debug(f"retry_state is {retry_state}; not logging before.")
50
+
51
+ def _log_before_sleep(self, url: str, retry_state: RetryCallState | None) -> None:
52
+ """Context aware logging before sleeping after a failed request."""
53
+ if retry_state and retry_state.outcome:
54
+ logger.warning(
55
+ f'Attempt {retry_state.attempt_number} of Zyte fetching product details for URL "{url}" '
56
+ f"Retrying in {retry_state.upcoming_sleep:.0f} seconds."
57
+ )
58
+ else:
59
+ logger.debug(f"retry_state is {retry_state}; not logging before_sleep.")
50
60
 
51
61
  async def get_details(self, url: str) -> dict:
52
62
  """Fetches product details for a single URL.
@@ -74,30 +84,25 @@ class ZyteApi(AsyncClient):
74
84
  }
75
85
  """
76
86
  logger.info(f"Fetching product details by Zyte for URL {url}.")
77
- attempts = 0
78
- err = None
79
- while attempts < self._max_retries:
80
- try:
81
- logger.debug(
82
- f"Fetch product details for URL {url} (Attempt {attempts + 1})."
83
- )
87
+
88
+ # Perform the request and retry if necessary. There is some context aware logging:
89
+ # - `before`: before the request is made (and before retrying)
90
+ # - `before_sleep`: if the request fails before sleeping
91
+ retry = get_async_retry()
92
+ retry.before = lambda retry_state: self._log_before(
93
+ url=url, retry_state=retry_state
94
+ )
95
+ retry.before_sleep = lambda retry_state: self._log_before_sleep(
96
+ url=url, retry_state=retry_state
97
+ )
98
+ async for attempt in retry:
99
+ with attempt:
84
100
  product = await self.post(
85
101
  url=self._endpoint,
86
102
  data={"url": url, **self._config},
87
103
  auth=self._aiohttp_basic_auth,
88
104
  )
89
- return product
90
- except Exception as e:
91
- logger.debug(
92
- f"Exception occurred while fetching product details for URL {url} (Attempt {attempts + 1})."
93
- )
94
- err = e
95
- attempts += 1
96
- if attempts < self._max_retries:
97
- await asyncio.sleep(self._retry_delay)
98
- if err is not None:
99
- raise err
100
- return {}
105
+ return product
101
106
 
102
107
  @staticmethod
103
108
  def keep_product(
fraudcrawler/settings.py CHANGED
@@ -2,10 +2,18 @@ from pathlib import Path
2
2
  from typing import List
3
3
 
4
4
  # Generic settings
5
- MAX_RETRIES = 3
6
- RETRY_DELAY = 2
7
5
  ROOT_DIR = Path(__file__).parents[1]
8
6
 
7
+ # Service retry settings
8
+ # With the following setup (neglecting the jitter) we have 6 attempts with delays:
9
+ # 0s, 1s, 4s, 16s, 64s, 64s (because of the max delay)
10
+ RETRY_STOP_AFTER_ATTEMPT = 6
11
+ RETRY_INITIAL_DELAY = 1
12
+ RETRY_MAX_DELAY = 64
13
+ RETRY_EXP_BASE = 4
14
+ RETRY_JITTER = 1
15
+ RETRY_SKIP_IF_CODE = [400, 401, 403] # Skip retrying on these HTTP status codes
16
+
9
17
  # Serp settings
10
18
  GOOGLE_LOCATIONS_FILENAME = ROOT_DIR / "fraudcrawler" / "base" / "google-locations.json"
11
19
  GOOGLE_LANGUAGES_FILENAME = ROOT_DIR / "fraudcrawler" / "base" / "google-languages.json"
@@ -15,14 +23,43 @@ SERP_DEFAULT_COUNTRY_CODES: List[str] = [
15
23
 
16
24
  # URL De-duplication settings
17
25
  KNOWN_TRACKERS = [
18
- "srsltid",
19
- "utm_source",
20
- "utm_medium",
21
- "utm_campaign",
22
- "utm_term",
23
- "utm_content",
24
- "ar",
25
- "ps",
26
+ "srsltid", # Search result click ID (used by some search engines)
27
+ "utm_source", # UTM: Source of the traffic (e.g., Google, Newsletter)
28
+ "utm_medium", # UTM: Medium such as CPC, email, social
29
+ "utm_campaign", # UTM: Campaign name (e.g., summer_sale)
30
+ "utm_term", # UTM: Keyword term (used in paid search)
31
+ "utm_content", # UTM: Used to differentiate similar links or ads
32
+ "ar", # Often used for ad region or targeting info
33
+ "ps", # Could refer to promotion source or partner segment
34
+ "gclid", # Google Ads click ID (auto-tagging)
35
+ "gclsrc", # Source of the GCLID (e.g., ads, search)
36
+ "sku", # Product SKU identifier, often used in ecommerce links
37
+ "ref", # Referrer username or source (e.g., GitHub ref links)
38
+ "referral", # Alternate form of referrer, often human-readable
39
+ "aff_id", # Affiliate identifier (ID-based)
40
+ "aff", # Short form for affiliate tag
41
+ "affiliate", # Affiliate tracking parameter (human-readable)
42
+ "partner", # Indicates marketing or distribution partner
43
+ "fbclid", # Facebook Click Identifier
44
+ "msclkid", # Microsoft/Bing Ads click identifier
45
+ "twclid", # Twitter Ads click identifier
46
+ "variant", # A/B test variant (used to test versions of pages)
47
+ "session_id", # Session tracking ID, should not persist across URLs
48
+ "track", # Generic flag used to enable/disable tracking
49
+ "cid", # Campaign ID (used in ads or emails)
50
+ "campaignid", # Alternate or long-form campaign ID
51
+ "adgroup", # Ad group identifier for campaigns
52
+ "bannerid", # Specific banner ad ID (for display ad tracking)
53
+ "token", # Often used to identify users or temporary sessions
54
+ "tag", # Affiliate or marketing tag (used for tracking)
55
+ "hash", # Generic hash identifier, often for state or cache
56
+ "user", # User ID or identifier passed in URL (should be avoided)
57
+ "src", # Generic source indicator, less formal than `utm_source`
58
+ "selsort", # Sorting parameter for search results
59
+ "shid", # Shop ID (used in ecommerce)
60
+ "shoparea", # Shop area (used in ecommerce)
61
+ "shopid", # Shop ID (used in ecommerce)
62
+ "shoparea", # Shop area (used in ecommerce)
26
63
  ]
27
64
 
28
65
  # Enrichment settings
@@ -34,6 +71,7 @@ ZYTE_DEFALUT_PROBABILITY_THRESHOLD = 0.1
34
71
  # Processor settings
35
72
  PROCESSOR_DEFAULT_MODEL = "gpt-4o"
36
73
  PROCESSOR_DEFAULT_IF_MISSING = -1
74
+ PROCESSOR_EMPTY_TOKEN_COUNT = -1
37
75
  PROCESSOR_USER_PROMPT_TEMPLATE = "Product Details:\n{product_details}\n\nRelevance:"
38
76
  PROCESSOR_PRODUCT_DETAILS_TEMPLATE = "{field_name}:\n{field_value}"
39
77
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: fraudcrawler
3
- Version: 0.4.3
3
+ Version: 0.4.6
4
4
  Summary: Intelligent Market Monitoring
5
5
  Home-page: https://github.com/open-veanu/fraudcrawler
6
6
  License: MIT
@@ -18,6 +18,7 @@ Requires-Dist: openai (>=1.68.2,<2.0.0)
18
18
  Requires-Dist: pandas (>=2.2.3,<3.0.0)
19
19
  Requires-Dist: pydantic-settings (>=2.8.1,<3.0.0)
20
20
  Requires-Dist: requests (>=2.32.3,<3.0.0)
21
+ Requires-Dist: tenacity (>=9.1.2,<10.0.0)
21
22
  Project-URL: Repository, https://github.com/open-veanu/fraudcrawler
22
23
  Description-Content-Type: text/markdown
23
24
 
@@ -0,0 +1,22 @@
1
+ fraudcrawler/__init__.py,sha256=zAqnJ9Mewq0qzSfOjyaICyqDRQZE_Z3FmyF2IPdOhXo,788
2
+ fraudcrawler/base/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
+ fraudcrawler/base/base.py,sha256=pYGdRV_Ssw5fA6tLVhlZwAO0OLQl6qn6LgJPCzOCrpc,6258
4
+ fraudcrawler/base/client.py,sha256=FibiYycjUys-c4sv66Y2JqJu5y15be2MYd2_9yB3wG8,4936
5
+ fraudcrawler/base/google-languages.json,sha256=z0VtjMCsCcZq11OkCIb9jMDD1p9Ty4lhV7bq4ddYvec,10748
6
+ fraudcrawler/base/google-locations.json,sha256=UtNu0iSStllvFRTQXMobWKmZR1hKmtgtHftLNgaJTT0,9204345
7
+ fraudcrawler/base/orchestrator.py,sha256=UzqEtC7Szw1-Ic31lex04Mgpf2f7MM-odwhC0gTxN2Q,23566
8
+ fraudcrawler/base/retry.py,sha256=OKdOed7mP2VLYJLi1zo0MC8ISMm7k3gZgtNuqn50NhI,995
9
+ fraudcrawler/launch_demo_pipeline.py,sha256=CX4A-E63ER7Ip9RNI_IyTAXerYXcQ-NoSvhvLDLdP-s,4640
10
+ fraudcrawler/processing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
+ fraudcrawler/processing/processor.py,sha256=-QdLiAhdPLdYWcMvbKmuPQ_WlvFEDpmEXNps1QGChvQ,7421
12
+ fraudcrawler/scraping/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
+ fraudcrawler/scraping/enrich.py,sha256=1vRGUtF9F8aw46qjKSUiVqGXLdRPaUmI8e5Bu-ZYt8Y,12398
14
+ fraudcrawler/scraping/serp.py,sha256=aTsrH9R9yOpEH_ga-h1BylAtVl4sf9eHIaCv798GLEE,18782
15
+ fraudcrawler/scraping/url.py,sha256=5Z3hPW73E-TLhM-Zha8OTcUOumc_rcx64R0fT9z2Hi8,1748
16
+ fraudcrawler/scraping/zyte.py,sha256=Pv0i2Ni6oamIo_aFdG9c-Kon0PM6oTmMgVYdT3KwvYo,7602
17
+ fraudcrawler/settings.py,sha256=zoNd4LCBL1JNfICiYlLkggw8rGr_tkFc7rrE1morLKI,3442
18
+ fraudcrawler-0.4.6.dist-info/LICENSE,sha256=B-3FuHfe3S0fWAlKlceskPcRhzXq81g-rJ-ddUYb4O8,1062
19
+ fraudcrawler-0.4.6.dist-info/METADATA,sha256=z1dneOJNzGU4cIEEOs0kTAdibcdjYBQnrUKb8N5rOSg,5973
20
+ fraudcrawler-0.4.6.dist-info/WHEEL,sha256=RaoafKOydTQ7I_I3JTrPCg6kUmTgtm4BornzOqyEfJ8,88
21
+ fraudcrawler-0.4.6.dist-info/entry_points.txt,sha256=1Befm7cM6945y2AA1z9V4gZV63mtSWcAs7ypvgux_Xg,79
22
+ fraudcrawler-0.4.6.dist-info/RECORD,,
@@ -1,21 +0,0 @@
1
- fraudcrawler/__init__.py,sha256=zAqnJ9Mewq0qzSfOjyaICyqDRQZE_Z3FmyF2IPdOhXo,788
2
- fraudcrawler/base/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
- fraudcrawler/base/base.py,sha256=JWjZ3mpX4caQAsWKYqtHrUqHfHr6GXlAaEjxxHV9ODQ,6020
4
- fraudcrawler/base/client.py,sha256=FibiYycjUys-c4sv66Y2JqJu5y15be2MYd2_9yB3wG8,4936
5
- fraudcrawler/base/google-languages.json,sha256=z0VtjMCsCcZq11OkCIb9jMDD1p9Ty4lhV7bq4ddYvec,10748
6
- fraudcrawler/base/google-locations.json,sha256=UtNu0iSStllvFRTQXMobWKmZR1hKmtgtHftLNgaJTT0,9204345
7
- fraudcrawler/base/orchestrator.py,sha256=xOMxA0zPUXSF8AGY5AUqzsOO9LfRIjxI2HuZf__Z_sI,24689
8
- fraudcrawler/launch_demo_pipeline.py,sha256=CX4A-E63ER7Ip9RNI_IyTAXerYXcQ-NoSvhvLDLdP-s,4640
9
- fraudcrawler/processing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
- fraudcrawler/processing/processor.py,sha256=An2orst0YRIav7bFuoDMgjwWz2Z9dyjVUbkNAMXNTTo,3748
11
- fraudcrawler/scraping/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
- fraudcrawler/scraping/enrich.py,sha256=X1BBZshdZqPmbduzhGwH0ULSzq03L_7bf7_UL8yOQ9E,10608
13
- fraudcrawler/scraping/serp.py,sha256=divEp1UBUsws24PWZABhWIxOmaLqLwdeGn4KNrqWkYA,17865
14
- fraudcrawler/scraping/url.py,sha256=5Z3hPW73E-TLhM-Zha8OTcUOumc_rcx64R0fT9z2Hi8,1748
15
- fraudcrawler/scraping/zyte.py,sha256=DUF5pIwpZyQw30qURnFxtp8KYpUgBkrXjM7RaVGH92Q,7005
16
- fraudcrawler/settings.py,sha256=31jvRFfB-gsVbeidLLl4iQgrFL7GH-824lerIniPI08,1017
17
- fraudcrawler-0.4.3.dist-info/LICENSE,sha256=B-3FuHfe3S0fWAlKlceskPcRhzXq81g-rJ-ddUYb4O8,1062
18
- fraudcrawler-0.4.3.dist-info/METADATA,sha256=jlk2WdtXEK0-s6QRQdI96EBpQiyHWKgJiYeW93yiU24,5931
19
- fraudcrawler-0.4.3.dist-info/WHEEL,sha256=RaoafKOydTQ7I_I3JTrPCg6kUmTgtm4BornzOqyEfJ8,88
20
- fraudcrawler-0.4.3.dist-info/entry_points.txt,sha256=1Befm7cM6945y2AA1z9V4gZV63mtSWcAs7ypvgux_Xg,79
21
- fraudcrawler-0.4.3.dist-info/RECORD,,