fraudcrawler 0.4.5__tar.gz → 0.4.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of fraudcrawler might be problematic. Click here for more details.
- {fraudcrawler-0.4.5 → fraudcrawler-0.4.7}/PKG-INFO +2 -1
- {fraudcrawler-0.4.5 → fraudcrawler-0.4.7}/fraudcrawler/base/orchestrator.py +6 -35
- fraudcrawler-0.4.7/fraudcrawler/base/retry.py +37 -0
- {fraudcrawler-0.4.5 → fraudcrawler-0.4.7}/fraudcrawler/launch_demo_pipeline.py +2 -3
- {fraudcrawler-0.4.5 → fraudcrawler-0.4.7}/fraudcrawler/processing/processor.py +81 -13
- {fraudcrawler-0.4.5 → fraudcrawler-0.4.7}/fraudcrawler/scraping/enrich.py +69 -30
- {fraudcrawler-0.4.5 → fraudcrawler-0.4.7}/fraudcrawler/scraping/serp.py +43 -26
- {fraudcrawler-0.4.5 → fraudcrawler-0.4.7}/fraudcrawler/scraping/zyte.py +36 -31
- fraudcrawler-0.4.7/fraudcrawler/settings.py +81 -0
- {fraudcrawler-0.4.5 → fraudcrawler-0.4.7}/pyproject.toml +2 -1
- fraudcrawler-0.4.5/fraudcrawler/settings.py +0 -73
- {fraudcrawler-0.4.5 → fraudcrawler-0.4.7}/LICENSE +0 -0
- {fraudcrawler-0.4.5 → fraudcrawler-0.4.7}/README.md +0 -0
- {fraudcrawler-0.4.5 → fraudcrawler-0.4.7}/fraudcrawler/__init__.py +0 -0
- {fraudcrawler-0.4.5 → fraudcrawler-0.4.7}/fraudcrawler/base/__init__.py +0 -0
- {fraudcrawler-0.4.5 → fraudcrawler-0.4.7}/fraudcrawler/base/base.py +0 -0
- {fraudcrawler-0.4.5 → fraudcrawler-0.4.7}/fraudcrawler/base/client.py +0 -0
- {fraudcrawler-0.4.5 → fraudcrawler-0.4.7}/fraudcrawler/base/google-languages.json +0 -0
- {fraudcrawler-0.4.5 → fraudcrawler-0.4.7}/fraudcrawler/base/google-locations.json +0 -0
- {fraudcrawler-0.4.5 → fraudcrawler-0.4.7}/fraudcrawler/processing/__init__.py +0 -0
- {fraudcrawler-0.4.5 → fraudcrawler-0.4.7}/fraudcrawler/scraping/__init__.py +0 -0
- {fraudcrawler-0.4.5 → fraudcrawler-0.4.7}/fraudcrawler/scraping/url.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: fraudcrawler
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.7
|
|
4
4
|
Summary: Intelligent Market Monitoring
|
|
5
5
|
Home-page: https://github.com/open-veanu/fraudcrawler
|
|
6
6
|
License: MIT
|
|
@@ -17,6 +17,7 @@ Requires-Dist: openai (>=1.68.2,<2.0.0)
|
|
|
17
17
|
Requires-Dist: pandas (>=2.2.3,<3.0.0)
|
|
18
18
|
Requires-Dist: pydantic-settings (>=2.8.1,<3.0.0)
|
|
19
19
|
Requires-Dist: requests (>=2.32.3,<3.0.0)
|
|
20
|
+
Requires-Dist: tenacity (>=9.1.2,<10.0.0)
|
|
20
21
|
Project-URL: Repository, https://github.com/open-veanu/fraudcrawler
|
|
21
22
|
Description-Content-Type: text/markdown
|
|
22
23
|
|
|
@@ -7,10 +7,6 @@ from bs4 import BeautifulSoup
|
|
|
7
7
|
|
|
8
8
|
from fraudcrawler.settings import (
|
|
9
9
|
PROCESSOR_DEFAULT_MODEL,
|
|
10
|
-
PROCESSOR_DEFAULT_IF_MISSING,
|
|
11
|
-
PROCESSOR_PRODUCT_DETAILS_TEMPLATE,
|
|
12
|
-
MAX_RETRIES,
|
|
13
|
-
RETRY_DELAY,
|
|
14
10
|
)
|
|
15
11
|
from fraudcrawler.settings import (
|
|
16
12
|
DEFAULT_N_SERP_WKRS,
|
|
@@ -61,9 +57,6 @@ class Orchestrator(ABC):
|
|
|
61
57
|
zyteapi_key: str,
|
|
62
58
|
openaiapi_key: str,
|
|
63
59
|
openai_model: str = PROCESSOR_DEFAULT_MODEL,
|
|
64
|
-
max_retries: int = MAX_RETRIES,
|
|
65
|
-
retry_delay: int = RETRY_DELAY,
|
|
66
|
-
default_if_missing: int = PROCESSOR_DEFAULT_IF_MISSING,
|
|
67
60
|
n_serp_wkrs: int = DEFAULT_N_SERP_WKRS,
|
|
68
61
|
n_zyte_wkrs: int = DEFAULT_N_ZYTE_WKRS,
|
|
69
62
|
n_proc_wkrs: int = DEFAULT_N_PROC_WKRS,
|
|
@@ -77,25 +70,18 @@ class Orchestrator(ABC):
|
|
|
77
70
|
zyteapi_key: The API key for Zyte API.
|
|
78
71
|
openaiapi_key: The API key for OpenAI.
|
|
79
72
|
openai_model: The model to use for the processing (optional).
|
|
80
|
-
max_retries: Maximum number of retries for API calls (optional).
|
|
81
|
-
retry_delay: Delay between retries in seconds (optional).
|
|
82
73
|
n_serp_wkrs: Number of async workers for serp (optional).
|
|
83
74
|
n_zyte_wkrs: Number of async workers for zyte (optional).
|
|
84
75
|
n_proc_wkrs: Number of async workers for the processor (optional).
|
|
85
76
|
"""
|
|
86
77
|
# Setup the clients
|
|
87
|
-
self._serpapi = SerpApi(
|
|
88
|
-
api_key=serpapi_key, max_retries=max_retries, retry_delay=retry_delay
|
|
89
|
-
)
|
|
78
|
+
self._serpapi = SerpApi(api_key=serpapi_key)
|
|
90
79
|
self._enricher = Enricher(user=dataforseo_user, pwd=dataforseo_pwd)
|
|
91
80
|
self._url_collector = URLCollector()
|
|
92
|
-
self._zyteapi = ZyteApi(
|
|
93
|
-
api_key=zyteapi_key, max_retries=max_retries, retry_delay=retry_delay
|
|
94
|
-
)
|
|
81
|
+
self._zyteapi = ZyteApi(api_key=zyteapi_key)
|
|
95
82
|
self._processor = Processor(
|
|
96
83
|
api_key=openaiapi_key,
|
|
97
84
|
model=openai_model,
|
|
98
|
-
default_if_missing=default_if_missing,
|
|
99
85
|
)
|
|
100
86
|
|
|
101
87
|
# Setup the async framework
|
|
@@ -249,7 +235,6 @@ class Orchestrator(ABC):
|
|
|
249
235
|
"""
|
|
250
236
|
|
|
251
237
|
# Process the products
|
|
252
|
-
|
|
253
238
|
while True:
|
|
254
239
|
product = await queue_in.get()
|
|
255
240
|
if product is None:
|
|
@@ -259,27 +244,11 @@ class Orchestrator(ABC):
|
|
|
259
244
|
|
|
260
245
|
if not product.filtered:
|
|
261
246
|
try:
|
|
262
|
-
url = product.url
|
|
263
247
|
# Run all the configured prompts
|
|
264
248
|
for prompt in prompts:
|
|
265
|
-
# Dynamically build product_details string
|
|
266
|
-
details = []
|
|
267
|
-
for field in prompt.product_item_fields:
|
|
268
|
-
value = getattr(product, field, None)
|
|
269
|
-
if value is not None:
|
|
270
|
-
details.append(
|
|
271
|
-
PROCESSOR_PRODUCT_DETAILS_TEMPLATE.format(
|
|
272
|
-
field_name=field, field_value=value
|
|
273
|
-
)
|
|
274
|
-
)
|
|
275
|
-
product_details = "\n\n".join(details)
|
|
276
|
-
logger.debug(
|
|
277
|
-
f"Classify product at {url} with prompt {prompt.name} and details: {product_details}"
|
|
278
|
-
)
|
|
279
249
|
classification = await self._processor.classify(
|
|
250
|
+
product=product,
|
|
280
251
|
prompt=prompt,
|
|
281
|
-
url=url,
|
|
282
|
-
product_details=product_details,
|
|
283
252
|
)
|
|
284
253
|
product.classifications[prompt.name] = int(
|
|
285
254
|
classification.result
|
|
@@ -289,7 +258,9 @@ class Orchestrator(ABC):
|
|
|
289
258
|
"output_tokens": classification.output_tokens,
|
|
290
259
|
}
|
|
291
260
|
except Exception as e:
|
|
292
|
-
logger.warning(
|
|
261
|
+
logger.warning(
|
|
262
|
+
f"Error processing product with url={product.url}: {e}."
|
|
263
|
+
)
|
|
293
264
|
|
|
294
265
|
await queue_out.put(product)
|
|
295
266
|
queue_in.task_done()
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
from aiohttp.web_exceptions import HTTPException
|
|
2
|
+
from tenacity import (
|
|
3
|
+
AsyncRetrying,
|
|
4
|
+
retry_if_exception,
|
|
5
|
+
stop_after_attempt,
|
|
6
|
+
wait_exponential_jitter,
|
|
7
|
+
)
|
|
8
|
+
|
|
9
|
+
from fraudcrawler.settings import (
|
|
10
|
+
RETRY_STOP_AFTER_ATTEMPT,
|
|
11
|
+
RETRY_INITIAL_DELAY,
|
|
12
|
+
RETRY_MAX_DELAY,
|
|
13
|
+
RETRY_EXP_BASE,
|
|
14
|
+
RETRY_JITTER,
|
|
15
|
+
RETRY_SKIP_IF_CODE,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _is_retryable_exception(err: BaseException) -> bool:
|
|
20
|
+
if isinstance(err, HTTPException) and err.status_code in RETRY_SKIP_IF_CODE:
|
|
21
|
+
return False
|
|
22
|
+
return True
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def get_async_retry() -> AsyncRetrying:
|
|
26
|
+
"""returns the retry configuration for async operations."""
|
|
27
|
+
return AsyncRetrying(
|
|
28
|
+
retry=retry_if_exception(_is_retryable_exception),
|
|
29
|
+
stop=stop_after_attempt(RETRY_STOP_AFTER_ATTEMPT),
|
|
30
|
+
wait=wait_exponential_jitter(
|
|
31
|
+
initial=RETRY_INITIAL_DELAY,
|
|
32
|
+
max=RETRY_MAX_DELAY,
|
|
33
|
+
exp_base=RETRY_EXP_BASE,
|
|
34
|
+
jitter=RETRY_JITTER,
|
|
35
|
+
),
|
|
36
|
+
reraise=True,
|
|
37
|
+
)
|
|
@@ -8,12 +8,11 @@ DATE_FMT = "%Y-%m-%d %H:%M:%S"
|
|
|
8
8
|
logging.basicConfig(format=LOG_FMT, level=LOG_LVL, datefmt=DATE_FMT)
|
|
9
9
|
|
|
10
10
|
|
|
11
|
-
def
|
|
11
|
+
def search(search_term: str):
|
|
12
12
|
# Setup the client
|
|
13
13
|
client = FraudCrawlerClient()
|
|
14
14
|
|
|
15
15
|
# Setup the search
|
|
16
|
-
search_term = "Medion Kühlbox MD 37454"
|
|
17
16
|
language = Language(name="German")
|
|
18
17
|
location = Location(name="Switzerland")
|
|
19
18
|
deepness = Deepness(num_results=10)
|
|
@@ -98,4 +97,4 @@ def main():
|
|
|
98
97
|
|
|
99
98
|
|
|
100
99
|
if __name__ == "__main__":
|
|
101
|
-
|
|
100
|
+
search(search_term = "Medion Kühlbox MD 37454")
|
|
@@ -1,9 +1,12 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
|
|
3
3
|
from openai import AsyncOpenAI
|
|
4
|
+
from tenacity import RetryCallState
|
|
4
5
|
|
|
5
|
-
from fraudcrawler.base.base import Prompt, ClassificationResult
|
|
6
|
+
from fraudcrawler.base.base import ProductItem, Prompt, ClassificationResult
|
|
7
|
+
from fraudcrawler.base.retry import get_async_retry
|
|
6
8
|
from fraudcrawler.settings import (
|
|
9
|
+
PROCESSOR_PRODUCT_DETAILS_TEMPLATE,
|
|
7
10
|
PROCESSOR_USER_PROMPT_TEMPLATE,
|
|
8
11
|
PROCESSOR_DEFAULT_IF_MISSING,
|
|
9
12
|
PROCESSOR_EMPTY_TOKEN_COUNT,
|
|
@@ -39,6 +42,50 @@ class Processor:
|
|
|
39
42
|
output_tokens=empty_token_count,
|
|
40
43
|
)
|
|
41
44
|
|
|
45
|
+
@staticmethod
|
|
46
|
+
def _get_product_details(product: ProductItem, prompt: Prompt) -> str:
|
|
47
|
+
"""Extracts product details based on the prompt configuration.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
product: The product item to extract details from.
|
|
51
|
+
prompt: The prompt configuration containing field names.
|
|
52
|
+
"""
|
|
53
|
+
details = []
|
|
54
|
+
for field in prompt.product_item_fields:
|
|
55
|
+
if value := getattr(product, field, None):
|
|
56
|
+
details.append(
|
|
57
|
+
PROCESSOR_PRODUCT_DETAILS_TEMPLATE.format(
|
|
58
|
+
field_name=field, field_value=value
|
|
59
|
+
)
|
|
60
|
+
)
|
|
61
|
+
else:
|
|
62
|
+
logger.error(
|
|
63
|
+
f'Field "{field}" is missing in ProductItem with url="{product.url}"'
|
|
64
|
+
)
|
|
65
|
+
return "\n\n".join(details)
|
|
66
|
+
|
|
67
|
+
@staticmethod
|
|
68
|
+
def _log_before(url: str, prompt: Prompt, retry_state: RetryCallState) -> None:
|
|
69
|
+
"""Context aware logging before the request is made."""
|
|
70
|
+
if retry_state:
|
|
71
|
+
logger.debug(
|
|
72
|
+
f"Classifying product with url={url} using prompt={prompt} (Attempt {retry_state.attempt_number})."
|
|
73
|
+
)
|
|
74
|
+
else:
|
|
75
|
+
logger.debug(f"retry_state is {retry_state}; not logging before.")
|
|
76
|
+
|
|
77
|
+
@staticmethod
|
|
78
|
+
def _log_before_sleep(
|
|
79
|
+
url: str, prompt: Prompt, retry_state: RetryCallState
|
|
80
|
+
) -> None:
|
|
81
|
+
"""Context aware logging before sleeping after a failed request."""
|
|
82
|
+
if retry_state and retry_state.outcome:
|
|
83
|
+
logger.warning(
|
|
84
|
+
f"Attempt {retry_state.attempt_number} of classifying product with url={url} using prompt={prompt} "
|
|
85
|
+
f"failed with error: {retry_state.outcome.exception()}. "
|
|
86
|
+
f"Retrying in {retry_state.upcoming_sleep:.0f} seconds."
|
|
87
|
+
)
|
|
88
|
+
|
|
42
89
|
async def _call_openai_api(
|
|
43
90
|
self,
|
|
44
91
|
system_prompt: str,
|
|
@@ -59,7 +106,12 @@ class Processor:
|
|
|
59
106
|
raise ValueError("Empty response from OpenAI API")
|
|
60
107
|
|
|
61
108
|
# Convert the content to an integer
|
|
62
|
-
|
|
109
|
+
try:
|
|
110
|
+
content = int(content.strip())
|
|
111
|
+
except Exception as e:
|
|
112
|
+
msg = f"Failed to convert OpenAI response '{content}' to integer: {e}"
|
|
113
|
+
logger.error(msg)
|
|
114
|
+
raise ValueError(msg)
|
|
63
115
|
|
|
64
116
|
# For tracking consumption we alre return the tokens used
|
|
65
117
|
classification = ClassificationResult(
|
|
@@ -71,15 +123,16 @@ class Processor:
|
|
|
71
123
|
return classification
|
|
72
124
|
|
|
73
125
|
async def classify(
|
|
74
|
-
self,
|
|
126
|
+
self,
|
|
127
|
+
product: ProductItem,
|
|
128
|
+
prompt: Prompt,
|
|
75
129
|
) -> ClassificationResult:
|
|
76
130
|
"""A generic classification method that classifies a product based on a prompt object and returns
|
|
77
131
|
the classification, input tokens, and output tokens.
|
|
78
132
|
|
|
79
133
|
Args:
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
product_details: String with product details, formatted per prompt.product_item_fields.
|
|
134
|
+
product: The product item to classify.
|
|
135
|
+
prompt: The prompt to use for classification.
|
|
83
136
|
|
|
84
137
|
Note:
|
|
85
138
|
This method returns `PROCESSOR_DEFAULT_IF_MISSING` if:
|
|
@@ -87,12 +140,15 @@ class Processor:
|
|
|
87
140
|
- an error occurs during the API call
|
|
88
141
|
- if the response isn't in allowed_classes.
|
|
89
142
|
"""
|
|
90
|
-
|
|
143
|
+
url = product.url
|
|
144
|
+
|
|
145
|
+
# Form the product details from the ProductItem
|
|
146
|
+
product_details = self._get_product_details(product=product, prompt=prompt)
|
|
91
147
|
if not product_details:
|
|
92
148
|
logger.warning("Missing required product_details for classification.")
|
|
93
149
|
return self._error_response
|
|
94
150
|
|
|
95
|
-
#
|
|
151
|
+
# Prepare the user prompt
|
|
96
152
|
user_prompt = PROCESSOR_USER_PROMPT_TEMPLATE.format(
|
|
97
153
|
product_details=product_details,
|
|
98
154
|
)
|
|
@@ -100,13 +156,25 @@ class Processor:
|
|
|
100
156
|
# Call the OpenAI API
|
|
101
157
|
try:
|
|
102
158
|
logger.debug(
|
|
103
|
-
f
|
|
159
|
+
f"Classifying product with url={url} using prompt={prompt.name} and user_prompt={user_prompt}."
|
|
160
|
+
)
|
|
161
|
+
# Perform the request and retry if necessary. There is some context aware logging
|
|
162
|
+
# - `before`: before the request is made (or before retrying)
|
|
163
|
+
# - `before_sleep`: if the request fails before sleeping
|
|
164
|
+
retry = get_async_retry()
|
|
165
|
+
retry.before = lambda retry_state: self._log_before(
|
|
166
|
+
url=url, prompt=prompt, retry_state=retry_state
|
|
104
167
|
)
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
user_prompt=user_prompt,
|
|
108
|
-
max_tokens=1,
|
|
168
|
+
retry.before_sleep = lambda retry_state: self._log_before_sleep(
|
|
169
|
+
url=url, prompt=prompt, retry_state=retry_state
|
|
109
170
|
)
|
|
171
|
+
async for attempt in retry:
|
|
172
|
+
with attempt:
|
|
173
|
+
classification = await self._call_openai_api(
|
|
174
|
+
system_prompt=prompt.system_prompt,
|
|
175
|
+
user_prompt=user_prompt,
|
|
176
|
+
max_tokens=1,
|
|
177
|
+
)
|
|
110
178
|
|
|
111
179
|
# Enforce that the classification is in the allowed classes
|
|
112
180
|
if classification.result not in prompt.allowed_classes:
|
|
@@ -4,8 +4,11 @@ import logging
|
|
|
4
4
|
from pydantic import BaseModel
|
|
5
5
|
from typing import Dict, List, Iterator
|
|
6
6
|
|
|
7
|
+
from tenacity import RetryCallState
|
|
8
|
+
|
|
7
9
|
from fraudcrawler.settings import ENRICHMENT_DEFAULT_LIMIT
|
|
8
10
|
from fraudcrawler.base.base import Location, Language, AsyncClient
|
|
11
|
+
from fraudcrawler.base.retry import get_async_retry
|
|
9
12
|
|
|
10
13
|
|
|
11
14
|
logger = logging.getLogger(__name__)
|
|
@@ -22,8 +25,6 @@ class Enricher(AsyncClient):
|
|
|
22
25
|
"""A client to interact with the DataForSEO API for enhancing searches (producing alternative search_terms)."""
|
|
23
26
|
|
|
24
27
|
_auth_encoding = "ascii"
|
|
25
|
-
_max_retries = 3
|
|
26
|
-
_retry_delay = 2
|
|
27
28
|
_base_endpoint = "https://api.dataforseo.com"
|
|
28
29
|
_suggestions_endpoint = "/v3/dataforseo_labs/google/keyword_suggestions/live"
|
|
29
30
|
_keywords_endpoint = "/v3/dataforseo_labs/google/related_keywords/live"
|
|
@@ -44,6 +45,28 @@ class Enricher(AsyncClient):
|
|
|
44
45
|
"Content-Encoding": "gzip",
|
|
45
46
|
}
|
|
46
47
|
|
|
48
|
+
@staticmethod
|
|
49
|
+
def _log_before(search_term: str, retry_state: RetryCallState | None) -> None:
|
|
50
|
+
"""Context aware logging before the request is made."""
|
|
51
|
+
if retry_state:
|
|
52
|
+
logger.debug(
|
|
53
|
+
f'DataForSEO suggested search with search="{search_term}" (attempt {retry_state.attempt_number}).'
|
|
54
|
+
)
|
|
55
|
+
else:
|
|
56
|
+
logger.debug(f"retry_state is {retry_state}, not logging before.")
|
|
57
|
+
|
|
58
|
+
@staticmethod
|
|
59
|
+
def _log_before_sleep(search_term: str, retry_state: RetryCallState | None) -> None:
|
|
60
|
+
"""Context aware logging before sleeping after a failed request."""
|
|
61
|
+
if retry_state and retry_state.outcome:
|
|
62
|
+
logger.warning(
|
|
63
|
+
f'Attempt {retry_state.attempt_number} DataForSEO suggested search with search_term="{search_term}" '
|
|
64
|
+
f"failed with error: {retry_state.outcome.exception()}. "
|
|
65
|
+
f"Retrying in {retry_state.upcoming_sleep:.0f} seconds."
|
|
66
|
+
)
|
|
67
|
+
else:
|
|
68
|
+
logger.debug(f"retry_state is {retry_state}, not logging before_sleep.")
|
|
69
|
+
|
|
47
70
|
@staticmethod
|
|
48
71
|
def _extract_items_from_data(data: dict) -> Iterator[dict]:
|
|
49
72
|
"""Extracts the items from the DataForSEO response.
|
|
@@ -126,7 +149,7 @@ class Enricher(AsyncClient):
|
|
|
126
149
|
limit: The upper limit of suggestions to get.
|
|
127
150
|
"""
|
|
128
151
|
|
|
129
|
-
# Data must be a list of dictionaries setting a number of search tasks; here we only have one task.
|
|
152
|
+
# Data must be a list of dictionaries, setting a number of search tasks; here we only have one task.
|
|
130
153
|
data = [
|
|
131
154
|
{
|
|
132
155
|
"keyword": search_term,
|
|
@@ -137,23 +160,25 @@ class Enricher(AsyncClient):
|
|
|
137
160
|
"include_seed_keyword": True,
|
|
138
161
|
}
|
|
139
162
|
]
|
|
140
|
-
|
|
141
|
-
|
|
163
|
+
url = f"{self._base_endpoint}{self._suggestions_endpoint}"
|
|
164
|
+
logger.debug(f'DataForSEO url="{url}" with data="{data}".')
|
|
165
|
+
|
|
166
|
+
# Perform the request and retry if necessary. There is some context aware logging
|
|
167
|
+
# - `before`: before the request is made (or before retrying)
|
|
168
|
+
# - `before_sleep`: if the request fails before sleeping
|
|
169
|
+
retry = get_async_retry()
|
|
170
|
+
retry.before = lambda retry_state: self._log_before(
|
|
171
|
+
search_term=search_term, retry_state=retry_state
|
|
142
172
|
)
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
173
|
+
retry.before_sleep = lambda retry_state: self._log_before_sleep(
|
|
174
|
+
search_term=search_term, retry_state=retry_state
|
|
175
|
+
)
|
|
176
|
+
async for attempt in retry:
|
|
177
|
+
with attempt:
|
|
178
|
+
sugg_data = await self.post(url=url, headers=self._headers, data=data)
|
|
149
179
|
|
|
150
180
|
# Extract the keywords from the response
|
|
151
|
-
|
|
152
|
-
keywords = self._extract_suggested_keywords(data=sugg_data)
|
|
153
|
-
except Exception as e:
|
|
154
|
-
logger.error(
|
|
155
|
-
f"Failed to extract suggested keywords from DataForSEO response with error: {e}."
|
|
156
|
-
)
|
|
181
|
+
keywords = self._extract_suggested_keywords(data=sugg_data)
|
|
157
182
|
|
|
158
183
|
logger.debug(f"Found {len(keywords)} suggestions from DataForSEO search.")
|
|
159
184
|
return keywords
|
|
@@ -271,22 +296,36 @@ class Enricher(AsyncClient):
|
|
|
271
296
|
language: The language to use for the search.
|
|
272
297
|
n_terms: The number of additional terms
|
|
273
298
|
"""
|
|
274
|
-
# Get the additional keywords
|
|
275
299
|
logger.info(
|
|
276
300
|
f'Applying enrichment for search_term="{search_term}" and n_terms="{n_terms}".'
|
|
277
301
|
)
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
302
|
+
# Get the additional suggested keywords
|
|
303
|
+
try:
|
|
304
|
+
suggested = await self._get_suggested_keywords(
|
|
305
|
+
search_term=search_term,
|
|
306
|
+
location=location,
|
|
307
|
+
language=language,
|
|
308
|
+
limit=n_terms,
|
|
309
|
+
)
|
|
310
|
+
except Exception as e:
|
|
311
|
+
logger.error(
|
|
312
|
+
f"Error fetching suggested keywords for search_term='{search_term}': {e}"
|
|
313
|
+
)
|
|
314
|
+
suggested = []
|
|
315
|
+
|
|
316
|
+
# Get the additional related keywords
|
|
317
|
+
try:
|
|
318
|
+
related = await self._get_related_keywords(
|
|
319
|
+
search_term=search_term,
|
|
320
|
+
location=location,
|
|
321
|
+
language=language,
|
|
322
|
+
limit=n_terms,
|
|
323
|
+
)
|
|
324
|
+
except Exception as e:
|
|
325
|
+
logger.error(
|
|
326
|
+
f"Error fetching related keywords for search_term='{search_term}': {e}"
|
|
327
|
+
)
|
|
328
|
+
related = []
|
|
290
329
|
|
|
291
330
|
# Remove original keyword and aggregate them by volume
|
|
292
331
|
keywords = [kw for kw in suggested + related if kw.text != search_term]
|
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
import asyncio
|
|
2
1
|
from enum import Enum
|
|
3
2
|
import logging
|
|
4
3
|
from pydantic import BaseModel
|
|
@@ -6,8 +5,11 @@ from typing import List
|
|
|
6
5
|
from urllib.parse import urlparse
|
|
7
6
|
import re
|
|
8
7
|
|
|
9
|
-
from
|
|
8
|
+
from tenacity import RetryCallState
|
|
9
|
+
|
|
10
|
+
from fraudcrawler.settings import SERP_DEFAULT_COUNTRY_CODES
|
|
10
11
|
from fraudcrawler.base.base import Host, Language, Location, AsyncClient
|
|
12
|
+
from fraudcrawler.base.retry import get_async_retry
|
|
11
13
|
|
|
12
14
|
logger = logging.getLogger(__name__)
|
|
13
15
|
|
|
@@ -42,20 +44,14 @@ class SerpApi(AsyncClient):
|
|
|
42
44
|
def __init__(
|
|
43
45
|
self,
|
|
44
46
|
api_key: str,
|
|
45
|
-
max_retries: int = MAX_RETRIES,
|
|
46
|
-
retry_delay: int = RETRY_DELAY,
|
|
47
47
|
):
|
|
48
48
|
"""Initializes the SerpApiClient with the given API key.
|
|
49
49
|
|
|
50
50
|
Args:
|
|
51
51
|
api_key: The API key for SerpApi.
|
|
52
|
-
max_retries: Maximum number of retries for API calls.
|
|
53
|
-
retry_delay: Delay between retries in seconds.
|
|
54
52
|
"""
|
|
55
53
|
super().__init__()
|
|
56
54
|
self._api_key = api_key
|
|
57
|
-
self._max_retries = max_retries
|
|
58
|
-
self._retry_delay = retry_delay
|
|
59
55
|
|
|
60
56
|
def _get_domain(self, url: str) -> str:
|
|
61
57
|
"""Extracts the second-level domain together with the top-level domain (e.g. `google.com`).
|
|
@@ -116,6 +112,31 @@ class SerpApi(AsyncClient):
|
|
|
116
112
|
|
|
117
113
|
return urls
|
|
118
114
|
|
|
115
|
+
@staticmethod
|
|
116
|
+
def _log_before(search_string: str, retry_state: RetryCallState | None) -> None:
|
|
117
|
+
"""Context aware logging before the request is made."""
|
|
118
|
+
if retry_state:
|
|
119
|
+
logger.debug(
|
|
120
|
+
f'Performing SerpAPI search with q="{search_string}" '
|
|
121
|
+
f"(attempt {retry_state.attempt_number})."
|
|
122
|
+
)
|
|
123
|
+
else:
|
|
124
|
+
logger.debug(f"retry_state is {retry_state}, not logging before.")
|
|
125
|
+
|
|
126
|
+
@staticmethod
|
|
127
|
+
def _log_before_sleep(
|
|
128
|
+
search_string: str, retry_state: RetryCallState | None
|
|
129
|
+
) -> None:
|
|
130
|
+
"""Context aware logging before sleeping after a failed request."""
|
|
131
|
+
if retry_state and retry_state.outcome:
|
|
132
|
+
logger.warning(
|
|
133
|
+
f'Attempt {retry_state.attempt_number} of SerpAPI search with q="{search_string}" '
|
|
134
|
+
f"failed with error: {retry_state.outcome.exception()}. "
|
|
135
|
+
f"Retrying in {retry_state.upcoming_sleep:.0f} seconds."
|
|
136
|
+
)
|
|
137
|
+
else:
|
|
138
|
+
logger.debug(f"retry_state is {retry_state}; not logging before_sleep.")
|
|
139
|
+
|
|
119
140
|
async def _search(
|
|
120
141
|
self,
|
|
121
142
|
engine: str,
|
|
@@ -172,25 +193,21 @@ class SerpApi(AsyncClient):
|
|
|
172
193
|
"num": num_results,
|
|
173
194
|
"api_key": self._api_key,
|
|
174
195
|
}
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
196
|
+
logger.debug(f"SerpAPI search with params: {params}")
|
|
197
|
+
|
|
198
|
+
# Perform the request and retry if necessary. There is some context aware logging:
|
|
199
|
+
# - `before`: before the request is made (and before retrying)
|
|
200
|
+
# - `before_sleep`: if the request fails before sleeping
|
|
201
|
+
retry = get_async_retry()
|
|
202
|
+
retry.before = lambda retry_state: self._log_before(
|
|
203
|
+
search_string=search_string, retry_state=retry_state
|
|
204
|
+
)
|
|
205
|
+
retry.before_sleep = lambda retry_state: self._log_before_sleep(
|
|
206
|
+
search_string=search_string, retry_state=retry_state
|
|
207
|
+
)
|
|
208
|
+
async for attempt in retry:
|
|
209
|
+
with attempt:
|
|
184
210
|
response = await self.get(url=self._endpoint, params=params)
|
|
185
|
-
break
|
|
186
|
-
except Exception as e:
|
|
187
|
-
logger.error(f"SerpAPI search failed with error: {e}.")
|
|
188
|
-
err = e
|
|
189
|
-
attempts += 1
|
|
190
|
-
if attempts < self._max_retries:
|
|
191
|
-
await asyncio.sleep(self._retry_delay)
|
|
192
|
-
if err is not None:
|
|
193
|
-
raise err
|
|
194
211
|
|
|
195
212
|
# Extract the URLs from the response
|
|
196
213
|
urls = self._extract_search_results(response=response, engine=engine)
|
|
@@ -1,16 +1,13 @@
|
|
|
1
|
-
import asyncio
|
|
2
1
|
import logging
|
|
3
2
|
from typing import List
|
|
4
3
|
from base64 import b64decode
|
|
5
4
|
|
|
6
5
|
import aiohttp
|
|
6
|
+
from tenacity import RetryCallState
|
|
7
7
|
|
|
8
|
-
from fraudcrawler.settings import
|
|
9
|
-
MAX_RETRIES,
|
|
10
|
-
RETRY_DELAY,
|
|
11
|
-
ZYTE_DEFALUT_PROBABILITY_THRESHOLD,
|
|
12
|
-
)
|
|
8
|
+
from fraudcrawler.settings import ZYTE_DEFALUT_PROBABILITY_THRESHOLD
|
|
13
9
|
from fraudcrawler.base.base import AsyncClient
|
|
10
|
+
from fraudcrawler.base.retry import get_async_retry
|
|
14
11
|
|
|
15
12
|
logger = logging.getLogger(__name__)
|
|
16
13
|
|
|
@@ -34,19 +31,32 @@ class ZyteApi(AsyncClient):
|
|
|
34
31
|
def __init__(
|
|
35
32
|
self,
|
|
36
33
|
api_key: str,
|
|
37
|
-
max_retries: int = MAX_RETRIES,
|
|
38
|
-
retry_delay: int = RETRY_DELAY,
|
|
39
34
|
):
|
|
40
35
|
"""Initializes the ZyteApiClient with the given API key and retry configurations.
|
|
41
36
|
|
|
42
37
|
Args:
|
|
43
38
|
api_key: The API key for Zyte API.
|
|
44
|
-
max_retries: Maximum number of retries for API calls.
|
|
45
|
-
retry_delay: Delay between retries in seconds.
|
|
46
39
|
"""
|
|
47
40
|
self._aiohttp_basic_auth = aiohttp.BasicAuth(api_key)
|
|
48
|
-
|
|
49
|
-
|
|
41
|
+
|
|
42
|
+
def _log_before(self, url: str, retry_state: RetryCallState | None) -> None:
|
|
43
|
+
"""Context aware logging before the request is made."""
|
|
44
|
+
if retry_state:
|
|
45
|
+
logger.debug(
|
|
46
|
+
f"Zyte fetching product details for URL {url} (Attempt {retry_state.attempt_number})."
|
|
47
|
+
)
|
|
48
|
+
else:
|
|
49
|
+
logger.debug(f"retry_state is {retry_state}; not logging before.")
|
|
50
|
+
|
|
51
|
+
def _log_before_sleep(self, url: str, retry_state: RetryCallState | None) -> None:
|
|
52
|
+
"""Context aware logging before sleeping after a failed request."""
|
|
53
|
+
if retry_state and retry_state.outcome:
|
|
54
|
+
logger.warning(
|
|
55
|
+
f'Attempt {retry_state.attempt_number} of Zyte fetching product details for URL "{url}" '
|
|
56
|
+
f"Retrying in {retry_state.upcoming_sleep:.0f} seconds."
|
|
57
|
+
)
|
|
58
|
+
else:
|
|
59
|
+
logger.debug(f"retry_state is {retry_state}; not logging before_sleep.")
|
|
50
60
|
|
|
51
61
|
async def get_details(self, url: str) -> dict:
|
|
52
62
|
"""Fetches product details for a single URL.
|
|
@@ -74,30 +84,25 @@ class ZyteApi(AsyncClient):
|
|
|
74
84
|
}
|
|
75
85
|
"""
|
|
76
86
|
logger.info(f"Fetching product details by Zyte for URL {url}.")
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
87
|
+
|
|
88
|
+
# Perform the request and retry if necessary. There is some context aware logging:
|
|
89
|
+
# - `before`: before the request is made (and before retrying)
|
|
90
|
+
# - `before_sleep`: if the request fails before sleeping
|
|
91
|
+
retry = get_async_retry()
|
|
92
|
+
retry.before = lambda retry_state: self._log_before(
|
|
93
|
+
url=url, retry_state=retry_state
|
|
94
|
+
)
|
|
95
|
+
retry.before_sleep = lambda retry_state: self._log_before_sleep(
|
|
96
|
+
url=url, retry_state=retry_state
|
|
97
|
+
)
|
|
98
|
+
async for attempt in retry:
|
|
99
|
+
with attempt:
|
|
84
100
|
product = await self.post(
|
|
85
101
|
url=self._endpoint,
|
|
86
102
|
data={"url": url, **self._config},
|
|
87
103
|
auth=self._aiohttp_basic_auth,
|
|
88
104
|
)
|
|
89
|
-
|
|
90
|
-
except Exception as e:
|
|
91
|
-
logger.debug(
|
|
92
|
-
f"Exception occurred while fetching product details for URL {url} (Attempt {attempts + 1})."
|
|
93
|
-
)
|
|
94
|
-
err = e
|
|
95
|
-
attempts += 1
|
|
96
|
-
if attempts < self._max_retries:
|
|
97
|
-
await asyncio.sleep(self._retry_delay)
|
|
98
|
-
if err is not None:
|
|
99
|
-
raise err
|
|
100
|
-
return {}
|
|
105
|
+
return product
|
|
101
106
|
|
|
102
107
|
@staticmethod
|
|
103
108
|
def keep_product(
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from typing import List
|
|
3
|
+
|
|
4
|
+
# Generic settings
|
|
5
|
+
ROOT_DIR = Path(__file__).parents[1]
|
|
6
|
+
|
|
7
|
+
# Service retry settings
|
|
8
|
+
# With the following setup (neglecting the jitter) we have 6 attempts with delays:
|
|
9
|
+
# 0s, 1s, 4s, 16s, 64s, 64s (because of the max delay)
|
|
10
|
+
RETRY_STOP_AFTER_ATTEMPT = 6
|
|
11
|
+
RETRY_INITIAL_DELAY = 1
|
|
12
|
+
RETRY_MAX_DELAY = 64
|
|
13
|
+
RETRY_EXP_BASE = 4
|
|
14
|
+
RETRY_JITTER = 1
|
|
15
|
+
RETRY_SKIP_IF_CODE = [400, 401, 403] # Skip retrying on these HTTP status codes
|
|
16
|
+
|
|
17
|
+
# Serp settings
|
|
18
|
+
GOOGLE_LOCATIONS_FILENAME = ROOT_DIR / "fraudcrawler" / "base" / "google-locations.json"
|
|
19
|
+
GOOGLE_LANGUAGES_FILENAME = ROOT_DIR / "fraudcrawler" / "base" / "google-languages.json"
|
|
20
|
+
SERP_DEFAULT_COUNTRY_CODES: List[str] = [
|
|
21
|
+
# ".com",
|
|
22
|
+
]
|
|
23
|
+
|
|
24
|
+
# URL De-duplication settings
|
|
25
|
+
KNOWN_TRACKERS = [
|
|
26
|
+
"srsltid", # Search result click ID (used by some search engines)
|
|
27
|
+
"utm_source", # UTM: Source of the traffic (e.g., Google, Newsletter)
|
|
28
|
+
"utm_medium", # UTM: Medium such as CPC, email, social
|
|
29
|
+
"utm_campaign", # UTM: Campaign name (e.g., summer_sale)
|
|
30
|
+
"utm_term", # UTM: Keyword term (used in paid search)
|
|
31
|
+
"utm_content", # UTM: Used to differentiate similar links or ads
|
|
32
|
+
"ar", # Often used for ad region or targeting info
|
|
33
|
+
"ps", # Could refer to promotion source or partner segment
|
|
34
|
+
"gclid", # Google Ads click ID (auto-tagging)
|
|
35
|
+
"gclsrc", # Source of the GCLID (e.g., ads, search)
|
|
36
|
+
"sku", # Product SKU identifier, often used in ecommerce links
|
|
37
|
+
"ref", # Referrer username or source (e.g., GitHub ref links)
|
|
38
|
+
"referral", # Alternate form of referrer, often human-readable
|
|
39
|
+
"aff_id", # Affiliate identifier (ID-based)
|
|
40
|
+
"aff", # Short form for affiliate tag
|
|
41
|
+
"affiliate", # Affiliate tracking parameter (human-readable)
|
|
42
|
+
"partner", # Indicates marketing or distribution partner
|
|
43
|
+
"fbclid", # Facebook Click Identifier
|
|
44
|
+
"msclkid", # Microsoft/Bing Ads click identifier
|
|
45
|
+
"twclid", # Twitter Ads click identifier
|
|
46
|
+
"variant", # A/B test variant (used to test versions of pages)
|
|
47
|
+
"session_id", # Session tracking ID, should not persist across URLs
|
|
48
|
+
"track", # Generic flag used to enable/disable tracking
|
|
49
|
+
"cid", # Campaign ID (used in ads or emails)
|
|
50
|
+
"campaignid", # Alternate or long-form campaign ID
|
|
51
|
+
"adgroup", # Ad group identifier for campaigns
|
|
52
|
+
"bannerid", # Specific banner ad ID (for display ad tracking)
|
|
53
|
+
"token", # Often used to identify users or temporary sessions
|
|
54
|
+
"tag", # Affiliate or marketing tag (used for tracking)
|
|
55
|
+
"hash", # Generic hash identifier, often for state or cache
|
|
56
|
+
"user", # User ID or identifier passed in URL (should be avoided)
|
|
57
|
+
"src", # Generic source indicator, less formal than `utm_source`
|
|
58
|
+
"selsort", # Sorting parameter for search results
|
|
59
|
+
"shid", # Shop ID (used in ecommerce)
|
|
60
|
+
"shoparea", # Shop area (used in ecommerce)
|
|
61
|
+
"shopid", # Shop ID (used in ecommerce)
|
|
62
|
+
"shoparea", # Shop area (used in ecommerce)
|
|
63
|
+
]
|
|
64
|
+
|
|
65
|
+
# Enrichment settings
|
|
66
|
+
ENRICHMENT_DEFAULT_LIMIT = 10
|
|
67
|
+
|
|
68
|
+
# Zyte settings
|
|
69
|
+
ZYTE_DEFALUT_PROBABILITY_THRESHOLD = 0.1
|
|
70
|
+
|
|
71
|
+
# Processor settings
|
|
72
|
+
PROCESSOR_DEFAULT_MODEL = "gpt-4o"
|
|
73
|
+
PROCESSOR_DEFAULT_IF_MISSING = -1
|
|
74
|
+
PROCESSOR_EMPTY_TOKEN_COUNT = -1
|
|
75
|
+
PROCESSOR_USER_PROMPT_TEMPLATE = "Product Details:\n{product_details}\n\nRelevance:"
|
|
76
|
+
PROCESSOR_PRODUCT_DETAILS_TEMPLATE = "{field_name}:\n{field_value}"
|
|
77
|
+
|
|
78
|
+
# Async settings
|
|
79
|
+
DEFAULT_N_SERP_WKRS = 10
|
|
80
|
+
DEFAULT_N_ZYTE_WKRS = 10
|
|
81
|
+
DEFAULT_N_PROC_WKRS = 10
|
|
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
|
|
|
4
4
|
|
|
5
5
|
[tool.poetry]
|
|
6
6
|
name = "fraudcrawler"
|
|
7
|
-
version = "0.4.
|
|
7
|
+
version = "0.4.7"
|
|
8
8
|
description = "Intelligent Market Monitoring"
|
|
9
9
|
authors = [
|
|
10
10
|
"Domingo Bertus <hello@veanu.ch>",
|
|
@@ -26,6 +26,7 @@ aiohttp = "^3.11.14"
|
|
|
26
26
|
pydantic-settings = "^2.8.1"
|
|
27
27
|
openai = "^1.68.2"
|
|
28
28
|
beautifulsoup4 = "^4.13.4"
|
|
29
|
+
tenacity = "^9.1.2"
|
|
29
30
|
|
|
30
31
|
[tool.poetry.group.dev.dependencies]
|
|
31
32
|
pytest-cov = "^6.0.0"
|
|
@@ -1,73 +0,0 @@
|
|
|
1
|
-
from pathlib import Path
|
|
2
|
-
from typing import List
|
|
3
|
-
|
|
4
|
-
# Generic settings
|
|
5
|
-
MAX_RETRIES = 3
|
|
6
|
-
RETRY_DELAY = 2
|
|
7
|
-
ROOT_DIR = Path(__file__).parents[1]
|
|
8
|
-
|
|
9
|
-
# Serp settings
|
|
10
|
-
GOOGLE_LOCATIONS_FILENAME = ROOT_DIR / "fraudcrawler" / "base" / "google-locations.json"
|
|
11
|
-
GOOGLE_LANGUAGES_FILENAME = ROOT_DIR / "fraudcrawler" / "base" / "google-languages.json"
|
|
12
|
-
SERP_DEFAULT_COUNTRY_CODES: List[str] = [
|
|
13
|
-
# ".com",
|
|
14
|
-
]
|
|
15
|
-
|
|
16
|
-
# URL De-duplication settings
|
|
17
|
-
KNOWN_TRACKERS = [
|
|
18
|
-
"srsltid", # Search result click ID (used by some search engines)
|
|
19
|
-
"utm_source", # UTM: Source of the traffic (e.g., Google, Newsletter)
|
|
20
|
-
"utm_medium", # UTM: Medium such as CPC, email, social
|
|
21
|
-
"utm_campaign", # UTM: Campaign name (e.g., summer_sale)
|
|
22
|
-
"utm_term", # UTM: Keyword term (used in paid search)
|
|
23
|
-
"utm_content", # UTM: Used to differentiate similar links or ads
|
|
24
|
-
"ar", # Often used for ad region or targeting info
|
|
25
|
-
"ps", # Could refer to promotion source or partner segment
|
|
26
|
-
"gclid", # Google Ads click ID (auto-tagging)
|
|
27
|
-
"gclsrc", # Source of the GCLID (e.g., ads, search)
|
|
28
|
-
"sku", # Product SKU identifier, often used in ecommerce links
|
|
29
|
-
"ref", # Referrer username or source (e.g., GitHub ref links)
|
|
30
|
-
"referral", # Alternate form of referrer, often human-readable
|
|
31
|
-
"aff_id", # Affiliate identifier (ID-based)
|
|
32
|
-
"aff", # Short form for affiliate tag
|
|
33
|
-
"affiliate", # Affiliate tracking parameter (human-readable)
|
|
34
|
-
"partner", # Indicates marketing or distribution partner
|
|
35
|
-
"fbclid", # Facebook Click Identifier
|
|
36
|
-
"msclkid", # Microsoft/Bing Ads click identifier
|
|
37
|
-
"twclid", # Twitter Ads click identifier
|
|
38
|
-
"variant", # A/B test variant (used to test versions of pages)
|
|
39
|
-
"session_id", # Session tracking ID, should not persist across URLs
|
|
40
|
-
"track", # Generic flag used to enable/disable tracking
|
|
41
|
-
"cid", # Campaign ID (used in ads or emails)
|
|
42
|
-
"campaignid", # Alternate or long-form campaign ID
|
|
43
|
-
"adgroup", # Ad group identifier for campaigns
|
|
44
|
-
"bannerid", # Specific banner ad ID (for display ad tracking)
|
|
45
|
-
"token", # Often used to identify users or temporary sessions
|
|
46
|
-
"tag", # Affiliate or marketing tag (used for tracking)
|
|
47
|
-
"hash", # Generic hash identifier, often for state or cache
|
|
48
|
-
"user", # User ID or identifier passed in URL (should be avoided)
|
|
49
|
-
"src", # Generic source indicator, less formal than `utm_source`
|
|
50
|
-
"selsort", # Sorting parameter for search results
|
|
51
|
-
"shid", # Shop ID (used in ecommerce)
|
|
52
|
-
"shoparea", # Shop area (used in ecommerce)
|
|
53
|
-
"shopid", # Shop ID (used in ecommerce)
|
|
54
|
-
"shoparea", # Shop area (used in ecommerce)
|
|
55
|
-
]
|
|
56
|
-
|
|
57
|
-
# Enrichment settings
|
|
58
|
-
ENRICHMENT_DEFAULT_LIMIT = 10
|
|
59
|
-
|
|
60
|
-
# Zyte settings
|
|
61
|
-
ZYTE_DEFALUT_PROBABILITY_THRESHOLD = 0.1
|
|
62
|
-
|
|
63
|
-
# Processor settings
|
|
64
|
-
PROCESSOR_DEFAULT_MODEL = "gpt-4o"
|
|
65
|
-
PROCESSOR_DEFAULT_IF_MISSING = -1
|
|
66
|
-
PROCESSOR_EMPTY_TOKEN_COUNT = -1
|
|
67
|
-
PROCESSOR_USER_PROMPT_TEMPLATE = "Product Details:\n{product_details}\n\nRelevance:"
|
|
68
|
-
PROCESSOR_PRODUCT_DETAILS_TEMPLATE = "{field_name}:\n{field_value}"
|
|
69
|
-
|
|
70
|
-
# Async settings
|
|
71
|
-
DEFAULT_N_SERP_WKRS = 10
|
|
72
|
-
DEFAULT_N_ZYTE_WKRS = 10
|
|
73
|
-
DEFAULT_N_PROC_WKRS = 10
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|