fraudcrawler 0.4.2__tar.gz → 0.4.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of fraudcrawler might be problematic. Click here for more details.
- {fraudcrawler-0.4.2 → fraudcrawler-0.4.5}/PKG-INFO +2 -3
- {fraudcrawler-0.4.2 → fraudcrawler-0.4.5}/fraudcrawler/__init__.py +2 -0
- {fraudcrawler-0.4.2 → fraudcrawler-0.4.5}/fraudcrawler/base/base.py +11 -0
- {fraudcrawler-0.4.2 → fraudcrawler-0.4.5}/fraudcrawler/base/orchestrator.py +25 -12
- {fraudcrawler-0.4.2 → fraudcrawler-0.4.5}/fraudcrawler/launch_demo_pipeline.py +1 -1
- {fraudcrawler-0.4.2 → fraudcrawler-0.4.5}/fraudcrawler/processing/processor.py +34 -14
- fraudcrawler-0.4.5/fraudcrawler/scraping/url.py +57 -0
- fraudcrawler-0.4.5/fraudcrawler/settings.py +73 -0
- {fraudcrawler-0.4.2 → fraudcrawler-0.4.5}/pyproject.toml +1 -1
- fraudcrawler-0.4.2/fraudcrawler/settings.py +0 -31
- {fraudcrawler-0.4.2 → fraudcrawler-0.4.5}/LICENSE +0 -0
- {fraudcrawler-0.4.2 → fraudcrawler-0.4.5}/README.md +0 -0
- {fraudcrawler-0.4.2 → fraudcrawler-0.4.5}/fraudcrawler/base/__init__.py +0 -0
- {fraudcrawler-0.4.2 → fraudcrawler-0.4.5}/fraudcrawler/base/client.py +0 -0
- {fraudcrawler-0.4.2 → fraudcrawler-0.4.5}/fraudcrawler/base/google-languages.json +0 -0
- {fraudcrawler-0.4.2 → fraudcrawler-0.4.5}/fraudcrawler/base/google-locations.json +0 -0
- {fraudcrawler-0.4.2 → fraudcrawler-0.4.5}/fraudcrawler/processing/__init__.py +0 -0
- {fraudcrawler-0.4.2 → fraudcrawler-0.4.5}/fraudcrawler/scraping/__init__.py +0 -0
- {fraudcrawler-0.4.2 → fraudcrawler-0.4.5}/fraudcrawler/scraping/enrich.py +0 -0
- {fraudcrawler-0.4.2 → fraudcrawler-0.4.5}/fraudcrawler/scraping/serp.py +1 -1
- {fraudcrawler-0.4.2 → fraudcrawler-0.4.5}/fraudcrawler/scraping/zyte.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
2
|
Name: fraudcrawler
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.5
|
|
4
4
|
Summary: Intelligent Market Monitoring
|
|
5
5
|
Home-page: https://github.com/open-veanu/fraudcrawler
|
|
6
6
|
License: MIT
|
|
@@ -11,7 +11,6 @@ Classifier: License :: OSI Approved :: MIT License
|
|
|
11
11
|
Classifier: Programming Language :: Python :: 3
|
|
12
12
|
Classifier: Programming Language :: Python :: 3.11
|
|
13
13
|
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
-
Classifier: Programming Language :: Python :: 3.13
|
|
15
14
|
Requires-Dist: aiohttp (>=3.11.14,<4.0.0)
|
|
16
15
|
Requires-Dist: beautifulsoup4 (>=4.13.4,<5.0.0)
|
|
17
16
|
Requires-Dist: openai (>=1.68.2,<2.0.0)
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from fraudcrawler.scraping.serp import SerpApi, SearchEngine
|
|
2
2
|
from fraudcrawler.scraping.enrich import Enricher
|
|
3
|
+
from fraudcrawler.scraping.url import URLCollector
|
|
3
4
|
from fraudcrawler.scraping.zyte import ZyteApi
|
|
4
5
|
from fraudcrawler.processing.processor import Processor
|
|
5
6
|
from fraudcrawler.base.orchestrator import Orchestrator
|
|
@@ -18,6 +19,7 @@ __all__ = [
|
|
|
18
19
|
"SerpApi",
|
|
19
20
|
"SearchEngine",
|
|
20
21
|
"Enricher",
|
|
22
|
+
"URLCollector",
|
|
21
23
|
"ZyteApi",
|
|
22
24
|
"Processor",
|
|
23
25
|
"Orchestrator",
|
|
@@ -63,6 +63,14 @@ class Host(BaseModel):
|
|
|
63
63
|
return [cls._normalize_domain(dom.strip()) for dom in val]
|
|
64
64
|
|
|
65
65
|
|
|
66
|
+
class ClassificationResult(BaseModel):
|
|
67
|
+
"""Model for classification results."""
|
|
68
|
+
|
|
69
|
+
result: int
|
|
70
|
+
input_tokens: int
|
|
71
|
+
output_tokens: int
|
|
72
|
+
|
|
73
|
+
|
|
66
74
|
class Location(BaseModel):
|
|
67
75
|
"""Model for location details (e.g. `Location(name="Switzerland", code="ch")`)."""
|
|
68
76
|
|
|
@@ -137,6 +145,9 @@ class ProductItem(BaseModel):
|
|
|
137
145
|
# Processor parameters are set dynamic so we must allow extra fields
|
|
138
146
|
classifications: Dict[str, int] = Field(default_factory=dict)
|
|
139
147
|
|
|
148
|
+
# Usage parameters
|
|
149
|
+
usage: Dict[str, Dict[str, int]] = Field(default_factory=dict)
|
|
150
|
+
|
|
140
151
|
# Filtering parameters
|
|
141
152
|
filtered: bool = False
|
|
142
153
|
filtered_at_stage: str | None = None
|
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
from abc import ABC, abstractmethod
|
|
2
2
|
import asyncio
|
|
3
3
|
import logging
|
|
4
|
-
from typing import Dict, List,
|
|
4
|
+
from typing import Dict, List, cast
|
|
5
|
+
|
|
5
6
|
from bs4 import BeautifulSoup
|
|
6
7
|
|
|
7
8
|
from fraudcrawler.settings import (
|
|
@@ -24,7 +25,14 @@ from fraudcrawler.base.base import (
|
|
|
24
25
|
Prompt,
|
|
25
26
|
ProductItem,
|
|
26
27
|
)
|
|
27
|
-
from fraudcrawler import
|
|
28
|
+
from fraudcrawler import (
|
|
29
|
+
SerpApi,
|
|
30
|
+
SearchEngine,
|
|
31
|
+
Enricher,
|
|
32
|
+
URLCollector,
|
|
33
|
+
ZyteApi,
|
|
34
|
+
Processor,
|
|
35
|
+
)
|
|
28
36
|
|
|
29
37
|
logger = logging.getLogger(__name__)
|
|
30
38
|
|
|
@@ -75,15 +83,12 @@ class Orchestrator(ABC):
|
|
|
75
83
|
n_zyte_wkrs: Number of async workers for zyte (optional).
|
|
76
84
|
n_proc_wkrs: Number of async workers for the processor (optional).
|
|
77
85
|
"""
|
|
78
|
-
# Setup the variables
|
|
79
|
-
self._collected_urls_current_run: Set[str] = set()
|
|
80
|
-
self._collected_urls_previous_runs: Set[str] = set()
|
|
81
|
-
|
|
82
86
|
# Setup the clients
|
|
83
87
|
self._serpapi = SerpApi(
|
|
84
88
|
api_key=serpapi_key, max_retries=max_retries, retry_delay=retry_delay
|
|
85
89
|
)
|
|
86
90
|
self._enricher = Enricher(user=dataforseo_user, pwd=dataforseo_pwd)
|
|
91
|
+
self._url_collector = URLCollector()
|
|
87
92
|
self._zyteapi = ZyteApi(
|
|
88
93
|
api_key=zyteapi_key, max_retries=max_retries, retry_delay=retry_delay
|
|
89
94
|
)
|
|
@@ -156,16 +161,18 @@ class Orchestrator(ABC):
|
|
|
156
161
|
break
|
|
157
162
|
|
|
158
163
|
if not product.filtered:
|
|
159
|
-
|
|
164
|
+
# Clean the URL by removing tracking parameters
|
|
165
|
+
url = self._url_collector.remove_tracking_parameters(product.url)
|
|
166
|
+
product.url = url
|
|
160
167
|
|
|
161
|
-
if url in self.
|
|
168
|
+
if url in self._url_collector.collected_currently:
|
|
162
169
|
# deduplicate on current run
|
|
163
170
|
product.filtered = True
|
|
164
171
|
product.filtered_at_stage = (
|
|
165
172
|
"URL collection (current run deduplication)"
|
|
166
173
|
)
|
|
167
174
|
logger.debug(f"URL {url} already collected in current run")
|
|
168
|
-
elif url in self.
|
|
175
|
+
elif url in self._url_collector.collected_previously:
|
|
169
176
|
# deduplicate on previous runs coming from a db
|
|
170
177
|
product.filtered = True
|
|
171
178
|
product.filtered_at_stage = (
|
|
@@ -173,7 +180,7 @@ class Orchestrator(ABC):
|
|
|
173
180
|
)
|
|
174
181
|
logger.debug(f"URL {url} as already collected in previous run")
|
|
175
182
|
else:
|
|
176
|
-
self.
|
|
183
|
+
self._url_collector.collected_currently.add(url)
|
|
177
184
|
|
|
178
185
|
await queue_out.put(product)
|
|
179
186
|
queue_in.task_done()
|
|
@@ -274,7 +281,13 @@ class Orchestrator(ABC):
|
|
|
274
281
|
url=url,
|
|
275
282
|
product_details=product_details,
|
|
276
283
|
)
|
|
277
|
-
product.classifications[prompt.name] =
|
|
284
|
+
product.classifications[prompt.name] = int(
|
|
285
|
+
classification.result
|
|
286
|
+
)
|
|
287
|
+
product.usage[prompt.name] = {
|
|
288
|
+
"input_tokens": classification.input_tokens,
|
|
289
|
+
"output_tokens": classification.output_tokens,
|
|
290
|
+
}
|
|
278
291
|
except Exception as e:
|
|
279
292
|
logger.warning(f"Error processing product: {e}.")
|
|
280
293
|
|
|
@@ -480,7 +493,7 @@ class Orchestrator(ABC):
|
|
|
480
493
|
# INITIAL SETUP
|
|
481
494
|
# ---------------------------
|
|
482
495
|
if previously_collected_urls:
|
|
483
|
-
self.
|
|
496
|
+
self._url_collector.collected_previously = set(previously_collected_urls)
|
|
484
497
|
|
|
485
498
|
# Setup the async framework
|
|
486
499
|
n_terms_max = 1 + (
|
|
@@ -13,7 +13,7 @@ def main():
|
|
|
13
13
|
client = FraudCrawlerClient()
|
|
14
14
|
|
|
15
15
|
# Setup the search
|
|
16
|
-
search_term = "
|
|
16
|
+
search_term = "Medion Kühlbox MD 37454"
|
|
17
17
|
language = Language(name="German")
|
|
18
18
|
location = Location(name="Switzerland")
|
|
19
19
|
deepness = Deepness(num_results=10)
|
|
@@ -2,10 +2,11 @@ import logging
|
|
|
2
2
|
|
|
3
3
|
from openai import AsyncOpenAI
|
|
4
4
|
|
|
5
|
-
from fraudcrawler.base.base import Prompt
|
|
5
|
+
from fraudcrawler.base.base import Prompt, ClassificationResult
|
|
6
6
|
from fraudcrawler.settings import (
|
|
7
7
|
PROCESSOR_USER_PROMPT_TEMPLATE,
|
|
8
8
|
PROCESSOR_DEFAULT_IF_MISSING,
|
|
9
|
+
PROCESSOR_EMPTY_TOKEN_COUNT,
|
|
9
10
|
)
|
|
10
11
|
|
|
11
12
|
|
|
@@ -20,6 +21,7 @@ class Processor:
|
|
|
20
21
|
api_key: str,
|
|
21
22
|
model: str,
|
|
22
23
|
default_if_missing: int = PROCESSOR_DEFAULT_IF_MISSING,
|
|
24
|
+
empty_token_count: int = PROCESSOR_EMPTY_TOKEN_COUNT,
|
|
23
25
|
):
|
|
24
26
|
"""Initializes the Processor.
|
|
25
27
|
|
|
@@ -27,17 +29,22 @@ class Processor:
|
|
|
27
29
|
api_key: The OpenAI API key.
|
|
28
30
|
model: The OpenAI model to use.
|
|
29
31
|
default_if_missing: The default classification to return if error occurs.
|
|
32
|
+
empty_token_count: The default value to return as tokensif the classification is empty.
|
|
30
33
|
"""
|
|
31
34
|
self._client = AsyncOpenAI(api_key=api_key)
|
|
32
35
|
self._model = model
|
|
33
|
-
self.
|
|
36
|
+
self._error_response = ClassificationResult(
|
|
37
|
+
result=default_if_missing,
|
|
38
|
+
input_tokens=empty_token_count,
|
|
39
|
+
output_tokens=empty_token_count,
|
|
40
|
+
)
|
|
34
41
|
|
|
35
42
|
async def _call_openai_api(
|
|
36
43
|
self,
|
|
37
44
|
system_prompt: str,
|
|
38
45
|
user_prompt: str,
|
|
39
46
|
**kwargs,
|
|
40
|
-
) ->
|
|
47
|
+
) -> ClassificationResult:
|
|
41
48
|
"""Calls the OpenAI API with the given user prompt."""
|
|
42
49
|
response = await self._client.chat.completions.create(
|
|
43
50
|
model=self._model,
|
|
@@ -50,10 +57,24 @@ class Processor:
|
|
|
50
57
|
content = response.choices[0].message.content
|
|
51
58
|
if not content:
|
|
52
59
|
raise ValueError("Empty response from OpenAI API")
|
|
53
|
-
return content
|
|
54
60
|
|
|
55
|
-
|
|
56
|
-
|
|
61
|
+
# Convert the content to an integer
|
|
62
|
+
content = int(content.strip())
|
|
63
|
+
|
|
64
|
+
# For tracking consumption we alre return the tokens used
|
|
65
|
+
classification = ClassificationResult(
|
|
66
|
+
result=content,
|
|
67
|
+
input_tokens=response.usage.prompt_tokens,
|
|
68
|
+
output_tokens=response.usage.completion_tokens,
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
return classification
|
|
72
|
+
|
|
73
|
+
async def classify(
|
|
74
|
+
self, prompt: Prompt, url: str, product_details: str
|
|
75
|
+
) -> ClassificationResult:
|
|
76
|
+
"""A generic classification method that classifies a product based on a prompt object and returns
|
|
77
|
+
the classification, input tokens, and output tokens.
|
|
57
78
|
|
|
58
79
|
Args:
|
|
59
80
|
prompt: A dictionary with keys "system_prompt", etc.
|
|
@@ -69,7 +90,7 @@ class Processor:
|
|
|
69
90
|
# If required fields are missing, return the prompt's default fallback if provided.
|
|
70
91
|
if not product_details:
|
|
71
92
|
logger.warning("Missing required product_details for classification.")
|
|
72
|
-
return self.
|
|
93
|
+
return self._error_response
|
|
73
94
|
|
|
74
95
|
# Substitute placeholders in user_prompt with the relevant arguments
|
|
75
96
|
user_prompt = PROCESSOR_USER_PROMPT_TEMPLATE.format(
|
|
@@ -81,22 +102,21 @@ class Processor:
|
|
|
81
102
|
logger.debug(
|
|
82
103
|
f'Calling OpenAI API for classification (url="{url}", prompt="{prompt.name}")'
|
|
83
104
|
)
|
|
84
|
-
|
|
105
|
+
classification = await self._call_openai_api(
|
|
85
106
|
system_prompt=prompt.system_prompt,
|
|
86
107
|
user_prompt=user_prompt,
|
|
87
108
|
max_tokens=1,
|
|
88
109
|
)
|
|
89
|
-
classification = int(content.strip())
|
|
90
110
|
|
|
91
111
|
# Enforce that the classification is in the allowed classes
|
|
92
|
-
if classification not in prompt.allowed_classes:
|
|
112
|
+
if classification.result not in prompt.allowed_classes:
|
|
93
113
|
logger.warning(
|
|
94
|
-
f"Classification '{classification}' not in allowed classes {prompt.allowed_classes}"
|
|
114
|
+
f"Classification '{classification.result}' not in allowed classes {prompt.allowed_classes}"
|
|
95
115
|
)
|
|
96
|
-
return self.
|
|
116
|
+
return self._error_response
|
|
97
117
|
|
|
98
118
|
logger.info(
|
|
99
|
-
f'Classification for url="{url}" (prompt={prompt.name}): {classification}'
|
|
119
|
+
f'Classification for url="{url}" (prompt={prompt.name}): {classification.result} and total tokens used: {classification.input_tokens + classification.output_tokens}'
|
|
100
120
|
)
|
|
101
121
|
return classification
|
|
102
122
|
|
|
@@ -104,4 +124,4 @@ class Processor:
|
|
|
104
124
|
logger.error(
|
|
105
125
|
f'Error classifying product at url="{url}" with prompt "{prompt.name}": {e}'
|
|
106
126
|
)
|
|
107
|
-
return self.
|
|
127
|
+
return self._error_response
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from typing import List, Set, Tuple
|
|
3
|
+
from urllib.parse import urlparse, parse_qsl, urlencode, quote, urlunparse, ParseResult
|
|
4
|
+
|
|
5
|
+
from fraudcrawler.settings import KNOWN_TRACKERS
|
|
6
|
+
|
|
7
|
+
logger = logging.getLogger(__name__)
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class URLCollector:
|
|
11
|
+
"""A class to collect and de-duplicate URLs."""
|
|
12
|
+
|
|
13
|
+
def __init__(self):
|
|
14
|
+
self.collected_currently: Set[str] = set()
|
|
15
|
+
self.collected_previously: Set[str] = set()
|
|
16
|
+
|
|
17
|
+
@staticmethod
|
|
18
|
+
def remove_tracking_parameters(url: str) -> str:
|
|
19
|
+
"""Remove tracking parameters from URLs.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
url: The URL to clean.
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
The cleaned URL without tracking parameters.
|
|
26
|
+
"""
|
|
27
|
+
logging.debug(f"Removing tracking parameters from URL: {url}")
|
|
28
|
+
|
|
29
|
+
# Parse the url
|
|
30
|
+
parsed_url = urlparse(url)
|
|
31
|
+
|
|
32
|
+
# Parse query parameters
|
|
33
|
+
queries: List[Tuple[str, str]] = parse_qsl(
|
|
34
|
+
parsed_url.query, keep_blank_values=True
|
|
35
|
+
)
|
|
36
|
+
remove_all = url.startswith(
|
|
37
|
+
"https://www.ebay"
|
|
38
|
+
) # eBay URLs have all query parameters as tracking parameters
|
|
39
|
+
if remove_all:
|
|
40
|
+
filtered_queries = []
|
|
41
|
+
else:
|
|
42
|
+
filtered_queries = [
|
|
43
|
+
q
|
|
44
|
+
for q in queries
|
|
45
|
+
if not any(q[0].startswith(tracker) for tracker in KNOWN_TRACKERS)
|
|
46
|
+
]
|
|
47
|
+
|
|
48
|
+
# Rebuild the URL without tracking parameters
|
|
49
|
+
clean_url = ParseResult(
|
|
50
|
+
scheme=parsed_url.scheme,
|
|
51
|
+
netloc=parsed_url.netloc,
|
|
52
|
+
path=parsed_url.path,
|
|
53
|
+
params=parsed_url.params,
|
|
54
|
+
query=urlencode(filtered_queries, quote_via=quote),
|
|
55
|
+
fragment=parsed_url.fragment,
|
|
56
|
+
)
|
|
57
|
+
return urlunparse(clean_url)
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from typing import List
|
|
3
|
+
|
|
4
|
+
# Generic settings
|
|
5
|
+
MAX_RETRIES = 3
|
|
6
|
+
RETRY_DELAY = 2
|
|
7
|
+
ROOT_DIR = Path(__file__).parents[1]
|
|
8
|
+
|
|
9
|
+
# Serp settings
|
|
10
|
+
GOOGLE_LOCATIONS_FILENAME = ROOT_DIR / "fraudcrawler" / "base" / "google-locations.json"
|
|
11
|
+
GOOGLE_LANGUAGES_FILENAME = ROOT_DIR / "fraudcrawler" / "base" / "google-languages.json"
|
|
12
|
+
SERP_DEFAULT_COUNTRY_CODES: List[str] = [
|
|
13
|
+
# ".com",
|
|
14
|
+
]
|
|
15
|
+
|
|
16
|
+
# URL De-duplication settings
|
|
17
|
+
KNOWN_TRACKERS = [
|
|
18
|
+
"srsltid", # Search result click ID (used by some search engines)
|
|
19
|
+
"utm_source", # UTM: Source of the traffic (e.g., Google, Newsletter)
|
|
20
|
+
"utm_medium", # UTM: Medium such as CPC, email, social
|
|
21
|
+
"utm_campaign", # UTM: Campaign name (e.g., summer_sale)
|
|
22
|
+
"utm_term", # UTM: Keyword term (used in paid search)
|
|
23
|
+
"utm_content", # UTM: Used to differentiate similar links or ads
|
|
24
|
+
"ar", # Often used for ad region or targeting info
|
|
25
|
+
"ps", # Could refer to promotion source or partner segment
|
|
26
|
+
"gclid", # Google Ads click ID (auto-tagging)
|
|
27
|
+
"gclsrc", # Source of the GCLID (e.g., ads, search)
|
|
28
|
+
"sku", # Product SKU identifier, often used in ecommerce links
|
|
29
|
+
"ref", # Referrer username or source (e.g., GitHub ref links)
|
|
30
|
+
"referral", # Alternate form of referrer, often human-readable
|
|
31
|
+
"aff_id", # Affiliate identifier (ID-based)
|
|
32
|
+
"aff", # Short form for affiliate tag
|
|
33
|
+
"affiliate", # Affiliate tracking parameter (human-readable)
|
|
34
|
+
"partner", # Indicates marketing or distribution partner
|
|
35
|
+
"fbclid", # Facebook Click Identifier
|
|
36
|
+
"msclkid", # Microsoft/Bing Ads click identifier
|
|
37
|
+
"twclid", # Twitter Ads click identifier
|
|
38
|
+
"variant", # A/B test variant (used to test versions of pages)
|
|
39
|
+
"session_id", # Session tracking ID, should not persist across URLs
|
|
40
|
+
"track", # Generic flag used to enable/disable tracking
|
|
41
|
+
"cid", # Campaign ID (used in ads or emails)
|
|
42
|
+
"campaignid", # Alternate or long-form campaign ID
|
|
43
|
+
"adgroup", # Ad group identifier for campaigns
|
|
44
|
+
"bannerid", # Specific banner ad ID (for display ad tracking)
|
|
45
|
+
"token", # Often used to identify users or temporary sessions
|
|
46
|
+
"tag", # Affiliate or marketing tag (used for tracking)
|
|
47
|
+
"hash", # Generic hash identifier, often for state or cache
|
|
48
|
+
"user", # User ID or identifier passed in URL (should be avoided)
|
|
49
|
+
"src", # Generic source indicator, less formal than `utm_source`
|
|
50
|
+
"selsort", # Sorting parameter for search results
|
|
51
|
+
"shid", # Shop ID (used in ecommerce)
|
|
52
|
+
"shoparea", # Shop area (used in ecommerce)
|
|
53
|
+
"shopid", # Shop ID (used in ecommerce)
|
|
54
|
+
"shoparea", # Shop area (used in ecommerce)
|
|
55
|
+
]
|
|
56
|
+
|
|
57
|
+
# Enrichment settings
|
|
58
|
+
ENRICHMENT_DEFAULT_LIMIT = 10
|
|
59
|
+
|
|
60
|
+
# Zyte settings
|
|
61
|
+
ZYTE_DEFALUT_PROBABILITY_THRESHOLD = 0.1
|
|
62
|
+
|
|
63
|
+
# Processor settings
|
|
64
|
+
PROCESSOR_DEFAULT_MODEL = "gpt-4o"
|
|
65
|
+
PROCESSOR_DEFAULT_IF_MISSING = -1
|
|
66
|
+
PROCESSOR_EMPTY_TOKEN_COUNT = -1
|
|
67
|
+
PROCESSOR_USER_PROMPT_TEMPLATE = "Product Details:\n{product_details}\n\nRelevance:"
|
|
68
|
+
PROCESSOR_PRODUCT_DETAILS_TEMPLATE = "{field_name}:\n{field_value}"
|
|
69
|
+
|
|
70
|
+
# Async settings
|
|
71
|
+
DEFAULT_N_SERP_WKRS = 10
|
|
72
|
+
DEFAULT_N_ZYTE_WKRS = 10
|
|
73
|
+
DEFAULT_N_PROC_WKRS = 10
|
|
@@ -1,31 +0,0 @@
|
|
|
1
|
-
from pathlib import Path
|
|
2
|
-
from typing import List
|
|
3
|
-
|
|
4
|
-
# Generic settings
|
|
5
|
-
MAX_RETRIES = 3
|
|
6
|
-
RETRY_DELAY = 2
|
|
7
|
-
ROOT_DIR = Path(__file__).parents[1]
|
|
8
|
-
|
|
9
|
-
# Serp settings
|
|
10
|
-
GOOGLE_LOCATIONS_FILENAME = ROOT_DIR / "fraudcrawler" / "base" / "google-locations.json"
|
|
11
|
-
GOOGLE_LANGUAGES_FILENAME = ROOT_DIR / "fraudcrawler" / "base" / "google-languages.json"
|
|
12
|
-
SERP_DEFAULT_COUNTRY_CODES: List[str] = [
|
|
13
|
-
# ".com",
|
|
14
|
-
]
|
|
15
|
-
|
|
16
|
-
# Enrichment settings
|
|
17
|
-
ENRICHMENT_DEFAULT_LIMIT = 10
|
|
18
|
-
|
|
19
|
-
# Zyte settings
|
|
20
|
-
ZYTE_DEFALUT_PROBABILITY_THRESHOLD = 0.1
|
|
21
|
-
|
|
22
|
-
# Processor settings
|
|
23
|
-
PROCESSOR_DEFAULT_MODEL = "gpt-4o"
|
|
24
|
-
PROCESSOR_DEFAULT_IF_MISSING = -1
|
|
25
|
-
PROCESSOR_USER_PROMPT_TEMPLATE = "Product Details:\n{product_details}\n\nRelevance:"
|
|
26
|
-
PROCESSOR_PRODUCT_DETAILS_TEMPLATE = "{field_name}:\n{field_value}"
|
|
27
|
-
|
|
28
|
-
# Async settings
|
|
29
|
-
DEFAULT_N_SERP_WKRS = 10
|
|
30
|
-
DEFAULT_N_ZYTE_WKRS = 10
|
|
31
|
-
DEFAULT_N_PROC_WKRS = 10
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
@@ -4,10 +4,10 @@ import logging
|
|
|
4
4
|
from pydantic import BaseModel
|
|
5
5
|
from typing import List
|
|
6
6
|
from urllib.parse import urlparse
|
|
7
|
+
import re
|
|
7
8
|
|
|
8
9
|
from fraudcrawler.settings import MAX_RETRIES, RETRY_DELAY, SERP_DEFAULT_COUNTRY_CODES
|
|
9
10
|
from fraudcrawler.base.base import Host, Language, Location, AsyncClient
|
|
10
|
-
import re
|
|
11
11
|
|
|
12
12
|
logger = logging.getLogger(__name__)
|
|
13
13
|
|
|
File without changes
|