fraudcrawler 0.4.3__tar.gz → 0.4.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of fraudcrawler might be problematic. Click here for more details.
- {fraudcrawler-0.4.3 → fraudcrawler-0.4.5}/PKG-INFO +2 -3
- {fraudcrawler-0.4.3 → fraudcrawler-0.4.5}/fraudcrawler/base/base.py +11 -0
- {fraudcrawler-0.4.3 → fraudcrawler-0.4.5}/fraudcrawler/base/orchestrator.py +7 -1
- {fraudcrawler-0.4.3 → fraudcrawler-0.4.5}/fraudcrawler/processing/processor.py +34 -14
- fraudcrawler-0.4.5/fraudcrawler/settings.py +73 -0
- {fraudcrawler-0.4.3 → fraudcrawler-0.4.5}/pyproject.toml +1 -1
- fraudcrawler-0.4.3/fraudcrawler/settings.py +0 -43
- {fraudcrawler-0.4.3 → fraudcrawler-0.4.5}/LICENSE +0 -0
- {fraudcrawler-0.4.3 → fraudcrawler-0.4.5}/README.md +0 -0
- {fraudcrawler-0.4.3 → fraudcrawler-0.4.5}/fraudcrawler/__init__.py +0 -0
- {fraudcrawler-0.4.3 → fraudcrawler-0.4.5}/fraudcrawler/base/__init__.py +0 -0
- {fraudcrawler-0.4.3 → fraudcrawler-0.4.5}/fraudcrawler/base/client.py +0 -0
- {fraudcrawler-0.4.3 → fraudcrawler-0.4.5}/fraudcrawler/base/google-languages.json +0 -0
- {fraudcrawler-0.4.3 → fraudcrawler-0.4.5}/fraudcrawler/base/google-locations.json +0 -0
- {fraudcrawler-0.4.3 → fraudcrawler-0.4.5}/fraudcrawler/launch_demo_pipeline.py +0 -0
- {fraudcrawler-0.4.3 → fraudcrawler-0.4.5}/fraudcrawler/processing/__init__.py +0 -0
- {fraudcrawler-0.4.3 → fraudcrawler-0.4.5}/fraudcrawler/scraping/__init__.py +0 -0
- {fraudcrawler-0.4.3 → fraudcrawler-0.4.5}/fraudcrawler/scraping/enrich.py +0 -0
- {fraudcrawler-0.4.3 → fraudcrawler-0.4.5}/fraudcrawler/scraping/serp.py +0 -0
- {fraudcrawler-0.4.3 → fraudcrawler-0.4.5}/fraudcrawler/scraping/url.py +0 -0
- {fraudcrawler-0.4.3 → fraudcrawler-0.4.5}/fraudcrawler/scraping/zyte.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
2
|
Name: fraudcrawler
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.5
|
|
4
4
|
Summary: Intelligent Market Monitoring
|
|
5
5
|
Home-page: https://github.com/open-veanu/fraudcrawler
|
|
6
6
|
License: MIT
|
|
@@ -11,7 +11,6 @@ Classifier: License :: OSI Approved :: MIT License
|
|
|
11
11
|
Classifier: Programming Language :: Python :: 3
|
|
12
12
|
Classifier: Programming Language :: Python :: 3.11
|
|
13
13
|
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
-
Classifier: Programming Language :: Python :: 3.13
|
|
15
14
|
Requires-Dist: aiohttp (>=3.11.14,<4.0.0)
|
|
16
15
|
Requires-Dist: beautifulsoup4 (>=4.13.4,<5.0.0)
|
|
17
16
|
Requires-Dist: openai (>=1.68.2,<2.0.0)
|
|
@@ -63,6 +63,14 @@ class Host(BaseModel):
|
|
|
63
63
|
return [cls._normalize_domain(dom.strip()) for dom in val]
|
|
64
64
|
|
|
65
65
|
|
|
66
|
+
class ClassificationResult(BaseModel):
|
|
67
|
+
"""Model for classification results."""
|
|
68
|
+
|
|
69
|
+
result: int
|
|
70
|
+
input_tokens: int
|
|
71
|
+
output_tokens: int
|
|
72
|
+
|
|
73
|
+
|
|
66
74
|
class Location(BaseModel):
|
|
67
75
|
"""Model for location details (e.g. `Location(name="Switzerland", code="ch")`)."""
|
|
68
76
|
|
|
@@ -137,6 +145,9 @@ class ProductItem(BaseModel):
|
|
|
137
145
|
# Processor parameters are set dynamic so we must allow extra fields
|
|
138
146
|
classifications: Dict[str, int] = Field(default_factory=dict)
|
|
139
147
|
|
|
148
|
+
# Usage parameters
|
|
149
|
+
usage: Dict[str, Dict[str, int]] = Field(default_factory=dict)
|
|
150
|
+
|
|
140
151
|
# Filtering parameters
|
|
141
152
|
filtered: bool = False
|
|
142
153
|
filtered_at_stage: str | None = None
|
|
@@ -281,7 +281,13 @@ class Orchestrator(ABC):
|
|
|
281
281
|
url=url,
|
|
282
282
|
product_details=product_details,
|
|
283
283
|
)
|
|
284
|
-
product.classifications[prompt.name] =
|
|
284
|
+
product.classifications[prompt.name] = int(
|
|
285
|
+
classification.result
|
|
286
|
+
)
|
|
287
|
+
product.usage[prompt.name] = {
|
|
288
|
+
"input_tokens": classification.input_tokens,
|
|
289
|
+
"output_tokens": classification.output_tokens,
|
|
290
|
+
}
|
|
285
291
|
except Exception as e:
|
|
286
292
|
logger.warning(f"Error processing product: {e}.")
|
|
287
293
|
|
|
@@ -2,10 +2,11 @@ import logging
|
|
|
2
2
|
|
|
3
3
|
from openai import AsyncOpenAI
|
|
4
4
|
|
|
5
|
-
from fraudcrawler.base.base import Prompt
|
|
5
|
+
from fraudcrawler.base.base import Prompt, ClassificationResult
|
|
6
6
|
from fraudcrawler.settings import (
|
|
7
7
|
PROCESSOR_USER_PROMPT_TEMPLATE,
|
|
8
8
|
PROCESSOR_DEFAULT_IF_MISSING,
|
|
9
|
+
PROCESSOR_EMPTY_TOKEN_COUNT,
|
|
9
10
|
)
|
|
10
11
|
|
|
11
12
|
|
|
@@ -20,6 +21,7 @@ class Processor:
|
|
|
20
21
|
api_key: str,
|
|
21
22
|
model: str,
|
|
22
23
|
default_if_missing: int = PROCESSOR_DEFAULT_IF_MISSING,
|
|
24
|
+
empty_token_count: int = PROCESSOR_EMPTY_TOKEN_COUNT,
|
|
23
25
|
):
|
|
24
26
|
"""Initializes the Processor.
|
|
25
27
|
|
|
@@ -27,17 +29,22 @@ class Processor:
|
|
|
27
29
|
api_key: The OpenAI API key.
|
|
28
30
|
model: The OpenAI model to use.
|
|
29
31
|
default_if_missing: The default classification to return if error occurs.
|
|
32
|
+
empty_token_count: The default value to return as tokensif the classification is empty.
|
|
30
33
|
"""
|
|
31
34
|
self._client = AsyncOpenAI(api_key=api_key)
|
|
32
35
|
self._model = model
|
|
33
|
-
self.
|
|
36
|
+
self._error_response = ClassificationResult(
|
|
37
|
+
result=default_if_missing,
|
|
38
|
+
input_tokens=empty_token_count,
|
|
39
|
+
output_tokens=empty_token_count,
|
|
40
|
+
)
|
|
34
41
|
|
|
35
42
|
async def _call_openai_api(
|
|
36
43
|
self,
|
|
37
44
|
system_prompt: str,
|
|
38
45
|
user_prompt: str,
|
|
39
46
|
**kwargs,
|
|
40
|
-
) ->
|
|
47
|
+
) -> ClassificationResult:
|
|
41
48
|
"""Calls the OpenAI API with the given user prompt."""
|
|
42
49
|
response = await self._client.chat.completions.create(
|
|
43
50
|
model=self._model,
|
|
@@ -50,10 +57,24 @@ class Processor:
|
|
|
50
57
|
content = response.choices[0].message.content
|
|
51
58
|
if not content:
|
|
52
59
|
raise ValueError("Empty response from OpenAI API")
|
|
53
|
-
return content
|
|
54
60
|
|
|
55
|
-
|
|
56
|
-
|
|
61
|
+
# Convert the content to an integer
|
|
62
|
+
content = int(content.strip())
|
|
63
|
+
|
|
64
|
+
# For tracking consumption we alre return the tokens used
|
|
65
|
+
classification = ClassificationResult(
|
|
66
|
+
result=content,
|
|
67
|
+
input_tokens=response.usage.prompt_tokens,
|
|
68
|
+
output_tokens=response.usage.completion_tokens,
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
return classification
|
|
72
|
+
|
|
73
|
+
async def classify(
|
|
74
|
+
self, prompt: Prompt, url: str, product_details: str
|
|
75
|
+
) -> ClassificationResult:
|
|
76
|
+
"""A generic classification method that classifies a product based on a prompt object and returns
|
|
77
|
+
the classification, input tokens, and output tokens.
|
|
57
78
|
|
|
58
79
|
Args:
|
|
59
80
|
prompt: A dictionary with keys "system_prompt", etc.
|
|
@@ -69,7 +90,7 @@ class Processor:
|
|
|
69
90
|
# If required fields are missing, return the prompt's default fallback if provided.
|
|
70
91
|
if not product_details:
|
|
71
92
|
logger.warning("Missing required product_details for classification.")
|
|
72
|
-
return self.
|
|
93
|
+
return self._error_response
|
|
73
94
|
|
|
74
95
|
# Substitute placeholders in user_prompt with the relevant arguments
|
|
75
96
|
user_prompt = PROCESSOR_USER_PROMPT_TEMPLATE.format(
|
|
@@ -81,22 +102,21 @@ class Processor:
|
|
|
81
102
|
logger.debug(
|
|
82
103
|
f'Calling OpenAI API for classification (url="{url}", prompt="{prompt.name}")'
|
|
83
104
|
)
|
|
84
|
-
|
|
105
|
+
classification = await self._call_openai_api(
|
|
85
106
|
system_prompt=prompt.system_prompt,
|
|
86
107
|
user_prompt=user_prompt,
|
|
87
108
|
max_tokens=1,
|
|
88
109
|
)
|
|
89
|
-
classification = int(content.strip())
|
|
90
110
|
|
|
91
111
|
# Enforce that the classification is in the allowed classes
|
|
92
|
-
if classification not in prompt.allowed_classes:
|
|
112
|
+
if classification.result not in prompt.allowed_classes:
|
|
93
113
|
logger.warning(
|
|
94
|
-
f"Classification '{classification}' not in allowed classes {prompt.allowed_classes}"
|
|
114
|
+
f"Classification '{classification.result}' not in allowed classes {prompt.allowed_classes}"
|
|
95
115
|
)
|
|
96
|
-
return self.
|
|
116
|
+
return self._error_response
|
|
97
117
|
|
|
98
118
|
logger.info(
|
|
99
|
-
f'Classification for url="{url}" (prompt={prompt.name}): {classification}'
|
|
119
|
+
f'Classification for url="{url}" (prompt={prompt.name}): {classification.result} and total tokens used: {classification.input_tokens + classification.output_tokens}'
|
|
100
120
|
)
|
|
101
121
|
return classification
|
|
102
122
|
|
|
@@ -104,4 +124,4 @@ class Processor:
|
|
|
104
124
|
logger.error(
|
|
105
125
|
f'Error classifying product at url="{url}" with prompt "{prompt.name}": {e}'
|
|
106
126
|
)
|
|
107
|
-
return self.
|
|
127
|
+
return self._error_response
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from typing import List
|
|
3
|
+
|
|
4
|
+
# Generic settings
|
|
5
|
+
MAX_RETRIES = 3
|
|
6
|
+
RETRY_DELAY = 2
|
|
7
|
+
ROOT_DIR = Path(__file__).parents[1]
|
|
8
|
+
|
|
9
|
+
# Serp settings
|
|
10
|
+
GOOGLE_LOCATIONS_FILENAME = ROOT_DIR / "fraudcrawler" / "base" / "google-locations.json"
|
|
11
|
+
GOOGLE_LANGUAGES_FILENAME = ROOT_DIR / "fraudcrawler" / "base" / "google-languages.json"
|
|
12
|
+
SERP_DEFAULT_COUNTRY_CODES: List[str] = [
|
|
13
|
+
# ".com",
|
|
14
|
+
]
|
|
15
|
+
|
|
16
|
+
# URL De-duplication settings
|
|
17
|
+
KNOWN_TRACKERS = [
|
|
18
|
+
"srsltid", # Search result click ID (used by some search engines)
|
|
19
|
+
"utm_source", # UTM: Source of the traffic (e.g., Google, Newsletter)
|
|
20
|
+
"utm_medium", # UTM: Medium such as CPC, email, social
|
|
21
|
+
"utm_campaign", # UTM: Campaign name (e.g., summer_sale)
|
|
22
|
+
"utm_term", # UTM: Keyword term (used in paid search)
|
|
23
|
+
"utm_content", # UTM: Used to differentiate similar links or ads
|
|
24
|
+
"ar", # Often used for ad region or targeting info
|
|
25
|
+
"ps", # Could refer to promotion source or partner segment
|
|
26
|
+
"gclid", # Google Ads click ID (auto-tagging)
|
|
27
|
+
"gclsrc", # Source of the GCLID (e.g., ads, search)
|
|
28
|
+
"sku", # Product SKU identifier, often used in ecommerce links
|
|
29
|
+
"ref", # Referrer username or source (e.g., GitHub ref links)
|
|
30
|
+
"referral", # Alternate form of referrer, often human-readable
|
|
31
|
+
"aff_id", # Affiliate identifier (ID-based)
|
|
32
|
+
"aff", # Short form for affiliate tag
|
|
33
|
+
"affiliate", # Affiliate tracking parameter (human-readable)
|
|
34
|
+
"partner", # Indicates marketing or distribution partner
|
|
35
|
+
"fbclid", # Facebook Click Identifier
|
|
36
|
+
"msclkid", # Microsoft/Bing Ads click identifier
|
|
37
|
+
"twclid", # Twitter Ads click identifier
|
|
38
|
+
"variant", # A/B test variant (used to test versions of pages)
|
|
39
|
+
"session_id", # Session tracking ID, should not persist across URLs
|
|
40
|
+
"track", # Generic flag used to enable/disable tracking
|
|
41
|
+
"cid", # Campaign ID (used in ads or emails)
|
|
42
|
+
"campaignid", # Alternate or long-form campaign ID
|
|
43
|
+
"adgroup", # Ad group identifier for campaigns
|
|
44
|
+
"bannerid", # Specific banner ad ID (for display ad tracking)
|
|
45
|
+
"token", # Often used to identify users or temporary sessions
|
|
46
|
+
"tag", # Affiliate or marketing tag (used for tracking)
|
|
47
|
+
"hash", # Generic hash identifier, often for state or cache
|
|
48
|
+
"user", # User ID or identifier passed in URL (should be avoided)
|
|
49
|
+
"src", # Generic source indicator, less formal than `utm_source`
|
|
50
|
+
"selsort", # Sorting parameter for search results
|
|
51
|
+
"shid", # Shop ID (used in ecommerce)
|
|
52
|
+
"shoparea", # Shop area (used in ecommerce)
|
|
53
|
+
"shopid", # Shop ID (used in ecommerce)
|
|
54
|
+
"shoparea", # Shop area (used in ecommerce)
|
|
55
|
+
]
|
|
56
|
+
|
|
57
|
+
# Enrichment settings
|
|
58
|
+
ENRICHMENT_DEFAULT_LIMIT = 10
|
|
59
|
+
|
|
60
|
+
# Zyte settings
|
|
61
|
+
ZYTE_DEFALUT_PROBABILITY_THRESHOLD = 0.1
|
|
62
|
+
|
|
63
|
+
# Processor settings
|
|
64
|
+
PROCESSOR_DEFAULT_MODEL = "gpt-4o"
|
|
65
|
+
PROCESSOR_DEFAULT_IF_MISSING = -1
|
|
66
|
+
PROCESSOR_EMPTY_TOKEN_COUNT = -1
|
|
67
|
+
PROCESSOR_USER_PROMPT_TEMPLATE = "Product Details:\n{product_details}\n\nRelevance:"
|
|
68
|
+
PROCESSOR_PRODUCT_DETAILS_TEMPLATE = "{field_name}:\n{field_value}"
|
|
69
|
+
|
|
70
|
+
# Async settings
|
|
71
|
+
DEFAULT_N_SERP_WKRS = 10
|
|
72
|
+
DEFAULT_N_ZYTE_WKRS = 10
|
|
73
|
+
DEFAULT_N_PROC_WKRS = 10
|
|
@@ -1,43 +0,0 @@
|
|
|
1
|
-
from pathlib import Path
|
|
2
|
-
from typing import List
|
|
3
|
-
|
|
4
|
-
# Generic settings
|
|
5
|
-
MAX_RETRIES = 3
|
|
6
|
-
RETRY_DELAY = 2
|
|
7
|
-
ROOT_DIR = Path(__file__).parents[1]
|
|
8
|
-
|
|
9
|
-
# Serp settings
|
|
10
|
-
GOOGLE_LOCATIONS_FILENAME = ROOT_DIR / "fraudcrawler" / "base" / "google-locations.json"
|
|
11
|
-
GOOGLE_LANGUAGES_FILENAME = ROOT_DIR / "fraudcrawler" / "base" / "google-languages.json"
|
|
12
|
-
SERP_DEFAULT_COUNTRY_CODES: List[str] = [
|
|
13
|
-
# ".com",
|
|
14
|
-
]
|
|
15
|
-
|
|
16
|
-
# URL De-duplication settings
|
|
17
|
-
KNOWN_TRACKERS = [
|
|
18
|
-
"srsltid",
|
|
19
|
-
"utm_source",
|
|
20
|
-
"utm_medium",
|
|
21
|
-
"utm_campaign",
|
|
22
|
-
"utm_term",
|
|
23
|
-
"utm_content",
|
|
24
|
-
"ar",
|
|
25
|
-
"ps",
|
|
26
|
-
]
|
|
27
|
-
|
|
28
|
-
# Enrichment settings
|
|
29
|
-
ENRICHMENT_DEFAULT_LIMIT = 10
|
|
30
|
-
|
|
31
|
-
# Zyte settings
|
|
32
|
-
ZYTE_DEFALUT_PROBABILITY_THRESHOLD = 0.1
|
|
33
|
-
|
|
34
|
-
# Processor settings
|
|
35
|
-
PROCESSOR_DEFAULT_MODEL = "gpt-4o"
|
|
36
|
-
PROCESSOR_DEFAULT_IF_MISSING = -1
|
|
37
|
-
PROCESSOR_USER_PROMPT_TEMPLATE = "Product Details:\n{product_details}\n\nRelevance:"
|
|
38
|
-
PROCESSOR_PRODUCT_DETAILS_TEMPLATE = "{field_name}:\n{field_value}"
|
|
39
|
-
|
|
40
|
-
# Async settings
|
|
41
|
-
DEFAULT_N_SERP_WKRS = 10
|
|
42
|
-
DEFAULT_N_ZYTE_WKRS = 10
|
|
43
|
-
DEFAULT_N_PROC_WKRS = 10
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|