fraudcrawler 0.4.0__tar.gz → 0.4.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of fraudcrawler might be problematic. Click here for more details.
- {fraudcrawler-0.4.0 → fraudcrawler-0.4.2}/PKG-INFO +2 -2
- {fraudcrawler-0.4.0 → fraudcrawler-0.4.2}/README.md +0 -1
- {fraudcrawler-0.4.0 → fraudcrawler-0.4.2}/fraudcrawler/__init__.py +2 -1
- {fraudcrawler-0.4.0 → fraudcrawler-0.4.2}/fraudcrawler/base/base.py +41 -2
- {fraudcrawler-0.4.0 → fraudcrawler-0.4.2}/fraudcrawler/base/client.py +10 -2
- {fraudcrawler-0.4.0 → fraudcrawler-0.4.2}/fraudcrawler/base/orchestrator.py +27 -36
- fraudcrawler-0.4.2/fraudcrawler/launch_demo_pipeline.py +101 -0
- {fraudcrawler-0.4.0 → fraudcrawler-0.4.2}/fraudcrawler/processing/processor.py +11 -19
- {fraudcrawler-0.4.0 → fraudcrawler-0.4.2}/fraudcrawler/scraping/serp.py +3 -1
- {fraudcrawler-0.4.0 → fraudcrawler-0.4.2}/fraudcrawler/scraping/zyte.py +24 -1
- {fraudcrawler-0.4.0 → fraudcrawler-0.4.2}/fraudcrawler/settings.py +2 -3
- {fraudcrawler-0.4.0 → fraudcrawler-0.4.2}/pyproject.toml +2 -1
- fraudcrawler-0.4.0/fraudcrawler/launch_demo_pipeline.py +0 -100
- {fraudcrawler-0.4.0 → fraudcrawler-0.4.2}/LICENSE +0 -0
- {fraudcrawler-0.4.0 → fraudcrawler-0.4.2}/fraudcrawler/base/__init__.py +0 -0
- {fraudcrawler-0.4.0 → fraudcrawler-0.4.2}/fraudcrawler/base/google-languages.json +0 -0
- {fraudcrawler-0.4.0 → fraudcrawler-0.4.2}/fraudcrawler/base/google-locations.json +0 -0
- {fraudcrawler-0.4.0 → fraudcrawler-0.4.2}/fraudcrawler/processing/__init__.py +0 -0
- {fraudcrawler-0.4.0 → fraudcrawler-0.4.2}/fraudcrawler/scraping/__init__.py +0 -0
- {fraudcrawler-0.4.0 → fraudcrawler-0.4.2}/fraudcrawler/scraping/enrich.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: fraudcrawler
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.2
|
|
4
4
|
Summary: Intelligent Market Monitoring
|
|
5
5
|
Home-page: https://github.com/open-veanu/fraudcrawler
|
|
6
6
|
License: MIT
|
|
@@ -13,6 +13,7 @@ Classifier: Programming Language :: Python :: 3.11
|
|
|
13
13
|
Classifier: Programming Language :: Python :: 3.12
|
|
14
14
|
Classifier: Programming Language :: Python :: 3.13
|
|
15
15
|
Requires-Dist: aiohttp (>=3.11.14,<4.0.0)
|
|
16
|
+
Requires-Dist: beautifulsoup4 (>=4.13.4,<5.0.0)
|
|
16
17
|
Requires-Dist: openai (>=1.68.2,<2.0.0)
|
|
17
18
|
Requires-Dist: pandas (>=2.2.3,<3.0.0)
|
|
18
19
|
Requires-Dist: pydantic-settings (>=2.8.1,<3.0.0)
|
|
@@ -80,7 +81,6 @@ deepness = Deepness(num_results=50)
|
|
|
80
81
|
prompts = [
|
|
81
82
|
Prompt(
|
|
82
83
|
name="relevance",
|
|
83
|
-
context="This organization is interested in medical products and drugs.",
|
|
84
84
|
system_prompt=(
|
|
85
85
|
"You are a helpful and intelligent assistant. Your task is to classify any given product "
|
|
86
86
|
"as either relevant (1) or not relevant (0), strictly based on the context and product details provided by the user. "
|
|
@@ -58,7 +58,6 @@ deepness = Deepness(num_results=50)
|
|
|
58
58
|
prompts = [
|
|
59
59
|
Prompt(
|
|
60
60
|
name="relevance",
|
|
61
|
-
context="This organization is interested in medical products and drugs.",
|
|
62
61
|
system_prompt=(
|
|
63
62
|
"You are a helpful and intelligent assistant. Your task is to classify any given product "
|
|
64
63
|
"as either relevant (1) or not relevant (0), strictly based on the context and product details provided by the user. "
|
|
@@ -2,7 +2,7 @@ from fraudcrawler.scraping.serp import SerpApi, SearchEngine
|
|
|
2
2
|
from fraudcrawler.scraping.enrich import Enricher
|
|
3
3
|
from fraudcrawler.scraping.zyte import ZyteApi
|
|
4
4
|
from fraudcrawler.processing.processor import Processor
|
|
5
|
-
from fraudcrawler.base.orchestrator import Orchestrator
|
|
5
|
+
from fraudcrawler.base.orchestrator import Orchestrator
|
|
6
6
|
from fraudcrawler.base.client import FraudCrawlerClient
|
|
7
7
|
from fraudcrawler.base.base import (
|
|
8
8
|
Deepness,
|
|
@@ -11,6 +11,7 @@ from fraudcrawler.base.base import (
|
|
|
11
11
|
Language,
|
|
12
12
|
Location,
|
|
13
13
|
Prompt,
|
|
14
|
+
ProductItem,
|
|
14
15
|
)
|
|
15
16
|
|
|
16
17
|
__all__ = [
|
|
@@ -2,12 +2,13 @@ import json
|
|
|
2
2
|
import logging
|
|
3
3
|
from pydantic import (
|
|
4
4
|
BaseModel,
|
|
5
|
+
Field,
|
|
5
6
|
field_validator,
|
|
6
7
|
model_validator,
|
|
7
8
|
)
|
|
8
9
|
from pydantic_settings import BaseSettings
|
|
9
10
|
import re
|
|
10
|
-
from typing import List
|
|
11
|
+
from typing import List, Dict
|
|
11
12
|
|
|
12
13
|
import aiohttp
|
|
13
14
|
|
|
@@ -114,12 +115,39 @@ class Deepness(BaseModel):
|
|
|
114
115
|
enrichment: Enrichment | None = None
|
|
115
116
|
|
|
116
117
|
|
|
118
|
+
class ProductItem(BaseModel):
|
|
119
|
+
"""Model representing a product item."""
|
|
120
|
+
|
|
121
|
+
# Serp/Enrich parameters
|
|
122
|
+
search_term: str
|
|
123
|
+
search_term_type: str
|
|
124
|
+
url: str
|
|
125
|
+
marketplace_name: str
|
|
126
|
+
domain: str
|
|
127
|
+
|
|
128
|
+
# Zyte parameters
|
|
129
|
+
product_name: str | None = None
|
|
130
|
+
product_price: str | None = None
|
|
131
|
+
product_description: str | None = None
|
|
132
|
+
product_images: List[str] | None = None
|
|
133
|
+
probability: float | None = None
|
|
134
|
+
html: str | None = None
|
|
135
|
+
html_clean: str | None = None
|
|
136
|
+
|
|
137
|
+
# Processor parameters are set dynamic so we must allow extra fields
|
|
138
|
+
classifications: Dict[str, int] = Field(default_factory=dict)
|
|
139
|
+
|
|
140
|
+
# Filtering parameters
|
|
141
|
+
filtered: bool = False
|
|
142
|
+
filtered_at_stage: str | None = None
|
|
143
|
+
|
|
144
|
+
|
|
117
145
|
class Prompt(BaseModel):
|
|
118
146
|
"""Model for prompts."""
|
|
119
147
|
|
|
120
148
|
name: str
|
|
121
|
-
context: str
|
|
122
149
|
system_prompt: str
|
|
150
|
+
product_item_fields: List[str]
|
|
123
151
|
allowed_classes: List[int]
|
|
124
152
|
|
|
125
153
|
@field_validator("allowed_classes", mode="before")
|
|
@@ -129,6 +157,17 @@ class Prompt(BaseModel):
|
|
|
129
157
|
raise ValueError("all values in allowed_classes must be positive integers.")
|
|
130
158
|
return val
|
|
131
159
|
|
|
160
|
+
@field_validator("product_item_fields", mode="before")
|
|
161
|
+
def validate_product_item_fields(cls, val):
|
|
162
|
+
"""Ensure all product_item_fields are valid ProductItem attributes."""
|
|
163
|
+
valid_fields = set(ProductItem.model_fields.keys())
|
|
164
|
+
for field in val:
|
|
165
|
+
if field not in valid_fields:
|
|
166
|
+
raise ValueError(
|
|
167
|
+
f"Invalid product_item_field: '{field}'. Must be one of: {sorted(valid_fields)}"
|
|
168
|
+
)
|
|
169
|
+
return val
|
|
170
|
+
|
|
132
171
|
|
|
133
172
|
class AsyncClient:
|
|
134
173
|
"""Base class for sub-classes using async HTTP requests."""
|
|
@@ -9,8 +9,16 @@ from typing import List
|
|
|
9
9
|
import pandas as pd
|
|
10
10
|
|
|
11
11
|
from fraudcrawler.settings import ROOT_DIR
|
|
12
|
-
from fraudcrawler.base.base import
|
|
13
|
-
|
|
12
|
+
from fraudcrawler.base.base import (
|
|
13
|
+
Setup,
|
|
14
|
+
Language,
|
|
15
|
+
Location,
|
|
16
|
+
Deepness,
|
|
17
|
+
Host,
|
|
18
|
+
Prompt,
|
|
19
|
+
ProductItem,
|
|
20
|
+
)
|
|
21
|
+
from fraudcrawler.base.orchestrator import Orchestrator
|
|
14
22
|
from fraudcrawler.scraping.serp import SearchEngine
|
|
15
23
|
|
|
16
24
|
logger = logging.getLogger(__name__)
|
|
@@ -1,12 +1,13 @@
|
|
|
1
1
|
from abc import ABC, abstractmethod
|
|
2
2
|
import asyncio
|
|
3
3
|
import logging
|
|
4
|
-
from pydantic import BaseModel, Field
|
|
5
4
|
from typing import Dict, List, Set, cast
|
|
5
|
+
from bs4 import BeautifulSoup
|
|
6
6
|
|
|
7
7
|
from fraudcrawler.settings import (
|
|
8
8
|
PROCESSOR_DEFAULT_MODEL,
|
|
9
9
|
PROCESSOR_DEFAULT_IF_MISSING,
|
|
10
|
+
PROCESSOR_PRODUCT_DETAILS_TEMPLATE,
|
|
10
11
|
MAX_RETRIES,
|
|
11
12
|
RETRY_DELAY,
|
|
12
13
|
)
|
|
@@ -15,37 +16,19 @@ from fraudcrawler.settings import (
|
|
|
15
16
|
DEFAULT_N_ZYTE_WKRS,
|
|
16
17
|
DEFAULT_N_PROC_WKRS,
|
|
17
18
|
)
|
|
18
|
-
from fraudcrawler.base.base import
|
|
19
|
+
from fraudcrawler.base.base import (
|
|
20
|
+
Deepness,
|
|
21
|
+
Host,
|
|
22
|
+
Language,
|
|
23
|
+
Location,
|
|
24
|
+
Prompt,
|
|
25
|
+
ProductItem,
|
|
26
|
+
)
|
|
19
27
|
from fraudcrawler import SerpApi, SearchEngine, Enricher, ZyteApi, Processor
|
|
20
28
|
|
|
21
29
|
logger = logging.getLogger(__name__)
|
|
22
30
|
|
|
23
31
|
|
|
24
|
-
class ProductItem(BaseModel):
|
|
25
|
-
"""Model representing a product item."""
|
|
26
|
-
|
|
27
|
-
# Serp/Enrich parameters
|
|
28
|
-
search_term: str
|
|
29
|
-
search_term_type: str
|
|
30
|
-
url: str
|
|
31
|
-
marketplace_name: str
|
|
32
|
-
domain: str
|
|
33
|
-
|
|
34
|
-
# Zyte parameters
|
|
35
|
-
product_name: str | None = None
|
|
36
|
-
product_price: str | None = None
|
|
37
|
-
product_description: str | None = None
|
|
38
|
-
product_images: List[str] | None = None
|
|
39
|
-
probability: float | None = None
|
|
40
|
-
|
|
41
|
-
# Processor parameters are set dynamic so we must allow extra fields
|
|
42
|
-
classifications: Dict[str, int] = Field(default_factory=dict)
|
|
43
|
-
|
|
44
|
-
# Filtering parameters
|
|
45
|
-
filtered: bool = False
|
|
46
|
-
filtered_at_stage: str | None = None
|
|
47
|
-
|
|
48
|
-
|
|
49
32
|
class Orchestrator(ABC):
|
|
50
33
|
"""Abstract base class for orchestrating the different actors (crawling, processing).
|
|
51
34
|
|
|
@@ -231,15 +214,16 @@ class Orchestrator(ABC):
|
|
|
231
214
|
product.probability = self._zyteapi.extract_probability(
|
|
232
215
|
details=details
|
|
233
216
|
)
|
|
234
|
-
|
|
217
|
+
product.html = self._zyteapi.extract_html(details=details)
|
|
218
|
+
if product.html:
|
|
219
|
+
soup = BeautifulSoup(product.html, "html.parser")
|
|
220
|
+
product.html_clean = soup.get_text(separator=" ", strip=True)
|
|
235
221
|
# Filter the product based on the probability threshold
|
|
236
222
|
if not self._zyteapi.keep_product(details=details):
|
|
237
223
|
product.filtered = True
|
|
238
224
|
product.filtered_at_stage = "Zyte probability threshold"
|
|
239
|
-
|
|
240
225
|
except Exception as e:
|
|
241
226
|
logger.warning(f"Error executing Zyte API search: {e}.")
|
|
242
|
-
|
|
243
227
|
await queue_out.put(product)
|
|
244
228
|
queue_in.task_done()
|
|
245
229
|
|
|
@@ -269,19 +253,26 @@ class Orchestrator(ABC):
|
|
|
269
253
|
if not product.filtered:
|
|
270
254
|
try:
|
|
271
255
|
url = product.url
|
|
272
|
-
name = product.product_name
|
|
273
|
-
description = product.product_description
|
|
274
|
-
|
|
275
256
|
# Run all the configured prompts
|
|
276
257
|
for prompt in prompts:
|
|
258
|
+
# Dynamically build product_details string
|
|
259
|
+
details = []
|
|
260
|
+
for field in prompt.product_item_fields:
|
|
261
|
+
value = getattr(product, field, None)
|
|
262
|
+
if value is not None:
|
|
263
|
+
details.append(
|
|
264
|
+
PROCESSOR_PRODUCT_DETAILS_TEMPLATE.format(
|
|
265
|
+
field_name=field, field_value=value
|
|
266
|
+
)
|
|
267
|
+
)
|
|
268
|
+
product_details = "\n\n".join(details)
|
|
277
269
|
logger.debug(
|
|
278
|
-
f"Classify product {
|
|
270
|
+
f"Classify product at {url} with prompt {prompt.name} and details: {product_details}"
|
|
279
271
|
)
|
|
280
272
|
classification = await self._processor.classify(
|
|
281
273
|
prompt=prompt,
|
|
282
274
|
url=url,
|
|
283
|
-
|
|
284
|
-
description=description,
|
|
275
|
+
product_details=product_details,
|
|
285
276
|
)
|
|
286
277
|
product.classifications[prompt.name] = classification
|
|
287
278
|
except Exception as e:
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
|
|
3
|
+
from fraudcrawler import FraudCrawlerClient, Language, Location, Deepness, Prompt
|
|
4
|
+
|
|
5
|
+
LOG_FMT = "%(asctime)s | %(name)s | %(funcName)s | %(levelname)s | %(message)s"
|
|
6
|
+
LOG_LVL = "INFO"
|
|
7
|
+
DATE_FMT = "%Y-%m-%d %H:%M:%S"
|
|
8
|
+
logging.basicConfig(format=LOG_FMT, level=LOG_LVL, datefmt=DATE_FMT)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def main():
|
|
12
|
+
# Setup the client
|
|
13
|
+
client = FraudCrawlerClient()
|
|
14
|
+
|
|
15
|
+
# Setup the search
|
|
16
|
+
search_term = "Kühlschrank"
|
|
17
|
+
language = Language(name="German")
|
|
18
|
+
location = Location(name="Switzerland")
|
|
19
|
+
deepness = Deepness(num_results=10)
|
|
20
|
+
prompts = [
|
|
21
|
+
Prompt(
|
|
22
|
+
name="availability",
|
|
23
|
+
system_prompt=(
|
|
24
|
+
"You are a helpful and intelligent assistant helping an organization that is interested in checking the availability of certain products."
|
|
25
|
+
"Your task is to classify any given product as either available (1) or not available (0), strictly based on the context and product details provided by the user. "
|
|
26
|
+
"You must consider all aspects of the given context and make a binary decision accordingly. "
|
|
27
|
+
"If the product can be purchased, added to a shopping basket, delivered, or is listed as available in any form, classify it as 1 (available); "
|
|
28
|
+
"if there is any mention of out of stock, not available, no longer shippable, or similar, classify it as 0 (not available). "
|
|
29
|
+
"Respond only with the number 1 or 0."
|
|
30
|
+
),
|
|
31
|
+
product_item_fields=["product_name", "html_clean"],
|
|
32
|
+
allowed_classes=[0, 1],
|
|
33
|
+
),
|
|
34
|
+
# Prompt(
|
|
35
|
+
# name="seriousness",
|
|
36
|
+
# system_prompt=(
|
|
37
|
+
# "You are a helpful and intelligent assistant helping an organization that is interested in checking the energy efficiency of certain devices. "
|
|
38
|
+
# "Your task is to classify each item as either a product for sale (1) or not a product for sale (0). To make this distinction, consider the following criteria: \n"
|
|
39
|
+
# " 1 Product for Sale (1): Classify as 1 if the result clearly indicates an item available for purchase, typically found "
|
|
40
|
+
# "within an online shop or marketplace.\n"
|
|
41
|
+
# " 2 Not a Product for Sale (0): Classify as 0 if the result is unrelated to a direct purchase of a product. This includes items such as: \n"
|
|
42
|
+
# " - Books and Videos: These may be available for sale, but if they are about or related to the searched product rather than being the "
|
|
43
|
+
# "exact product itself, classify as 0.\n"
|
|
44
|
+
# " - Advertisements: Promotional content that doesn't directly sell a product.\n"
|
|
45
|
+
# " - Companies and Services: Names and descriptions of companies or services related to the product but not the product itself.\n"
|
|
46
|
+
# " - Related Topics/Content: Any text or media that discusses or elaborates on the topic without offering a tangible product for sale.\n"
|
|
47
|
+
# "Make your decision based solely on the context and details provided in the search result. Respond only with the number 1 or 0."
|
|
48
|
+
# ),
|
|
49
|
+
# product_item_fields=["product_name", "product_description"],
|
|
50
|
+
# allowed_classes=[0, 1],
|
|
51
|
+
# ),
|
|
52
|
+
]
|
|
53
|
+
# # Optional: Add tern ENRICHEMENT
|
|
54
|
+
# from fraudcrawler import Enrichment
|
|
55
|
+
|
|
56
|
+
# deepness.enrichment = Enrichment(additional_terms=10, additional_urls_per_term=20)
|
|
57
|
+
|
|
58
|
+
# # Optional: Add MARKETPLACES and EXCLUDED_URLS
|
|
59
|
+
# from fraudcrawler import Host
|
|
60
|
+
|
|
61
|
+
# marketplaces = [
|
|
62
|
+
# Host(name="International", domains="zavamed.com,apomeds.com"),
|
|
63
|
+
# Host(name="National", domains="netdoktor.ch, nobelpharma.ch")
|
|
64
|
+
# ]
|
|
65
|
+
# excluded_urls = [
|
|
66
|
+
# Host(name="Digitec", domains="digitec.ch"),
|
|
67
|
+
# Host(name="Brack", domains="brack.ch"),
|
|
68
|
+
# ]
|
|
69
|
+
|
|
70
|
+
# Execute the pipeline
|
|
71
|
+
client.execute(
|
|
72
|
+
search_term=search_term,
|
|
73
|
+
language=language,
|
|
74
|
+
location=location,
|
|
75
|
+
deepness=deepness,
|
|
76
|
+
prompts=prompts,
|
|
77
|
+
# marketplaces=marketplaces,
|
|
78
|
+
# excluded_urls=excluded_urls,
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
# Show results
|
|
82
|
+
print()
|
|
83
|
+
title = "Available results"
|
|
84
|
+
print(title)
|
|
85
|
+
print("=" * len(title))
|
|
86
|
+
client.print_available_results()
|
|
87
|
+
print()
|
|
88
|
+
title = f'Results for "{search_term.upper()}"'
|
|
89
|
+
print(title)
|
|
90
|
+
print("=" * len(title))
|
|
91
|
+
df = client.load_results()
|
|
92
|
+
print(f"Number of products found: {len(df)}")
|
|
93
|
+
print()
|
|
94
|
+
n_head = 10
|
|
95
|
+
print(f"First {n_head} products are:")
|
|
96
|
+
print(df.head(n=n_head))
|
|
97
|
+
print()
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
if __name__ == "__main__":
|
|
101
|
+
main()
|
|
@@ -52,42 +52,34 @@ class Processor:
|
|
|
52
52
|
raise ValueError("Empty response from OpenAI API")
|
|
53
53
|
return content
|
|
54
54
|
|
|
55
|
-
async def classify(
|
|
56
|
-
|
|
57
|
-
) -> int:
|
|
58
|
-
"""A generic classification method that classified a product based on a prompt object.
|
|
55
|
+
async def classify(self, prompt: Prompt, url: str, product_details: str) -> int:
|
|
56
|
+
"""A generic classification method that classifies a product based on a prompt object.
|
|
59
57
|
|
|
60
58
|
Args:
|
|
61
|
-
prompt: A dictionary with keys "system_prompt",
|
|
59
|
+
prompt: A dictionary with keys "system_prompt", etc.
|
|
62
60
|
url: Product URL (often used in the user_prompt).
|
|
63
|
-
|
|
64
|
-
description: Product description (often used in the user_prompt).
|
|
61
|
+
product_details: String with product details, formatted per prompt.product_item_fields.
|
|
65
62
|
|
|
66
63
|
Note:
|
|
67
64
|
This method returns `PROCESSOR_DEFAULT_IF_MISSING` if:
|
|
68
|
-
-
|
|
65
|
+
- product_details is empty
|
|
69
66
|
- an error occurs during the API call
|
|
70
67
|
- if the response isn't in allowed_classes.
|
|
71
68
|
"""
|
|
72
69
|
# If required fields are missing, return the prompt's default fallback if provided.
|
|
73
|
-
if
|
|
74
|
-
logger.warning(
|
|
75
|
-
f"Missing required fields for classification: name='{name}', description='{description}'"
|
|
76
|
-
)
|
|
70
|
+
if not product_details:
|
|
71
|
+
logger.warning("Missing required product_details for classification.")
|
|
77
72
|
return self._default_if_missing
|
|
78
73
|
|
|
79
74
|
# Substitute placeholders in user_prompt with the relevant arguments
|
|
80
75
|
user_prompt = PROCESSOR_USER_PROMPT_TEMPLATE.format(
|
|
81
|
-
|
|
82
|
-
url=url,
|
|
83
|
-
name=name,
|
|
84
|
-
description=description,
|
|
76
|
+
product_details=product_details,
|
|
85
77
|
)
|
|
86
78
|
|
|
87
79
|
# Call the OpenAI API
|
|
88
80
|
try:
|
|
89
81
|
logger.debug(
|
|
90
|
-
f'Calling OpenAI API for classification (
|
|
82
|
+
f'Calling OpenAI API for classification (url="{url}", prompt="{prompt.name}")'
|
|
91
83
|
)
|
|
92
84
|
content = await self._call_openai_api(
|
|
93
85
|
system_prompt=prompt.system_prompt,
|
|
@@ -104,12 +96,12 @@ class Processor:
|
|
|
104
96
|
return self._default_if_missing
|
|
105
97
|
|
|
106
98
|
logger.info(
|
|
107
|
-
f'Classification for "{
|
|
99
|
+
f'Classification for url="{url}" (prompt={prompt.name}): {classification}'
|
|
108
100
|
)
|
|
109
101
|
return classification
|
|
110
102
|
|
|
111
103
|
except Exception as e:
|
|
112
104
|
logger.error(
|
|
113
|
-
f'Error classifying product "{
|
|
105
|
+
f'Error classifying product at url="{url}" with prompt "{prompt.name}": {e}'
|
|
114
106
|
)
|
|
115
107
|
return self._default_if_missing
|
|
@@ -14,6 +14,7 @@ logger = logging.getLogger(__name__)
|
|
|
14
14
|
|
|
15
15
|
class SerpResult(BaseModel):
|
|
16
16
|
"""Model for a single search result from SerpApi."""
|
|
17
|
+
|
|
17
18
|
url: str
|
|
18
19
|
domain: str
|
|
19
20
|
marketplace_name: str
|
|
@@ -23,6 +24,7 @@ class SerpResult(BaseModel):
|
|
|
23
24
|
|
|
24
25
|
class SearchEngine(Enum):
|
|
25
26
|
"""Enum for the supported search engines."""
|
|
27
|
+
|
|
26
28
|
GOOGLE = "google"
|
|
27
29
|
GOOGLE_SHOPPING = "google_shopping"
|
|
28
30
|
|
|
@@ -33,7 +35,7 @@ class SerpApi(AsyncClient):
|
|
|
33
35
|
_endpoint = "https://serpapi.com/search"
|
|
34
36
|
_engine_marketplace_names = {
|
|
35
37
|
SearchEngine.GOOGLE.value: "Google",
|
|
36
|
-
SearchEngine.GOOGLE_SHOPPING.value: "Google Shopping"
|
|
38
|
+
SearchEngine.GOOGLE_SHOPPING.value: "Google Shopping",
|
|
37
39
|
}
|
|
38
40
|
_hostname_pattern = r"^(?:https?:\/\/)?([^\/:?#]+)"
|
|
39
41
|
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
import logging
|
|
3
3
|
from typing import List
|
|
4
|
+
from base64 import b64decode
|
|
4
5
|
|
|
5
6
|
import aiohttp
|
|
6
7
|
|
|
@@ -68,7 +69,8 @@ class ZyteApi(AsyncClient):
|
|
|
68
69
|
"metadata": {
|
|
69
70
|
"probability": float,
|
|
70
71
|
},
|
|
71
|
-
}
|
|
72
|
+
},
|
|
73
|
+
"httpResponseBody": base64
|
|
72
74
|
}
|
|
73
75
|
"""
|
|
74
76
|
logger.info(f"Fetching product details by Zyte for URL {url}.")
|
|
@@ -192,3 +194,24 @@ class ZyteApi(AsyncClient):
|
|
|
192
194
|
}
|
|
193
195
|
"""
|
|
194
196
|
return float(details.get("product", {}).get("metadata", {}).get("probability"))
|
|
197
|
+
|
|
198
|
+
@staticmethod
|
|
199
|
+
def extract_html(details: dict) -> str | None:
|
|
200
|
+
"""Extracts the HTML from the Zyte API response.
|
|
201
|
+
|
|
202
|
+
The input argument is a dictionary of the following structure:
|
|
203
|
+
{
|
|
204
|
+
"httpResponseBody": base64
|
|
205
|
+
}
|
|
206
|
+
"""
|
|
207
|
+
|
|
208
|
+
# Get the Base64-encoded content
|
|
209
|
+
encoded = details.get("httpResponseBody")
|
|
210
|
+
|
|
211
|
+
# Decode it into bytes
|
|
212
|
+
if isinstance(encoded, str):
|
|
213
|
+
decoded_bytes = b64decode(encoded)
|
|
214
|
+
|
|
215
|
+
# Convert bytes to string (assuming UTF-8 encoding)
|
|
216
|
+
decoded_string = decoded_bytes.decode("utf-8")
|
|
217
|
+
return decoded_string
|
|
@@ -22,9 +22,8 @@ ZYTE_DEFALUT_PROBABILITY_THRESHOLD = 0.1
|
|
|
22
22
|
# Processor settings
|
|
23
23
|
PROCESSOR_DEFAULT_MODEL = "gpt-4o"
|
|
24
24
|
PROCESSOR_DEFAULT_IF_MISSING = -1
|
|
25
|
-
PROCESSOR_USER_PROMPT_TEMPLATE =
|
|
26
|
-
|
|
27
|
-
)
|
|
25
|
+
PROCESSOR_USER_PROMPT_TEMPLATE = "Product Details:\n{product_details}\n\nRelevance:"
|
|
26
|
+
PROCESSOR_PRODUCT_DETAILS_TEMPLATE = "{field_name}:\n{field_value}"
|
|
28
27
|
|
|
29
28
|
# Async settings
|
|
30
29
|
DEFAULT_N_SERP_WKRS = 10
|
|
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
|
|
|
4
4
|
|
|
5
5
|
[tool.poetry]
|
|
6
6
|
name = "fraudcrawler"
|
|
7
|
-
version = "0.4.
|
|
7
|
+
version = "0.4.2"
|
|
8
8
|
description = "Intelligent Market Monitoring"
|
|
9
9
|
authors = [
|
|
10
10
|
"Domingo Bertus <hello@veanu.ch>",
|
|
@@ -25,6 +25,7 @@ pandas = "^2.2.3"
|
|
|
25
25
|
aiohttp = "^3.11.14"
|
|
26
26
|
pydantic-settings = "^2.8.1"
|
|
27
27
|
openai = "^1.68.2"
|
|
28
|
+
beautifulsoup4 = "^4.13.4"
|
|
28
29
|
|
|
29
30
|
[tool.poetry.group.dev.dependencies]
|
|
30
31
|
pytest-cov = "^6.0.0"
|
|
@@ -1,100 +0,0 @@
|
|
|
1
|
-
import logging
|
|
2
|
-
|
|
3
|
-
from fraudcrawler import FraudCrawlerClient, Language, Location, Deepness, Prompt
|
|
4
|
-
|
|
5
|
-
LOG_FMT = "%(asctime)s | %(name)s | %(funcName)s | %(levelname)s | %(message)s"
|
|
6
|
-
LOG_LVL = "INFO"
|
|
7
|
-
DATE_FMT = "%Y-%m-%d %H:%M:%S"
|
|
8
|
-
logging.basicConfig(format=LOG_FMT, level=LOG_LVL, datefmt=DATE_FMT)
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
def main():
|
|
12
|
-
# Setup the client
|
|
13
|
-
client = FraudCrawlerClient()
|
|
14
|
-
|
|
15
|
-
# Setup the search
|
|
16
|
-
search_term = "Kühlschrank"
|
|
17
|
-
language = Language(name="German")
|
|
18
|
-
location = Location(name="Switzerland")
|
|
19
|
-
deepness = Deepness(num_results=20)
|
|
20
|
-
prompts = [
|
|
21
|
-
Prompt(
|
|
22
|
-
name="relevance",
|
|
23
|
-
context="This organization is interested in checking the energy efficiency of certain devices.",
|
|
24
|
-
system_prompt=(
|
|
25
|
-
"You are a helpful and intelligent assistant. Your task is to classify any given product "
|
|
26
|
-
"as either relevant (1) or not relevant (0), strictly based on the context and product details provided by the user. "
|
|
27
|
-
"You must consider all aspects of the given context and make a binary decision accordingly. "
|
|
28
|
-
"If the product aligns with the user's needs, classify it as 1 (relevant); otherwise, classify it as 0 (not relevant). "
|
|
29
|
-
"Respond only with the number 1 or 0."
|
|
30
|
-
),
|
|
31
|
-
allowed_classes=[0, 1],
|
|
32
|
-
),
|
|
33
|
-
Prompt(
|
|
34
|
-
name="seriousness",
|
|
35
|
-
context="This organization is interested in checking the energy efficiency of certain devices.",
|
|
36
|
-
system_prompt=(
|
|
37
|
-
"You are an intelligent and discerning assistant. Your task is to classify each item as either "
|
|
38
|
-
"a product for sale (1) or not a product for sale (0). To make this distinction, consider the following criteria: \n"
|
|
39
|
-
" 1 Product for Sale (1): Classify as 1 if the result clearly indicates an item available for purchase, typically found "
|
|
40
|
-
"within an online shop or marketplace.\n"
|
|
41
|
-
" 2 Not a Product for Sale (0): Classify as 0 if the result is unrelated to a direct purchase of a product. This includes items such as: \n"
|
|
42
|
-
" - Books and Videos: These may be available for sale, but if they are about or related to the searched product rather than being the "
|
|
43
|
-
"exact product itself, classify as 0.\n"
|
|
44
|
-
" - Advertisements: Promotional content that doesn't directly sell a product.\n"
|
|
45
|
-
" - Companies and Services: Names and descriptions of companies or services related to the product but not the product itself.\n"
|
|
46
|
-
" - Related Topics/Content: Any text or media that discusses or elaborates on the topic without offering a tangible product for sale.\n"
|
|
47
|
-
"Make your decision based solely on the context and details provided in the search result. Respond only with the number 1 or 0."
|
|
48
|
-
),
|
|
49
|
-
allowed_classes=[0, 1],
|
|
50
|
-
),
|
|
51
|
-
]
|
|
52
|
-
# # Optional: Add tern ENRICHEMENT
|
|
53
|
-
# from fraudcrawler import Enrichment
|
|
54
|
-
|
|
55
|
-
# deepness.enrichment = Enrichment(additional_terms=10, additional_urls_per_term=20)
|
|
56
|
-
|
|
57
|
-
# # Optional: Add MARKETPLACES and EXCLUDED_URLS
|
|
58
|
-
# from fraudcrawler import Host
|
|
59
|
-
|
|
60
|
-
# marketplaces = [
|
|
61
|
-
# Host(name="International", domains="zavamed.com,apomeds.com"),
|
|
62
|
-
# Host(name="National", domains="netdoktor.ch, nobelpharma.ch")
|
|
63
|
-
# ]
|
|
64
|
-
# excluded_urls = [
|
|
65
|
-
# Host(name="Digitec", domains="digitec.ch"),
|
|
66
|
-
# Host(name="Brack", domains="brack.ch"),
|
|
67
|
-
# ]
|
|
68
|
-
|
|
69
|
-
# Execute the pipeline
|
|
70
|
-
client.execute(
|
|
71
|
-
search_term=search_term,
|
|
72
|
-
language=language,
|
|
73
|
-
location=location,
|
|
74
|
-
deepness=deepness,
|
|
75
|
-
prompts=prompts,
|
|
76
|
-
# marketplaces=marketplaces,
|
|
77
|
-
# excluded_urls=excluded_urls,
|
|
78
|
-
)
|
|
79
|
-
|
|
80
|
-
# Show results
|
|
81
|
-
print()
|
|
82
|
-
title = "Available results"
|
|
83
|
-
print(title)
|
|
84
|
-
print("=" * len(title))
|
|
85
|
-
client.print_available_results()
|
|
86
|
-
print()
|
|
87
|
-
title = f'Results for "{search_term.upper()}"'
|
|
88
|
-
print(title)
|
|
89
|
-
print("=" * len(title))
|
|
90
|
-
df = client.load_results()
|
|
91
|
-
print(f"Number of products found: {len(df)}")
|
|
92
|
-
print()
|
|
93
|
-
n_head = 10
|
|
94
|
-
print(f"First {n_head} products are:")
|
|
95
|
-
print(df.head(n=n_head))
|
|
96
|
-
print()
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
if __name__ == "__main__":
|
|
100
|
-
main()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|