fraudcrawler 0.3.10__tar.gz → 0.4.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of fraudcrawler might be problematic. Click here for more details.
- {fraudcrawler-0.3.10 → fraudcrawler-0.4.2}/PKG-INFO +2 -2
- {fraudcrawler-0.3.10 → fraudcrawler-0.4.2}/README.md +0 -1
- {fraudcrawler-0.3.10 → fraudcrawler-0.4.2}/fraudcrawler/__init__.py +4 -2
- {fraudcrawler-0.3.10 → fraudcrawler-0.4.2}/fraudcrawler/base/base.py +41 -2
- {fraudcrawler-0.3.10 → fraudcrawler-0.4.2}/fraudcrawler/base/client.py +22 -2
- {fraudcrawler-0.3.10 → fraudcrawler-0.4.2}/fraudcrawler/base/orchestrator.py +36 -37
- fraudcrawler-0.4.2/fraudcrawler/launch_demo_pipeline.py +101 -0
- {fraudcrawler-0.3.10 → fraudcrawler-0.4.2}/fraudcrawler/processing/processor.py +11 -19
- {fraudcrawler-0.3.10 → fraudcrawler-0.4.2}/fraudcrawler/scraping/serp.py +197 -30
- {fraudcrawler-0.3.10 → fraudcrawler-0.4.2}/fraudcrawler/scraping/zyte.py +24 -1
- {fraudcrawler-0.3.10 → fraudcrawler-0.4.2}/fraudcrawler/settings.py +5 -5
- {fraudcrawler-0.3.10 → fraudcrawler-0.4.2}/pyproject.toml +2 -1
- fraudcrawler-0.3.10/fraudcrawler/launch_demo_pipeline.py +0 -100
- {fraudcrawler-0.3.10 → fraudcrawler-0.4.2}/LICENSE +0 -0
- {fraudcrawler-0.3.10 → fraudcrawler-0.4.2}/fraudcrawler/base/__init__.py +0 -0
- {fraudcrawler-0.3.10 → fraudcrawler-0.4.2}/fraudcrawler/base/google-languages.json +0 -0
- {fraudcrawler-0.3.10 → fraudcrawler-0.4.2}/fraudcrawler/base/google-locations.json +0 -0
- {fraudcrawler-0.3.10 → fraudcrawler-0.4.2}/fraudcrawler/processing/__init__.py +0 -0
- {fraudcrawler-0.3.10 → fraudcrawler-0.4.2}/fraudcrawler/scraping/__init__.py +0 -0
- {fraudcrawler-0.3.10 → fraudcrawler-0.4.2}/fraudcrawler/scraping/enrich.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: fraudcrawler
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.2
|
|
4
4
|
Summary: Intelligent Market Monitoring
|
|
5
5
|
Home-page: https://github.com/open-veanu/fraudcrawler
|
|
6
6
|
License: MIT
|
|
@@ -13,6 +13,7 @@ Classifier: Programming Language :: Python :: 3.11
|
|
|
13
13
|
Classifier: Programming Language :: Python :: 3.12
|
|
14
14
|
Classifier: Programming Language :: Python :: 3.13
|
|
15
15
|
Requires-Dist: aiohttp (>=3.11.14,<4.0.0)
|
|
16
|
+
Requires-Dist: beautifulsoup4 (>=4.13.4,<5.0.0)
|
|
16
17
|
Requires-Dist: openai (>=1.68.2,<2.0.0)
|
|
17
18
|
Requires-Dist: pandas (>=2.2.3,<3.0.0)
|
|
18
19
|
Requires-Dist: pydantic-settings (>=2.8.1,<3.0.0)
|
|
@@ -80,7 +81,6 @@ deepness = Deepness(num_results=50)
|
|
|
80
81
|
prompts = [
|
|
81
82
|
Prompt(
|
|
82
83
|
name="relevance",
|
|
83
|
-
context="This organization is interested in medical products and drugs.",
|
|
84
84
|
system_prompt=(
|
|
85
85
|
"You are a helpful and intelligent assistant. Your task is to classify any given product "
|
|
86
86
|
"as either relevant (1) or not relevant (0), strictly based on the context and product details provided by the user. "
|
|
@@ -58,7 +58,6 @@ deepness = Deepness(num_results=50)
|
|
|
58
58
|
prompts = [
|
|
59
59
|
Prompt(
|
|
60
60
|
name="relevance",
|
|
61
|
-
context="This organization is interested in medical products and drugs.",
|
|
62
61
|
system_prompt=(
|
|
63
62
|
"You are a helpful and intelligent assistant. Your task is to classify any given product "
|
|
64
63
|
"as either relevant (1) or not relevant (0), strictly based on the context and product details provided by the user. "
|
|
@@ -1,8 +1,8 @@
|
|
|
1
|
-
from fraudcrawler.scraping.serp import SerpApi
|
|
1
|
+
from fraudcrawler.scraping.serp import SerpApi, SearchEngine
|
|
2
2
|
from fraudcrawler.scraping.enrich import Enricher
|
|
3
3
|
from fraudcrawler.scraping.zyte import ZyteApi
|
|
4
4
|
from fraudcrawler.processing.processor import Processor
|
|
5
|
-
from fraudcrawler.base.orchestrator import Orchestrator
|
|
5
|
+
from fraudcrawler.base.orchestrator import Orchestrator
|
|
6
6
|
from fraudcrawler.base.client import FraudCrawlerClient
|
|
7
7
|
from fraudcrawler.base.base import (
|
|
8
8
|
Deepness,
|
|
@@ -11,10 +11,12 @@ from fraudcrawler.base.base import (
|
|
|
11
11
|
Language,
|
|
12
12
|
Location,
|
|
13
13
|
Prompt,
|
|
14
|
+
ProductItem,
|
|
14
15
|
)
|
|
15
16
|
|
|
16
17
|
__all__ = [
|
|
17
18
|
"SerpApi",
|
|
19
|
+
"SearchEngine",
|
|
18
20
|
"Enricher",
|
|
19
21
|
"ZyteApi",
|
|
20
22
|
"Processor",
|
|
@@ -2,12 +2,13 @@ import json
|
|
|
2
2
|
import logging
|
|
3
3
|
from pydantic import (
|
|
4
4
|
BaseModel,
|
|
5
|
+
Field,
|
|
5
6
|
field_validator,
|
|
6
7
|
model_validator,
|
|
7
8
|
)
|
|
8
9
|
from pydantic_settings import BaseSettings
|
|
9
10
|
import re
|
|
10
|
-
from typing import List
|
|
11
|
+
from typing import List, Dict
|
|
11
12
|
|
|
12
13
|
import aiohttp
|
|
13
14
|
|
|
@@ -114,12 +115,39 @@ class Deepness(BaseModel):
|
|
|
114
115
|
enrichment: Enrichment | None = None
|
|
115
116
|
|
|
116
117
|
|
|
118
|
+
class ProductItem(BaseModel):
|
|
119
|
+
"""Model representing a product item."""
|
|
120
|
+
|
|
121
|
+
# Serp/Enrich parameters
|
|
122
|
+
search_term: str
|
|
123
|
+
search_term_type: str
|
|
124
|
+
url: str
|
|
125
|
+
marketplace_name: str
|
|
126
|
+
domain: str
|
|
127
|
+
|
|
128
|
+
# Zyte parameters
|
|
129
|
+
product_name: str | None = None
|
|
130
|
+
product_price: str | None = None
|
|
131
|
+
product_description: str | None = None
|
|
132
|
+
product_images: List[str] | None = None
|
|
133
|
+
probability: float | None = None
|
|
134
|
+
html: str | None = None
|
|
135
|
+
html_clean: str | None = None
|
|
136
|
+
|
|
137
|
+
# Processor parameters are set dynamic so we must allow extra fields
|
|
138
|
+
classifications: Dict[str, int] = Field(default_factory=dict)
|
|
139
|
+
|
|
140
|
+
# Filtering parameters
|
|
141
|
+
filtered: bool = False
|
|
142
|
+
filtered_at_stage: str | None = None
|
|
143
|
+
|
|
144
|
+
|
|
117
145
|
class Prompt(BaseModel):
|
|
118
146
|
"""Model for prompts."""
|
|
119
147
|
|
|
120
148
|
name: str
|
|
121
|
-
context: str
|
|
122
149
|
system_prompt: str
|
|
150
|
+
product_item_fields: List[str]
|
|
123
151
|
allowed_classes: List[int]
|
|
124
152
|
|
|
125
153
|
@field_validator("allowed_classes", mode="before")
|
|
@@ -129,6 +157,17 @@ class Prompt(BaseModel):
|
|
|
129
157
|
raise ValueError("all values in allowed_classes must be positive integers.")
|
|
130
158
|
return val
|
|
131
159
|
|
|
160
|
+
@field_validator("product_item_fields", mode="before")
|
|
161
|
+
def validate_product_item_fields(cls, val):
|
|
162
|
+
"""Ensure all product_item_fields are valid ProductItem attributes."""
|
|
163
|
+
valid_fields = set(ProductItem.model_fields.keys())
|
|
164
|
+
for field in val:
|
|
165
|
+
if field not in valid_fields:
|
|
166
|
+
raise ValueError(
|
|
167
|
+
f"Invalid product_item_field: '{field}'. Must be one of: {sorted(valid_fields)}"
|
|
168
|
+
)
|
|
169
|
+
return val
|
|
170
|
+
|
|
132
171
|
|
|
133
172
|
class AsyncClient:
|
|
134
173
|
"""Base class for sub-classes using async HTTP requests."""
|
|
@@ -9,8 +9,17 @@ from typing import List
|
|
|
9
9
|
import pandas as pd
|
|
10
10
|
|
|
11
11
|
from fraudcrawler.settings import ROOT_DIR
|
|
12
|
-
from fraudcrawler.base.base import
|
|
13
|
-
|
|
12
|
+
from fraudcrawler.base.base import (
|
|
13
|
+
Setup,
|
|
14
|
+
Language,
|
|
15
|
+
Location,
|
|
16
|
+
Deepness,
|
|
17
|
+
Host,
|
|
18
|
+
Prompt,
|
|
19
|
+
ProductItem,
|
|
20
|
+
)
|
|
21
|
+
from fraudcrawler.base.orchestrator import Orchestrator
|
|
22
|
+
from fraudcrawler.scraping.serp import SearchEngine
|
|
14
23
|
|
|
15
24
|
logger = logging.getLogger(__name__)
|
|
16
25
|
|
|
@@ -84,6 +93,7 @@ class FraudCrawlerClient(Orchestrator):
|
|
|
84
93
|
prompts: List[Prompt],
|
|
85
94
|
marketplaces: List[Host] | None = None,
|
|
86
95
|
excluded_urls: List[Host] | None = None,
|
|
96
|
+
search_engines: List[SearchEngine | str] | None = None,
|
|
87
97
|
) -> None:
|
|
88
98
|
"""Runs the pipeline steps: serp, enrich, zyte, process, and collect the results.
|
|
89
99
|
|
|
@@ -96,6 +106,7 @@ class FraudCrawlerClient(Orchestrator):
|
|
|
96
106
|
marketplaces: The marketplaces to include in the search.
|
|
97
107
|
excluded_urls: The URLs to exclude from the search.
|
|
98
108
|
"""
|
|
109
|
+
# Handle results files
|
|
99
110
|
timestamp = datetime.today().strftime("%Y%m%d%H%M%S")
|
|
100
111
|
filename = self._results_dir / self._filename_template.format(
|
|
101
112
|
search_term=search_term,
|
|
@@ -105,9 +116,18 @@ class FraudCrawlerClient(Orchestrator):
|
|
|
105
116
|
)
|
|
106
117
|
self._results.append(Results(search_term=search_term, filename=filename))
|
|
107
118
|
|
|
119
|
+
# Normalize inputs
|
|
120
|
+
nrm_se: List[SearchEngine] = list(SearchEngine)
|
|
121
|
+
if search_engines:
|
|
122
|
+
nrm_se = [
|
|
123
|
+
SearchEngine(se) if isinstance(se, str) else se for se in search_engines
|
|
124
|
+
]
|
|
125
|
+
|
|
126
|
+
# Run the pipeline by calling the orchestrator's run method
|
|
108
127
|
asyncio.run(
|
|
109
128
|
super().run(
|
|
110
129
|
search_term=search_term,
|
|
130
|
+
search_engines=nrm_se,
|
|
111
131
|
language=language,
|
|
112
132
|
location=location,
|
|
113
133
|
deepness=deepness,
|
|
@@ -1,12 +1,13 @@
|
|
|
1
1
|
from abc import ABC, abstractmethod
|
|
2
2
|
import asyncio
|
|
3
3
|
import logging
|
|
4
|
-
from pydantic import BaseModel, Field
|
|
5
4
|
from typing import Dict, List, Set, cast
|
|
5
|
+
from bs4 import BeautifulSoup
|
|
6
6
|
|
|
7
7
|
from fraudcrawler.settings import (
|
|
8
8
|
PROCESSOR_DEFAULT_MODEL,
|
|
9
9
|
PROCESSOR_DEFAULT_IF_MISSING,
|
|
10
|
+
PROCESSOR_PRODUCT_DETAILS_TEMPLATE,
|
|
10
11
|
MAX_RETRIES,
|
|
11
12
|
RETRY_DELAY,
|
|
12
13
|
)
|
|
@@ -15,37 +16,19 @@ from fraudcrawler.settings import (
|
|
|
15
16
|
DEFAULT_N_ZYTE_WKRS,
|
|
16
17
|
DEFAULT_N_PROC_WKRS,
|
|
17
18
|
)
|
|
18
|
-
from fraudcrawler.base.base import
|
|
19
|
-
|
|
19
|
+
from fraudcrawler.base.base import (
|
|
20
|
+
Deepness,
|
|
21
|
+
Host,
|
|
22
|
+
Language,
|
|
23
|
+
Location,
|
|
24
|
+
Prompt,
|
|
25
|
+
ProductItem,
|
|
26
|
+
)
|
|
27
|
+
from fraudcrawler import SerpApi, SearchEngine, Enricher, ZyteApi, Processor
|
|
20
28
|
|
|
21
29
|
logger = logging.getLogger(__name__)
|
|
22
30
|
|
|
23
31
|
|
|
24
|
-
class ProductItem(BaseModel):
|
|
25
|
-
"""Model representing a product item."""
|
|
26
|
-
|
|
27
|
-
# Serp/Enrich parameters
|
|
28
|
-
search_term: str
|
|
29
|
-
search_term_type: str
|
|
30
|
-
url: str
|
|
31
|
-
marketplace_name: str
|
|
32
|
-
domain: str
|
|
33
|
-
|
|
34
|
-
# Zyte parameters
|
|
35
|
-
product_name: str | None = None
|
|
36
|
-
product_price: str | None = None
|
|
37
|
-
product_description: str | None = None
|
|
38
|
-
product_images: List[str] | None = None
|
|
39
|
-
probability: float | None = None
|
|
40
|
-
|
|
41
|
-
# Processor parameters are set dynamic so we must allow extra fields
|
|
42
|
-
classifications: Dict[str, int] = Field(default_factory=dict)
|
|
43
|
-
|
|
44
|
-
# Filtering parameters
|
|
45
|
-
filtered: bool = False
|
|
46
|
-
filtered_at_stage: str | None = None
|
|
47
|
-
|
|
48
|
-
|
|
49
32
|
class Orchestrator(ABC):
|
|
50
33
|
"""Abstract base class for orchestrating the different actors (crawling, processing).
|
|
51
34
|
|
|
@@ -231,15 +214,16 @@ class Orchestrator(ABC):
|
|
|
231
214
|
product.probability = self._zyteapi.extract_probability(
|
|
232
215
|
details=details
|
|
233
216
|
)
|
|
234
|
-
|
|
217
|
+
product.html = self._zyteapi.extract_html(details=details)
|
|
218
|
+
if product.html:
|
|
219
|
+
soup = BeautifulSoup(product.html, "html.parser")
|
|
220
|
+
product.html_clean = soup.get_text(separator=" ", strip=True)
|
|
235
221
|
# Filter the product based on the probability threshold
|
|
236
222
|
if not self._zyteapi.keep_product(details=details):
|
|
237
223
|
product.filtered = True
|
|
238
224
|
product.filtered_at_stage = "Zyte probability threshold"
|
|
239
|
-
|
|
240
225
|
except Exception as e:
|
|
241
226
|
logger.warning(f"Error executing Zyte API search: {e}.")
|
|
242
|
-
|
|
243
227
|
await queue_out.put(product)
|
|
244
228
|
queue_in.task_done()
|
|
245
229
|
|
|
@@ -269,19 +253,26 @@ class Orchestrator(ABC):
|
|
|
269
253
|
if not product.filtered:
|
|
270
254
|
try:
|
|
271
255
|
url = product.url
|
|
272
|
-
name = product.product_name
|
|
273
|
-
description = product.product_description
|
|
274
|
-
|
|
275
256
|
# Run all the configured prompts
|
|
276
257
|
for prompt in prompts:
|
|
258
|
+
# Dynamically build product_details string
|
|
259
|
+
details = []
|
|
260
|
+
for field in prompt.product_item_fields:
|
|
261
|
+
value = getattr(product, field, None)
|
|
262
|
+
if value is not None:
|
|
263
|
+
details.append(
|
|
264
|
+
PROCESSOR_PRODUCT_DETAILS_TEMPLATE.format(
|
|
265
|
+
field_name=field, field_value=value
|
|
266
|
+
)
|
|
267
|
+
)
|
|
268
|
+
product_details = "\n\n".join(details)
|
|
277
269
|
logger.debug(
|
|
278
|
-
f"Classify product {
|
|
270
|
+
f"Classify product at {url} with prompt {prompt.name} and details: {product_details}"
|
|
279
271
|
)
|
|
280
272
|
classification = await self._processor.classify(
|
|
281
273
|
prompt=prompt,
|
|
282
274
|
url=url,
|
|
283
|
-
|
|
284
|
-
description=description,
|
|
275
|
+
product_details=product_details,
|
|
285
276
|
)
|
|
286
277
|
product.classifications[prompt.name] = classification
|
|
287
278
|
except Exception as e:
|
|
@@ -387,6 +378,7 @@ class Orchestrator(ABC):
|
|
|
387
378
|
queue: asyncio.Queue[dict | None],
|
|
388
379
|
search_term: str,
|
|
389
380
|
search_term_type: str,
|
|
381
|
+
search_engines: List[SearchEngine],
|
|
390
382
|
language: Language,
|
|
391
383
|
location: Location,
|
|
392
384
|
num_results: int,
|
|
@@ -397,6 +389,7 @@ class Orchestrator(ABC):
|
|
|
397
389
|
item = {
|
|
398
390
|
"search_term": search_term,
|
|
399
391
|
"search_term_type": search_term_type,
|
|
392
|
+
"search_engines": search_engines,
|
|
400
393
|
"language": language,
|
|
401
394
|
"location": location,
|
|
402
395
|
"num_results": num_results,
|
|
@@ -410,6 +403,7 @@ class Orchestrator(ABC):
|
|
|
410
403
|
self,
|
|
411
404
|
queue: asyncio.Queue[dict | None],
|
|
412
405
|
search_term: str,
|
|
406
|
+
search_engines: List[SearchEngine],
|
|
413
407
|
language: Language,
|
|
414
408
|
location: Location,
|
|
415
409
|
deepness: Deepness,
|
|
@@ -429,6 +423,7 @@ class Orchestrator(ABC):
|
|
|
429
423
|
await self._add_serp_items_for_search_term(
|
|
430
424
|
search_term=search_term,
|
|
431
425
|
search_term_type="initial",
|
|
426
|
+
search_engines=search_engines,
|
|
432
427
|
num_results=deepness.num_results,
|
|
433
428
|
**common_kwargs, # type: ignore[arg-type]
|
|
434
429
|
)
|
|
@@ -450,6 +445,7 @@ class Orchestrator(ABC):
|
|
|
450
445
|
await self._add_serp_items_for_search_term(
|
|
451
446
|
search_term=trm,
|
|
452
447
|
search_term_type="enriched",
|
|
448
|
+
search_engines=search_engines,
|
|
453
449
|
num_results=enrichment.additional_urls_per_term,
|
|
454
450
|
**common_kwargs, # type: ignore[arg-type]
|
|
455
451
|
)
|
|
@@ -457,6 +453,7 @@ class Orchestrator(ABC):
|
|
|
457
453
|
async def run(
|
|
458
454
|
self,
|
|
459
455
|
search_term: str,
|
|
456
|
+
search_engines: List[SearchEngine],
|
|
460
457
|
language: Language,
|
|
461
458
|
location: Location,
|
|
462
459
|
deepness: Deepness,
|
|
@@ -469,6 +466,7 @@ class Orchestrator(ABC):
|
|
|
469
466
|
|
|
470
467
|
Args:
|
|
471
468
|
search_term: The search term for the query.
|
|
469
|
+
search_engines: The list of search engines to use for the SerpAPI query.
|
|
472
470
|
language: The language to use for the query.
|
|
473
471
|
location: The location to use for the query.
|
|
474
472
|
deepness: The search depth and enrichment details.
|
|
@@ -523,6 +521,7 @@ class Orchestrator(ABC):
|
|
|
523
521
|
await self._add_serp_items(
|
|
524
522
|
queue=serp_queue,
|
|
525
523
|
search_term=search_term,
|
|
524
|
+
search_engines=search_engines,
|
|
526
525
|
language=language,
|
|
527
526
|
location=location,
|
|
528
527
|
deepness=deepness,
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
|
|
3
|
+
from fraudcrawler import FraudCrawlerClient, Language, Location, Deepness, Prompt
|
|
4
|
+
|
|
5
|
+
LOG_FMT = "%(asctime)s | %(name)s | %(funcName)s | %(levelname)s | %(message)s"
|
|
6
|
+
LOG_LVL = "INFO"
|
|
7
|
+
DATE_FMT = "%Y-%m-%d %H:%M:%S"
|
|
8
|
+
logging.basicConfig(format=LOG_FMT, level=LOG_LVL, datefmt=DATE_FMT)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def main():
|
|
12
|
+
# Setup the client
|
|
13
|
+
client = FraudCrawlerClient()
|
|
14
|
+
|
|
15
|
+
# Setup the search
|
|
16
|
+
search_term = "Kühlschrank"
|
|
17
|
+
language = Language(name="German")
|
|
18
|
+
location = Location(name="Switzerland")
|
|
19
|
+
deepness = Deepness(num_results=10)
|
|
20
|
+
prompts = [
|
|
21
|
+
Prompt(
|
|
22
|
+
name="availability",
|
|
23
|
+
system_prompt=(
|
|
24
|
+
"You are a helpful and intelligent assistant helping an organization that is interested in checking the availability of certain products."
|
|
25
|
+
"Your task is to classify any given product as either available (1) or not available (0), strictly based on the context and product details provided by the user. "
|
|
26
|
+
"You must consider all aspects of the given context and make a binary decision accordingly. "
|
|
27
|
+
"If the product can be purchased, added to a shopping basket, delivered, or is listed as available in any form, classify it as 1 (available); "
|
|
28
|
+
"if there is any mention of out of stock, not available, no longer shippable, or similar, classify it as 0 (not available). "
|
|
29
|
+
"Respond only with the number 1 or 0."
|
|
30
|
+
),
|
|
31
|
+
product_item_fields=["product_name", "html_clean"],
|
|
32
|
+
allowed_classes=[0, 1],
|
|
33
|
+
),
|
|
34
|
+
# Prompt(
|
|
35
|
+
# name="seriousness",
|
|
36
|
+
# system_prompt=(
|
|
37
|
+
# "You are a helpful and intelligent assistant helping an organization that is interested in checking the energy efficiency of certain devices. "
|
|
38
|
+
# "Your task is to classify each item as either a product for sale (1) or not a product for sale (0). To make this distinction, consider the following criteria: \n"
|
|
39
|
+
# " 1 Product for Sale (1): Classify as 1 if the result clearly indicates an item available for purchase, typically found "
|
|
40
|
+
# "within an online shop or marketplace.\n"
|
|
41
|
+
# " 2 Not a Product for Sale (0): Classify as 0 if the result is unrelated to a direct purchase of a product. This includes items such as: \n"
|
|
42
|
+
# " - Books and Videos: These may be available for sale, but if they are about or related to the searched product rather than being the "
|
|
43
|
+
# "exact product itself, classify as 0.\n"
|
|
44
|
+
# " - Advertisements: Promotional content that doesn't directly sell a product.\n"
|
|
45
|
+
# " - Companies and Services: Names and descriptions of companies or services related to the product but not the product itself.\n"
|
|
46
|
+
# " - Related Topics/Content: Any text or media that discusses or elaborates on the topic without offering a tangible product for sale.\n"
|
|
47
|
+
# "Make your decision based solely on the context and details provided in the search result. Respond only with the number 1 or 0."
|
|
48
|
+
# ),
|
|
49
|
+
# product_item_fields=["product_name", "product_description"],
|
|
50
|
+
# allowed_classes=[0, 1],
|
|
51
|
+
# ),
|
|
52
|
+
]
|
|
53
|
+
# # Optional: Add tern ENRICHEMENT
|
|
54
|
+
# from fraudcrawler import Enrichment
|
|
55
|
+
|
|
56
|
+
# deepness.enrichment = Enrichment(additional_terms=10, additional_urls_per_term=20)
|
|
57
|
+
|
|
58
|
+
# # Optional: Add MARKETPLACES and EXCLUDED_URLS
|
|
59
|
+
# from fraudcrawler import Host
|
|
60
|
+
|
|
61
|
+
# marketplaces = [
|
|
62
|
+
# Host(name="International", domains="zavamed.com,apomeds.com"),
|
|
63
|
+
# Host(name="National", domains="netdoktor.ch, nobelpharma.ch")
|
|
64
|
+
# ]
|
|
65
|
+
# excluded_urls = [
|
|
66
|
+
# Host(name="Digitec", domains="digitec.ch"),
|
|
67
|
+
# Host(name="Brack", domains="brack.ch"),
|
|
68
|
+
# ]
|
|
69
|
+
|
|
70
|
+
# Execute the pipeline
|
|
71
|
+
client.execute(
|
|
72
|
+
search_term=search_term,
|
|
73
|
+
language=language,
|
|
74
|
+
location=location,
|
|
75
|
+
deepness=deepness,
|
|
76
|
+
prompts=prompts,
|
|
77
|
+
# marketplaces=marketplaces,
|
|
78
|
+
# excluded_urls=excluded_urls,
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
# Show results
|
|
82
|
+
print()
|
|
83
|
+
title = "Available results"
|
|
84
|
+
print(title)
|
|
85
|
+
print("=" * len(title))
|
|
86
|
+
client.print_available_results()
|
|
87
|
+
print()
|
|
88
|
+
title = f'Results for "{search_term.upper()}"'
|
|
89
|
+
print(title)
|
|
90
|
+
print("=" * len(title))
|
|
91
|
+
df = client.load_results()
|
|
92
|
+
print(f"Number of products found: {len(df)}")
|
|
93
|
+
print()
|
|
94
|
+
n_head = 10
|
|
95
|
+
print(f"First {n_head} products are:")
|
|
96
|
+
print(df.head(n=n_head))
|
|
97
|
+
print()
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
if __name__ == "__main__":
|
|
101
|
+
main()
|
|
@@ -52,42 +52,34 @@ class Processor:
|
|
|
52
52
|
raise ValueError("Empty response from OpenAI API")
|
|
53
53
|
return content
|
|
54
54
|
|
|
55
|
-
async def classify(
|
|
56
|
-
|
|
57
|
-
) -> int:
|
|
58
|
-
"""A generic classification method that classified a product based on a prompt object.
|
|
55
|
+
async def classify(self, prompt: Prompt, url: str, product_details: str) -> int:
|
|
56
|
+
"""A generic classification method that classifies a product based on a prompt object.
|
|
59
57
|
|
|
60
58
|
Args:
|
|
61
|
-
prompt: A dictionary with keys "system_prompt",
|
|
59
|
+
prompt: A dictionary with keys "system_prompt", etc.
|
|
62
60
|
url: Product URL (often used in the user_prompt).
|
|
63
|
-
|
|
64
|
-
description: Product description (often used in the user_prompt).
|
|
61
|
+
product_details: String with product details, formatted per prompt.product_item_fields.
|
|
65
62
|
|
|
66
63
|
Note:
|
|
67
64
|
This method returns `PROCESSOR_DEFAULT_IF_MISSING` if:
|
|
68
|
-
-
|
|
65
|
+
- product_details is empty
|
|
69
66
|
- an error occurs during the API call
|
|
70
67
|
- if the response isn't in allowed_classes.
|
|
71
68
|
"""
|
|
72
69
|
# If required fields are missing, return the prompt's default fallback if provided.
|
|
73
|
-
if
|
|
74
|
-
logger.warning(
|
|
75
|
-
f"Missing required fields for classification: name='{name}', description='{description}'"
|
|
76
|
-
)
|
|
70
|
+
if not product_details:
|
|
71
|
+
logger.warning("Missing required product_details for classification.")
|
|
77
72
|
return self._default_if_missing
|
|
78
73
|
|
|
79
74
|
# Substitute placeholders in user_prompt with the relevant arguments
|
|
80
75
|
user_prompt = PROCESSOR_USER_PROMPT_TEMPLATE.format(
|
|
81
|
-
|
|
82
|
-
url=url,
|
|
83
|
-
name=name,
|
|
84
|
-
description=description,
|
|
76
|
+
product_details=product_details,
|
|
85
77
|
)
|
|
86
78
|
|
|
87
79
|
# Call the OpenAI API
|
|
88
80
|
try:
|
|
89
81
|
logger.debug(
|
|
90
|
-
f'Calling OpenAI API for classification (
|
|
82
|
+
f'Calling OpenAI API for classification (url="{url}", prompt="{prompt.name}")'
|
|
91
83
|
)
|
|
92
84
|
content = await self._call_openai_api(
|
|
93
85
|
system_prompt=prompt.system_prompt,
|
|
@@ -104,12 +96,12 @@ class Processor:
|
|
|
104
96
|
return self._default_if_missing
|
|
105
97
|
|
|
106
98
|
logger.info(
|
|
107
|
-
f'Classification for "{
|
|
99
|
+
f'Classification for url="{url}" (prompt={prompt.name}): {classification}'
|
|
108
100
|
)
|
|
109
101
|
return classification
|
|
110
102
|
|
|
111
103
|
except Exception as e:
|
|
112
104
|
logger.error(
|
|
113
|
-
f'Error classifying product "{
|
|
105
|
+
f'Error classifying product at url="{url}" with prompt "{prompt.name}": {e}'
|
|
114
106
|
)
|
|
115
107
|
return self._default_if_missing
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import asyncio
|
|
2
|
+
from enum import Enum
|
|
2
3
|
import logging
|
|
3
4
|
from pydantic import BaseModel
|
|
4
5
|
from typing import List
|
|
@@ -21,12 +22,21 @@ class SerpResult(BaseModel):
|
|
|
21
22
|
filtered_at_stage: str | None = None
|
|
22
23
|
|
|
23
24
|
|
|
25
|
+
class SearchEngine(Enum):
|
|
26
|
+
"""Enum for the supported search engines."""
|
|
27
|
+
|
|
28
|
+
GOOGLE = "google"
|
|
29
|
+
GOOGLE_SHOPPING = "google_shopping"
|
|
30
|
+
|
|
31
|
+
|
|
24
32
|
class SerpApi(AsyncClient):
|
|
25
33
|
"""A client to interact with the SerpApi for performing searches."""
|
|
26
34
|
|
|
27
35
|
_endpoint = "https://serpapi.com/search"
|
|
28
|
-
|
|
29
|
-
|
|
36
|
+
_engine_marketplace_names = {
|
|
37
|
+
SearchEngine.GOOGLE.value: "Google",
|
|
38
|
+
SearchEngine.GOOGLE_SHOPPING.value: "Google Shopping",
|
|
39
|
+
}
|
|
30
40
|
_hostname_pattern = r"^(?:https?:\/\/)?([^\/:?#]+)"
|
|
31
41
|
|
|
32
42
|
def __init__(
|
|
@@ -73,8 +83,42 @@ class SerpApi(AsyncClient):
|
|
|
73
83
|
hostname = hostname[4:]
|
|
74
84
|
return hostname.lower()
|
|
75
85
|
|
|
86
|
+
@staticmethod
|
|
87
|
+
def _extract_search_results(response: dict, engine: str) -> List[str]:
|
|
88
|
+
"""Extracts search results from the response based on the engine type.
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
response: The response from the SerpApi search.
|
|
92
|
+
engine: The search engine used.
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
A list of URLs extracted from the response.
|
|
96
|
+
"""
|
|
97
|
+
urls = []
|
|
98
|
+
if engine == SearchEngine.GOOGLE.value:
|
|
99
|
+
# Get the organic_results
|
|
100
|
+
results = response.get("organic_results")
|
|
101
|
+
if results is None:
|
|
102
|
+
logger.warning(f'No SerpAPI results for engine="{engine}".')
|
|
103
|
+
else:
|
|
104
|
+
urls = [url for res in results if (url := res.get("link"))]
|
|
105
|
+
|
|
106
|
+
elif engine == SearchEngine.GOOGLE_SHOPPING.value:
|
|
107
|
+
# Get the shopping_results
|
|
108
|
+
results = response.get("shopping_results")
|
|
109
|
+
if results is None:
|
|
110
|
+
logger.warning(f'No SerpAPI results for engine="{engine}".')
|
|
111
|
+
else:
|
|
112
|
+
urls = [url for res in results if (url := res.get("product_link"))]
|
|
113
|
+
|
|
114
|
+
else:
|
|
115
|
+
raise ValueError(f"Invalid SerpAPI search engine: {engine}")
|
|
116
|
+
|
|
117
|
+
return urls
|
|
118
|
+
|
|
76
119
|
async def _search(
|
|
77
120
|
self,
|
|
121
|
+
engine: str,
|
|
78
122
|
search_string: str,
|
|
79
123
|
language: Language,
|
|
80
124
|
location: Location,
|
|
@@ -83,6 +127,7 @@ class SerpApi(AsyncClient):
|
|
|
83
127
|
"""Performs a search using SerpApi and returns the URLs of the results.
|
|
84
128
|
|
|
85
129
|
Args:
|
|
130
|
+
engine: The search engine to use.
|
|
86
131
|
search_string: The search string (with potentially added site: parameters).
|
|
87
132
|
language: The language to use for the query ('hl' parameter).
|
|
88
133
|
location: The location to use for the query ('gl' parameter).
|
|
@@ -93,20 +138,35 @@ class SerpApi(AsyncClient):
|
|
|
93
138
|
q: The search string (with potentially added site: parameters).
|
|
94
139
|
google_domain: The Google domain to use for the search (e.g. google.[com]).
|
|
95
140
|
location_[requested|used]: The location to use for the search.
|
|
96
|
-
tbs: The
|
|
141
|
+
tbs: The to-be-searched parameters (e.g. 'ctr:CH').
|
|
142
|
+
cr: The country code to limit the search to (e.g. 'countryCH').
|
|
97
143
|
gl: The country code to use for the search.
|
|
98
144
|
hl: The language code to use for the search.
|
|
99
145
|
num: The number of results to return.
|
|
100
146
|
api_key: The API key to use for the search.
|
|
101
147
|
"""
|
|
148
|
+
if engine not in self._engine_marketplace_names:
|
|
149
|
+
raise ValueError(
|
|
150
|
+
f"Invalid SerpAPI search engine: {engine}. "
|
|
151
|
+
f"Supported engines are: {list(self._engine_marketplace_names.keys())}."
|
|
152
|
+
)
|
|
153
|
+
logger.debug(
|
|
154
|
+
f'Performing SerpAPI search with engine="{engine}", '
|
|
155
|
+
f'q="{search_string}", '
|
|
156
|
+
f'location="{location.name}", '
|
|
157
|
+
f'language="{language.code}", '
|
|
158
|
+
f"num_results={num_results}."
|
|
159
|
+
)
|
|
160
|
+
|
|
102
161
|
# Setup the parameters
|
|
103
162
|
params = {
|
|
104
|
-
"engine":
|
|
163
|
+
"engine": engine,
|
|
105
164
|
"q": search_string,
|
|
106
165
|
"google_domain": f"google.{location.code}",
|
|
107
166
|
"location_requested": location.name,
|
|
108
167
|
"location_used": location.name,
|
|
109
|
-
"tbs": f"ctr:{location.code.upper()}
|
|
168
|
+
"tbs": f"ctr:{location.code.upper()}",
|
|
169
|
+
"cr": f"country{location.code.upper()}",
|
|
110
170
|
"gl": location.code,
|
|
111
171
|
"hl": language.code,
|
|
112
172
|
"num": num_results,
|
|
@@ -132,18 +192,11 @@ class SerpApi(AsyncClient):
|
|
|
132
192
|
if err is not None:
|
|
133
193
|
raise err
|
|
134
194
|
|
|
135
|
-
#
|
|
136
|
-
|
|
137
|
-
if results is None:
|
|
138
|
-
logger.warning(
|
|
139
|
-
f'No organic_results key in SerpAPI results for search_string="{search_string}".'
|
|
140
|
-
)
|
|
141
|
-
return []
|
|
195
|
+
# Extract the URLs from the response
|
|
196
|
+
urls = self._extract_search_results(response=response, engine=engine)
|
|
142
197
|
|
|
143
|
-
# Extract urls
|
|
144
|
-
urls = [res.get("link") for res in results]
|
|
145
198
|
logger.debug(
|
|
146
|
-
f'Found {len(urls)} URLs from SerpApi search for q="{search_string}".'
|
|
199
|
+
f'Found total of {len(urls)} URLs from SerpApi search for q="{search_string}" and engine="{engine}".'
|
|
147
200
|
)
|
|
148
201
|
return urls
|
|
149
202
|
|
|
@@ -234,6 +287,7 @@ class SerpApi(AsyncClient):
|
|
|
234
287
|
|
|
235
288
|
def _create_serp_result(
|
|
236
289
|
self,
|
|
290
|
+
engine: str,
|
|
237
291
|
url: str,
|
|
238
292
|
location: Location,
|
|
239
293
|
marketplaces: List[Host] | None = None,
|
|
@@ -244,13 +298,18 @@ class SerpApi(AsyncClient):
|
|
|
244
298
|
If marketplaces is None or the domain can not be extracted, the default marketplace name is used.
|
|
245
299
|
|
|
246
300
|
Args:
|
|
301
|
+
engine: The search engine used.
|
|
247
302
|
url: The URL to be processed.
|
|
248
303
|
location: The location to use for the query.
|
|
249
304
|
marketplaces: The list of marketplaces to compare the URL against.
|
|
305
|
+
excluded_urls: The list of excluded URLs.
|
|
250
306
|
"""
|
|
251
307
|
# Get marketplace name
|
|
252
308
|
domain = self._get_domain(url=url)
|
|
253
|
-
|
|
309
|
+
|
|
310
|
+
# Select marketplace name based on engine
|
|
311
|
+
marketplace_name = self._engine_marketplace_names[engine]
|
|
312
|
+
|
|
254
313
|
if marketplaces:
|
|
255
314
|
try:
|
|
256
315
|
marketplace_name = next(
|
|
@@ -277,9 +336,109 @@ class SerpApi(AsyncClient):
|
|
|
277
336
|
)
|
|
278
337
|
return result
|
|
279
338
|
|
|
339
|
+
async def _search_google(
|
|
340
|
+
self,
|
|
341
|
+
search_string: str,
|
|
342
|
+
language: Language,
|
|
343
|
+
location: Location,
|
|
344
|
+
num_results: int,
|
|
345
|
+
marketplaces: List[Host] | None = None,
|
|
346
|
+
excluded_urls: List[Host] | None = None,
|
|
347
|
+
) -> List[SerpResult]:
|
|
348
|
+
"""Performs a google search using SerpApi and returns SerpResults.
|
|
349
|
+
|
|
350
|
+
Args:
|
|
351
|
+
search_string: The search string (with potentially added site: parameters).
|
|
352
|
+
language: The language to use for the query ('hl' parameter).
|
|
353
|
+
location: The location to use for the query ('gl' parameter).
|
|
354
|
+
num_results: Max number of results to return.
|
|
355
|
+
marketplaces: The marketplaces to include in the search.
|
|
356
|
+
excluded_urls: The URLs to exclude from the search.
|
|
357
|
+
"""
|
|
358
|
+
engine = SearchEngine.GOOGLE.value
|
|
359
|
+
|
|
360
|
+
# Perform the search
|
|
361
|
+
urls = await self._search(
|
|
362
|
+
engine=engine,
|
|
363
|
+
search_string=search_string,
|
|
364
|
+
language=language,
|
|
365
|
+
location=location,
|
|
366
|
+
num_results=num_results,
|
|
367
|
+
)
|
|
368
|
+
|
|
369
|
+
# Create SerpResult objects from the URLs
|
|
370
|
+
results = [
|
|
371
|
+
self._create_serp_result(
|
|
372
|
+
url=url,
|
|
373
|
+
location=location,
|
|
374
|
+
marketplaces=marketplaces,
|
|
375
|
+
excluded_urls=excluded_urls,
|
|
376
|
+
engine=engine,
|
|
377
|
+
)
|
|
378
|
+
for url in urls
|
|
379
|
+
]
|
|
380
|
+
|
|
381
|
+
logger.debug(
|
|
382
|
+
f'Produced {len(results)} results from google search with q="{search_string}".'
|
|
383
|
+
)
|
|
384
|
+
return results
|
|
385
|
+
|
|
386
|
+
async def _search_google_shopping(
|
|
387
|
+
self,
|
|
388
|
+
search_string: str,
|
|
389
|
+
language: Language,
|
|
390
|
+
location: Location,
|
|
391
|
+
num_results: int,
|
|
392
|
+
marketplaces: List[Host] | None = None,
|
|
393
|
+
excluded_urls: List[Host] | None = None,
|
|
394
|
+
) -> List[SerpResult]:
|
|
395
|
+
"""Performs a google search using SerpApi and returns SerpResults.
|
|
396
|
+
|
|
397
|
+
Args:
|
|
398
|
+
search_string: The search string (with potentially added site: parameters).
|
|
399
|
+
language: The language to use for the query ('hl' parameter).
|
|
400
|
+
location: The location to use for the query ('gl' parameter).
|
|
401
|
+
num_results: Max number of results to return.
|
|
402
|
+
marketplaces: The marketplaces to include in the search.
|
|
403
|
+
excluded_urls: The URLs to exclude from the search.
|
|
404
|
+
"""
|
|
405
|
+
engine = SearchEngine.GOOGLE_SHOPPING.value
|
|
406
|
+
|
|
407
|
+
# Perform the search
|
|
408
|
+
urls = await self._search(
|
|
409
|
+
engine=engine,
|
|
410
|
+
search_string=search_string,
|
|
411
|
+
language=language,
|
|
412
|
+
location=location,
|
|
413
|
+
num_results=num_results,
|
|
414
|
+
)
|
|
415
|
+
|
|
416
|
+
# !!! NOTE !!!: Google Shopping results do not properly support the 'num' parameter,
|
|
417
|
+
# so we might get more results than requested. This is a known issue with SerpAPI
|
|
418
|
+
# and Google Shopping searches (see https://github.com/serpapi/public-roadmap/issues/1858)
|
|
419
|
+
urls = urls[:num_results]
|
|
420
|
+
|
|
421
|
+
# Create SerpResult objects from the URLs
|
|
422
|
+
results = [
|
|
423
|
+
self._create_serp_result(
|
|
424
|
+
url=url,
|
|
425
|
+
location=location,
|
|
426
|
+
marketplaces=marketplaces,
|
|
427
|
+
excluded_urls=excluded_urls,
|
|
428
|
+
engine=engine,
|
|
429
|
+
)
|
|
430
|
+
for url in urls
|
|
431
|
+
]
|
|
432
|
+
|
|
433
|
+
logger.debug(
|
|
434
|
+
f'Produced {len(results)} results from google shopping search with q="{search_string}".'
|
|
435
|
+
)
|
|
436
|
+
return results
|
|
437
|
+
|
|
280
438
|
async def apply(
|
|
281
439
|
self,
|
|
282
440
|
search_term: str,
|
|
441
|
+
search_engines: List[SearchEngine],
|
|
283
442
|
language: Language,
|
|
284
443
|
location: Location,
|
|
285
444
|
num_results: int,
|
|
@@ -305,27 +464,35 @@ class SerpApi(AsyncClient):
|
|
|
305
464
|
sites = [dom for host in marketplaces for dom in host.domains]
|
|
306
465
|
search_string += " site:" + " OR site:".join(s for s in sites)
|
|
307
466
|
|
|
308
|
-
#
|
|
309
|
-
|
|
310
|
-
search_string=search_string,
|
|
311
|
-
language=language,
|
|
312
|
-
location=location,
|
|
313
|
-
num_results=num_results,
|
|
314
|
-
)
|
|
467
|
+
# Initialize the results list
|
|
468
|
+
results: List[SerpResult] = []
|
|
315
469
|
|
|
316
|
-
#
|
|
317
|
-
|
|
318
|
-
self.
|
|
319
|
-
|
|
470
|
+
# Perform the google search
|
|
471
|
+
if SearchEngine.GOOGLE in search_engines:
|
|
472
|
+
ggl_res = await self._search_google(
|
|
473
|
+
search_string=search_string,
|
|
474
|
+
language=language,
|
|
320
475
|
location=location,
|
|
476
|
+
num_results=num_results,
|
|
321
477
|
marketplaces=marketplaces,
|
|
322
478
|
excluded_urls=excluded_urls,
|
|
323
479
|
)
|
|
324
|
-
|
|
325
|
-
|
|
480
|
+
results.extend(ggl_res)
|
|
481
|
+
|
|
482
|
+
# Perform the google shopping search
|
|
483
|
+
if SearchEngine.GOOGLE_SHOPPING in search_engines:
|
|
484
|
+
shp_res = await self._search_google_shopping(
|
|
485
|
+
search_string=search_string,
|
|
486
|
+
language=language,
|
|
487
|
+
location=location,
|
|
488
|
+
num_results=num_results,
|
|
489
|
+
marketplaces=marketplaces,
|
|
490
|
+
excluded_urls=excluded_urls,
|
|
491
|
+
)
|
|
492
|
+
results.extend(shp_res)
|
|
326
493
|
|
|
327
494
|
num_non_filtered = len([res for res in results if not res.filtered])
|
|
328
495
|
logger.info(
|
|
329
|
-
f'Produced {num_non_filtered} results from SerpApi search with q="{search_string}".'
|
|
496
|
+
f'Produced a total of {num_non_filtered} results from SerpApi search with q="{search_string}".'
|
|
330
497
|
)
|
|
331
498
|
return results
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
import logging
|
|
3
3
|
from typing import List
|
|
4
|
+
from base64 import b64decode
|
|
4
5
|
|
|
5
6
|
import aiohttp
|
|
6
7
|
|
|
@@ -68,7 +69,8 @@ class ZyteApi(AsyncClient):
|
|
|
68
69
|
"metadata": {
|
|
69
70
|
"probability": float,
|
|
70
71
|
},
|
|
71
|
-
}
|
|
72
|
+
},
|
|
73
|
+
"httpResponseBody": base64
|
|
72
74
|
}
|
|
73
75
|
"""
|
|
74
76
|
logger.info(f"Fetching product details by Zyte for URL {url}.")
|
|
@@ -192,3 +194,24 @@ class ZyteApi(AsyncClient):
|
|
|
192
194
|
}
|
|
193
195
|
"""
|
|
194
196
|
return float(details.get("product", {}).get("metadata", {}).get("probability"))
|
|
197
|
+
|
|
198
|
+
@staticmethod
|
|
199
|
+
def extract_html(details: dict) -> str | None:
|
|
200
|
+
"""Extracts the HTML from the Zyte API response.
|
|
201
|
+
|
|
202
|
+
The input argument is a dictionary of the following structure:
|
|
203
|
+
{
|
|
204
|
+
"httpResponseBody": base64
|
|
205
|
+
}
|
|
206
|
+
"""
|
|
207
|
+
|
|
208
|
+
# Get the Base64-encoded content
|
|
209
|
+
encoded = details.get("httpResponseBody")
|
|
210
|
+
|
|
211
|
+
# Decode it into bytes
|
|
212
|
+
if isinstance(encoded, str):
|
|
213
|
+
decoded_bytes = b64decode(encoded)
|
|
214
|
+
|
|
215
|
+
# Convert bytes to string (assuming UTF-8 encoding)
|
|
216
|
+
decoded_string = decoded_bytes.decode("utf-8")
|
|
217
|
+
return decoded_string
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
from pathlib import Path
|
|
2
|
+
from typing import List
|
|
2
3
|
|
|
3
4
|
# Generic settings
|
|
4
5
|
MAX_RETRIES = 3
|
|
@@ -8,8 +9,8 @@ ROOT_DIR = Path(__file__).parents[1]
|
|
|
8
9
|
# Serp settings
|
|
9
10
|
GOOGLE_LOCATIONS_FILENAME = ROOT_DIR / "fraudcrawler" / "base" / "google-locations.json"
|
|
10
11
|
GOOGLE_LANGUAGES_FILENAME = ROOT_DIR / "fraudcrawler" / "base" / "google-languages.json"
|
|
11
|
-
SERP_DEFAULT_COUNTRY_CODES = [
|
|
12
|
-
".com",
|
|
12
|
+
SERP_DEFAULT_COUNTRY_CODES: List[str] = [
|
|
13
|
+
# ".com",
|
|
13
14
|
]
|
|
14
15
|
|
|
15
16
|
# Enrichment settings
|
|
@@ -21,9 +22,8 @@ ZYTE_DEFALUT_PROBABILITY_THRESHOLD = 0.1
|
|
|
21
22
|
# Processor settings
|
|
22
23
|
PROCESSOR_DEFAULT_MODEL = "gpt-4o"
|
|
23
24
|
PROCESSOR_DEFAULT_IF_MISSING = -1
|
|
24
|
-
PROCESSOR_USER_PROMPT_TEMPLATE =
|
|
25
|
-
|
|
26
|
-
)
|
|
25
|
+
PROCESSOR_USER_PROMPT_TEMPLATE = "Product Details:\n{product_details}\n\nRelevance:"
|
|
26
|
+
PROCESSOR_PRODUCT_DETAILS_TEMPLATE = "{field_name}:\n{field_value}"
|
|
27
27
|
|
|
28
28
|
# Async settings
|
|
29
29
|
DEFAULT_N_SERP_WKRS = 10
|
|
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
|
|
|
4
4
|
|
|
5
5
|
[tool.poetry]
|
|
6
6
|
name = "fraudcrawler"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.4.2"
|
|
8
8
|
description = "Intelligent Market Monitoring"
|
|
9
9
|
authors = [
|
|
10
10
|
"Domingo Bertus <hello@veanu.ch>",
|
|
@@ -25,6 +25,7 @@ pandas = "^2.2.3"
|
|
|
25
25
|
aiohttp = "^3.11.14"
|
|
26
26
|
pydantic-settings = "^2.8.1"
|
|
27
27
|
openai = "^1.68.2"
|
|
28
|
+
beautifulsoup4 = "^4.13.4"
|
|
28
29
|
|
|
29
30
|
[tool.poetry.group.dev.dependencies]
|
|
30
31
|
pytest-cov = "^6.0.0"
|
|
@@ -1,100 +0,0 @@
|
|
|
1
|
-
import logging
|
|
2
|
-
|
|
3
|
-
from fraudcrawler import FraudCrawlerClient, Language, Location, Deepness, Prompt
|
|
4
|
-
|
|
5
|
-
LOG_FMT = "%(asctime)s | %(name)s | %(funcName)s | %(levelname)s | %(message)s"
|
|
6
|
-
LOG_LVL = "INFO"
|
|
7
|
-
DATE_FMT = "%Y-%m-%d %H:%M:%S"
|
|
8
|
-
logging.basicConfig(format=LOG_FMT, level=LOG_LVL, datefmt=DATE_FMT)
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
def main():
|
|
12
|
-
# Setup the client
|
|
13
|
-
client = FraudCrawlerClient()
|
|
14
|
-
|
|
15
|
-
# Setup the search
|
|
16
|
-
search_term = "Kühlschrank"
|
|
17
|
-
language = Language(name="German")
|
|
18
|
-
location = Location(name="Switzerland")
|
|
19
|
-
deepness = Deepness(num_results=20)
|
|
20
|
-
prompts = [
|
|
21
|
-
Prompt(
|
|
22
|
-
name="relevance",
|
|
23
|
-
context="This organization is interested in checking the energy efficiency of certain devices.",
|
|
24
|
-
system_prompt=(
|
|
25
|
-
"You are a helpful and intelligent assistant. Your task is to classify any given product "
|
|
26
|
-
"as either relevant (1) or not relevant (0), strictly based on the context and product details provided by the user. "
|
|
27
|
-
"You must consider all aspects of the given context and make a binary decision accordingly. "
|
|
28
|
-
"If the product aligns with the user's needs, classify it as 1 (relevant); otherwise, classify it as 0 (not relevant). "
|
|
29
|
-
"Respond only with the number 1 or 0."
|
|
30
|
-
),
|
|
31
|
-
allowed_classes=[0, 1],
|
|
32
|
-
),
|
|
33
|
-
Prompt(
|
|
34
|
-
name="seriousness",
|
|
35
|
-
context="This organization is interested in checking the energy efficiency of certain devices.",
|
|
36
|
-
system_prompt=(
|
|
37
|
-
"You are an intelligent and discerning assistant. Your task is to classify each item as either "
|
|
38
|
-
"a product for sale (1) or not a product for sale (0). To make this distinction, consider the following criteria: \n"
|
|
39
|
-
" 1 Product for Sale (1): Classify as 1 if the result clearly indicates an item available for purchase, typically found "
|
|
40
|
-
"within an online shop or marketplace.\n"
|
|
41
|
-
" 2 Not a Product for Sale (0): Classify as 0 if the result is unrelated to a direct purchase of a product. This includes items such as: \n"
|
|
42
|
-
" - Books and Videos: These may be available for sale, but if they are about or related to the searched product rather than being the "
|
|
43
|
-
"exact product itself, classify as 0.\n"
|
|
44
|
-
" - Advertisements: Promotional content that doesn't directly sell a product.\n"
|
|
45
|
-
" - Companies and Services: Names and descriptions of companies or services related to the product but not the product itself.\n"
|
|
46
|
-
" - Related Topics/Content: Any text or media that discusses or elaborates on the topic without offering a tangible product for sale.\n"
|
|
47
|
-
"Make your decision based solely on the context and details provided in the search result. Respond only with the number 1 or 0."
|
|
48
|
-
),
|
|
49
|
-
allowed_classes=[0, 1],
|
|
50
|
-
),
|
|
51
|
-
]
|
|
52
|
-
# # Optional: Add tern ENRICHEMENT
|
|
53
|
-
# from fraudcrawler import Enrichment
|
|
54
|
-
|
|
55
|
-
# deepness.enrichment = Enrichment(additional_terms=10, additional_urls_per_term=20)
|
|
56
|
-
|
|
57
|
-
# # Optional: Add MARKETPLACES and EXCLUDED_URLS
|
|
58
|
-
# from fraudcrawler import Host
|
|
59
|
-
|
|
60
|
-
# marketplaces = [
|
|
61
|
-
# Host(name="International", domains="zavamed.com,apomeds.com"),
|
|
62
|
-
# Host(name="National", domains="netdoktor.ch, nobelpharma.ch")
|
|
63
|
-
# ]
|
|
64
|
-
# excluded_urls = [
|
|
65
|
-
# Host(name="Digitec", domains="digitec.ch"),
|
|
66
|
-
# Host(name="Brack", domains="brack.ch"),
|
|
67
|
-
# ]
|
|
68
|
-
|
|
69
|
-
# Execute the pipeline
|
|
70
|
-
client.execute(
|
|
71
|
-
search_term=search_term,
|
|
72
|
-
language=language,
|
|
73
|
-
location=location,
|
|
74
|
-
deepness=deepness,
|
|
75
|
-
prompts=prompts,
|
|
76
|
-
# marketplaces=marketplaces,
|
|
77
|
-
# excluded_urls=excluded_urls,
|
|
78
|
-
)
|
|
79
|
-
|
|
80
|
-
# Show results
|
|
81
|
-
print()
|
|
82
|
-
title = "Available results"
|
|
83
|
-
print(title)
|
|
84
|
-
print("=" * len(title))
|
|
85
|
-
client.print_available_results()
|
|
86
|
-
print()
|
|
87
|
-
title = f'Results for "{search_term.upper()}"'
|
|
88
|
-
print(title)
|
|
89
|
-
print("=" * len(title))
|
|
90
|
-
df = client.load_results()
|
|
91
|
-
print(f"Number of products found: {len(df)}")
|
|
92
|
-
print()
|
|
93
|
-
n_head = 10
|
|
94
|
-
print(f"First {n_head} products are:")
|
|
95
|
-
print(df.head(n=n_head))
|
|
96
|
-
print()
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
if __name__ == "__main__":
|
|
100
|
-
main()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|