fraudcrawler 0.7.21__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of fraudcrawler might be problematic. Click here for more details.
- fraudcrawler/__init__.py +52 -0
- fraudcrawler/base/__init__.py +0 -0
- fraudcrawler/base/base.py +222 -0
- fraudcrawler/base/client.py +167 -0
- fraudcrawler/base/google-languages.json +630 -0
- fraudcrawler/base/google-locations.json +1 -0
- fraudcrawler/base/orchestrator.py +696 -0
- fraudcrawler/base/retry.py +54 -0
- fraudcrawler/launch_demo_pipeline.py +162 -0
- fraudcrawler/processing/__init__.py +0 -0
- fraudcrawler/processing/base.py +129 -0
- fraudcrawler/processing/openai.py +520 -0
- fraudcrawler/scraping/__init__.py +0 -0
- fraudcrawler/scraping/enrich.py +361 -0
- fraudcrawler/scraping/search.py +924 -0
- fraudcrawler/scraping/url.py +96 -0
- fraudcrawler/scraping/zyte.py +287 -0
- fraudcrawler/settings.py +104 -0
- fraudcrawler-0.7.21.dist-info/METADATA +175 -0
- fraudcrawler-0.7.21.dist-info/RECORD +23 -0
- fraudcrawler-0.7.21.dist-info/WHEEL +4 -0
- fraudcrawler-0.7.21.dist-info/entry_points.txt +3 -0
- fraudcrawler-0.7.21.dist-info/licenses/LICENSE +21 -0
fraudcrawler/__init__.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
from fraudcrawler.scraping.search import Searcher, SearchEngineName
|
|
2
|
+
from fraudcrawler.scraping.enrich import Enricher
|
|
3
|
+
from fraudcrawler.scraping.url import URLCollector
|
|
4
|
+
from fraudcrawler.scraping.zyte import ZyteAPI
|
|
5
|
+
from fraudcrawler.processing.base import (
|
|
6
|
+
UserInputs,
|
|
7
|
+
Workflow,
|
|
8
|
+
ClassificationResult,
|
|
9
|
+
TmpResult,
|
|
10
|
+
Processor,
|
|
11
|
+
)
|
|
12
|
+
from fraudcrawler.processing.openai import (
|
|
13
|
+
OpenAIWorkflow,
|
|
14
|
+
OpenAIClassification,
|
|
15
|
+
OpenAIClassificationUserInputs,
|
|
16
|
+
)
|
|
17
|
+
from fraudcrawler.base.orchestrator import Orchestrator
|
|
18
|
+
from fraudcrawler.base.client import FraudCrawlerClient
|
|
19
|
+
from fraudcrawler.base.base import (
|
|
20
|
+
Deepness,
|
|
21
|
+
Enrichment,
|
|
22
|
+
Host,
|
|
23
|
+
Language,
|
|
24
|
+
Location,
|
|
25
|
+
ProductItem,
|
|
26
|
+
HttpxAsyncClient,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
__all__ = [
|
|
30
|
+
"Searcher",
|
|
31
|
+
"SearchEngineName",
|
|
32
|
+
"Enricher",
|
|
33
|
+
"URLCollector",
|
|
34
|
+
"ZyteAPI",
|
|
35
|
+
"UserInputs",
|
|
36
|
+
"Workflow",
|
|
37
|
+
"ClassificationResult",
|
|
38
|
+
"TmpResult",
|
|
39
|
+
"OpenAIWorkflow",
|
|
40
|
+
"OpenAIClassification",
|
|
41
|
+
"OpenAIClassificationUserInputs",
|
|
42
|
+
"Processor",
|
|
43
|
+
"Orchestrator",
|
|
44
|
+
"ProductItem",
|
|
45
|
+
"FraudCrawlerClient",
|
|
46
|
+
"Language",
|
|
47
|
+
"Location",
|
|
48
|
+
"Host",
|
|
49
|
+
"Deepness",
|
|
50
|
+
"Enrichment",
|
|
51
|
+
"HttpxAsyncClient",
|
|
52
|
+
]
|
|
File without changes
|
|
@@ -0,0 +1,222 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import logging
|
|
3
|
+
from pydantic import (
|
|
4
|
+
BaseModel,
|
|
5
|
+
Field,
|
|
6
|
+
field_validator,
|
|
7
|
+
model_validator,
|
|
8
|
+
)
|
|
9
|
+
from pydantic_settings import BaseSettings
|
|
10
|
+
from urllib.parse import urlparse
|
|
11
|
+
import re
|
|
12
|
+
from typing import Any, Dict, List
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
import httpx
|
|
16
|
+
|
|
17
|
+
from fraudcrawler.settings import (
|
|
18
|
+
GOOGLE_LANGUAGES_FILENAME,
|
|
19
|
+
GOOGLE_LOCATIONS_FILENAME,
|
|
20
|
+
)
|
|
21
|
+
from fraudcrawler.settings import (
|
|
22
|
+
DEFAULT_HTTPX_TIMEOUT,
|
|
23
|
+
DEFAULT_HTTPX_LIMITS,
|
|
24
|
+
DEFAULT_HTTPX_REDIRECTS,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
logger = logging.getLogger(__name__)
|
|
28
|
+
|
|
29
|
+
# Load google locations and languages
|
|
30
|
+
with open(GOOGLE_LOCATIONS_FILENAME, "r") as gfile:
|
|
31
|
+
_locs = json.load(gfile)
|
|
32
|
+
_LOCATION_CODES = {loc["name"]: loc["country_code"].lower() for loc in _locs}
|
|
33
|
+
with open(GOOGLE_LANGUAGES_FILENAME, "r") as gfile:
|
|
34
|
+
_langs = json.load(gfile)
|
|
35
|
+
_LANGUAGE_CODES = {lang["language_name"]: lang["language_code"] for lang in _langs}
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
# Base classes
|
|
39
|
+
class Setup(BaseSettings):
|
|
40
|
+
"""Class for loading environment variables."""
|
|
41
|
+
|
|
42
|
+
# Crawler ENV variables
|
|
43
|
+
serpapi_key: str
|
|
44
|
+
dataforseo_user: str
|
|
45
|
+
dataforseo_pwd: str
|
|
46
|
+
zyteapi_key: str
|
|
47
|
+
openaiapi_key: str
|
|
48
|
+
pypy_token: str
|
|
49
|
+
|
|
50
|
+
class Config:
|
|
51
|
+
env_file = ".env"
|
|
52
|
+
env_file_encoding = "utf-8"
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class Host(BaseModel):
|
|
56
|
+
"""Model for host details (e.g. `Host(name="Galaxus", domains="galaxus.ch, digitec.ch")`)."""
|
|
57
|
+
|
|
58
|
+
name: str
|
|
59
|
+
domains: str | List[str]
|
|
60
|
+
|
|
61
|
+
@staticmethod
|
|
62
|
+
def _normalize_domain(domain: str) -> str:
|
|
63
|
+
"""Make it lowercase and strip 'www.' and 'https?://' prefixes from the domain."""
|
|
64
|
+
domain = domain.strip().lower()
|
|
65
|
+
return re.sub(r"^(https?://)?(www\.)?", "", domain)
|
|
66
|
+
|
|
67
|
+
@field_validator("domains", mode="before")
|
|
68
|
+
def normalize_domains(cls, val):
|
|
69
|
+
if isinstance(val, str):
|
|
70
|
+
val = val.split(",")
|
|
71
|
+
return [cls._normalize_domain(dom.strip()) for dom in val]
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class Location(BaseModel):
|
|
75
|
+
"""Model for location details (e.g. `Location(name="Switzerland", code="ch")`)."""
|
|
76
|
+
|
|
77
|
+
name: str
|
|
78
|
+
code: str = ""
|
|
79
|
+
|
|
80
|
+
@model_validator(mode="before")
|
|
81
|
+
def set_code(cls, values):
|
|
82
|
+
"""Set the location code if not provided and make it lower case."""
|
|
83
|
+
name = values.get("name")
|
|
84
|
+
code = values.get("code")
|
|
85
|
+
if code is None or not len(code):
|
|
86
|
+
code = _LOCATION_CODES.get(name)
|
|
87
|
+
if code is None:
|
|
88
|
+
raise ValueError(f'Location code not found for location name="{name}"')
|
|
89
|
+
code = code.lower()
|
|
90
|
+
return {"name": name, "code": code}
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
class Language(BaseModel):
|
|
94
|
+
"""Model for language details (e.g. `Language(name="German", code="de")`)."""
|
|
95
|
+
|
|
96
|
+
name: str
|
|
97
|
+
code: str = ""
|
|
98
|
+
|
|
99
|
+
@model_validator(mode="before")
|
|
100
|
+
def set_code(cls, values):
|
|
101
|
+
"""Set the language code if not provided and make it lower case."""
|
|
102
|
+
name = values.get("name")
|
|
103
|
+
code = values.get("code")
|
|
104
|
+
if code is None or not len(code):
|
|
105
|
+
code = _LANGUAGE_CODES.get(name)
|
|
106
|
+
if code is None:
|
|
107
|
+
raise ValueError(f'Language code not found for language name="{name}"')
|
|
108
|
+
code = code.lower()
|
|
109
|
+
return {"name": name, "code": code}
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
class Enrichment(BaseModel):
|
|
113
|
+
"""Model for enriching initial search_term with alternative ones."""
|
|
114
|
+
|
|
115
|
+
additional_terms: int
|
|
116
|
+
additional_urls_per_term: int
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
class Deepness(BaseModel):
|
|
120
|
+
"""Model for search depth."""
|
|
121
|
+
|
|
122
|
+
num_results: int
|
|
123
|
+
enrichment: Enrichment | None = None
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
class ProductItem(BaseModel):
|
|
127
|
+
"""Model representing a product item."""
|
|
128
|
+
|
|
129
|
+
# Search parameters
|
|
130
|
+
search_term: str
|
|
131
|
+
search_term_type: str
|
|
132
|
+
url: str
|
|
133
|
+
url_resolved: str
|
|
134
|
+
search_engine_name: str
|
|
135
|
+
domain: str
|
|
136
|
+
exact_search: bool = False
|
|
137
|
+
exact_search_match: bool = False
|
|
138
|
+
|
|
139
|
+
# Context parameters
|
|
140
|
+
product_name: str | None = None
|
|
141
|
+
product_price: str | None = None
|
|
142
|
+
product_description: str | None = None
|
|
143
|
+
product_images: List[str] | None = None
|
|
144
|
+
probability: float | None = None
|
|
145
|
+
html: str | None = None
|
|
146
|
+
html_clean: str | None = None
|
|
147
|
+
|
|
148
|
+
# Processor parameters (set dynamically)
|
|
149
|
+
classifications: Dict[str, int] = Field(default_factory=dict)
|
|
150
|
+
tmp: Dict[str, Any] = Field(default_factory=dict)
|
|
151
|
+
insights: Dict[str, Any] | None = Field(default=None)
|
|
152
|
+
|
|
153
|
+
# Usage parameters
|
|
154
|
+
usage: Dict[str, Dict[str, int]] = Field(default_factory=dict)
|
|
155
|
+
|
|
156
|
+
# Filtering parameters
|
|
157
|
+
filtered: bool = False
|
|
158
|
+
filtered_at_stage: str | None = None
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
class HttpxAsyncClient(httpx.AsyncClient):
|
|
162
|
+
"""Httpx async client that can be used to retain the default settings."""
|
|
163
|
+
|
|
164
|
+
def __init__(
|
|
165
|
+
self,
|
|
166
|
+
timeout: httpx.Timeout | Dict[str, Any] = DEFAULT_HTTPX_TIMEOUT,
|
|
167
|
+
limits: httpx.Limits | Dict[str, Any] = DEFAULT_HTTPX_LIMITS,
|
|
168
|
+
follow_redirects: bool = DEFAULT_HTTPX_REDIRECTS,
|
|
169
|
+
**kwargs: Any,
|
|
170
|
+
) -> None:
|
|
171
|
+
if isinstance(timeout, dict):
|
|
172
|
+
timeout = httpx.Timeout(**timeout)
|
|
173
|
+
if isinstance(limits, dict):
|
|
174
|
+
limits = httpx.Limits(**limits)
|
|
175
|
+
|
|
176
|
+
kwargs.setdefault("timeout", timeout)
|
|
177
|
+
kwargs.setdefault("limits", limits)
|
|
178
|
+
kwargs.setdefault("follow_redirects", follow_redirects)
|
|
179
|
+
super().__init__(**kwargs)
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
class DomainUtils:
|
|
183
|
+
"""Utility class for domain extraction and normalization.
|
|
184
|
+
|
|
185
|
+
Handles domain parsing from URLs, removes common prefixes (www, http/https),
|
|
186
|
+
and provides consistent domain formatting for search and scraping operations.
|
|
187
|
+
"""
|
|
188
|
+
|
|
189
|
+
_hostname_pattern = r"^(?:https?:\/\/)?([^\/:?#]+)"
|
|
190
|
+
_headers = {
|
|
191
|
+
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
|
|
192
|
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
|
193
|
+
"Accept-Language": "en-US,en;q=0.5",
|
|
194
|
+
"Accept-Encoding": "gzip, deflate",
|
|
195
|
+
"Connection": "keep-alive",
|
|
196
|
+
"Upgrade-Insecure-Requests": "1",
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
def _get_domain(self, url: str) -> str:
|
|
200
|
+
"""Extracts the second-level domain together with the top-level domain (e.g. `google.com`).
|
|
201
|
+
|
|
202
|
+
Args:
|
|
203
|
+
url: The URL to be processed.
|
|
204
|
+
"""
|
|
205
|
+
# Add scheme; urlparse requires it
|
|
206
|
+
if not url.startswith(("http://", "https://")):
|
|
207
|
+
url = "http://" + url
|
|
208
|
+
|
|
209
|
+
# Get the hostname
|
|
210
|
+
hostname = urlparse(url).hostname
|
|
211
|
+
if hostname is None and (match := re.search(self._hostname_pattern, url)):
|
|
212
|
+
hostname = match.group(1)
|
|
213
|
+
if hostname is None:
|
|
214
|
+
logger.warning(
|
|
215
|
+
f'Failed to extract domain from url="{url}"; full url is returned'
|
|
216
|
+
)
|
|
217
|
+
return url.lower()
|
|
218
|
+
|
|
219
|
+
# Remove www. prefix
|
|
220
|
+
if hostname and hostname.startswith("www."):
|
|
221
|
+
hostname = hostname[4:]
|
|
222
|
+
return hostname.lower()
|
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import csv
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
import logging
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from pydantic import BaseModel
|
|
7
|
+
from typing import List
|
|
8
|
+
|
|
9
|
+
import pandas as pd
|
|
10
|
+
|
|
11
|
+
from fraudcrawler.settings import ROOT_DIR
|
|
12
|
+
from fraudcrawler.base.base import (
|
|
13
|
+
Language,
|
|
14
|
+
Location,
|
|
15
|
+
Deepness,
|
|
16
|
+
Host,
|
|
17
|
+
ProductItem,
|
|
18
|
+
)
|
|
19
|
+
from fraudcrawler.base.orchestrator import Orchestrator
|
|
20
|
+
from fraudcrawler.scraping.search import Searcher, SearchEngineName
|
|
21
|
+
from fraudcrawler.scraping.enrich import Enricher
|
|
22
|
+
from fraudcrawler.scraping.url import URLCollector
|
|
23
|
+
from fraudcrawler.scraping.zyte import ZyteAPI
|
|
24
|
+
from fraudcrawler.processing.base import Processor
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
logger = logging.getLogger(__name__)
|
|
28
|
+
|
|
29
|
+
_RESULTS_DIR = ROOT_DIR / "data" / "results"
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class Results(BaseModel):
|
|
33
|
+
"""The results of the product search."""
|
|
34
|
+
|
|
35
|
+
search_term: str
|
|
36
|
+
filename: Path | None = None
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class FraudCrawlerClient(Orchestrator):
|
|
40
|
+
"""The main client for FraudCrawler product search and analysis.
|
|
41
|
+
|
|
42
|
+
This client orchestrates the complete pipeline: search, deduplication, context extraction,
|
|
43
|
+
processing (classification), and result collection. It inherits from Orchestrator and adds
|
|
44
|
+
result management and persistence functionality.
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
_FILENAME_TEMPLATE = "{search_term}_{language}_{location}_{timestamp}.csv"
|
|
48
|
+
|
|
49
|
+
def __init__(
|
|
50
|
+
self,
|
|
51
|
+
searcher: Searcher,
|
|
52
|
+
enricher: Enricher,
|
|
53
|
+
url_collector: URLCollector,
|
|
54
|
+
zyteapi: ZyteAPI,
|
|
55
|
+
processor: Processor,
|
|
56
|
+
):
|
|
57
|
+
"""Initializes FraudCrawlerClient.
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
searcher: Client for searching step.
|
|
61
|
+
enricher: Client for enrichment step.
|
|
62
|
+
url_collector: Client for deduplication.
|
|
63
|
+
zyteapi: Client for metadata extraction.
|
|
64
|
+
processor: Client for product classification.
|
|
65
|
+
"""
|
|
66
|
+
super().__init__(
|
|
67
|
+
searcher=searcher,
|
|
68
|
+
enricher=enricher,
|
|
69
|
+
url_collector=url_collector,
|
|
70
|
+
zyteapi=zyteapi,
|
|
71
|
+
processor=processor,
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
self._results_dir = _RESULTS_DIR
|
|
75
|
+
if not self._results_dir.exists():
|
|
76
|
+
self._results_dir.mkdir(parents=True)
|
|
77
|
+
self._results: List[Results] = []
|
|
78
|
+
|
|
79
|
+
async def _collect_results(
|
|
80
|
+
self, queue_in: asyncio.Queue[ProductItem | None]
|
|
81
|
+
) -> None:
|
|
82
|
+
"""Collects the results from the given queue_in and saves it as csv.
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
queue_in: The input queue containing the results.
|
|
86
|
+
"""
|
|
87
|
+
products = []
|
|
88
|
+
while True:
|
|
89
|
+
product = await queue_in.get()
|
|
90
|
+
if product is None:
|
|
91
|
+
queue_in.task_done()
|
|
92
|
+
break
|
|
93
|
+
|
|
94
|
+
products.append(product.model_dump())
|
|
95
|
+
queue_in.task_done()
|
|
96
|
+
|
|
97
|
+
# Convert the list of products to a DataFrame
|
|
98
|
+
df = pd.json_normalize(products)
|
|
99
|
+
|
|
100
|
+
# Save the DataFrame to a CSV file
|
|
101
|
+
filename = self._results[-1].filename
|
|
102
|
+
df.to_csv(filename, index=False, quoting=csv.QUOTE_ALL)
|
|
103
|
+
logger.info(f"Results saved to {filename}")
|
|
104
|
+
|
|
105
|
+
async def run(
|
|
106
|
+
self,
|
|
107
|
+
search_term: str,
|
|
108
|
+
search_engines: List[SearchEngineName],
|
|
109
|
+
language: Language,
|
|
110
|
+
location: Location,
|
|
111
|
+
deepness: Deepness,
|
|
112
|
+
marketplaces: List[Host] | None = None,
|
|
113
|
+
excluded_urls: List[Host] | None = None,
|
|
114
|
+
previously_collected_urls: List[str] | None = None,
|
|
115
|
+
) -> None:
|
|
116
|
+
"""Runs the pipeline steps: srch, deduplication, context extraction, processing, and collect the results.
|
|
117
|
+
|
|
118
|
+
Args:
|
|
119
|
+
search_term: The search term for the query.
|
|
120
|
+
search_engines: The list of search engines to use for the search query.
|
|
121
|
+
language: The language to use for the query.
|
|
122
|
+
location: The location to use for the query.
|
|
123
|
+
deepness: The search depth and enrichment details.
|
|
124
|
+
marketplaces: The marketplaces to include in the search.
|
|
125
|
+
excluded_urls: The URLs to exclude from the search.
|
|
126
|
+
previously_collected_urls: The urls that have been collected previously and are ignored.
|
|
127
|
+
"""
|
|
128
|
+
# Handle results files
|
|
129
|
+
timestamp = datetime.today().strftime("%Y%m%d%H%M%S")
|
|
130
|
+
filename = self._results_dir / self._FILENAME_TEMPLATE.format(
|
|
131
|
+
search_term=search_term,
|
|
132
|
+
language=language.code,
|
|
133
|
+
location=location.code,
|
|
134
|
+
timestamp=timestamp,
|
|
135
|
+
)
|
|
136
|
+
self._results.append(Results(search_term=search_term, filename=filename))
|
|
137
|
+
|
|
138
|
+
# Run the pipeline by calling the orchestrator's run method
|
|
139
|
+
await super().run(
|
|
140
|
+
search_term=search_term,
|
|
141
|
+
search_engines=search_engines,
|
|
142
|
+
language=language,
|
|
143
|
+
location=location,
|
|
144
|
+
deepness=deepness,
|
|
145
|
+
marketplaces=marketplaces,
|
|
146
|
+
excluded_urls=excluded_urls,
|
|
147
|
+
previously_collected_urls=previously_collected_urls,
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
def load_results(self, index: int = -1) -> pd.DataFrame:
|
|
151
|
+
"""Loads the results from the saved .csv files.
|
|
152
|
+
|
|
153
|
+
Args:
|
|
154
|
+
index: The index of the results to load (`incex=-1` are the results for the most recent run).
|
|
155
|
+
"""
|
|
156
|
+
|
|
157
|
+
results = self._results[index]
|
|
158
|
+
if (filename := results.filename) is None:
|
|
159
|
+
raise ValueError("filename not found (is None)")
|
|
160
|
+
|
|
161
|
+
return pd.read_csv(filename)
|
|
162
|
+
|
|
163
|
+
def print_available_results(self) -> None:
|
|
164
|
+
"""Prints the available results."""
|
|
165
|
+
n_res = len(self._results)
|
|
166
|
+
for i, res in enumerate(self._results):
|
|
167
|
+
print(f"index={-n_res + i}: {res.search_term} - {res.filename}")
|