fraudcrawler 0.7.21__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of fraudcrawler might be problematic. Click here for more details.

@@ -0,0 +1,52 @@
1
+ from fraudcrawler.scraping.search import Searcher, SearchEngineName
2
+ from fraudcrawler.scraping.enrich import Enricher
3
+ from fraudcrawler.scraping.url import URLCollector
4
+ from fraudcrawler.scraping.zyte import ZyteAPI
5
+ from fraudcrawler.processing.base import (
6
+ UserInputs,
7
+ Workflow,
8
+ ClassificationResult,
9
+ TmpResult,
10
+ Processor,
11
+ )
12
+ from fraudcrawler.processing.openai import (
13
+ OpenAIWorkflow,
14
+ OpenAIClassification,
15
+ OpenAIClassificationUserInputs,
16
+ )
17
+ from fraudcrawler.base.orchestrator import Orchestrator
18
+ from fraudcrawler.base.client import FraudCrawlerClient
19
+ from fraudcrawler.base.base import (
20
+ Deepness,
21
+ Enrichment,
22
+ Host,
23
+ Language,
24
+ Location,
25
+ ProductItem,
26
+ HttpxAsyncClient,
27
+ )
28
+
29
+ __all__ = [
30
+ "Searcher",
31
+ "SearchEngineName",
32
+ "Enricher",
33
+ "URLCollector",
34
+ "ZyteAPI",
35
+ "UserInputs",
36
+ "Workflow",
37
+ "ClassificationResult",
38
+ "TmpResult",
39
+ "OpenAIWorkflow",
40
+ "OpenAIClassification",
41
+ "OpenAIClassificationUserInputs",
42
+ "Processor",
43
+ "Orchestrator",
44
+ "ProductItem",
45
+ "FraudCrawlerClient",
46
+ "Language",
47
+ "Location",
48
+ "Host",
49
+ "Deepness",
50
+ "Enrichment",
51
+ "HttpxAsyncClient",
52
+ ]
File without changes
@@ -0,0 +1,222 @@
1
+ import json
2
+ import logging
3
+ from pydantic import (
4
+ BaseModel,
5
+ Field,
6
+ field_validator,
7
+ model_validator,
8
+ )
9
+ from pydantic_settings import BaseSettings
10
+ from urllib.parse import urlparse
11
+ import re
12
+ from typing import Any, Dict, List
13
+
14
+
15
+ import httpx
16
+
17
+ from fraudcrawler.settings import (
18
+ GOOGLE_LANGUAGES_FILENAME,
19
+ GOOGLE_LOCATIONS_FILENAME,
20
+ )
21
+ from fraudcrawler.settings import (
22
+ DEFAULT_HTTPX_TIMEOUT,
23
+ DEFAULT_HTTPX_LIMITS,
24
+ DEFAULT_HTTPX_REDIRECTS,
25
+ )
26
+
27
+ logger = logging.getLogger(__name__)
28
+
29
+ # Load google locations and languages
30
+ with open(GOOGLE_LOCATIONS_FILENAME, "r") as gfile:
31
+ _locs = json.load(gfile)
32
+ _LOCATION_CODES = {loc["name"]: loc["country_code"].lower() for loc in _locs}
33
+ with open(GOOGLE_LANGUAGES_FILENAME, "r") as gfile:
34
+ _langs = json.load(gfile)
35
+ _LANGUAGE_CODES = {lang["language_name"]: lang["language_code"] for lang in _langs}
36
+
37
+
38
+ # Base classes
39
+ class Setup(BaseSettings):
40
+ """Class for loading environment variables."""
41
+
42
+ # Crawler ENV variables
43
+ serpapi_key: str
44
+ dataforseo_user: str
45
+ dataforseo_pwd: str
46
+ zyteapi_key: str
47
+ openaiapi_key: str
48
+ pypy_token: str
49
+
50
+ class Config:
51
+ env_file = ".env"
52
+ env_file_encoding = "utf-8"
53
+
54
+
55
+ class Host(BaseModel):
56
+ """Model for host details (e.g. `Host(name="Galaxus", domains="galaxus.ch, digitec.ch")`)."""
57
+
58
+ name: str
59
+ domains: str | List[str]
60
+
61
+ @staticmethod
62
+ def _normalize_domain(domain: str) -> str:
63
+ """Make it lowercase and strip 'www.' and 'https?://' prefixes from the domain."""
64
+ domain = domain.strip().lower()
65
+ return re.sub(r"^(https?://)?(www\.)?", "", domain)
66
+
67
+ @field_validator("domains", mode="before")
68
+ def normalize_domains(cls, val):
69
+ if isinstance(val, str):
70
+ val = val.split(",")
71
+ return [cls._normalize_domain(dom.strip()) for dom in val]
72
+
73
+
74
+ class Location(BaseModel):
75
+ """Model for location details (e.g. `Location(name="Switzerland", code="ch")`)."""
76
+
77
+ name: str
78
+ code: str = ""
79
+
80
+ @model_validator(mode="before")
81
+ def set_code(cls, values):
82
+ """Set the location code if not provided and make it lower case."""
83
+ name = values.get("name")
84
+ code = values.get("code")
85
+ if code is None or not len(code):
86
+ code = _LOCATION_CODES.get(name)
87
+ if code is None:
88
+ raise ValueError(f'Location code not found for location name="{name}"')
89
+ code = code.lower()
90
+ return {"name": name, "code": code}
91
+
92
+
93
+ class Language(BaseModel):
94
+ """Model for language details (e.g. `Language(name="German", code="de")`)."""
95
+
96
+ name: str
97
+ code: str = ""
98
+
99
+ @model_validator(mode="before")
100
+ def set_code(cls, values):
101
+ """Set the language code if not provided and make it lower case."""
102
+ name = values.get("name")
103
+ code = values.get("code")
104
+ if code is None or not len(code):
105
+ code = _LANGUAGE_CODES.get(name)
106
+ if code is None:
107
+ raise ValueError(f'Language code not found for language name="{name}"')
108
+ code = code.lower()
109
+ return {"name": name, "code": code}
110
+
111
+
112
+ class Enrichment(BaseModel):
113
+ """Model for enriching initial search_term with alternative ones."""
114
+
115
+ additional_terms: int
116
+ additional_urls_per_term: int
117
+
118
+
119
+ class Deepness(BaseModel):
120
+ """Model for search depth."""
121
+
122
+ num_results: int
123
+ enrichment: Enrichment | None = None
124
+
125
+
126
+ class ProductItem(BaseModel):
127
+ """Model representing a product item."""
128
+
129
+ # Search parameters
130
+ search_term: str
131
+ search_term_type: str
132
+ url: str
133
+ url_resolved: str
134
+ search_engine_name: str
135
+ domain: str
136
+ exact_search: bool = False
137
+ exact_search_match: bool = False
138
+
139
+ # Context parameters
140
+ product_name: str | None = None
141
+ product_price: str | None = None
142
+ product_description: str | None = None
143
+ product_images: List[str] | None = None
144
+ probability: float | None = None
145
+ html: str | None = None
146
+ html_clean: str | None = None
147
+
148
+ # Processor parameters (set dynamically)
149
+ classifications: Dict[str, int] = Field(default_factory=dict)
150
+ tmp: Dict[str, Any] = Field(default_factory=dict)
151
+ insights: Dict[str, Any] | None = Field(default=None)
152
+
153
+ # Usage parameters
154
+ usage: Dict[str, Dict[str, int]] = Field(default_factory=dict)
155
+
156
+ # Filtering parameters
157
+ filtered: bool = False
158
+ filtered_at_stage: str | None = None
159
+
160
+
161
+ class HttpxAsyncClient(httpx.AsyncClient):
162
+ """Httpx async client that can be used to retain the default settings."""
163
+
164
+ def __init__(
165
+ self,
166
+ timeout: httpx.Timeout | Dict[str, Any] = DEFAULT_HTTPX_TIMEOUT,
167
+ limits: httpx.Limits | Dict[str, Any] = DEFAULT_HTTPX_LIMITS,
168
+ follow_redirects: bool = DEFAULT_HTTPX_REDIRECTS,
169
+ **kwargs: Any,
170
+ ) -> None:
171
+ if isinstance(timeout, dict):
172
+ timeout = httpx.Timeout(**timeout)
173
+ if isinstance(limits, dict):
174
+ limits = httpx.Limits(**limits)
175
+
176
+ kwargs.setdefault("timeout", timeout)
177
+ kwargs.setdefault("limits", limits)
178
+ kwargs.setdefault("follow_redirects", follow_redirects)
179
+ super().__init__(**kwargs)
180
+
181
+
182
+ class DomainUtils:
183
+ """Utility class for domain extraction and normalization.
184
+
185
+ Handles domain parsing from URLs, removes common prefixes (www, http/https),
186
+ and provides consistent domain formatting for search and scraping operations.
187
+ """
188
+
189
+ _hostname_pattern = r"^(?:https?:\/\/)?([^\/:?#]+)"
190
+ _headers = {
191
+ "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
192
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
193
+ "Accept-Language": "en-US,en;q=0.5",
194
+ "Accept-Encoding": "gzip, deflate",
195
+ "Connection": "keep-alive",
196
+ "Upgrade-Insecure-Requests": "1",
197
+ }
198
+
199
+ def _get_domain(self, url: str) -> str:
200
+ """Extracts the second-level domain together with the top-level domain (e.g. `google.com`).
201
+
202
+ Args:
203
+ url: The URL to be processed.
204
+ """
205
+ # Add scheme; urlparse requires it
206
+ if not url.startswith(("http://", "https://")):
207
+ url = "http://" + url
208
+
209
+ # Get the hostname
210
+ hostname = urlparse(url).hostname
211
+ if hostname is None and (match := re.search(self._hostname_pattern, url)):
212
+ hostname = match.group(1)
213
+ if hostname is None:
214
+ logger.warning(
215
+ f'Failed to extract domain from url="{url}"; full url is returned'
216
+ )
217
+ return url.lower()
218
+
219
+ # Remove www. prefix
220
+ if hostname and hostname.startswith("www."):
221
+ hostname = hostname[4:]
222
+ return hostname.lower()
@@ -0,0 +1,167 @@
1
+ import asyncio
2
+ import csv
3
+ from datetime import datetime
4
+ import logging
5
+ from pathlib import Path
6
+ from pydantic import BaseModel
7
+ from typing import List
8
+
9
+ import pandas as pd
10
+
11
+ from fraudcrawler.settings import ROOT_DIR
12
+ from fraudcrawler.base.base import (
13
+ Language,
14
+ Location,
15
+ Deepness,
16
+ Host,
17
+ ProductItem,
18
+ )
19
+ from fraudcrawler.base.orchestrator import Orchestrator
20
+ from fraudcrawler.scraping.search import Searcher, SearchEngineName
21
+ from fraudcrawler.scraping.enrich import Enricher
22
+ from fraudcrawler.scraping.url import URLCollector
23
+ from fraudcrawler.scraping.zyte import ZyteAPI
24
+ from fraudcrawler.processing.base import Processor
25
+
26
+
27
+ logger = logging.getLogger(__name__)
28
+
29
+ _RESULTS_DIR = ROOT_DIR / "data" / "results"
30
+
31
+
32
+ class Results(BaseModel):
33
+ """The results of the product search."""
34
+
35
+ search_term: str
36
+ filename: Path | None = None
37
+
38
+
39
+ class FraudCrawlerClient(Orchestrator):
40
+ """The main client for FraudCrawler product search and analysis.
41
+
42
+ This client orchestrates the complete pipeline: search, deduplication, context extraction,
43
+ processing (classification), and result collection. It inherits from Orchestrator and adds
44
+ result management and persistence functionality.
45
+ """
46
+
47
+ _FILENAME_TEMPLATE = "{search_term}_{language}_{location}_{timestamp}.csv"
48
+
49
+ def __init__(
50
+ self,
51
+ searcher: Searcher,
52
+ enricher: Enricher,
53
+ url_collector: URLCollector,
54
+ zyteapi: ZyteAPI,
55
+ processor: Processor,
56
+ ):
57
+ """Initializes FraudCrawlerClient.
58
+
59
+ Args:
60
+ searcher: Client for searching step.
61
+ enricher: Client for enrichment step.
62
+ url_collector: Client for deduplication.
63
+ zyteapi: Client for metadata extraction.
64
+ processor: Client for product classification.
65
+ """
66
+ super().__init__(
67
+ searcher=searcher,
68
+ enricher=enricher,
69
+ url_collector=url_collector,
70
+ zyteapi=zyteapi,
71
+ processor=processor,
72
+ )
73
+
74
+ self._results_dir = _RESULTS_DIR
75
+ if not self._results_dir.exists():
76
+ self._results_dir.mkdir(parents=True)
77
+ self._results: List[Results] = []
78
+
79
+ async def _collect_results(
80
+ self, queue_in: asyncio.Queue[ProductItem | None]
81
+ ) -> None:
82
+ """Collects the results from the given queue_in and saves it as csv.
83
+
84
+ Args:
85
+ queue_in: The input queue containing the results.
86
+ """
87
+ products = []
88
+ while True:
89
+ product = await queue_in.get()
90
+ if product is None:
91
+ queue_in.task_done()
92
+ break
93
+
94
+ products.append(product.model_dump())
95
+ queue_in.task_done()
96
+
97
+ # Convert the list of products to a DataFrame
98
+ df = pd.json_normalize(products)
99
+
100
+ # Save the DataFrame to a CSV file
101
+ filename = self._results[-1].filename
102
+ df.to_csv(filename, index=False, quoting=csv.QUOTE_ALL)
103
+ logger.info(f"Results saved to {filename}")
104
+
105
+ async def run(
106
+ self,
107
+ search_term: str,
108
+ search_engines: List[SearchEngineName],
109
+ language: Language,
110
+ location: Location,
111
+ deepness: Deepness,
112
+ marketplaces: List[Host] | None = None,
113
+ excluded_urls: List[Host] | None = None,
114
+ previously_collected_urls: List[str] | None = None,
115
+ ) -> None:
116
+ """Runs the pipeline steps: srch, deduplication, context extraction, processing, and collect the results.
117
+
118
+ Args:
119
+ search_term: The search term for the query.
120
+ search_engines: The list of search engines to use for the search query.
121
+ language: The language to use for the query.
122
+ location: The location to use for the query.
123
+ deepness: The search depth and enrichment details.
124
+ marketplaces: The marketplaces to include in the search.
125
+ excluded_urls: The URLs to exclude from the search.
126
+ previously_collected_urls: The urls that have been collected previously and are ignored.
127
+ """
128
+ # Handle results files
129
+ timestamp = datetime.today().strftime("%Y%m%d%H%M%S")
130
+ filename = self._results_dir / self._FILENAME_TEMPLATE.format(
131
+ search_term=search_term,
132
+ language=language.code,
133
+ location=location.code,
134
+ timestamp=timestamp,
135
+ )
136
+ self._results.append(Results(search_term=search_term, filename=filename))
137
+
138
+ # Run the pipeline by calling the orchestrator's run method
139
+ await super().run(
140
+ search_term=search_term,
141
+ search_engines=search_engines,
142
+ language=language,
143
+ location=location,
144
+ deepness=deepness,
145
+ marketplaces=marketplaces,
146
+ excluded_urls=excluded_urls,
147
+ previously_collected_urls=previously_collected_urls,
148
+ )
149
+
150
+ def load_results(self, index: int = -1) -> pd.DataFrame:
151
+ """Loads the results from the saved .csv files.
152
+
153
+ Args:
154
+ index: The index of the results to load (`incex=-1` are the results for the most recent run).
155
+ """
156
+
157
+ results = self._results[index]
158
+ if (filename := results.filename) is None:
159
+ raise ValueError("filename not found (is None)")
160
+
161
+ return pd.read_csv(filename)
162
+
163
+ def print_available_results(self) -> None:
164
+ """Prints the available results."""
165
+ n_res = len(self._results)
166
+ for i, res in enumerate(self._results):
167
+ print(f"index={-n_res + i}: {res.search_term} - {res.filename}")