fraudcrawler 0.7.21__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of fraudcrawler might be problematic. Click here for more details.

@@ -0,0 +1,96 @@
1
+ import logging
2
+ from typing import List, Set, Tuple
3
+ from urllib.parse import urlparse, parse_qsl, urlencode, quote, urlunparse, ParseResult
4
+
5
+ from fraudcrawler.settings import KNOWN_TRACKERS
6
+ from fraudcrawler.base.base import ProductItem
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+
11
+ class URLCollector:
12
+ """A class to collect and de-duplicate URLs."""
13
+
14
+ def __init__(self):
15
+ self._collected_currently: Set[str] = set()
16
+ self._collected_previously: Set[str] = set()
17
+
18
+ def add_previously_collected_urls(self, urls: List[str]) -> None:
19
+ """Add a set of previously collected URLs to the internal state.
20
+
21
+ Args:
22
+ urls: A set of URLs that have been collected in previous runs.
23
+ """
24
+ self._collected_previously.update(urls)
25
+
26
+ @staticmethod
27
+ def _remove_tracking_parameters(url: str) -> str:
28
+ """Remove tracking parameters from URLs.
29
+
30
+ Args:
31
+ url: The URL to clean.
32
+
33
+ Returns:
34
+ The cleaned URL without tracking parameters.
35
+ """
36
+ logging.debug(f"Removing tracking parameters from URL: {url}")
37
+
38
+ # Parse the url
39
+ parsed_url = urlparse(url)
40
+
41
+ # Parse query parameters
42
+ queries: List[Tuple[str, str]] = parse_qsl(
43
+ parsed_url.query, keep_blank_values=True
44
+ )
45
+ remove_all = url.startswith(
46
+ "https://www.ebay"
47
+ ) # eBay URLs have all query parameters as tracking parameters
48
+ if remove_all:
49
+ filtered_queries = []
50
+ else:
51
+ filtered_queries = [
52
+ q
53
+ for q in queries
54
+ if not any(q[0].startswith(tracker) for tracker in KNOWN_TRACKERS)
55
+ ]
56
+
57
+ # Rebuild the URL without tracking parameters
58
+ clean_url = ParseResult(
59
+ scheme=parsed_url.scheme,
60
+ netloc=parsed_url.netloc,
61
+ path=parsed_url.path,
62
+ params=parsed_url.params,
63
+ query=urlencode(filtered_queries, quote_via=quote),
64
+ fragment=parsed_url.fragment,
65
+ )
66
+ return urlunparse(clean_url)
67
+
68
+ async def apply(self, product: ProductItem) -> ProductItem:
69
+ """Manages the collection and deduplication of ProductItems.
70
+
71
+ Args:
72
+ product: The product item to process.
73
+ """
74
+ logger.debug(f'Processing product with url="{product.url}"')
75
+
76
+ # Remove tracking parameters from the URL
77
+ url = self._remove_tracking_parameters(product.url)
78
+ product.url = url
79
+
80
+ # deduplicate on current run
81
+ if url in self._collected_currently:
82
+ product.filtered = True
83
+ product.filtered_at_stage = "URL collection (current run deduplication)"
84
+ logger.debug(f"URL {url} already collected in current run")
85
+
86
+ # deduplicate on previous runs coming from a db
87
+ elif url in self._collected_previously:
88
+ product.filtered = True
89
+ product.filtered_at_stage = "URL collection (previous run deduplication)"
90
+ logger.debug(f"URL {url} as already collected in previous run")
91
+
92
+ # Add to currently collected URLs
93
+ else:
94
+ self._collected_currently.add(url)
95
+
96
+ return product
@@ -0,0 +1,287 @@
1
+ from base64 import b64decode
2
+ import logging
3
+ from typing import List
4
+
5
+ from bs4 import BeautifulSoup
6
+ import httpx
7
+ from tenacity import RetryCallState
8
+
9
+ from fraudcrawler.settings import ZYTE_DEFALUT_PROBABILITY_THRESHOLD
10
+ from fraudcrawler.base.base import DomainUtils, ProductItem
11
+ from fraudcrawler.base.retry import get_async_retry
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ class ZyteAPI(DomainUtils):
17
+ """A client to interact with the Zyte API for fetching product details."""
18
+
19
+ _endpoint = "https://api.zyte.com/v1/extract"
20
+ _config = {
21
+ "javascript": False,
22
+ "browserHtml": False,
23
+ "screenshot": False,
24
+ "productOptions": {"extractFrom": "httpResponseBody"},
25
+ "httpResponseBody": True,
26
+ "geolocation": "CH",
27
+ "viewport": {"width": 1280, "height": 1080},
28
+ "product": True,
29
+ # "actions": [],
30
+ }
31
+
32
+ def __init__(
33
+ self,
34
+ http_client: httpx.AsyncClient,
35
+ api_key: str,
36
+ ):
37
+ """Initializes the ZyteApiClient with the given API key and retry configurations.
38
+
39
+ Args:
40
+ http_client: An httpx.AsyncClient to use for the async requests.
41
+ api_key: The API key for Zyte API.
42
+ """
43
+ self._http_client = http_client
44
+ self._api_key = api_key
45
+
46
+ def _log_before(self, url: str, retry_state: RetryCallState | None) -> None:
47
+ """Context aware logging before the request is made."""
48
+ if retry_state:
49
+ logger.debug(
50
+ f"Zyte fetching product details for URL {url} (Attempt {retry_state.attempt_number})."
51
+ )
52
+ else:
53
+ logger.debug(f"retry_state is {retry_state}; not logging before.")
54
+
55
+ def _log_before_sleep(self, url: str, retry_state: RetryCallState | None) -> None:
56
+ """Context aware logging before sleeping after a failed request."""
57
+ if retry_state and retry_state.outcome:
58
+ logger.warning(
59
+ f'Attempt {retry_state.attempt_number} of Zyte fetching product details for URL "{url}" '
60
+ f"Retrying in {retry_state.upcoming_sleep:.0f} seconds."
61
+ )
62
+ else:
63
+ logger.debug(f"retry_state is {retry_state}; not logging before_sleep.")
64
+
65
+ @staticmethod
66
+ def _extract_product_name(details: dict) -> str | None:
67
+ """Extracts the product name from the product data.
68
+
69
+ The input argument is a dictionary of the following structure:
70
+ {
71
+ "product": {
72
+ "name": str,
73
+ }
74
+ }
75
+ """
76
+ return details.get("product", {}).get("name")
77
+
78
+ @staticmethod
79
+ def _extract_url_resolved(details: dict) -> str | None:
80
+ """Extracts the resolved URL from the product data - this is automatically resolved by Zyte.
81
+
82
+ The input argument is a dictionary of the following structure:
83
+ {
84
+ "product": {
85
+ "url": str,
86
+ }
87
+ }
88
+ """
89
+ return details.get("product", {}).get("url")
90
+
91
+ @staticmethod
92
+ def _extract_product_price(details: dict) -> str | None:
93
+ """Extracts the product price from the product data.
94
+
95
+ The input argument is a dictionary of the following structure:
96
+ {
97
+ "product": {
98
+ "price": str,
99
+ }
100
+ }
101
+ """
102
+ return details.get("product", {}).get("price")
103
+
104
+ @staticmethod
105
+ def _extract_product_description(details: dict) -> str | None:
106
+ """Extracts the product description from the product data.
107
+
108
+ The input argument is a dictionary of the following structure:
109
+ {
110
+ "product": {
111
+ "description": str,
112
+ }
113
+ }
114
+ """
115
+ return details.get("product", {}).get("description")
116
+
117
+ @staticmethod
118
+ def _extract_image_urls(details: dict) -> List[str]:
119
+ """Extracts the images from the product data.
120
+
121
+ The input argument is a dictionary of the following structure:
122
+ {
123
+ "product": {
124
+ "mainImage": {"url": str},
125
+ "images": [{"url": str}],
126
+ }
127
+ }
128
+ """
129
+ images = []
130
+ product = details.get("product")
131
+ if product:
132
+ # Extract main image URL
133
+ if (main_img := product.get("mainImage")) and (url := main_img.get("url")):
134
+ images.append(url)
135
+ # Extract additional image URLs
136
+ if urls := product.get("images"):
137
+ images.extend([img["url"] for img in urls if img.get("url")])
138
+ return images
139
+
140
+ @staticmethod
141
+ def _extract_probability(details: dict) -> float:
142
+ """Extracts the probability from the product data.
143
+
144
+ The input argument is a dictionary of the following structure:
145
+ {
146
+ "product": {
147
+ "metadata": {
148
+ "probability": float,
149
+ }
150
+ }
151
+ }
152
+ """
153
+ return float(
154
+ details.get("product", {}).get("metadata", {}).get("probability", 0.0)
155
+ )
156
+
157
+ @staticmethod
158
+ def _extract_html(details: dict) -> str | None:
159
+ """Extracts the HTML from the Zyte API response.
160
+
161
+ The input argument is a dictionary of the following structure:
162
+ {
163
+ "httpResponseBody": base64
164
+ }
165
+ """
166
+ # Get the Base64-encoded content
167
+ encoded = details.get("httpResponseBody")
168
+
169
+ # Decode it into bytes
170
+ if isinstance(encoded, str):
171
+ decoded_bytes = b64decode(encoded)
172
+
173
+ # Convert bytes to string
174
+ try:
175
+ decoded_string = decoded_bytes.decode("utf-8")
176
+ except UnicodeDecodeError:
177
+ decoded_string = decoded_bytes.decode("iso-8859-1")
178
+ return decoded_string
179
+ return None
180
+
181
+ def enrich_context(self, product: ProductItem, details: dict) -> ProductItem:
182
+ product.product_name = self._extract_product_name(details=details)
183
+
184
+ url_resolved = self._extract_url_resolved(details=details)
185
+ if url_resolved:
186
+ product.url_resolved = url_resolved
187
+
188
+ # If the resolved URL is different from the original URL, we also need to update the domain as
189
+ # otherwise the unresolved domain will be shown.
190
+ # For example for an unresolved domain "toppreise.ch" but resolved "digitec.ch
191
+ if url_resolved and url_resolved != product.url:
192
+ logger.debug(f"URL resolved for {product.url} is {url_resolved}")
193
+ product.domain = self._get_domain(url=url_resolved)
194
+
195
+ product.product_price = self._extract_product_price(details=details)
196
+ product.product_description = self._extract_product_description(details=details)
197
+ product.product_images = self._extract_image_urls(details=details)
198
+ product.probability = self._extract_probability(details=details)
199
+ product.html = self._extract_html(details=details)
200
+ if product.html:
201
+ soup = BeautifulSoup(product.html, "html.parser")
202
+ product.html_clean = soup.get_text(separator=" ", strip=True)
203
+
204
+ return product
205
+
206
+ @staticmethod
207
+ def keep_product(
208
+ details: dict,
209
+ threshold: float = ZYTE_DEFALUT_PROBABILITY_THRESHOLD,
210
+ ) -> bool:
211
+ """Determines whether to keep the product based on the probability threshold.
212
+
213
+ Args:
214
+ details: A product details data dictionary.
215
+ threshold: The probability threshold used to filter the products.
216
+ """
217
+ try:
218
+ prob = float(details["product"]["metadata"]["probability"])
219
+ except KeyError:
220
+ logger.warning(
221
+ f"Product with url={details.get('url')} has no probability value - product is ignored"
222
+ )
223
+ return False
224
+ return prob > threshold
225
+
226
+ async def unblock_url_content(self, url: str) -> bytes:
227
+ """Unblock the content of an URL using Zyte proxy mode.
228
+
229
+ Args:
230
+ url: The URL to fetch using Zyte proxy mode.
231
+ """
232
+ logger.debug(f'Unblock URL content using Zyte proxy for url="{url}"')
233
+ details = await self.details(url)
234
+
235
+ if not details or "httpResponseBody" not in details:
236
+ raise httpx.HTTPError("No httpResponseBody in Zyte response")
237
+
238
+ return b64decode(details["httpResponseBody"])
239
+
240
+ async def details(self, url: str) -> dict:
241
+ """Fetches product details for a single URL.
242
+
243
+ Args:
244
+ url: The URL to fetch product details from.
245
+
246
+ Returns:
247
+ A dictionary containing the product details, fields include:
248
+ (c.f. https://docs.zyte.com/zyte-api/usage/reference.html#operation/extract/response/200/product)
249
+ {
250
+ "url": str,
251
+ "statusCode": str,
252
+ "product": {
253
+ "name": str,
254
+ "price": str,
255
+ "mainImage": {"url": str},
256
+ "images": [{"url": str}],
257
+ "description": str,
258
+ "metadata": {
259
+ "probability": float,
260
+ },
261
+ },
262
+ "httpResponseBody": base64
263
+ }
264
+ """
265
+ logger.info(f"Fetching product details by Zyte for URL {url}.")
266
+
267
+ # Perform the request and retry if necessary. There is some context aware logging:
268
+ # - `before`: before the request is made (and before retrying)
269
+ # - `before_sleep`: if the request fails before sleeping
270
+ retry = get_async_retry()
271
+ retry.before = lambda retry_state: self._log_before(
272
+ url=url, retry_state=retry_state
273
+ )
274
+ retry.before_sleep = lambda retry_state: self._log_before_sleep(
275
+ url=url, retry_state=retry_state
276
+ )
277
+ async for attempt in retry:
278
+ with attempt:
279
+ response = await self._http_client.post(
280
+ url=self._endpoint,
281
+ json={"url": url, **self._config},
282
+ auth=(self._api_key, ""), # API key as username, empty password
283
+ )
284
+ response.raise_for_status()
285
+
286
+ details = response.json()
287
+ return details
@@ -0,0 +1,104 @@
1
+ from pathlib import Path
2
+ from typing import List
3
+
4
+ # Generic settings
5
+ ROOT_DIR = Path(__file__).parents[1]
6
+
7
+ # Service retry settings
8
+ # With the following setup (neglecting the jitter) we have 6 attempts with delays:
9
+ # 0s, 1s, 4s, 16s, 64s, 64s (because of the max delay)
10
+ RETRY_STOP_AFTER_ATTEMPT = 6
11
+ RETRY_INITIAL_DELAY = 1
12
+ RETRY_MAX_DELAY = 64
13
+ RETRY_EXP_BASE = 4
14
+ RETRY_JITTER = 1
15
+ RETRY_SKIP_IF_CODE = [400, 401, 403] # Skip retrying on these HTTP status codes
16
+
17
+ # Search settings
18
+ GOOGLE_LOCATIONS_FILENAME = ROOT_DIR / "fraudcrawler" / "base" / "google-locations.json"
19
+ GOOGLE_LANGUAGES_FILENAME = ROOT_DIR / "fraudcrawler" / "base" / "google-languages.json"
20
+ SEARCH_DEFAULT_COUNTRY_CODES: List[str] = [
21
+ # ".com",
22
+ ]
23
+ TOPPREISE_SEARCH_PATHS = {
24
+ "de": "produktsuche",
25
+ "fr": "chercher",
26
+ "default": "browse",
27
+ }
28
+ TOPPREISE_COMPARISON_PATHS = [
29
+ "preisvergleich",
30
+ "comparison-prix",
31
+ "price-comparison",
32
+ ]
33
+
34
+ # URL De-duplication settings
35
+ KNOWN_TRACKERS = [
36
+ "srsltid", # Search result click ID (used by some search engines)
37
+ "utm_source", # UTM: Source of the traffic (e.g., Google, Newsletter)
38
+ "utm_medium", # UTM: Medium such as CPC, email, social
39
+ "utm_campaign", # UTM: Campaign name (e.g., summer_sale)
40
+ "utm_term", # UTM: Keyword term (used in paid search)
41
+ "utm_content", # UTM: Used to differentiate similar links or ads
42
+ "ar", # Often used for ad region or targeting info
43
+ "ps", # Could refer to promotion source or partner segment
44
+ "gclid", # Google Ads click ID (auto-tagging)
45
+ "gclsrc", # Source of the GCLID (e.g., ads, search)
46
+ "sku", # Product SKU identifier, often used in ecommerce links
47
+ "ref", # Referrer username or source (e.g., GitHub ref links)
48
+ "referral", # Alternate form of referrer, often human-readable
49
+ "aff_id", # Affiliate identifier (ID-based)
50
+ "aff", # Short form for affiliate tag
51
+ "affiliate", # Affiliate tracking parameter (human-readable)
52
+ "partner", # Indicates marketing or distribution partner
53
+ "fbclid", # Facebook Click Identifier
54
+ "msclkid", # Microsoft/Bing Ads click identifier
55
+ "twclid", # Twitter Ads click identifier
56
+ "variant", # A/B test variant (used to test versions of pages)
57
+ "session_id", # Session tracking ID, should not persist across URLs
58
+ "track", # Generic flag used to enable/disable tracking
59
+ "cid", # Campaign ID (used in ads or emails)
60
+ "campaignid", # Alternate or long-form campaign ID
61
+ "adgroup", # Ad group identifier for campaigns
62
+ "bannerid", # Specific banner ad ID (for display ad tracking)
63
+ "token", # Often used to identify users or temporary sessions
64
+ "tag", # Affiliate or marketing tag (used for tracking)
65
+ "hash", # Generic hash identifier, often for state or cache
66
+ "user", # User ID or identifier passed in URL (should be avoided)
67
+ "src", # Generic source indicator, less formal than `utm_source`
68
+ "selsort", # Sorting parameter for search results
69
+ "shid", # Shop ID (used in ecommerce)
70
+ "shoparea", # Shop area (used in ecommerce)
71
+ "shopid", # Shop ID (used in ecommerce)
72
+ "shoparea", # Shop area (used in ecommerce)
73
+ ]
74
+
75
+ # Enrichment settings
76
+ ENRICHMENT_DEFAULT_LIMIT = 10
77
+
78
+ # Zyte settings
79
+ ZYTE_DEFALUT_PROBABILITY_THRESHOLD = 0.1
80
+
81
+ # Exact match settings
82
+ EXACT_MATCH_PRODUCT_FIELDS = {
83
+ "url_resolved",
84
+ "product_name",
85
+ "product_description",
86
+ "html",
87
+ }
88
+ EXACT_MATCH_FIELD_SEPARATOR = "\n"
89
+
90
+ # Async workers settings
91
+ DEFAULT_N_SRCH_WKRS = 2
92
+ DEFAULT_N_CNTX_WKRS = 23
93
+ DEFAULT_N_PROC_WKRS = 5
94
+
95
+ # HTTPX client settings
96
+ DEFAULT_HTTPX_TIMEOUT = {
97
+ "timeout": 600,
98
+ "connect": 5.0,
99
+ }
100
+ DEFAULT_HTTPX_LIMITS = {
101
+ "max_connections": 1000,
102
+ "max_keepalive_connections": 100,
103
+ }
104
+ DEFAULT_HTTPX_REDIRECTS = True
@@ -0,0 +1,175 @@
1
+ Metadata-Version: 2.4
2
+ Name: fraudcrawler
3
+ Version: 0.7.21
4
+ Summary: Intelligent Market Monitoring
5
+ License: MIT
6
+ License-File: LICENSE
7
+ Author: Domingo Bertus
8
+ Author-email: hello@veanu.ch
9
+ Requires-Python: >=3.11,<4.0
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Programming Language :: Python :: 3.11
13
+ Classifier: Programming Language :: Python :: 3.12
14
+ Classifier: Programming Language :: Python :: 3.13
15
+ Classifier: Programming Language :: Python :: 3.14
16
+ Requires-Dist: beautifulsoup4 (>=4.13.4,<5.0.0)
17
+ Requires-Dist: httpx (>=0.28.1,<0.29.0)
18
+ Requires-Dist: openai (>=1.68.2,<2.0.0)
19
+ Requires-Dist: pandas (>=2.2.3,<3.0.0)
20
+ Requires-Dist: pydantic-settings (>=2.8.1,<3.0.0)
21
+ Requires-Dist: requests (>=2.32.3,<3.0.0)
22
+ Requires-Dist: tenacity (>=9.1.2,<10.0.0)
23
+ Project-URL: Repository, https://github.com/open-veanu/fraudcrawler
24
+ Description-Content-Type: text/markdown
25
+
26
+ # fraudcrawler
27
+
28
+ ![CI Status](https://github.com/open-veanu/fraudcrawler/workflows/CI/badge.svg)
29
+ ![Python Version](https://img.shields.io/badge/python-3.11+-blue.svg)
30
+ ![License](https://img.shields.io/badge/license-MIT-green.svg)
31
+ ![PyPI](https://img.shields.io/pypi/v/fraudcrawler.svg)
32
+
33
+ Fraudcrawler is an intelligent **market monitoring** tool that searches the web for products, extracts product details, and classifies them using LLMs. It combines search APIs, web scraping, and AI to automate product discovery and relevance assessment.
34
+
35
+ ## Features
36
+
37
+ - **Asynchronous pipeline** - Products move through search, extraction, and classification stages independently
38
+ - **Multiple search engines** - Google Search, Google Shopping, and more...
39
+ - **Search term enrichment** - Automatically find related terms and expand your search
40
+ - **Product extraction** - Get structured product data via Zyte API
41
+ - **LLM classification** - Assess product relevance using OpenAI API with custom prompts
42
+ - **Marketplace filtering** - Focus searches on specific domains
43
+ - **Deduplication** - Avoid reprocessing previously collected URLs
44
+ - **CSV export** - Results saved with timestamps for easy tracking
45
+
46
+ ## Prerequisites
47
+
48
+ - Python 3.11 or higher
49
+ - API keys for:
50
+ - **SerpAPI** - Google search results
51
+ - **Zyte API** - Product data extraction
52
+ - **OpenAI API** - Product classification
53
+ - **DataForSEO** (optional) - Search term enrichment
54
+
55
+ ## Installation
56
+
57
+ ```bash
58
+ python3.11 -m venv .venv
59
+ source .venv/bin/activate
60
+ pip install fraudcrawler
61
+ ```
62
+
63
+ **Using Poetry:**
64
+ ```bash
65
+ poetry install
66
+ ```
67
+
68
+ ## Configuration
69
+
70
+ Create a `.env` file with your API credentials (see `.env.example` for template):
71
+
72
+ ```bash
73
+ SERPAPI_KEY=your_serpapi_key
74
+ ZYTEAPI_KEY=your_zyte_key
75
+ OPENAIAPI_KEY=your_openai_key
76
+ DATAFORSEO_USER=your_user # optional
77
+ DATAFORSEO_PWD=your_pwd # optional
78
+ ```
79
+
80
+ ## Usage
81
+
82
+ ### Basic Configuration
83
+ For a complete working example, see `fraudcrawler/launch_demo_pipeline.py`. After setting up the necessary parameters you can launch and analyse the results with:
84
+ ```python
85
+ # Run pipeline
86
+ await client.run(
87
+ search_term=search_term,
88
+ search_engines=search_engines,
89
+ language=language,
90
+ location=location,
91
+ deepness=deepness,
92
+ excluded_urls=excluded_urls,
93
+ )
94
+
95
+ # Load results
96
+ df = client.load_results()
97
+ print(df.head())
98
+ ```
99
+
100
+ ### Advanced Configuration
101
+
102
+ **Search term enrichment** - Find and search related terms:
103
+ ```python
104
+ from fraudcrawler import Enrichment
105
+
106
+ deepness.enrichment = Enrichment(
107
+ additional_terms=5,
108
+ additional_urls_per_term=10
109
+ )
110
+ ```
111
+
112
+ **Marketplace filtering** - Focus on specific domains:
113
+ ```python
114
+ from fraudcrawler import Host
115
+
116
+ marketplaces = [
117
+ Host(name="International", domains="zavamed.com,apomeds.com"),
118
+ Host(name="National", domains="netdoktor.ch,nobelpharma.ch"),
119
+ ]
120
+
121
+ await client.run(..., marketplaces=marketplaces)
122
+ ```
123
+
124
+ **Exclude domains** - Exclude specific domains from your results:
125
+ ```python
126
+ excluded_urls = [
127
+ Host(name="Compendium", domains="compendium.ch"),
128
+ ]
129
+
130
+ await client.run(..., excluded_urls=excluded_urls)
131
+ ```
132
+
133
+ **Skip previously collected URLs**:
134
+ ```python
135
+ previously_collected_urls = [
136
+ "https://example.com/product1",
137
+ "https://example.com/product2",
138
+ ]
139
+
140
+ await client.run(..., previously_collected_urls=previously_collected_urls)
141
+ ```
142
+
143
+ **View all results** from a client instance:
144
+ ```python
145
+ client.print_available_results()
146
+ ```
147
+
148
+ ## Output
149
+
150
+ Results are saved as CSV files in `data/results/` with the naming pattern:
151
+
152
+ ```
153
+ <search_term>_<language_code>_<location_code>_<timestamp>.csv
154
+ ```
155
+
156
+ Example: `sildenafil_de_ch_20250115143022.csv`
157
+
158
+ The CSV includes product details, URLs, and classification scores from your workflows.
159
+
160
+ ## Development
161
+
162
+ For detailed contribution guidelines, see [CONTRIBUTING.md](CONTRIBUTING.md).
163
+
164
+ ## License
165
+
166
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
167
+
168
+ ## Architecture
169
+
170
+ Fraudcrawler uses an asynchronous pipeline where products can be at different processing stages simultaneously. Product A might be in classification while Product B is still being scraped. This is enabled by async workers for each stage (Search, Context Extraction, Processing) using `httpx.AsyncClient`.
171
+
172
+ ![Async Setup](https://github.com/open-veanu/fraudcrawler/raw/master/docs/assets/images/Fraudcrawler_Async_Setup.svg)
173
+
174
+ For more details on the async design, see the [httpx documentation](https://www.python-httpx.org/api/#asyncclient).
175
+
@@ -0,0 +1,23 @@
1
+ fraudcrawler/__init__.py,sha256=4Xzhj6aS9zmjs8KZS9nhFg9YAWOpCX-TAtJ4s32A5Jk,1191
2
+ fraudcrawler/base/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
+ fraudcrawler/base/base.py,sha256=dcXaMVvaCz2alA7rcjQUYKFgclpNPOnanBW-AV5XJGk,6785
4
+ fraudcrawler/base/client.py,sha256=uPaC1uMpHlMR1ThcTkzsoliwmGuTGH3jyTHLyVNYSHk,5608
5
+ fraudcrawler/base/google-languages.json,sha256=z0VtjMCsCcZq11OkCIb9jMDD1p9Ty4lhV7bq4ddYvec,10748
6
+ fraudcrawler/base/google-locations.json,sha256=UtNu0iSStllvFRTQXMobWKmZR1hKmtgtHftLNgaJTT0,9204345
7
+ fraudcrawler/base/orchestrator.py,sha256=BklS4DNzxbp7yvE2NvBWrDDqnvT4YO7Xh_WXstYNWYA,26050
8
+ fraudcrawler/base/retry.py,sha256=bCDd44XO2-lHO8MGvPblD5152-lHt1dOfMAQSmymLO4,1462
9
+ fraudcrawler/launch_demo_pipeline.py,sha256=oZWodtNzA5mhmLNYMS6lglry88NutvH4IxnEWOUtL8M,6179
10
+ fraudcrawler/processing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
+ fraudcrawler/processing/base.py,sha256=UkoYxFNZ3BQkXmgJnTtruz8-eIFCtWiquRN_IoEXfM4,4091
12
+ fraudcrawler/processing/openai.py,sha256=7sbFg2NPsn627VDzsfIkKantE2KahGmVkSZ1R10OrzQ,19050
13
+ fraudcrawler/scraping/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
+ fraudcrawler/scraping/enrich.py,sha256=dGWi9p0JStQYSGscCnsQPHNlAeqjoL2rXZnHFNmPhaQ,13158
15
+ fraudcrawler/scraping/search.py,sha256=qHeUpzv1IpRhdFvaycGtL3FLOwT8rOiF0PfiOH6BmUA,34561
16
+ fraudcrawler/scraping/url.py,sha256=unUoZ-bThU99ZlLdDUILdPx1kbtwMWPZVPCDqPscqHw,3217
17
+ fraudcrawler/scraping/zyte.py,sha256=B9PfmF9hsHQVhRcW0MVJYUIUZzXXPcPYQTyJmUpgeQ8,10317
18
+ fraudcrawler/settings.py,sha256=q3je0r_jd30x2dzlgfm8GyKcigFdgteOLa8HX188bho,3768
19
+ fraudcrawler-0.7.21.dist-info/METADATA,sha256=_YD6sCGCLTQ6wpyqtfpiZ2CM-s5mMxsZed12eZwzkcU,5373
20
+ fraudcrawler-0.7.21.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
21
+ fraudcrawler-0.7.21.dist-info/entry_points.txt,sha256=1Befm7cM6945y2AA1z9V4gZV63mtSWcAs7ypvgux_Xg,79
22
+ fraudcrawler-0.7.21.dist-info/licenses/LICENSE,sha256=B-3FuHfe3S0fWAlKlceskPcRhzXq81g-rJ-ddUYb4O8,1062
23
+ fraudcrawler-0.7.21.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: poetry-core 2.2.1
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,3 @@
1
+ [console_scripts]
2
+ launch_demo_pipeline=fraudcrawler.launch_demo_pipeline:main
3
+