fraudcrawler 0.5.9__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of fraudcrawler might be problematic. Click here for more details.
- fraudcrawler/__init__.py +2 -2
- fraudcrawler/base/base.py +3 -32
- fraudcrawler/base/client.py +1 -1
- fraudcrawler/base/orchestrator.py +135 -135
- fraudcrawler/base/retry.py +12 -6
- fraudcrawler/processing/processor.py +3 -3
- fraudcrawler/scraping/search.py +274 -69
- fraudcrawler/scraping/url.py +42 -3
- fraudcrawler/scraping/zyte.py +15 -1
- fraudcrawler/settings.py +13 -3
- {fraudcrawler-0.5.9.dist-info → fraudcrawler-0.6.0.dist-info}/METADATA +4 -3
- fraudcrawler-0.6.0.dist-info/RECORD +22 -0
- {fraudcrawler-0.5.9.dist-info → fraudcrawler-0.6.0.dist-info}/WHEEL +1 -1
- fraudcrawler-0.5.9.dist-info/RECORD +0 -22
- {fraudcrawler-0.5.9.dist-info → fraudcrawler-0.6.0.dist-info}/LICENSE +0 -0
- {fraudcrawler-0.5.9.dist-info → fraudcrawler-0.6.0.dist-info}/entry_points.txt +0 -0
|
@@ -72,7 +72,7 @@ class Processor:
|
|
|
72
72
|
"""Context aware logging before the request is made."""
|
|
73
73
|
if retry_state:
|
|
74
74
|
logger.debug(
|
|
75
|
-
f"Classifying product with url={url} using prompt={prompt} (Attempt {retry_state.attempt_number})."
|
|
75
|
+
f"Classifying product with url={url} using prompt={prompt.name} (Attempt {retry_state.attempt_number})."
|
|
76
76
|
)
|
|
77
77
|
else:
|
|
78
78
|
logger.debug(f"retry_state is {retry_state}; not logging before.")
|
|
@@ -84,7 +84,7 @@ class Processor:
|
|
|
84
84
|
"""Context aware logging before sleeping after a failed request."""
|
|
85
85
|
if retry_state and retry_state.outcome:
|
|
86
86
|
logger.warning(
|
|
87
|
-
f"Attempt {retry_state.attempt_number} of classifying product with url={url} using prompt={prompt} "
|
|
87
|
+
f"Attempt {retry_state.attempt_number} of classifying product with url={url} using prompt={prompt.name} "
|
|
88
88
|
f"failed with error: {retry_state.outcome.exception()}. "
|
|
89
89
|
f"Retrying in {retry_state.upcoming_sleep:.0f} seconds."
|
|
90
90
|
)
|
|
@@ -160,7 +160,7 @@ class Processor:
|
|
|
160
160
|
# Call the OpenAI API
|
|
161
161
|
try:
|
|
162
162
|
logger.debug(
|
|
163
|
-
f"Classifying product with url={url} using prompt={prompt.name}
|
|
163
|
+
f"Classifying product with url={url}, using prompt={prompt.name}."
|
|
164
164
|
)
|
|
165
165
|
# Perform the request and retry if necessary. There is some context aware logging
|
|
166
166
|
# - `before`: before the request is made (or before retrying)
|
fraudcrawler/scraping/search.py
CHANGED
|
@@ -6,12 +6,18 @@ from typing import Dict, List
|
|
|
6
6
|
from urllib.parse import quote_plus
|
|
7
7
|
|
|
8
8
|
from bs4 import BeautifulSoup
|
|
9
|
+
from bs4.element import Tag
|
|
9
10
|
import httpx
|
|
10
|
-
from tenacity import RetryCallState
|
|
11
|
+
from tenacity import RetryCallState, AsyncRetrying
|
|
11
12
|
|
|
12
|
-
from fraudcrawler.settings import
|
|
13
|
+
from fraudcrawler.settings import (
|
|
14
|
+
SEARCH_DEFAULT_COUNTRY_CODES,
|
|
15
|
+
TOPPREISE_SEARCH_PATHS,
|
|
16
|
+
TOPPREISE_COMPARISON_PATHS,
|
|
17
|
+
)
|
|
13
18
|
from fraudcrawler.base.base import Host, Language, Location, DomainUtils
|
|
14
19
|
from fraudcrawler.base.retry import get_async_retry
|
|
20
|
+
from fraudcrawler.scraping.zyte import ZyteAPI
|
|
15
21
|
|
|
16
22
|
logger = logging.getLogger(__name__)
|
|
17
23
|
|
|
@@ -380,7 +386,7 @@ class SerpAPIGoogleShopping(SerpAPI):
|
|
|
380
386
|
class Toppreise(SearchEngine):
|
|
381
387
|
"""Search engine for toppreise.ch."""
|
|
382
388
|
|
|
383
|
-
_endpoint = "https://www.toppreise.ch/
|
|
389
|
+
_endpoint = "https://www.toppreise.ch/"
|
|
384
390
|
_headers = {
|
|
385
391
|
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
|
|
386
392
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
|
@@ -390,28 +396,42 @@ class Toppreise(SearchEngine):
|
|
|
390
396
|
"Upgrade-Insecure-Requests": "1",
|
|
391
397
|
}
|
|
392
398
|
|
|
393
|
-
def __init__(self, http_client: httpx.AsyncClient,
|
|
399
|
+
def __init__(self, http_client: httpx.AsyncClient, zyteapi_key: str):
|
|
394
400
|
"""Initializes the Toppreise client.
|
|
395
401
|
|
|
396
402
|
Args:
|
|
397
403
|
http_client: An httpx.AsyncClient to use for the async requests.
|
|
398
|
-
|
|
404
|
+
zyteapi_key: ZyteAPI key for fallback when direct access fails.
|
|
399
405
|
"""
|
|
400
406
|
self._http_client = http_client
|
|
401
|
-
self.
|
|
407
|
+
self._zyteapi = ZyteAPI(http_client=http_client, api_key=zyteapi_key)
|
|
402
408
|
|
|
403
|
-
@
|
|
404
|
-
def
|
|
405
|
-
"""
|
|
406
|
-
|
|
409
|
+
@classmethod
|
|
410
|
+
def _get_search_endpoint(cls, language: Language) -> str:
|
|
411
|
+
"""Get the search endpoint based on the language."""
|
|
412
|
+
search_path = TOPPREISE_SEARCH_PATHS.get(
|
|
413
|
+
language.code, TOPPREISE_SEARCH_PATHS["default"]
|
|
414
|
+
)
|
|
415
|
+
return f"{cls._endpoint}{search_path}"
|
|
407
416
|
|
|
408
417
|
@staticmethod
|
|
409
|
-
def
|
|
410
|
-
|
|
418
|
+
def _extract_links(
|
|
419
|
+
element: Tag, ext_products: bool = True, comp_products: bool = True
|
|
420
|
+
) -> List[str]:
|
|
421
|
+
"""Extracts all relevant product URLs from a BeautifulSoup object of a Toppreise page.
|
|
411
422
|
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
423
|
+
Note:
|
|
424
|
+
Depending on the arguments, it extracts:
|
|
425
|
+
- product comparison URLs (i.e. https://www.toppreise.ch/preisvergleich/...)
|
|
426
|
+
- external product URLs (i.e. https://www.example.com/ext_...).
|
|
427
|
+
|
|
428
|
+
Args:
|
|
429
|
+
tag: BeautifulSoup Tag object containing the HTML to parse.
|
|
430
|
+
ext_products: Whether to extract external product URLs.
|
|
431
|
+
comp_products: Whether to extract product comparison URLs.
|
|
432
|
+
"""
|
|
433
|
+
# Find all links in the page
|
|
434
|
+
links = element.find_all("a", href=True)
|
|
415
435
|
|
|
416
436
|
# Filter links to only include external product links
|
|
417
437
|
hrefs = [
|
|
@@ -422,7 +442,15 @@ class Toppreise(SearchEngine):
|
|
|
422
442
|
and (href := link.get("href")) # Ensure href is not None
|
|
423
443
|
and not href.startswith("javascript:") # Skip javascript links
|
|
424
444
|
and isinstance(href, str) # Ensure href is a string
|
|
425
|
-
|
|
445
|
+
# Make sure the link is either an external product link (href contains 'ext_')
|
|
446
|
+
# or is a search result link (href contains 'preisvergleich', 'comparison-prix', or 'price-comparison')
|
|
447
|
+
and (
|
|
448
|
+
("ext_" in href and ext_products)
|
|
449
|
+
or (
|
|
450
|
+
any(pth in href for pth in TOPPREISE_COMPARISON_PATHS)
|
|
451
|
+
and comp_products
|
|
452
|
+
)
|
|
453
|
+
)
|
|
426
454
|
)
|
|
427
455
|
]
|
|
428
456
|
|
|
@@ -437,21 +465,100 @@ class Toppreise(SearchEngine):
|
|
|
437
465
|
|
|
438
466
|
# Return deduplicated urls
|
|
439
467
|
urls = list(set(urls))
|
|
468
|
+
return urls
|
|
469
|
+
|
|
470
|
+
def _extract_product_urls_from_search_page(self, content: bytes) -> List[str]:
|
|
471
|
+
"""Extracts product urls from a Toppreise search page (i.e. https://www.toppreise.ch/produktsuche)."""
|
|
472
|
+
|
|
473
|
+
# Parse the HTML
|
|
474
|
+
soup = BeautifulSoup(content, "html.parser")
|
|
475
|
+
main = soup.find("div", id="Page_Browsing")
|
|
476
|
+
if not isinstance(main, Tag):
|
|
477
|
+
logger.warning("No main content found in Toppreise search page.")
|
|
478
|
+
return []
|
|
479
|
+
|
|
480
|
+
# Extract links (external product links and comparison links)
|
|
481
|
+
urls = self._extract_links(element=main)
|
|
482
|
+
|
|
483
|
+
logger.debug(f"Found {len(urls)} product URLs from Toppreise search results.")
|
|
484
|
+
return urls
|
|
485
|
+
|
|
486
|
+
def _extract_product_urls_from_comparison_page(self, content: bytes) -> List[str]:
|
|
487
|
+
"""Extracts product urls from a Toppreise product comparison page (i.e. https://www.toppreise.ch/preisvergleich/...)."""
|
|
488
|
+
|
|
489
|
+
# Parse the HTML
|
|
490
|
+
soup = BeautifulSoup(content, "html.parser")
|
|
491
|
+
|
|
492
|
+
# Extract links (external product links only)
|
|
493
|
+
urls = self._extract_links(element=soup, comp_products=False)
|
|
494
|
+
|
|
440
495
|
logger.debug(
|
|
441
|
-
f"Found {len(urls)} external product URLs from Toppreise
|
|
496
|
+
f"Found {len(urls)} external product URLs from Toppreise comparison page."
|
|
442
497
|
)
|
|
443
498
|
return urls
|
|
444
499
|
|
|
445
|
-
|
|
500
|
+
@property
|
|
501
|
+
def _search_engine_name(self) -> str:
|
|
502
|
+
"""The name of the search engine."""
|
|
503
|
+
return SearchEngineName.TOPPREISE.value
|
|
504
|
+
|
|
505
|
+
async def http_client_get_with_fallback(
|
|
506
|
+
self, url: str, retry: AsyncRetrying
|
|
507
|
+
) -> bytes:
|
|
508
|
+
"""Performs a GET request with retries.
|
|
509
|
+
|
|
510
|
+
If direct access fails (e.g. 403 Forbidden), it will attempt to unblock the URL
|
|
511
|
+
content using Zyte proxy mode.
|
|
512
|
+
|
|
513
|
+
Args:
|
|
514
|
+
url: The URL to request.
|
|
515
|
+
retry: The retry strategy to use.
|
|
516
|
+
"""
|
|
517
|
+
# Try to access the URL directly
|
|
518
|
+
try:
|
|
519
|
+
async for attempt in retry:
|
|
520
|
+
with attempt:
|
|
521
|
+
response = await self._http_client.get(
|
|
522
|
+
url=url,
|
|
523
|
+
headers=self._headers,
|
|
524
|
+
)
|
|
525
|
+
response.raise_for_status()
|
|
526
|
+
content = response.content
|
|
527
|
+
|
|
528
|
+
# If we get a 403 Error (can happen depending on IP/location of deployment),
|
|
529
|
+
# we try to unblock the URL using Zyte proxy mode
|
|
530
|
+
except httpx.HTTPStatusError as err_direct:
|
|
531
|
+
if err_direct.response.status_code == 403:
|
|
532
|
+
logger.warning(
|
|
533
|
+
f"Received 403 Forbidden for {url}, attempting to unblock with Zyte proxy"
|
|
534
|
+
)
|
|
535
|
+
try:
|
|
536
|
+
content = await self._zyteapi.unblock_url_content(url)
|
|
537
|
+
except Exception as err_resolve:
|
|
538
|
+
msg = f'Error unblocking URL="{url}" with Zyte proxy: {err_resolve}'
|
|
539
|
+
logger.error(msg)
|
|
540
|
+
raise httpx.HTTPError(msg) from err_resolve
|
|
541
|
+
else:
|
|
542
|
+
raise err_direct
|
|
543
|
+
return content
|
|
544
|
+
|
|
545
|
+
async def _search(
|
|
546
|
+
self, search_string: str, language: Language, num_results: int
|
|
547
|
+
) -> List[str]:
|
|
446
548
|
"""Performs a search on Toppreise and returns the URLs of the results.
|
|
447
549
|
|
|
550
|
+
If direct access fails (e.g. 403 Forbidden), it will attempt to unblock the URL
|
|
551
|
+
content using Zyte proxy mode.
|
|
552
|
+
|
|
448
553
|
Args:
|
|
449
554
|
search_string: The search string to use for the query.
|
|
555
|
+
language: The language to use for the query.
|
|
450
556
|
num_results: Max number of results to return.
|
|
451
557
|
"""
|
|
452
558
|
# Build the search URL for Toppreise
|
|
559
|
+
endpoint = self._get_search_endpoint(language=language)
|
|
453
560
|
encoded_search = quote_plus(search_string)
|
|
454
|
-
url = f"{
|
|
561
|
+
url = f"{endpoint}?q={encoded_search}"
|
|
455
562
|
logger.debug(f"Toppreise search URL: {url}")
|
|
456
563
|
|
|
457
564
|
# Perform the request and retry if necessary. There is some context aware logging:
|
|
@@ -464,33 +571,10 @@ class Toppreise(SearchEngine):
|
|
|
464
571
|
retry.before_sleep = lambda retry_state: self._log_before_sleep(
|
|
465
572
|
search_string=search_string, retry_state=retry_state
|
|
466
573
|
)
|
|
467
|
-
|
|
468
|
-
content = None
|
|
469
|
-
try:
|
|
470
|
-
async for attempt in retry:
|
|
471
|
-
with attempt:
|
|
472
|
-
response = await self._http_client.get(
|
|
473
|
-
url=url,
|
|
474
|
-
headers=self._headers,
|
|
475
|
-
)
|
|
476
|
-
response.raise_for_status()
|
|
477
|
-
content = response.content
|
|
478
|
-
except httpx.HTTPStatusError as e:
|
|
479
|
-
if e.response.status_code == 403 and self._zyte_api:
|
|
480
|
-
logger.warning(
|
|
481
|
-
f"Received 403 Forbidden for {url}, attempting to unblock with Zyte proxy"
|
|
482
|
-
)
|
|
483
|
-
content = await self._unblock_url(url, self._zyte_api)
|
|
484
|
-
if content is None:
|
|
485
|
-
raise e # Re-raise if zyte fallback also failed
|
|
486
|
-
else:
|
|
487
|
-
raise e
|
|
488
|
-
|
|
489
|
-
if content is None:
|
|
490
|
-
raise httpx.HTTPError("Failed to fetch content")
|
|
574
|
+
content = await self.http_client_get_with_fallback(url=url, retry=retry)
|
|
491
575
|
|
|
492
576
|
# Get external product urls from the content
|
|
493
|
-
urls = self.
|
|
577
|
+
urls = self._extract_product_urls_from_search_page(content=content)
|
|
494
578
|
urls = urls[:num_results] # Limit to num_results if needed
|
|
495
579
|
|
|
496
580
|
return urls
|
|
@@ -498,17 +582,20 @@ class Toppreise(SearchEngine):
|
|
|
498
582
|
async def search(
|
|
499
583
|
self,
|
|
500
584
|
search_term: str,
|
|
585
|
+
language: Language,
|
|
501
586
|
num_results: int,
|
|
502
587
|
) -> List[SearchResult]:
|
|
503
588
|
"""Performs a Toppreise search and returns SearchResults.
|
|
504
589
|
|
|
505
590
|
Args:
|
|
506
591
|
search_term: The search term to use for the query.
|
|
592
|
+
language: The language to use for the search.
|
|
507
593
|
num_results: Max number of results to return.
|
|
508
594
|
"""
|
|
509
595
|
# Perform the search
|
|
510
596
|
urls = await self._search(
|
|
511
597
|
search_string=search_term,
|
|
598
|
+
language=language,
|
|
512
599
|
num_results=num_results,
|
|
513
600
|
)
|
|
514
601
|
|
|
@@ -520,22 +607,124 @@ class Toppreise(SearchEngine):
|
|
|
520
607
|
return results
|
|
521
608
|
|
|
522
609
|
|
|
523
|
-
class
|
|
610
|
+
class Searcher(DomainUtils):
|
|
524
611
|
"""Class to perform searches using different search engines."""
|
|
525
612
|
|
|
526
|
-
|
|
613
|
+
_post_search_retry_stop_after = 3
|
|
614
|
+
|
|
615
|
+
def __init__(
|
|
616
|
+
self, http_client: httpx.AsyncClient, serpapi_key: str, zyteapi_key: str
|
|
617
|
+
):
|
|
527
618
|
"""Initializes the Search class with the given SerpAPI key.
|
|
528
619
|
|
|
529
620
|
Args:
|
|
530
621
|
http_client: An httpx.AsyncClient to use for the async requests.
|
|
531
622
|
serpapi_key: The API key for SERP API.
|
|
532
|
-
|
|
623
|
+
zyteapi_key: ZyteAPI key for fallback when direct access fails.
|
|
533
624
|
"""
|
|
625
|
+
self._http_client = http_client
|
|
534
626
|
self._google = SerpAPIGoogle(http_client=http_client, api_key=serpapi_key)
|
|
535
627
|
self._google_shopping = SerpAPIGoogleShopping(
|
|
536
|
-
http_client=http_client,
|
|
628
|
+
http_client=http_client,
|
|
629
|
+
api_key=serpapi_key,
|
|
630
|
+
)
|
|
631
|
+
self._toppreise = Toppreise(
|
|
632
|
+
http_client=http_client,
|
|
633
|
+
zyteapi_key=zyteapi_key,
|
|
537
634
|
)
|
|
538
|
-
|
|
635
|
+
|
|
636
|
+
@staticmethod
|
|
637
|
+
def _post_search_log_before(url: str, retry_state: RetryCallState | None) -> None:
|
|
638
|
+
"""Context aware logging before the request is made."""
|
|
639
|
+
if retry_state:
|
|
640
|
+
logger.debug(
|
|
641
|
+
f'Performing post search for url="{url}" '
|
|
642
|
+
f"(attempt {retry_state.attempt_number})."
|
|
643
|
+
)
|
|
644
|
+
else:
|
|
645
|
+
logger.debug(f"retry_state is {retry_state}; not logging before.")
|
|
646
|
+
|
|
647
|
+
@staticmethod
|
|
648
|
+
def _post_search_log_before_sleep(
|
|
649
|
+
url: str, retry_state: RetryCallState | None
|
|
650
|
+
) -> None:
|
|
651
|
+
"""Context aware logging before sleeping after a failed request."""
|
|
652
|
+
if retry_state and retry_state.outcome:
|
|
653
|
+
logger.warning(
|
|
654
|
+
f'Attempt {retry_state.attempt_number} of post search for url="{url}" '
|
|
655
|
+
f"failed with error: {retry_state.outcome.exception()}. "
|
|
656
|
+
f"Retrying in {retry_state.upcoming_sleep:.0f} seconds."
|
|
657
|
+
)
|
|
658
|
+
else:
|
|
659
|
+
logger.debug(f"retry_state is {retry_state}; not logging before_sleep.")
|
|
660
|
+
|
|
661
|
+
async def _post_search_toppreise_comparison(self, url: str) -> List[str]:
|
|
662
|
+
"""Post-search for product URLs from a Toppreise product comparison page.
|
|
663
|
+
|
|
664
|
+
Note:
|
|
665
|
+
In comparison to the function Toppreise._search, here we extract the urls from
|
|
666
|
+
product comparison pages (f.e. https://www.toppreise.ch/preisvergleich/). They can
|
|
667
|
+
also be found in the results of a google search.
|
|
668
|
+
|
|
669
|
+
Args:
|
|
670
|
+
url: The URL of the Toppreise product listing page.
|
|
671
|
+
"""
|
|
672
|
+
# Perform the request and retry if necessary. There is some context aware logging:
|
|
673
|
+
# - `before`: before the request is made (and before retrying)
|
|
674
|
+
# - `before_sleep`: if the request fails before sleeping
|
|
675
|
+
retry = get_async_retry(stop_after=self._post_search_retry_stop_after)
|
|
676
|
+
retry.before = lambda retry_state: self._post_search_log_before(
|
|
677
|
+
url=url, retry_state=retry_state
|
|
678
|
+
)
|
|
679
|
+
retry.before_sleep = lambda retry_state: self._post_search_log_before_sleep(
|
|
680
|
+
url=url, retry_state=retry_state
|
|
681
|
+
)
|
|
682
|
+
content = await self._toppreise.http_client_get_with_fallback(
|
|
683
|
+
url=url, retry=retry
|
|
684
|
+
)
|
|
685
|
+
|
|
686
|
+
# Get external product urls from the content
|
|
687
|
+
urls = self._toppreise._extract_product_urls_from_comparison_page(
|
|
688
|
+
content=content
|
|
689
|
+
)
|
|
690
|
+
|
|
691
|
+
return urls
|
|
692
|
+
|
|
693
|
+
async def _post_search(self, results: List[SearchResult]) -> List[SearchResult]:
|
|
694
|
+
"""Post-search for additional embedded product URLs from the obtained results.
|
|
695
|
+
|
|
696
|
+
Note:
|
|
697
|
+
This function is used to extract embedded product URLs from
|
|
698
|
+
product listing pages (e.g. Toppreise, Google Shopping) if needed.
|
|
699
|
+
|
|
700
|
+
Args:
|
|
701
|
+
results: The list of SearchResult objects obtained from the search.
|
|
702
|
+
"""
|
|
703
|
+
post_search_results: List[SearchResult] = []
|
|
704
|
+
for res in results:
|
|
705
|
+
url = res.url
|
|
706
|
+
|
|
707
|
+
# Extract embedded product URLs from the Toppreise product listing page
|
|
708
|
+
if any(pth in url for pth in TOPPREISE_COMPARISON_PATHS):
|
|
709
|
+
logger.debug(
|
|
710
|
+
f'Extracting embedded product URLs from url="{url}" found by search_engine="{res.search_engine_name}"'
|
|
711
|
+
)
|
|
712
|
+
post_search_urls = await self._post_search_toppreise_comparison(url=url)
|
|
713
|
+
logger.debug(
|
|
714
|
+
f'Extracted {len(post_search_urls)} embedded product URLs from url="{url}".'
|
|
715
|
+
)
|
|
716
|
+
|
|
717
|
+
psr = [
|
|
718
|
+
SearchResult(
|
|
719
|
+
url=psu,
|
|
720
|
+
domain=self._get_domain(url=psu),
|
|
721
|
+
search_engine_name=res.search_engine_name,
|
|
722
|
+
)
|
|
723
|
+
for psu in post_search_urls
|
|
724
|
+
]
|
|
725
|
+
post_search_results.extend(psr)
|
|
726
|
+
|
|
727
|
+
return post_search_results
|
|
539
728
|
|
|
540
729
|
@staticmethod
|
|
541
730
|
def _domain_in_host(domain: str, host: Host) -> bool:
|
|
@@ -625,63 +814,77 @@ class Search(DomainUtils):
|
|
|
625
814
|
async def apply(
|
|
626
815
|
self,
|
|
627
816
|
search_term: str,
|
|
817
|
+
search_engine: SearchEngineName | str,
|
|
628
818
|
language: Language,
|
|
629
819
|
location: Location,
|
|
630
820
|
num_results: int,
|
|
631
821
|
marketplaces: List[Host] | None = None,
|
|
632
822
|
excluded_urls: List[Host] | None = None,
|
|
633
|
-
search_engines: List[SearchEngineName | str] | None = None,
|
|
634
823
|
) -> List[SearchResult]:
|
|
635
824
|
"""Performs a search and returns SearchResults.
|
|
636
825
|
|
|
637
826
|
Args:
|
|
638
827
|
search_term: The search term to use for the query.
|
|
828
|
+
search_engine: The search engine to use for the search.
|
|
639
829
|
language: The language to use for the query ('hl' parameter).
|
|
640
830
|
location: The location to use for the query ('gl' parameter).
|
|
641
831
|
num_results: Max number of results per search engine.
|
|
642
832
|
marketplaces: The marketplaces to include in the search.
|
|
643
833
|
excluded_urls: The URLs to exclude from the search.
|
|
644
|
-
search_engines: The list of search engines to use for the search.
|
|
645
834
|
"""
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
835
|
+
logger.info(
|
|
836
|
+
f'Performing search for term="{search_term}" using engine="{search_engine}".'
|
|
837
|
+
)
|
|
838
|
+
|
|
839
|
+
# -------------------------------
|
|
840
|
+
# SEARCH
|
|
841
|
+
# -------------------------------
|
|
842
|
+
# Map string to SearchEngineName if needed
|
|
843
|
+
if isinstance(search_engine, str):
|
|
844
|
+
search_engine = SearchEngineName(search_engine)
|
|
654
845
|
|
|
655
846
|
# Make SerpAPI google search
|
|
656
|
-
if SearchEngineName.GOOGLE
|
|
657
|
-
|
|
847
|
+
if search_engine == SearchEngineName.GOOGLE:
|
|
848
|
+
results = await self._google.search(
|
|
658
849
|
search_term=search_term,
|
|
659
850
|
language=language,
|
|
660
851
|
location=location,
|
|
661
852
|
num_results=num_results,
|
|
662
853
|
marketplaces=marketplaces,
|
|
663
854
|
)
|
|
664
|
-
results.extend(res)
|
|
665
855
|
|
|
666
856
|
# Make SerpAPI google shopping search
|
|
667
|
-
|
|
668
|
-
|
|
857
|
+
elif search_engine == SearchEngineName.GOOGLE_SHOPPING:
|
|
858
|
+
results = await self._google_shopping.search(
|
|
669
859
|
search_term=search_term,
|
|
670
860
|
language=language,
|
|
671
861
|
location=location,
|
|
672
862
|
num_results=num_results,
|
|
673
863
|
marketplaces=marketplaces,
|
|
674
864
|
)
|
|
675
|
-
results.extend(res)
|
|
676
865
|
|
|
677
866
|
# Make Toppreise search
|
|
678
|
-
|
|
679
|
-
|
|
867
|
+
elif search_engine == SearchEngineName.TOPPREISE:
|
|
868
|
+
results = await self._toppreise.search(
|
|
680
869
|
search_term=search_term,
|
|
870
|
+
language=language,
|
|
681
871
|
num_results=num_results,
|
|
682
872
|
)
|
|
683
|
-
results.extend(res)
|
|
684
873
|
|
|
874
|
+
# Other search engines can be added here (raise unknown engine error otherwise)
|
|
875
|
+
else:
|
|
876
|
+
raise ValueError(f"Unknown search engine: {search_engine}")
|
|
877
|
+
|
|
878
|
+
# -------------------------------
|
|
879
|
+
# POST-SEARCH URL EXTRACTION
|
|
880
|
+
# -------------------------------
|
|
881
|
+
post_search_results = await self._post_search(results=results)
|
|
882
|
+
post_search_results = post_search_results[:num_results]
|
|
883
|
+
results.extend(post_search_results)
|
|
884
|
+
|
|
885
|
+
# -------------------------------
|
|
886
|
+
# FILTERS
|
|
887
|
+
# -------------------------------
|
|
685
888
|
# Apply filters
|
|
686
889
|
results = [
|
|
687
890
|
self._apply_filters(
|
|
@@ -693,5 +896,7 @@ class Search(DomainUtils):
|
|
|
693
896
|
for res in results
|
|
694
897
|
]
|
|
695
898
|
|
|
696
|
-
logger.
|
|
899
|
+
logger.info(
|
|
900
|
+
f'Search for term="{search_term}" using engine="{search_engine}" produced {len(results)} results.'
|
|
901
|
+
)
|
|
697
902
|
return results
|
fraudcrawler/scraping/url.py
CHANGED
|
@@ -3,6 +3,7 @@ from typing import List, Set, Tuple
|
|
|
3
3
|
from urllib.parse import urlparse, parse_qsl, urlencode, quote, urlunparse, ParseResult
|
|
4
4
|
|
|
5
5
|
from fraudcrawler.settings import KNOWN_TRACKERS
|
|
6
|
+
from fraudcrawler.base.base import ProductItem
|
|
6
7
|
|
|
7
8
|
logger = logging.getLogger(__name__)
|
|
8
9
|
|
|
@@ -11,11 +12,19 @@ class URLCollector:
|
|
|
11
12
|
"""A class to collect and de-duplicate URLs."""
|
|
12
13
|
|
|
13
14
|
def __init__(self):
|
|
14
|
-
self.
|
|
15
|
-
self.
|
|
15
|
+
self._collected_currently: Set[str] = set()
|
|
16
|
+
self._collected_previously: Set[str] = set()
|
|
17
|
+
|
|
18
|
+
def add_previously_collected_urls(self, urls: List[str]) -> None:
|
|
19
|
+
"""Add a set of previously collected URLs to the internal state.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
urls: A set of URLs that have been collected in previous runs.
|
|
23
|
+
"""
|
|
24
|
+
self._collected_previously.update(urls)
|
|
16
25
|
|
|
17
26
|
@staticmethod
|
|
18
|
-
def
|
|
27
|
+
def _remove_tracking_parameters(url: str) -> str:
|
|
19
28
|
"""Remove tracking parameters from URLs.
|
|
20
29
|
|
|
21
30
|
Args:
|
|
@@ -55,3 +64,33 @@ class URLCollector:
|
|
|
55
64
|
fragment=parsed_url.fragment,
|
|
56
65
|
)
|
|
57
66
|
return urlunparse(clean_url)
|
|
67
|
+
|
|
68
|
+
async def apply(self, product: ProductItem) -> ProductItem:
|
|
69
|
+
"""Manages the collection and deduplication of ProductItems.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
product: The product item to process.
|
|
73
|
+
"""
|
|
74
|
+
logger.debug(f'Processing product with url="{product.url}"')
|
|
75
|
+
|
|
76
|
+
# Remove tracking parameters from the URL
|
|
77
|
+
url = self._remove_tracking_parameters(product.url)
|
|
78
|
+
product.url = url
|
|
79
|
+
|
|
80
|
+
# deduplicate on current run
|
|
81
|
+
if url in self._collected_currently:
|
|
82
|
+
product.filtered = True
|
|
83
|
+
product.filtered_at_stage = "URL collection (current run deduplication)"
|
|
84
|
+
logger.debug(f"URL {url} already collected in current run")
|
|
85
|
+
|
|
86
|
+
# deduplicate on previous runs coming from a db
|
|
87
|
+
elif url in self._collected_previously:
|
|
88
|
+
product.filtered = True
|
|
89
|
+
product.filtered_at_stage = "URL collection (previous run deduplication)"
|
|
90
|
+
logger.debug(f"URL {url} as already collected in previous run")
|
|
91
|
+
|
|
92
|
+
# Add to currently collected URLs
|
|
93
|
+
else:
|
|
94
|
+
self._collected_currently.add(url)
|
|
95
|
+
|
|
96
|
+
return product
|
fraudcrawler/scraping/zyte.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
|
+
from base64 import b64decode
|
|
1
2
|
import logging
|
|
2
3
|
from typing import List
|
|
3
|
-
from base64 import b64decode
|
|
4
4
|
|
|
5
5
|
import httpx
|
|
6
6
|
from tenacity import RetryCallState
|
|
@@ -242,3 +242,17 @@ class ZyteAPI(DomainUtils):
|
|
|
242
242
|
decoded_string = decoded_bytes.decode("utf-8")
|
|
243
243
|
return decoded_string
|
|
244
244
|
return None
|
|
245
|
+
|
|
246
|
+
async def unblock_url_content(self, url: str) -> bytes:
|
|
247
|
+
"""Unblock the content of an URL using Zyte proxy mode.
|
|
248
|
+
|
|
249
|
+
Args:
|
|
250
|
+
url: The URL to fetch using Zyte proxy mode.
|
|
251
|
+
"""
|
|
252
|
+
logger.debug(f'Unblock URL content using Zyte proxy for url="{url}"')
|
|
253
|
+
details = await self.details(url)
|
|
254
|
+
|
|
255
|
+
if not details or "httpResponseBody" not in details:
|
|
256
|
+
raise httpx.HTTPError("No httpResponseBody in Zyte response")
|
|
257
|
+
|
|
258
|
+
return b64decode(details["httpResponseBody"])
|
fraudcrawler/settings.py
CHANGED
|
@@ -14,12 +14,22 @@ RETRY_EXP_BASE = 4
|
|
|
14
14
|
RETRY_JITTER = 1
|
|
15
15
|
RETRY_SKIP_IF_CODE = [400, 401, 403] # Skip retrying on these HTTP status codes
|
|
16
16
|
|
|
17
|
-
#
|
|
17
|
+
# Search settings
|
|
18
18
|
GOOGLE_LOCATIONS_FILENAME = ROOT_DIR / "fraudcrawler" / "base" / "google-locations.json"
|
|
19
19
|
GOOGLE_LANGUAGES_FILENAME = ROOT_DIR / "fraudcrawler" / "base" / "google-languages.json"
|
|
20
20
|
SEARCH_DEFAULT_COUNTRY_CODES: List[str] = [
|
|
21
21
|
# ".com",
|
|
22
22
|
]
|
|
23
|
+
TOPPREISE_SEARCH_PATHS = {
|
|
24
|
+
"de": "produktsuche",
|
|
25
|
+
"fr": "chercher",
|
|
26
|
+
"default": "browse",
|
|
27
|
+
}
|
|
28
|
+
TOPPREISE_COMPARISON_PATHS = [
|
|
29
|
+
"preisvergleich",
|
|
30
|
+
"comparison-prix",
|
|
31
|
+
"price-comparison",
|
|
32
|
+
]
|
|
23
33
|
|
|
24
34
|
# URL De-duplication settings
|
|
25
35
|
KNOWN_TRACKERS = [
|
|
@@ -76,8 +86,8 @@ PROCESSOR_USER_PROMPT_TEMPLATE = "Product Details:\n{product_details}\n\nRelevan
|
|
|
76
86
|
PROCESSOR_PRODUCT_DETAILS_TEMPLATE = "{field_name}:\n{field_value}"
|
|
77
87
|
|
|
78
88
|
# Async workers settings
|
|
79
|
-
|
|
80
|
-
|
|
89
|
+
DEFAULT_N_SRCH_WKRS = 5
|
|
90
|
+
DEFAULT_N_CNTX_WKRS = 15
|
|
81
91
|
DEFAULT_N_PROC_WKRS = 10
|
|
82
92
|
|
|
83
93
|
# HTTPX client settings
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
2
|
Name: fraudcrawler
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.6.0
|
|
4
4
|
Summary: Intelligent Market Monitoring
|
|
5
5
|
Home-page: https://github.com/open-veanu/fraudcrawler
|
|
6
6
|
License: MIT
|
|
@@ -11,6 +11,7 @@ Classifier: License :: OSI Approved :: MIT License
|
|
|
11
11
|
Classifier: Programming Language :: Python :: 3
|
|
12
12
|
Classifier: Programming Language :: Python :: 3.11
|
|
13
13
|
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
14
15
|
Requires-Dist: beautifulsoup4 (>=4.13.4,<5.0.0)
|
|
15
16
|
Requires-Dist: httpx (>=0.28.1,<0.29.0)
|
|
16
17
|
Requires-Dist: openai (>=1.68.2,<2.0.0)
|
|
@@ -160,7 +161,7 @@ see `CONTRIBUTING.md`
|
|
|
160
161
|
### Async Setup
|
|
161
162
|
The `Orchestrator` class in `src/base/orchestrator.py` is designed to coordinate multiple services that may have interdependencies, allowing them to run in a semi-iterative manner. This means, for example, that product A can be at stage III of the pipeline while product B is still at stage I.
|
|
162
163
|
|
|
163
|
-
This behavior is enabled through an asynchronous pipeline setup. The three main steps, `
|
|
164
|
+
This behavior is enabled through an asynchronous pipeline setup. The three main steps, `Search`, `Context Extraction`, and `Processing`, all utilize `httpx.AsyncClient`. It is both possible and highly recommended to manage a single AsyncClient instance per application for efficiency. We provide a `HttpxAsyncClient` class that you can pass For more details, see the [httpx documentation](https://www.python-httpx.org/api/#asyncclient).
|
|
164
165
|
|
|
165
166
|
The following image provides a schematic representation of the package's async setup.
|
|
166
167
|

|