fraudcrawler 0.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of fraudcrawler might be problematic. Click here for more details.
- fraudcrawler/__init__.py +30 -0
- fraudcrawler/base/__init__.py +0 -0
- fraudcrawler/base/base.py +145 -0
- fraudcrawler/base/client.py +134 -0
- fraudcrawler/base/google-languages.json +630 -0
- fraudcrawler/base/google-locations.json +1 -0
- fraudcrawler/base/orchestrator.py +626 -0
- fraudcrawler/launch_demo_pipeline.py +100 -0
- fraudcrawler/processing/__init__.py +0 -0
- fraudcrawler/processing/processor.py +105 -0
- fraudcrawler/scraping/__init__.py +0 -0
- fraudcrawler/scraping/enrich.py +303 -0
- fraudcrawler/scraping/serp.py +251 -0
- fraudcrawler/scraping/zyte.py +194 -0
- fraudcrawler/settings.py +31 -0
- fraudcrawler-0.3.3.dist-info/LICENSE +21 -0
- fraudcrawler-0.3.3.dist-info/METADATA +163 -0
- fraudcrawler-0.3.3.dist-info/RECORD +20 -0
- fraudcrawler-0.3.3.dist-info/WHEEL +4 -0
- fraudcrawler-0.3.3.dist-info/entry_points.txt +3 -0
|
@@ -0,0 +1,251 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import logging
|
|
3
|
+
from pydantic import BaseModel
|
|
4
|
+
from typing import List
|
|
5
|
+
from urllib.parse import urlparse
|
|
6
|
+
|
|
7
|
+
from fraudcrawler.settings import MAX_RETRIES, RETRY_DELAY
|
|
8
|
+
from fraudcrawler.base.base import Host, Language, Location, AsyncClient
|
|
9
|
+
import re
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class SerpResult(BaseModel):
|
|
15
|
+
"""Model for a single search result from SerpApi."""
|
|
16
|
+
|
|
17
|
+
url: str
|
|
18
|
+
domain: str
|
|
19
|
+
marketplace_name: str
|
|
20
|
+
filtered: bool = False
|
|
21
|
+
filtered_at_stage: str | None = None
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class SerpApi(AsyncClient):
|
|
25
|
+
"""A client to interact with the SerpApi for performing searches."""
|
|
26
|
+
|
|
27
|
+
_endpoint = "https://serpapi.com/search"
|
|
28
|
+
_engine = "google"
|
|
29
|
+
_default_marketplace_name = "Google"
|
|
30
|
+
_hostname_pattern = r"^(?:https?:\/\/)?([^\/:?#]+)"
|
|
31
|
+
|
|
32
|
+
def __init__(
|
|
33
|
+
self,
|
|
34
|
+
api_key: str,
|
|
35
|
+
max_retries: int = MAX_RETRIES,
|
|
36
|
+
retry_delay: int = RETRY_DELAY,
|
|
37
|
+
):
|
|
38
|
+
"""Initializes the SerpApiClient with the given API key.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
api_key: The API key for SerpApi.
|
|
42
|
+
max_retries: Maximum number of retries for API calls.
|
|
43
|
+
retry_delay: Delay between retries in seconds.
|
|
44
|
+
"""
|
|
45
|
+
super().__init__()
|
|
46
|
+
self._api_key = api_key
|
|
47
|
+
self._max_retries = max_retries
|
|
48
|
+
self._retry_delay = retry_delay
|
|
49
|
+
|
|
50
|
+
def _get_domain(self, url: str) -> str:
|
|
51
|
+
"""Extracts the second-level domain together with the top-level domain (e.g. `google.com`).
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
url: The URL to be processed.
|
|
55
|
+
|
|
56
|
+
"""
|
|
57
|
+
# Add scheme (if needed -> urlparse requires it)
|
|
58
|
+
if not url.startswith(("http://", "https://")):
|
|
59
|
+
url = "http://" + url
|
|
60
|
+
|
|
61
|
+
# Get the hostname
|
|
62
|
+
hostname = urlparse(url).hostname
|
|
63
|
+
if hostname is None and (match := re.search(self._hostname_pattern, url)):
|
|
64
|
+
hostname = match.group(1)
|
|
65
|
+
if hostname is None:
|
|
66
|
+
logger.warning(
|
|
67
|
+
f'Failed to extract domain from url="{url}"; full url is returned'
|
|
68
|
+
)
|
|
69
|
+
return url
|
|
70
|
+
|
|
71
|
+
# Remove www. prefix
|
|
72
|
+
if hostname and hostname.startswith("www."):
|
|
73
|
+
hostname = hostname[4:]
|
|
74
|
+
return hostname
|
|
75
|
+
|
|
76
|
+
async def _search(
|
|
77
|
+
self,
|
|
78
|
+
search_string: str,
|
|
79
|
+
language: Language,
|
|
80
|
+
location: Location,
|
|
81
|
+
num_results: int,
|
|
82
|
+
) -> List[str]:
|
|
83
|
+
"""Performs a search using SerpApi and returns the URLs of the results.
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
search_string: The search string (with potentially added site: parameters).
|
|
87
|
+
language: The language to use for the query ('hl' parameter).
|
|
88
|
+
location: The location to use for the query ('gl' parameter).
|
|
89
|
+
num_results: Max number of results to return.
|
|
90
|
+
|
|
91
|
+
The SerpAPI parameters are:
|
|
92
|
+
engine: The search engine to use ('google' NOT 'google_shopping').
|
|
93
|
+
q: The search string (with potentially added site: parameters).
|
|
94
|
+
google_domain: The Google domain to use for the search (e.g. google.[com]).
|
|
95
|
+
location_[requested|used]: The location to use for the search.
|
|
96
|
+
tbs: The time-based search parameters (e.g. 'ctr:CH&cr:countryCH').
|
|
97
|
+
gl: The country code to use for the search.
|
|
98
|
+
hl: The language code to use for the search.
|
|
99
|
+
num: The number of results to return.
|
|
100
|
+
api_key: The API key to use for the search.
|
|
101
|
+
"""
|
|
102
|
+
# Setup the parameters
|
|
103
|
+
params = {
|
|
104
|
+
"engine": self._engine,
|
|
105
|
+
"q": search_string,
|
|
106
|
+
"google_domain": f"google.{location.code}",
|
|
107
|
+
"location_requested": location.name,
|
|
108
|
+
"location_used": location.name,
|
|
109
|
+
"tbs": f"ctr:{location.code.upper()}&cr:country{location.code.upper()}",
|
|
110
|
+
"gl": location.code,
|
|
111
|
+
"hl": language.code,
|
|
112
|
+
"num": num_results,
|
|
113
|
+
"api_key": self._api_key,
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
# Perform the request
|
|
117
|
+
attempts = 0
|
|
118
|
+
err = None
|
|
119
|
+
while attempts < self._max_retries:
|
|
120
|
+
try:
|
|
121
|
+
logger.debug(
|
|
122
|
+
f'Performing SerpAPI search with q="{search_string}" (Attempt {attempts + 1}).'
|
|
123
|
+
)
|
|
124
|
+
response = await self.get(url=self._endpoint, params=params)
|
|
125
|
+
break
|
|
126
|
+
except Exception as e:
|
|
127
|
+
logger.error(f"SerpAPI search failed with error: {e}.")
|
|
128
|
+
err = e
|
|
129
|
+
attempts += 1
|
|
130
|
+
if attempts < self._max_retries:
|
|
131
|
+
await asyncio.sleep(self._retry_delay)
|
|
132
|
+
if err is not None:
|
|
133
|
+
raise err
|
|
134
|
+
|
|
135
|
+
# Get the organic_results
|
|
136
|
+
results = response.get("organic_results")
|
|
137
|
+
if results is None:
|
|
138
|
+
logger.warning(
|
|
139
|
+
f'No organic_results key in SerpAPI results for search_string="{search_string}".'
|
|
140
|
+
)
|
|
141
|
+
return []
|
|
142
|
+
|
|
143
|
+
# Extract urls
|
|
144
|
+
urls = [res.get("link") for res in results]
|
|
145
|
+
logger.debug(
|
|
146
|
+
f'Found {len(urls)} URLs from SerpApi search for q="{search_string}".'
|
|
147
|
+
)
|
|
148
|
+
return urls
|
|
149
|
+
|
|
150
|
+
@staticmethod
|
|
151
|
+
def _keep_url(url: str, country_code: str) -> bool:
|
|
152
|
+
"""Determines whether to keep the url based on the country_code.
|
|
153
|
+
|
|
154
|
+
Args:
|
|
155
|
+
url: The URL to investigate.
|
|
156
|
+
country_code: The country code used to filter the products.
|
|
157
|
+
"""
|
|
158
|
+
return f".{country_code}" in url.lower() or ".com" in url.lower()
|
|
159
|
+
|
|
160
|
+
def _create_serp_result(
|
|
161
|
+
self,
|
|
162
|
+
url: str,
|
|
163
|
+
location: Location,
|
|
164
|
+
marketplaces: List[Host] | None,
|
|
165
|
+
) -> SerpResult:
|
|
166
|
+
"""From a given url it creates the class:`SerpResult` instance.
|
|
167
|
+
|
|
168
|
+
If marketplaces is None or the domain can not be extracted, the default marketplace name is used.
|
|
169
|
+
|
|
170
|
+
Args:
|
|
171
|
+
url: The URL to be processed.
|
|
172
|
+
location: The location to use for the query.
|
|
173
|
+
marketplaces: The list of marketplaces to compare the URL against.
|
|
174
|
+
"""
|
|
175
|
+
# Filter for county code
|
|
176
|
+
filtered = not self._keep_url(url=url, country_code=location.code)
|
|
177
|
+
filtered_at_stage = "country code filtering" if filtered else None
|
|
178
|
+
|
|
179
|
+
# Get marketplace name
|
|
180
|
+
domain = self._get_domain(url=url)
|
|
181
|
+
marketplace_name = self._default_marketplace_name
|
|
182
|
+
if domain and marketplaces:
|
|
183
|
+
try:
|
|
184
|
+
marketplace_name = next(
|
|
185
|
+
mp.name
|
|
186
|
+
for mp in marketplaces
|
|
187
|
+
if domain.lower() in [d.lower() for d in mp.domains]
|
|
188
|
+
)
|
|
189
|
+
except StopIteration:
|
|
190
|
+
logger.warning(f'Failed to find marketplace for domain="{domain}".')
|
|
191
|
+
return SerpResult(
|
|
192
|
+
url=url,
|
|
193
|
+
domain=domain,
|
|
194
|
+
marketplace_name=marketplace_name,
|
|
195
|
+
filtered=filtered,
|
|
196
|
+
filtered_at_stage=filtered_at_stage,
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
async def apply(
|
|
200
|
+
self,
|
|
201
|
+
search_term: str,
|
|
202
|
+
language: Language,
|
|
203
|
+
location: Location,
|
|
204
|
+
num_results: int,
|
|
205
|
+
marketplaces: List[Host] | None = None,
|
|
206
|
+
excluded_urls: List[Host] | None = None,
|
|
207
|
+
) -> List[SerpResult]:
|
|
208
|
+
"""Performs a search using SerpApi, filters based on country code and returns the URLs.
|
|
209
|
+
|
|
210
|
+
Args:
|
|
211
|
+
search_term: The search term to use for the query.
|
|
212
|
+
language: The language to use for the query.
|
|
213
|
+
location: The location to use for the query.
|
|
214
|
+
num_results: Max number of results to return (default: 10).
|
|
215
|
+
marketplaces: The marketplaces to include in the search.
|
|
216
|
+
excluded_urls: The URLs to exclude from the search.
|
|
217
|
+
"""
|
|
218
|
+
# Setup the parameters
|
|
219
|
+
logger.info(f'Performing SerpAPI search for search_term="{search_term}".')
|
|
220
|
+
|
|
221
|
+
# Setup the search string
|
|
222
|
+
search_string = search_term
|
|
223
|
+
if marketplaces:
|
|
224
|
+
sites = [dom for host in marketplaces for dom in host.domains]
|
|
225
|
+
search_string += " site:" + " OR site:".join(s for s in sites)
|
|
226
|
+
|
|
227
|
+
# Perform the search
|
|
228
|
+
urls = await self._search(
|
|
229
|
+
search_string=search_string,
|
|
230
|
+
language=language,
|
|
231
|
+
location=location,
|
|
232
|
+
num_results=num_results,
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
# Form the SerpResult objects
|
|
236
|
+
results = [
|
|
237
|
+
self._create_serp_result(
|
|
238
|
+
url=url, location=location, marketplaces=marketplaces
|
|
239
|
+
)
|
|
240
|
+
for url in urls
|
|
241
|
+
]
|
|
242
|
+
|
|
243
|
+
# Filter out the excluded URLs
|
|
244
|
+
if excluded_urls:
|
|
245
|
+
excluded = [dom for excl in excluded_urls for dom in excl.domains]
|
|
246
|
+
results = [res for res in results if res.domain not in excluded]
|
|
247
|
+
|
|
248
|
+
logger.info(
|
|
249
|
+
f'Produced {len(results)} results from SerpApi search with q="{search_string}".'
|
|
250
|
+
)
|
|
251
|
+
return results
|
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import logging
|
|
3
|
+
from typing import List
|
|
4
|
+
|
|
5
|
+
import aiohttp
|
|
6
|
+
|
|
7
|
+
from fraudcrawler.settings import (
|
|
8
|
+
MAX_RETRIES,
|
|
9
|
+
RETRY_DELAY,
|
|
10
|
+
ZYTE_DEFALUT_PROBABILITY_THRESHOLD,
|
|
11
|
+
)
|
|
12
|
+
from fraudcrawler.base.base import AsyncClient
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class ZyteApi(AsyncClient):
|
|
18
|
+
"""A client to interact with the Zyte API for fetching product details."""
|
|
19
|
+
|
|
20
|
+
_endpoint = "https://api.zyte.com/v1/extract"
|
|
21
|
+
_config = {
|
|
22
|
+
"javascript": False,
|
|
23
|
+
"browserHtml": False,
|
|
24
|
+
"screenshot": False,
|
|
25
|
+
"productOptions": {"extractFrom": "httpResponseBody"},
|
|
26
|
+
"httpResponseBody": True,
|
|
27
|
+
"geolocation": "CH",
|
|
28
|
+
"viewport": {"width": 1280, "height": 1080},
|
|
29
|
+
"product": True,
|
|
30
|
+
# "actions": [],
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
def __init__(
|
|
34
|
+
self,
|
|
35
|
+
api_key: str,
|
|
36
|
+
max_retries: int = MAX_RETRIES,
|
|
37
|
+
retry_delay: int = RETRY_DELAY,
|
|
38
|
+
):
|
|
39
|
+
"""Initializes the ZyteApiClient with the given API key and retry configurations.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
api_key: The API key for Zyte API.
|
|
43
|
+
max_retries: Maximum number of retries for API calls.
|
|
44
|
+
retry_delay: Delay between retries in seconds.
|
|
45
|
+
"""
|
|
46
|
+
self._aiohttp_basic_auth = aiohttp.BasicAuth(api_key)
|
|
47
|
+
self._max_retries = max_retries
|
|
48
|
+
self._retry_delay = retry_delay
|
|
49
|
+
|
|
50
|
+
async def get_details(self, url: str) -> dict:
|
|
51
|
+
"""Fetches product details for a single URL.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
url: The URL to fetch product details from.
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
A dictionary containing the product details, fields include:
|
|
58
|
+
(c.f. https://docs.zyte.com/zyte-api/usage/reference.html#operation/extract/response/200/product)
|
|
59
|
+
{
|
|
60
|
+
"url": str,
|
|
61
|
+
"statusCode": str,
|
|
62
|
+
"product": {
|
|
63
|
+
"name": str,
|
|
64
|
+
"price": str,
|
|
65
|
+
"mainImage": {"url": str},
|
|
66
|
+
"images": [{"url": str}],
|
|
67
|
+
"description": str,
|
|
68
|
+
"metadata": {
|
|
69
|
+
"probability": float,
|
|
70
|
+
},
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
"""
|
|
74
|
+
logger.info(f"Fetching product details by Zyte for URL {url}.")
|
|
75
|
+
attempts = 0
|
|
76
|
+
err = None
|
|
77
|
+
while attempts < self._max_retries:
|
|
78
|
+
try:
|
|
79
|
+
logger.debug(
|
|
80
|
+
f"Fetch product details for URL {url} (Attempt {attempts + 1})."
|
|
81
|
+
)
|
|
82
|
+
product = await self.post(
|
|
83
|
+
url=self._endpoint,
|
|
84
|
+
data={"url": url, **self._config},
|
|
85
|
+
auth=self._aiohttp_basic_auth,
|
|
86
|
+
)
|
|
87
|
+
return product
|
|
88
|
+
except Exception as e:
|
|
89
|
+
logger.debug(
|
|
90
|
+
f"Exception occurred while fetching product details for URL {url} (Attempt {attempts + 1})."
|
|
91
|
+
)
|
|
92
|
+
err = e
|
|
93
|
+
attempts += 1
|
|
94
|
+
if attempts < self._max_retries:
|
|
95
|
+
await asyncio.sleep(self._retry_delay)
|
|
96
|
+
if err is not None:
|
|
97
|
+
raise err
|
|
98
|
+
return {}
|
|
99
|
+
|
|
100
|
+
@staticmethod
|
|
101
|
+
def keep_product(
|
|
102
|
+
details: dict, threshold: float = ZYTE_DEFALUT_PROBABILITY_THRESHOLD
|
|
103
|
+
) -> bool:
|
|
104
|
+
"""Determines whether to keep the product based on the probability threshold.
|
|
105
|
+
|
|
106
|
+
Args:
|
|
107
|
+
details: A product details data dictionary.
|
|
108
|
+
threshold: The probability threshold used to filter the products.
|
|
109
|
+
"""
|
|
110
|
+
try:
|
|
111
|
+
prob = float(details["product"]["metadata"]["probability"])
|
|
112
|
+
except KeyError:
|
|
113
|
+
logger.warning(
|
|
114
|
+
f"Product with url={details.get('url')} has no probability value - product is ignored"
|
|
115
|
+
)
|
|
116
|
+
return False
|
|
117
|
+
return prob > threshold
|
|
118
|
+
|
|
119
|
+
@staticmethod
|
|
120
|
+
def extract_product_name(details: dict) -> str | None:
|
|
121
|
+
"""Extracts the product name from the product data.
|
|
122
|
+
|
|
123
|
+
The input argument is a dictionary of the following structure:
|
|
124
|
+
{
|
|
125
|
+
"product": {
|
|
126
|
+
"name": str,
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
"""
|
|
130
|
+
return details.get("product", {}).get("name")
|
|
131
|
+
|
|
132
|
+
@staticmethod
|
|
133
|
+
def extract_product_price(details: dict) -> str | None:
|
|
134
|
+
"""Extracts the product price from the product data.
|
|
135
|
+
|
|
136
|
+
The input argument is a dictionary of the following structure:
|
|
137
|
+
{
|
|
138
|
+
"product": {
|
|
139
|
+
"price": str,
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
"""
|
|
143
|
+
return details.get("product", {}).get("price")
|
|
144
|
+
|
|
145
|
+
@staticmethod
|
|
146
|
+
def extract_product_description(details: dict) -> str | None:
|
|
147
|
+
"""Extracts the product description from the product data.
|
|
148
|
+
|
|
149
|
+
The input argument is a dictionary of the following structure:
|
|
150
|
+
{
|
|
151
|
+
"product": {
|
|
152
|
+
"description": str,
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
"""
|
|
156
|
+
return details.get("product", {}).get("description")
|
|
157
|
+
|
|
158
|
+
@staticmethod
|
|
159
|
+
def extract_image_urls(details: dict) -> List[str]:
|
|
160
|
+
"""Extracts the images from the product data.
|
|
161
|
+
|
|
162
|
+
The input argument is a dictionary of the following structure:
|
|
163
|
+
{
|
|
164
|
+
"product": {
|
|
165
|
+
"mainImage": {"url": str},
|
|
166
|
+
"images": [{"url": str}],
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
"""
|
|
170
|
+
images = []
|
|
171
|
+
product = details.get("product")
|
|
172
|
+
if product:
|
|
173
|
+
# Extract main image URL
|
|
174
|
+
if (main_img := product.get("mainImage")) and (url := main_img.get("url")):
|
|
175
|
+
images.append(url)
|
|
176
|
+
# Extract additional image URLs
|
|
177
|
+
if urls := product.get("images"):
|
|
178
|
+
images.extend([img["url"] for img in urls if img.get("url")])
|
|
179
|
+
return images
|
|
180
|
+
|
|
181
|
+
@staticmethod
|
|
182
|
+
def extract_probability(details: dict) -> float:
|
|
183
|
+
"""Extracts the probability from the product data.
|
|
184
|
+
|
|
185
|
+
The input argument is a dictionary of the following structure:
|
|
186
|
+
{
|
|
187
|
+
"product": {
|
|
188
|
+
"metadata": {
|
|
189
|
+
"probability": float,
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
"""
|
|
194
|
+
return float(details.get("product", {}).get("metadata", {}).get("probability"))
|
fraudcrawler/settings.py
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
# Generic settings
|
|
4
|
+
MAX_RETRIES = 3
|
|
5
|
+
RETRY_DELAY = 2
|
|
6
|
+
ROOT_DIR = Path(__file__).parents[1]
|
|
7
|
+
|
|
8
|
+
# Serp settings
|
|
9
|
+
GOOGLE_LOCATIONS_FILENAME = ROOT_DIR / "fraudcrawler" / "base" / "google-locations.json"
|
|
10
|
+
GOOGLE_LANGUAGES_FILENAME = ROOT_DIR / "fraudcrawler" / "base" / "google-languages.json"
|
|
11
|
+
|
|
12
|
+
# Enrichment settings
|
|
13
|
+
ENRICHMENT_DEFAULT_LIMIT = 10
|
|
14
|
+
|
|
15
|
+
# Zyte settings
|
|
16
|
+
ZYTE_DEFALUT_PROBABILITY_THRESHOLD = 0.1
|
|
17
|
+
|
|
18
|
+
# Processor settings
|
|
19
|
+
PROCESSOR_DEFAULT_MODEL = "gpt-4o"
|
|
20
|
+
PROCESSOR_DEFAULT_IF_MISSING = -1
|
|
21
|
+
PROCESSOR_USER_PROMPT_TEMPLATE = (
|
|
22
|
+
"Context: {context}\n\nProduct Details: {name}\n{description}\\n\nRelevance:"
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
# Orchestrator settings
|
|
26
|
+
PRODUCT_ITEM_DEFAULT_IS_RELEVANT = -1
|
|
27
|
+
|
|
28
|
+
# Async settings
|
|
29
|
+
DEFAULT_N_SERP_WKRS = 10
|
|
30
|
+
DEFAULT_N_ZYTE_WKRS = 10
|
|
31
|
+
DEFAULT_N_PROC_WKRS = 10
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 veanu
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: fraudcrawler
|
|
3
|
+
Version: 0.3.3
|
|
4
|
+
Summary: Intelligent Market Monitoring
|
|
5
|
+
Home-page: https://github/open-veanu/fraudcrawler
|
|
6
|
+
License: MIT
|
|
7
|
+
Author: Domingo Bertus
|
|
8
|
+
Author-email: hello@veanu.ch
|
|
9
|
+
Requires-Python: >=3.11,<4.0
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
15
|
+
Requires-Dist: aiohttp (>=3.11.14,<4.0.0)
|
|
16
|
+
Requires-Dist: openai (>=1.68.2,<2.0.0)
|
|
17
|
+
Requires-Dist: pandas (>=2.2.3,<3.0.0)
|
|
18
|
+
Requires-Dist: pydantic-settings (>=2.8.1,<3.0.0)
|
|
19
|
+
Requires-Dist: requests (>=2.32.3,<3.0.0)
|
|
20
|
+
Project-URL: Repository, https://github/open-veanu/fraudcrawler
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
|
|
23
|
+
# open-veanu/fraudcrawler
|
|
24
|
+
Intelligent Market Monitoring
|
|
25
|
+
|
|
26
|
+
The pipeline for monitoring the market has the folling main steps:
|
|
27
|
+
1. search for a given term using SerpAPI
|
|
28
|
+
2. get product information using ZyteAPI
|
|
29
|
+
3. assess relevance of the found products using an OpenAI API
|
|
30
|
+
|
|
31
|
+
## Installation
|
|
32
|
+
```bash
|
|
33
|
+
python3.11 -m venv .venv
|
|
34
|
+
source .venv/bin/activate
|
|
35
|
+
pip install fraudcrawler
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
## Usage
|
|
39
|
+
### `.env` file
|
|
40
|
+
Make sure to create an `.env` file with the necessary API keys and credentials (c.f. `.env.example` file).
|
|
41
|
+
|
|
42
|
+
### Run demo pipeline
|
|
43
|
+
```bash
|
|
44
|
+
python -m fraudcrawler.launch_demo_pipeline
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
### Customize the pipeline
|
|
48
|
+
Start by initializing the client
|
|
49
|
+
```python
|
|
50
|
+
from fraudcrawler import FraudCrawlerClient
|
|
51
|
+
|
|
52
|
+
# Initialize the client
|
|
53
|
+
client = FraudCrawlerClient()
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
For setting up the search we need 5 main objects.
|
|
57
|
+
|
|
58
|
+
#### `search_term: str`
|
|
59
|
+
The search term for the query (similar to search terms used within major search providers).
|
|
60
|
+
|
|
61
|
+
#### `language: Language`
|
|
62
|
+
The language used in SerpAPI ('hl' parameter), as well as for the optional search term enrichement (e.g. finding similar and related search terms). `language=Language('German')` creates an object having a language name and a language code as: `Language(name='German', code='de')`.
|
|
63
|
+
|
|
64
|
+
#### `location: Location`
|
|
65
|
+
The location used in SerpAPI ('gl' parameter). `location=Location('Switzerland')` creates an object having a location name and a location code as `Location(name='Switzerland', code='ch')`.
|
|
66
|
+
|
|
67
|
+
#### `deepness: Deepness`
|
|
68
|
+
Defines the search depth with the number of results to retrieve and optional enrichment parameters.
|
|
69
|
+
|
|
70
|
+
#### `prompts: List[Prompt]`
|
|
71
|
+
The list of prompts to classify a given product with (multiple) LLM calls. Each prompt object has a `name`, a `context` (used for defining the user prompt), a `system_prompt` (for defining the classification task), `allowed_classes` (a list of possible classes) and optionally `default_if_missing` (a default class if anything goes wrong).
|
|
72
|
+
|
|
73
|
+
```python
|
|
74
|
+
from fraudcrawler import Language, Location, Deepness, Prompt
|
|
75
|
+
# Setup the search
|
|
76
|
+
search_term = "sildenafil"
|
|
77
|
+
language = Language(name="German")
|
|
78
|
+
location = Location(name="Switzerland")
|
|
79
|
+
deepness = Deepness(num_results=50)
|
|
80
|
+
prompts = [
|
|
81
|
+
Prompt(
|
|
82
|
+
name="relevance",
|
|
83
|
+
context="This organization is interested in medical products and drugs.",
|
|
84
|
+
system_prompt=(
|
|
85
|
+
"You are a helpful and intelligent assistant. Your task is to classify any given product "
|
|
86
|
+
"as either relevant (1) or not relevant (0), strictly based on the context and product details provided by the user. "
|
|
87
|
+
"You must consider all aspects of the given context and make a binary decision accordingly. "
|
|
88
|
+
"If the product aligns with the user's needs, classify it as 1 (relevant); otherwise, classify it as 0 (not relevant). "
|
|
89
|
+
"Respond only with the number 1 or 0."
|
|
90
|
+
),
|
|
91
|
+
allowed_classes=[0, 1],
|
|
92
|
+
)
|
|
93
|
+
]
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
(Optional) Add search term enrichement. This will find related search terms (in a given language) and search for these as well.
|
|
97
|
+
```python
|
|
98
|
+
from fraudcrawler import Enrichment
|
|
99
|
+
deepness.enrichment = Enrichment(
|
|
100
|
+
additional_terms=5,
|
|
101
|
+
additional_urls_per_term=10
|
|
102
|
+
)
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
(Optional) Add marketplaces where we explicitely want to look for (this will focus your search as the :site parameter for a google search)
|
|
106
|
+
```python
|
|
107
|
+
from fraudcrawler import Host
|
|
108
|
+
marketplaces = [
|
|
109
|
+
Host(name="International", domains="zavamed.com,apomeds.com"),
|
|
110
|
+
Host(name="National", domains="netdoktor.ch, nobelpharma.ch"),
|
|
111
|
+
]
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
(Optional) Exclude urls (where you don't want to find products)
|
|
115
|
+
```python
|
|
116
|
+
excluded_urls = [
|
|
117
|
+
Host(name="Compendium", domains="compendium.ch"),
|
|
118
|
+
]
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
(Optional) Exclude previously collected urls (intends to save credits)
|
|
122
|
+
```python
|
|
123
|
+
previously_collected_urls = [
|
|
124
|
+
https://pharmaciedelabateliere.ch/shop/sante/douleurs-inflammations/dafalgan-cpr-eff-500-mg-16-pce/,
|
|
125
|
+
https://eiche.ch/product/schmerzmittel-52cd81d5d206a/dafalgan-brausetabletten-1336653,
|
|
126
|
+
]
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
And finally run the pipeline
|
|
130
|
+
```python
|
|
131
|
+
# Execute the pipeline
|
|
132
|
+
client.execute(
|
|
133
|
+
search_term=search_term,
|
|
134
|
+
language=language,
|
|
135
|
+
location=location,
|
|
136
|
+
deepness=deepness,
|
|
137
|
+
prompts=prompts,
|
|
138
|
+
# marketplaces=marketplaces, # Uncomment this for using marketplaces
|
|
139
|
+
# excluded_urls=excluded_urls # Uncomment this for using excluded_urls
|
|
140
|
+
# previously_collected_urls=previously_collected_urls # Uncomment this for using previously_selected_urls
|
|
141
|
+
)
|
|
142
|
+
```
|
|
143
|
+
This creates a file with name pattern `<search_term>_<language.code>_<location.code>_<datetime[%Y%m%d%H%M%S]>.csv` inside the folder `data/results/`.
|
|
144
|
+
|
|
145
|
+
Once the pipeline terminated the results can be loaded and examined as follows:
|
|
146
|
+
```python
|
|
147
|
+
df = client.load_results()
|
|
148
|
+
print(df.head(n=10))
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
If the client has been used to run multiple pipelines, an overview of the available results (for a given instance of
|
|
152
|
+
`FraudCrawlerClient`) can be obtained with
|
|
153
|
+
```python
|
|
154
|
+
client.print_available_results()
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
## Contributing
|
|
158
|
+
see `CONTRIBUTING.md`
|
|
159
|
+
|
|
160
|
+
### Async Setup
|
|
161
|
+
The following image provides a schematic representation of the package's async setup.
|
|
162
|
+

|
|
163
|
+
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
fraudcrawler/__init__.py,sha256=2EgoTb2jNcQt1NxUV8za0154kb7ZnHZ_KeKgx21rdFs,679
|
|
2
|
+
fraudcrawler/base/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
+
fraudcrawler/base/base.py,sha256=YgX7cUB3Fta-sXWZu5I-gn85sCfpmoa8M67Whn1m56o,4241
|
|
4
|
+
fraudcrawler/base/client.py,sha256=GcTUMqLfvweLFdHy6CP9tgxsFQiPkc6KyiLcwLnDiw8,4412
|
|
5
|
+
fraudcrawler/base/google-languages.json,sha256=z0VtjMCsCcZq11OkCIb9jMDD1p9Ty4lhV7bq4ddYvec,10748
|
|
6
|
+
fraudcrawler/base/google-locations.json,sha256=UtNu0iSStllvFRTQXMobWKmZR1hKmtgtHftLNgaJTT0,9204345
|
|
7
|
+
fraudcrawler/base/orchestrator.py,sha256=GmJvrP9jKr4FqTKhuU9YMEuZ54gV0asHrSCxhM43onA,23903
|
|
8
|
+
fraudcrawler/launch_demo_pipeline.py,sha256=RIZTtdtZeJPhvSLp1IUjT_nhme_2q6mAGWKoL838E4E,4320
|
|
9
|
+
fraudcrawler/processing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
|
+
fraudcrawler/processing/processor.py,sha256=sNYK0gY7PsJJS5FMYOiHLXEQJ70buifSokuIiXk5dG4,3715
|
|
11
|
+
fraudcrawler/scraping/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
12
|
+
fraudcrawler/scraping/enrich.py,sha256=X1BBZshdZqPmbduzhGwH0ULSzq03L_7bf7_UL8yOQ9E,10608
|
|
13
|
+
fraudcrawler/scraping/serp.py,sha256=wT8vhk0EugcrS2CCvMuCCZrlw1MRI-ahtGYKdNUZQo8,8830
|
|
14
|
+
fraudcrawler/scraping/zyte.py,sha256=ggI4iYG-E_UyiKgUpEFekeUd1giifEfJ_uyFUSJGSLY,6296
|
|
15
|
+
fraudcrawler/settings.py,sha256=eUb7zd2Q7fYUrLk4cl_d48lZ9zaB8iU7M0zFFuZc_-g,786
|
|
16
|
+
fraudcrawler-0.3.3.dist-info/LICENSE,sha256=B-3FuHfe3S0fWAlKlceskPcRhzXq81g-rJ-ddUYb4O8,1062
|
|
17
|
+
fraudcrawler-0.3.3.dist-info/METADATA,sha256=860K3oeNRoAC2Fmr9D4Gd1DYaXw1KOaWSEBeByL4V-U,6030
|
|
18
|
+
fraudcrawler-0.3.3.dist-info/WHEEL,sha256=RaoafKOydTQ7I_I3JTrPCg6kUmTgtm4BornzOqyEfJ8,88
|
|
19
|
+
fraudcrawler-0.3.3.dist-info/entry_points.txt,sha256=1Befm7cM6945y2AA1z9V4gZV63mtSWcAs7ypvgux_Xg,79
|
|
20
|
+
fraudcrawler-0.3.3.dist-info/RECORD,,
|