fraudcrawler 0.3.9__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of fraudcrawler might be problematic. Click here for more details.
- fraudcrawler/__init__.py +2 -1
- fraudcrawler/base/base.py +9 -2
- fraudcrawler/base/client.py +12 -0
- fraudcrawler/base/orchestrator.py +9 -1
- fraudcrawler/scraping/serp.py +196 -31
- fraudcrawler/settings.py +3 -2
- {fraudcrawler-0.3.9.dist-info → fraudcrawler-0.4.0.dist-info}/METADATA +1 -1
- {fraudcrawler-0.3.9.dist-info → fraudcrawler-0.4.0.dist-info}/RECORD +11 -11
- {fraudcrawler-0.3.9.dist-info → fraudcrawler-0.4.0.dist-info}/LICENSE +0 -0
- {fraudcrawler-0.3.9.dist-info → fraudcrawler-0.4.0.dist-info}/WHEEL +0 -0
- {fraudcrawler-0.3.9.dist-info → fraudcrawler-0.4.0.dist-info}/entry_points.txt +0 -0
fraudcrawler/__init__.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from fraudcrawler.scraping.serp import SerpApi
|
|
1
|
+
from fraudcrawler.scraping.serp import SerpApi, SearchEngine
|
|
2
2
|
from fraudcrawler.scraping.enrich import Enricher
|
|
3
3
|
from fraudcrawler.scraping.zyte import ZyteApi
|
|
4
4
|
from fraudcrawler.processing.processor import Processor
|
|
@@ -15,6 +15,7 @@ from fraudcrawler.base.base import (
|
|
|
15
15
|
|
|
16
16
|
__all__ = [
|
|
17
17
|
"SerpApi",
|
|
18
|
+
"SearchEngine",
|
|
18
19
|
"Enricher",
|
|
19
20
|
"ZyteApi",
|
|
20
21
|
"Processor",
|
fraudcrawler/base/base.py
CHANGED
|
@@ -6,6 +6,7 @@ from pydantic import (
|
|
|
6
6
|
model_validator,
|
|
7
7
|
)
|
|
8
8
|
from pydantic_settings import BaseSettings
|
|
9
|
+
import re
|
|
9
10
|
from typing import List
|
|
10
11
|
|
|
11
12
|
import aiohttp
|
|
@@ -48,11 +49,17 @@ class Host(BaseModel):
|
|
|
48
49
|
name: str
|
|
49
50
|
domains: str | List[str]
|
|
50
51
|
|
|
52
|
+
@staticmethod
|
|
53
|
+
def _normalize_domain(domain: str) -> str:
|
|
54
|
+
"""Make it lowercase and strip 'www.' and 'https?://' prefixes from the domain."""
|
|
55
|
+
domain = domain.strip().lower()
|
|
56
|
+
return re.sub(r"^(https?://)?(www\.)?", "", domain)
|
|
57
|
+
|
|
51
58
|
@field_validator("domains", mode="before")
|
|
52
|
-
def
|
|
59
|
+
def normalize_domains(cls, val):
|
|
53
60
|
if isinstance(val, str):
|
|
54
61
|
val = val.split(",")
|
|
55
|
-
return [dom.strip()
|
|
62
|
+
return [cls._normalize_domain(dom.strip()) for dom in val]
|
|
56
63
|
|
|
57
64
|
|
|
58
65
|
class Location(BaseModel):
|
fraudcrawler/base/client.py
CHANGED
|
@@ -11,6 +11,7 @@ import pandas as pd
|
|
|
11
11
|
from fraudcrawler.settings import ROOT_DIR
|
|
12
12
|
from fraudcrawler.base.base import Setup, Language, Location, Deepness, Host, Prompt
|
|
13
13
|
from fraudcrawler.base.orchestrator import Orchestrator, ProductItem
|
|
14
|
+
from fraudcrawler.scraping.serp import SearchEngine
|
|
14
15
|
|
|
15
16
|
logger = logging.getLogger(__name__)
|
|
16
17
|
|
|
@@ -84,6 +85,7 @@ class FraudCrawlerClient(Orchestrator):
|
|
|
84
85
|
prompts: List[Prompt],
|
|
85
86
|
marketplaces: List[Host] | None = None,
|
|
86
87
|
excluded_urls: List[Host] | None = None,
|
|
88
|
+
search_engines: List[SearchEngine | str] | None = None,
|
|
87
89
|
) -> None:
|
|
88
90
|
"""Runs the pipeline steps: serp, enrich, zyte, process, and collect the results.
|
|
89
91
|
|
|
@@ -96,6 +98,7 @@ class FraudCrawlerClient(Orchestrator):
|
|
|
96
98
|
marketplaces: The marketplaces to include in the search.
|
|
97
99
|
excluded_urls: The URLs to exclude from the search.
|
|
98
100
|
"""
|
|
101
|
+
# Handle results files
|
|
99
102
|
timestamp = datetime.today().strftime("%Y%m%d%H%M%S")
|
|
100
103
|
filename = self._results_dir / self._filename_template.format(
|
|
101
104
|
search_term=search_term,
|
|
@@ -105,9 +108,18 @@ class FraudCrawlerClient(Orchestrator):
|
|
|
105
108
|
)
|
|
106
109
|
self._results.append(Results(search_term=search_term, filename=filename))
|
|
107
110
|
|
|
111
|
+
# Normalize inputs
|
|
112
|
+
nrm_se: List[SearchEngine] = list(SearchEngine)
|
|
113
|
+
if search_engines:
|
|
114
|
+
nrm_se = [
|
|
115
|
+
SearchEngine(se) if isinstance(se, str) else se for se in search_engines
|
|
116
|
+
]
|
|
117
|
+
|
|
118
|
+
# Run the pipeline by calling the orchestrator's run method
|
|
108
119
|
asyncio.run(
|
|
109
120
|
super().run(
|
|
110
121
|
search_term=search_term,
|
|
122
|
+
search_engines=nrm_se,
|
|
111
123
|
language=language,
|
|
112
124
|
location=location,
|
|
113
125
|
deepness=deepness,
|
|
@@ -16,7 +16,7 @@ from fraudcrawler.settings import (
|
|
|
16
16
|
DEFAULT_N_PROC_WKRS,
|
|
17
17
|
)
|
|
18
18
|
from fraudcrawler.base.base import Deepness, Host, Language, Location, Prompt
|
|
19
|
-
from fraudcrawler import SerpApi, Enricher, ZyteApi, Processor
|
|
19
|
+
from fraudcrawler import SerpApi, SearchEngine, Enricher, ZyteApi, Processor
|
|
20
20
|
|
|
21
21
|
logger = logging.getLogger(__name__)
|
|
22
22
|
|
|
@@ -387,6 +387,7 @@ class Orchestrator(ABC):
|
|
|
387
387
|
queue: asyncio.Queue[dict | None],
|
|
388
388
|
search_term: str,
|
|
389
389
|
search_term_type: str,
|
|
390
|
+
search_engines: List[SearchEngine],
|
|
390
391
|
language: Language,
|
|
391
392
|
location: Location,
|
|
392
393
|
num_results: int,
|
|
@@ -397,6 +398,7 @@ class Orchestrator(ABC):
|
|
|
397
398
|
item = {
|
|
398
399
|
"search_term": search_term,
|
|
399
400
|
"search_term_type": search_term_type,
|
|
401
|
+
"search_engines": search_engines,
|
|
400
402
|
"language": language,
|
|
401
403
|
"location": location,
|
|
402
404
|
"num_results": num_results,
|
|
@@ -410,6 +412,7 @@ class Orchestrator(ABC):
|
|
|
410
412
|
self,
|
|
411
413
|
queue: asyncio.Queue[dict | None],
|
|
412
414
|
search_term: str,
|
|
415
|
+
search_engines: List[SearchEngine],
|
|
413
416
|
language: Language,
|
|
414
417
|
location: Location,
|
|
415
418
|
deepness: Deepness,
|
|
@@ -429,6 +432,7 @@ class Orchestrator(ABC):
|
|
|
429
432
|
await self._add_serp_items_for_search_term(
|
|
430
433
|
search_term=search_term,
|
|
431
434
|
search_term_type="initial",
|
|
435
|
+
search_engines=search_engines,
|
|
432
436
|
num_results=deepness.num_results,
|
|
433
437
|
**common_kwargs, # type: ignore[arg-type]
|
|
434
438
|
)
|
|
@@ -450,6 +454,7 @@ class Orchestrator(ABC):
|
|
|
450
454
|
await self._add_serp_items_for_search_term(
|
|
451
455
|
search_term=trm,
|
|
452
456
|
search_term_type="enriched",
|
|
457
|
+
search_engines=search_engines,
|
|
453
458
|
num_results=enrichment.additional_urls_per_term,
|
|
454
459
|
**common_kwargs, # type: ignore[arg-type]
|
|
455
460
|
)
|
|
@@ -457,6 +462,7 @@ class Orchestrator(ABC):
|
|
|
457
462
|
async def run(
|
|
458
463
|
self,
|
|
459
464
|
search_term: str,
|
|
465
|
+
search_engines: List[SearchEngine],
|
|
460
466
|
language: Language,
|
|
461
467
|
location: Location,
|
|
462
468
|
deepness: Deepness,
|
|
@@ -469,6 +475,7 @@ class Orchestrator(ABC):
|
|
|
469
475
|
|
|
470
476
|
Args:
|
|
471
477
|
search_term: The search term for the query.
|
|
478
|
+
search_engines: The list of search engines to use for the SerpAPI query.
|
|
472
479
|
language: The language to use for the query.
|
|
473
480
|
location: The location to use for the query.
|
|
474
481
|
deepness: The search depth and enrichment details.
|
|
@@ -523,6 +530,7 @@ class Orchestrator(ABC):
|
|
|
523
530
|
await self._add_serp_items(
|
|
524
531
|
queue=serp_queue,
|
|
525
532
|
search_term=search_term,
|
|
533
|
+
search_engines=search_engines,
|
|
526
534
|
language=language,
|
|
527
535
|
location=location,
|
|
528
536
|
deepness=deepness,
|
fraudcrawler/scraping/serp.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import asyncio
|
|
2
|
+
from enum import Enum
|
|
2
3
|
import logging
|
|
3
4
|
from pydantic import BaseModel
|
|
4
5
|
from typing import List
|
|
@@ -13,7 +14,6 @@ logger = logging.getLogger(__name__)
|
|
|
13
14
|
|
|
14
15
|
class SerpResult(BaseModel):
|
|
15
16
|
"""Model for a single search result from SerpApi."""
|
|
16
|
-
|
|
17
17
|
url: str
|
|
18
18
|
domain: str
|
|
19
19
|
marketplace_name: str
|
|
@@ -21,12 +21,20 @@ class SerpResult(BaseModel):
|
|
|
21
21
|
filtered_at_stage: str | None = None
|
|
22
22
|
|
|
23
23
|
|
|
24
|
+
class SearchEngine(Enum):
|
|
25
|
+
"""Enum for the supported search engines."""
|
|
26
|
+
GOOGLE = "google"
|
|
27
|
+
GOOGLE_SHOPPING = "google_shopping"
|
|
28
|
+
|
|
29
|
+
|
|
24
30
|
class SerpApi(AsyncClient):
|
|
25
31
|
"""A client to interact with the SerpApi for performing searches."""
|
|
26
32
|
|
|
27
33
|
_endpoint = "https://serpapi.com/search"
|
|
28
|
-
|
|
29
|
-
|
|
34
|
+
_engine_marketplace_names = {
|
|
35
|
+
SearchEngine.GOOGLE.value: "Google",
|
|
36
|
+
SearchEngine.GOOGLE_SHOPPING.value: "Google Shopping"
|
|
37
|
+
}
|
|
30
38
|
_hostname_pattern = r"^(?:https?:\/\/)?([^\/:?#]+)"
|
|
31
39
|
|
|
32
40
|
def __init__(
|
|
@@ -73,8 +81,42 @@ class SerpApi(AsyncClient):
|
|
|
73
81
|
hostname = hostname[4:]
|
|
74
82
|
return hostname.lower()
|
|
75
83
|
|
|
84
|
+
@staticmethod
|
|
85
|
+
def _extract_search_results(response: dict, engine: str) -> List[str]:
|
|
86
|
+
"""Extracts search results from the response based on the engine type.
|
|
87
|
+
|
|
88
|
+
Args:
|
|
89
|
+
response: The response from the SerpApi search.
|
|
90
|
+
engine: The search engine used.
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
A list of URLs extracted from the response.
|
|
94
|
+
"""
|
|
95
|
+
urls = []
|
|
96
|
+
if engine == SearchEngine.GOOGLE.value:
|
|
97
|
+
# Get the organic_results
|
|
98
|
+
results = response.get("organic_results")
|
|
99
|
+
if results is None:
|
|
100
|
+
logger.warning(f'No SerpAPI results for engine="{engine}".')
|
|
101
|
+
else:
|
|
102
|
+
urls = [url for res in results if (url := res.get("link"))]
|
|
103
|
+
|
|
104
|
+
elif engine == SearchEngine.GOOGLE_SHOPPING.value:
|
|
105
|
+
# Get the shopping_results
|
|
106
|
+
results = response.get("shopping_results")
|
|
107
|
+
if results is None:
|
|
108
|
+
logger.warning(f'No SerpAPI results for engine="{engine}".')
|
|
109
|
+
else:
|
|
110
|
+
urls = [url for res in results if (url := res.get("product_link"))]
|
|
111
|
+
|
|
112
|
+
else:
|
|
113
|
+
raise ValueError(f"Invalid SerpAPI search engine: {engine}")
|
|
114
|
+
|
|
115
|
+
return urls
|
|
116
|
+
|
|
76
117
|
async def _search(
|
|
77
118
|
self,
|
|
119
|
+
engine: str,
|
|
78
120
|
search_string: str,
|
|
79
121
|
language: Language,
|
|
80
122
|
location: Location,
|
|
@@ -83,6 +125,7 @@ class SerpApi(AsyncClient):
|
|
|
83
125
|
"""Performs a search using SerpApi and returns the URLs of the results.
|
|
84
126
|
|
|
85
127
|
Args:
|
|
128
|
+
engine: The search engine to use.
|
|
86
129
|
search_string: The search string (with potentially added site: parameters).
|
|
87
130
|
language: The language to use for the query ('hl' parameter).
|
|
88
131
|
location: The location to use for the query ('gl' parameter).
|
|
@@ -93,20 +136,35 @@ class SerpApi(AsyncClient):
|
|
|
93
136
|
q: The search string (with potentially added site: parameters).
|
|
94
137
|
google_domain: The Google domain to use for the search (e.g. google.[com]).
|
|
95
138
|
location_[requested|used]: The location to use for the search.
|
|
96
|
-
tbs: The
|
|
139
|
+
tbs: The to-be-searched parameters (e.g. 'ctr:CH').
|
|
140
|
+
cr: The country code to limit the search to (e.g. 'countryCH').
|
|
97
141
|
gl: The country code to use for the search.
|
|
98
142
|
hl: The language code to use for the search.
|
|
99
143
|
num: The number of results to return.
|
|
100
144
|
api_key: The API key to use for the search.
|
|
101
145
|
"""
|
|
146
|
+
if engine not in self._engine_marketplace_names:
|
|
147
|
+
raise ValueError(
|
|
148
|
+
f"Invalid SerpAPI search engine: {engine}. "
|
|
149
|
+
f"Supported engines are: {list(self._engine_marketplace_names.keys())}."
|
|
150
|
+
)
|
|
151
|
+
logger.debug(
|
|
152
|
+
f'Performing SerpAPI search with engine="{engine}", '
|
|
153
|
+
f'q="{search_string}", '
|
|
154
|
+
f'location="{location.name}", '
|
|
155
|
+
f'language="{language.code}", '
|
|
156
|
+
f"num_results={num_results}."
|
|
157
|
+
)
|
|
158
|
+
|
|
102
159
|
# Setup the parameters
|
|
103
160
|
params = {
|
|
104
|
-
"engine":
|
|
161
|
+
"engine": engine,
|
|
105
162
|
"q": search_string,
|
|
106
163
|
"google_domain": f"google.{location.code}",
|
|
107
164
|
"location_requested": location.name,
|
|
108
165
|
"location_used": location.name,
|
|
109
|
-
"tbs": f"ctr:{location.code.upper()}
|
|
166
|
+
"tbs": f"ctr:{location.code.upper()}",
|
|
167
|
+
"cr": f"country{location.code.upper()}",
|
|
110
168
|
"gl": location.code,
|
|
111
169
|
"hl": language.code,
|
|
112
170
|
"num": num_results,
|
|
@@ -132,18 +190,11 @@ class SerpApi(AsyncClient):
|
|
|
132
190
|
if err is not None:
|
|
133
191
|
raise err
|
|
134
192
|
|
|
135
|
-
#
|
|
136
|
-
|
|
137
|
-
if results is None:
|
|
138
|
-
logger.warning(
|
|
139
|
-
f'No organic_results key in SerpAPI results for search_string="{search_string}".'
|
|
140
|
-
)
|
|
141
|
-
return []
|
|
193
|
+
# Extract the URLs from the response
|
|
194
|
+
urls = self._extract_search_results(response=response, engine=engine)
|
|
142
195
|
|
|
143
|
-
# Extract urls
|
|
144
|
-
urls = [res.get("link") for res in results]
|
|
145
196
|
logger.debug(
|
|
146
|
-
f'Found {len(urls)} URLs from SerpApi search for q="{search_string}".'
|
|
197
|
+
f'Found total of {len(urls)} URLs from SerpApi search for q="{search_string}" and engine="{engine}".'
|
|
147
198
|
)
|
|
148
199
|
return urls
|
|
149
200
|
|
|
@@ -234,6 +285,7 @@ class SerpApi(AsyncClient):
|
|
|
234
285
|
|
|
235
286
|
def _create_serp_result(
|
|
236
287
|
self,
|
|
288
|
+
engine: str,
|
|
237
289
|
url: str,
|
|
238
290
|
location: Location,
|
|
239
291
|
marketplaces: List[Host] | None = None,
|
|
@@ -244,13 +296,18 @@ class SerpApi(AsyncClient):
|
|
|
244
296
|
If marketplaces is None or the domain can not be extracted, the default marketplace name is used.
|
|
245
297
|
|
|
246
298
|
Args:
|
|
299
|
+
engine: The search engine used.
|
|
247
300
|
url: The URL to be processed.
|
|
248
301
|
location: The location to use for the query.
|
|
249
302
|
marketplaces: The list of marketplaces to compare the URL against.
|
|
303
|
+
excluded_urls: The list of excluded URLs.
|
|
250
304
|
"""
|
|
251
305
|
# Get marketplace name
|
|
252
306
|
domain = self._get_domain(url=url)
|
|
253
|
-
|
|
307
|
+
|
|
308
|
+
# Select marketplace name based on engine
|
|
309
|
+
marketplace_name = self._engine_marketplace_names[engine]
|
|
310
|
+
|
|
254
311
|
if marketplaces:
|
|
255
312
|
try:
|
|
256
313
|
marketplace_name = next(
|
|
@@ -277,9 +334,109 @@ class SerpApi(AsyncClient):
|
|
|
277
334
|
)
|
|
278
335
|
return result
|
|
279
336
|
|
|
337
|
+
async def _search_google(
|
|
338
|
+
self,
|
|
339
|
+
search_string: str,
|
|
340
|
+
language: Language,
|
|
341
|
+
location: Location,
|
|
342
|
+
num_results: int,
|
|
343
|
+
marketplaces: List[Host] | None = None,
|
|
344
|
+
excluded_urls: List[Host] | None = None,
|
|
345
|
+
) -> List[SerpResult]:
|
|
346
|
+
"""Performs a google search using SerpApi and returns SerpResults.
|
|
347
|
+
|
|
348
|
+
Args:
|
|
349
|
+
search_string: The search string (with potentially added site: parameters).
|
|
350
|
+
language: The language to use for the query ('hl' parameter).
|
|
351
|
+
location: The location to use for the query ('gl' parameter).
|
|
352
|
+
num_results: Max number of results to return.
|
|
353
|
+
marketplaces: The marketplaces to include in the search.
|
|
354
|
+
excluded_urls: The URLs to exclude from the search.
|
|
355
|
+
"""
|
|
356
|
+
engine = SearchEngine.GOOGLE.value
|
|
357
|
+
|
|
358
|
+
# Perform the search
|
|
359
|
+
urls = await self._search(
|
|
360
|
+
engine=engine,
|
|
361
|
+
search_string=search_string,
|
|
362
|
+
language=language,
|
|
363
|
+
location=location,
|
|
364
|
+
num_results=num_results,
|
|
365
|
+
)
|
|
366
|
+
|
|
367
|
+
# Create SerpResult objects from the URLs
|
|
368
|
+
results = [
|
|
369
|
+
self._create_serp_result(
|
|
370
|
+
url=url,
|
|
371
|
+
location=location,
|
|
372
|
+
marketplaces=marketplaces,
|
|
373
|
+
excluded_urls=excluded_urls,
|
|
374
|
+
engine=engine,
|
|
375
|
+
)
|
|
376
|
+
for url in urls
|
|
377
|
+
]
|
|
378
|
+
|
|
379
|
+
logger.debug(
|
|
380
|
+
f'Produced {len(results)} results from google search with q="{search_string}".'
|
|
381
|
+
)
|
|
382
|
+
return results
|
|
383
|
+
|
|
384
|
+
async def _search_google_shopping(
|
|
385
|
+
self,
|
|
386
|
+
search_string: str,
|
|
387
|
+
language: Language,
|
|
388
|
+
location: Location,
|
|
389
|
+
num_results: int,
|
|
390
|
+
marketplaces: List[Host] | None = None,
|
|
391
|
+
excluded_urls: List[Host] | None = None,
|
|
392
|
+
) -> List[SerpResult]:
|
|
393
|
+
"""Performs a google search using SerpApi and returns SerpResults.
|
|
394
|
+
|
|
395
|
+
Args:
|
|
396
|
+
search_string: The search string (with potentially added site: parameters).
|
|
397
|
+
language: The language to use for the query ('hl' parameter).
|
|
398
|
+
location: The location to use for the query ('gl' parameter).
|
|
399
|
+
num_results: Max number of results to return.
|
|
400
|
+
marketplaces: The marketplaces to include in the search.
|
|
401
|
+
excluded_urls: The URLs to exclude from the search.
|
|
402
|
+
"""
|
|
403
|
+
engine = SearchEngine.GOOGLE_SHOPPING.value
|
|
404
|
+
|
|
405
|
+
# Perform the search
|
|
406
|
+
urls = await self._search(
|
|
407
|
+
engine=engine,
|
|
408
|
+
search_string=search_string,
|
|
409
|
+
language=language,
|
|
410
|
+
location=location,
|
|
411
|
+
num_results=num_results,
|
|
412
|
+
)
|
|
413
|
+
|
|
414
|
+
# !!! NOTE !!!: Google Shopping results do not properly support the 'num' parameter,
|
|
415
|
+
# so we might get more results than requested. This is a known issue with SerpAPI
|
|
416
|
+
# and Google Shopping searches (see https://github.com/serpapi/public-roadmap/issues/1858)
|
|
417
|
+
urls = urls[:num_results]
|
|
418
|
+
|
|
419
|
+
# Create SerpResult objects from the URLs
|
|
420
|
+
results = [
|
|
421
|
+
self._create_serp_result(
|
|
422
|
+
url=url,
|
|
423
|
+
location=location,
|
|
424
|
+
marketplaces=marketplaces,
|
|
425
|
+
excluded_urls=excluded_urls,
|
|
426
|
+
engine=engine,
|
|
427
|
+
)
|
|
428
|
+
for url in urls
|
|
429
|
+
]
|
|
430
|
+
|
|
431
|
+
logger.debug(
|
|
432
|
+
f'Produced {len(results)} results from google shopping search with q="{search_string}".'
|
|
433
|
+
)
|
|
434
|
+
return results
|
|
435
|
+
|
|
280
436
|
async def apply(
|
|
281
437
|
self,
|
|
282
438
|
search_term: str,
|
|
439
|
+
search_engines: List[SearchEngine],
|
|
283
440
|
language: Language,
|
|
284
441
|
location: Location,
|
|
285
442
|
num_results: int,
|
|
@@ -305,27 +462,35 @@ class SerpApi(AsyncClient):
|
|
|
305
462
|
sites = [dom for host in marketplaces for dom in host.domains]
|
|
306
463
|
search_string += " site:" + " OR site:".join(s for s in sites)
|
|
307
464
|
|
|
308
|
-
#
|
|
309
|
-
|
|
310
|
-
search_string=search_string,
|
|
311
|
-
language=language,
|
|
312
|
-
location=location,
|
|
313
|
-
num_results=num_results,
|
|
314
|
-
)
|
|
465
|
+
# Initialize the results list
|
|
466
|
+
results: List[SerpResult] = []
|
|
315
467
|
|
|
316
|
-
#
|
|
317
|
-
|
|
318
|
-
self.
|
|
319
|
-
|
|
468
|
+
# Perform the google search
|
|
469
|
+
if SearchEngine.GOOGLE in search_engines:
|
|
470
|
+
ggl_res = await self._search_google(
|
|
471
|
+
search_string=search_string,
|
|
472
|
+
language=language,
|
|
320
473
|
location=location,
|
|
474
|
+
num_results=num_results,
|
|
321
475
|
marketplaces=marketplaces,
|
|
322
476
|
excluded_urls=excluded_urls,
|
|
323
477
|
)
|
|
324
|
-
|
|
325
|
-
|
|
478
|
+
results.extend(ggl_res)
|
|
479
|
+
|
|
480
|
+
# Perform the google shopping search
|
|
481
|
+
if SearchEngine.GOOGLE_SHOPPING in search_engines:
|
|
482
|
+
shp_res = await self._search_google_shopping(
|
|
483
|
+
search_string=search_string,
|
|
484
|
+
language=language,
|
|
485
|
+
location=location,
|
|
486
|
+
num_results=num_results,
|
|
487
|
+
marketplaces=marketplaces,
|
|
488
|
+
excluded_urls=excluded_urls,
|
|
489
|
+
)
|
|
490
|
+
results.extend(shp_res)
|
|
326
491
|
|
|
327
492
|
num_non_filtered = len([res for res in results if not res.filtered])
|
|
328
493
|
logger.info(
|
|
329
|
-
f'Produced {num_non_filtered} results from SerpApi search with q="{search_string}".'
|
|
494
|
+
f'Produced a total of {num_non_filtered} results from SerpApi search with q="{search_string}".'
|
|
330
495
|
)
|
|
331
496
|
return results
|
fraudcrawler/settings.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
from pathlib import Path
|
|
2
|
+
from typing import List
|
|
2
3
|
|
|
3
4
|
# Generic settings
|
|
4
5
|
MAX_RETRIES = 3
|
|
@@ -8,8 +9,8 @@ ROOT_DIR = Path(__file__).parents[1]
|
|
|
8
9
|
# Serp settings
|
|
9
10
|
GOOGLE_LOCATIONS_FILENAME = ROOT_DIR / "fraudcrawler" / "base" / "google-locations.json"
|
|
10
11
|
GOOGLE_LANGUAGES_FILENAME = ROOT_DIR / "fraudcrawler" / "base" / "google-languages.json"
|
|
11
|
-
SERP_DEFAULT_COUNTRY_CODES = [
|
|
12
|
-
".com",
|
|
12
|
+
SERP_DEFAULT_COUNTRY_CODES: List[str] = [
|
|
13
|
+
# ".com",
|
|
13
14
|
]
|
|
14
15
|
|
|
15
16
|
# Enrichment settings
|
|
@@ -1,20 +1,20 @@
|
|
|
1
|
-
fraudcrawler/__init__.py,sha256=
|
|
1
|
+
fraudcrawler/__init__.py,sha256=o_K3jVqH-0Pfa08DxySUyHfrwAzHNf-fWbgV5v66oKA,713
|
|
2
2
|
fraudcrawler/base/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
-
fraudcrawler/base/base.py,sha256=
|
|
4
|
-
fraudcrawler/base/client.py,sha256=
|
|
3
|
+
fraudcrawler/base/base.py,sha256=woesbPztEh7tbD0ty9S37JbFrbEC-01H9etmCT2ffnc,4771
|
|
4
|
+
fraudcrawler/base/client.py,sha256=ONy1jf2qwQey4ix4Wdn_qJIik-8NUZHQpuQZyKIVf5I,4903
|
|
5
5
|
fraudcrawler/base/google-languages.json,sha256=z0VtjMCsCcZq11OkCIb9jMDD1p9Ty4lhV7bq4ddYvec,10748
|
|
6
6
|
fraudcrawler/base/google-locations.json,sha256=UtNu0iSStllvFRTQXMobWKmZR1hKmtgtHftLNgaJTT0,9204345
|
|
7
|
-
fraudcrawler/base/orchestrator.py,sha256=
|
|
7
|
+
fraudcrawler/base/orchestrator.py,sha256=VNM8QBT7nZ3BUzkL5pXKNmQxM_FY12UOfA7dnKKfo9U,24395
|
|
8
8
|
fraudcrawler/launch_demo_pipeline.py,sha256=RIZTtdtZeJPhvSLp1IUjT_nhme_2q6mAGWKoL838E4E,4320
|
|
9
9
|
fraudcrawler/processing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
10
|
fraudcrawler/processing/processor.py,sha256=IFVKIiNi0QoCAgPFkFtNDgxfhh01iDNUyIBZWACplR8,3993
|
|
11
11
|
fraudcrawler/scraping/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
12
12
|
fraudcrawler/scraping/enrich.py,sha256=X1BBZshdZqPmbduzhGwH0ULSzq03L_7bf7_UL8yOQ9E,10608
|
|
13
|
-
fraudcrawler/scraping/serp.py,sha256=
|
|
13
|
+
fraudcrawler/scraping/serp.py,sha256=baXcTcquSXpK_JvtHt0_J1CQ20yMHx7I7oF1_LtMpqE,17862
|
|
14
14
|
fraudcrawler/scraping/zyte.py,sha256=ggI4iYG-E_UyiKgUpEFekeUd1giifEfJ_uyFUSJGSLY,6296
|
|
15
|
-
fraudcrawler/settings.py,sha256=
|
|
16
|
-
fraudcrawler-0.
|
|
17
|
-
fraudcrawler-0.
|
|
18
|
-
fraudcrawler-0.
|
|
19
|
-
fraudcrawler-0.
|
|
20
|
-
fraudcrawler-0.
|
|
15
|
+
fraudcrawler/settings.py,sha256=uMjWyDS-TDZBGUK0kiMVzc7TiYhuEav_GFY3A4XFcvo,805
|
|
16
|
+
fraudcrawler-0.4.0.dist-info/LICENSE,sha256=B-3FuHfe3S0fWAlKlceskPcRhzXq81g-rJ-ddUYb4O8,1062
|
|
17
|
+
fraudcrawler-0.4.0.dist-info/METADATA,sha256=uNYbjX1vPPH--hOSI9gVIkgSGhFkYV7SQ_zWR9Nc1Ng,5965
|
|
18
|
+
fraudcrawler-0.4.0.dist-info/WHEEL,sha256=RaoafKOydTQ7I_I3JTrPCg6kUmTgtm4BornzOqyEfJ8,88
|
|
19
|
+
fraudcrawler-0.4.0.dist-info/entry_points.txt,sha256=1Befm7cM6945y2AA1z9V4gZV63mtSWcAs7ypvgux_Xg,79
|
|
20
|
+
fraudcrawler-0.4.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|