fraudcrawler 0.3.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of fraudcrawler might be problematic. Click here for more details.
- fraudcrawler-0.3.3/LICENSE +21 -0
- fraudcrawler-0.3.3/PKG-INFO +163 -0
- fraudcrawler-0.3.3/README.md +140 -0
- fraudcrawler-0.3.3/fraudcrawler/__init__.py +30 -0
- fraudcrawler-0.3.3/fraudcrawler/base/__init__.py +0 -0
- fraudcrawler-0.3.3/fraudcrawler/base/base.py +145 -0
- fraudcrawler-0.3.3/fraudcrawler/base/client.py +134 -0
- fraudcrawler-0.3.3/fraudcrawler/base/google-languages.json +630 -0
- fraudcrawler-0.3.3/fraudcrawler/base/google-locations.json +1 -0
- fraudcrawler-0.3.3/fraudcrawler/base/orchestrator.py +626 -0
- fraudcrawler-0.3.3/fraudcrawler/launch_demo_pipeline.py +100 -0
- fraudcrawler-0.3.3/fraudcrawler/processing/__init__.py +0 -0
- fraudcrawler-0.3.3/fraudcrawler/processing/processor.py +105 -0
- fraudcrawler-0.3.3/fraudcrawler/scraping/__init__.py +0 -0
- fraudcrawler-0.3.3/fraudcrawler/scraping/enrich.py +303 -0
- fraudcrawler-0.3.3/fraudcrawler/scraping/serp.py +251 -0
- fraudcrawler-0.3.3/fraudcrawler/scraping/zyte.py +194 -0
- fraudcrawler-0.3.3/fraudcrawler/settings.py +31 -0
- fraudcrawler-0.3.3/pyproject.toml +46 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 veanu
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: fraudcrawler
|
|
3
|
+
Version: 0.3.3
|
|
4
|
+
Summary: Intelligent Market Monitoring
|
|
5
|
+
Home-page: https://github/open-veanu/fraudcrawler
|
|
6
|
+
License: MIT
|
|
7
|
+
Author: Domingo Bertus
|
|
8
|
+
Author-email: hello@veanu.ch
|
|
9
|
+
Requires-Python: >=3.11,<4.0
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
15
|
+
Requires-Dist: aiohttp (>=3.11.14,<4.0.0)
|
|
16
|
+
Requires-Dist: openai (>=1.68.2,<2.0.0)
|
|
17
|
+
Requires-Dist: pandas (>=2.2.3,<3.0.0)
|
|
18
|
+
Requires-Dist: pydantic-settings (>=2.8.1,<3.0.0)
|
|
19
|
+
Requires-Dist: requests (>=2.32.3,<3.0.0)
|
|
20
|
+
Project-URL: Repository, https://github/open-veanu/fraudcrawler
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
|
|
23
|
+
# open-veanu/fraudcrawler
|
|
24
|
+
Intelligent Market Monitoring
|
|
25
|
+
|
|
26
|
+
The pipeline for monitoring the market has the folling main steps:
|
|
27
|
+
1. search for a given term using SerpAPI
|
|
28
|
+
2. get product information using ZyteAPI
|
|
29
|
+
3. assess relevance of the found products using an OpenAI API
|
|
30
|
+
|
|
31
|
+
## Installation
|
|
32
|
+
```bash
|
|
33
|
+
python3.11 -m venv .venv
|
|
34
|
+
source .venv/bin/activate
|
|
35
|
+
pip install fraudcrawler
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
## Usage
|
|
39
|
+
### `.env` file
|
|
40
|
+
Make sure to create an `.env` file with the necessary API keys and credentials (c.f. `.env.example` file).
|
|
41
|
+
|
|
42
|
+
### Run demo pipeline
|
|
43
|
+
```bash
|
|
44
|
+
python -m fraudcrawler.launch_demo_pipeline
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
### Customize the pipeline
|
|
48
|
+
Start by initializing the client
|
|
49
|
+
```python
|
|
50
|
+
from fraudcrawler import FraudCrawlerClient
|
|
51
|
+
|
|
52
|
+
# Initialize the client
|
|
53
|
+
client = FraudCrawlerClient()
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
For setting up the search we need 5 main objects.
|
|
57
|
+
|
|
58
|
+
#### `search_term: str`
|
|
59
|
+
The search term for the query (similar to search terms used within major search providers).
|
|
60
|
+
|
|
61
|
+
#### `language: Language`
|
|
62
|
+
The language used in SerpAPI ('hl' parameter), as well as for the optional search term enrichement (e.g. finding similar and related search terms). `language=Language('German')` creates an object having a language name and a language code as: `Language(name='German', code='de')`.
|
|
63
|
+
|
|
64
|
+
#### `location: Location`
|
|
65
|
+
The location used in SerpAPI ('gl' parameter). `location=Location('Switzerland')` creates an object having a location name and a location code as `Location(name='Switzerland', code='ch')`.
|
|
66
|
+
|
|
67
|
+
#### `deepness: Deepness`
|
|
68
|
+
Defines the search depth with the number of results to retrieve and optional enrichment parameters.
|
|
69
|
+
|
|
70
|
+
#### `prompts: List[Prompt]`
|
|
71
|
+
The list of prompts to classify a given product with (multiple) LLM calls. Each prompt object has a `name`, a `context` (used for defining the user prompt), a `system_prompt` (for defining the classification task), `allowed_classes` (a list of possible classes) and optionally `default_if_missing` (a default class if anything goes wrong).
|
|
72
|
+
|
|
73
|
+
```python
|
|
74
|
+
from fraudcrawler import Language, Location, Deepness, Prompt
|
|
75
|
+
# Setup the search
|
|
76
|
+
search_term = "sildenafil"
|
|
77
|
+
language = Language(name="German")
|
|
78
|
+
location = Location(name="Switzerland")
|
|
79
|
+
deepness = Deepness(num_results=50)
|
|
80
|
+
prompts = [
|
|
81
|
+
Prompt(
|
|
82
|
+
name="relevance",
|
|
83
|
+
context="This organization is interested in medical products and drugs.",
|
|
84
|
+
system_prompt=(
|
|
85
|
+
"You are a helpful and intelligent assistant. Your task is to classify any given product "
|
|
86
|
+
"as either relevant (1) or not relevant (0), strictly based on the context and product details provided by the user. "
|
|
87
|
+
"You must consider all aspects of the given context and make a binary decision accordingly. "
|
|
88
|
+
"If the product aligns with the user's needs, classify it as 1 (relevant); otherwise, classify it as 0 (not relevant). "
|
|
89
|
+
"Respond only with the number 1 or 0."
|
|
90
|
+
),
|
|
91
|
+
allowed_classes=[0, 1],
|
|
92
|
+
)
|
|
93
|
+
]
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
(Optional) Add search term enrichement. This will find related search terms (in a given language) and search for these as well.
|
|
97
|
+
```python
|
|
98
|
+
from fraudcrawler import Enrichment
|
|
99
|
+
deepness.enrichment = Enrichment(
|
|
100
|
+
additional_terms=5,
|
|
101
|
+
additional_urls_per_term=10
|
|
102
|
+
)
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
(Optional) Add marketplaces where we explicitely want to look for (this will focus your search as the :site parameter for a google search)
|
|
106
|
+
```python
|
|
107
|
+
from fraudcrawler import Host
|
|
108
|
+
marketplaces = [
|
|
109
|
+
Host(name="International", domains="zavamed.com,apomeds.com"),
|
|
110
|
+
Host(name="National", domains="netdoktor.ch, nobelpharma.ch"),
|
|
111
|
+
]
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
(Optional) Exclude urls (where you don't want to find products)
|
|
115
|
+
```python
|
|
116
|
+
excluded_urls = [
|
|
117
|
+
Host(name="Compendium", domains="compendium.ch"),
|
|
118
|
+
]
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
(Optional) Exclude previously collected urls (intends to save credits)
|
|
122
|
+
```python
|
|
123
|
+
previously_collected_urls = [
|
|
124
|
+
https://pharmaciedelabateliere.ch/shop/sante/douleurs-inflammations/dafalgan-cpr-eff-500-mg-16-pce/,
|
|
125
|
+
https://eiche.ch/product/schmerzmittel-52cd81d5d206a/dafalgan-brausetabletten-1336653,
|
|
126
|
+
]
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
And finally run the pipeline
|
|
130
|
+
```python
|
|
131
|
+
# Execute the pipeline
|
|
132
|
+
client.execute(
|
|
133
|
+
search_term=search_term,
|
|
134
|
+
language=language,
|
|
135
|
+
location=location,
|
|
136
|
+
deepness=deepness,
|
|
137
|
+
prompts=prompts,
|
|
138
|
+
# marketplaces=marketplaces, # Uncomment this for using marketplaces
|
|
139
|
+
# excluded_urls=excluded_urls # Uncomment this for using excluded_urls
|
|
140
|
+
# previously_collected_urls=previously_collected_urls # Uncomment this for using previously_selected_urls
|
|
141
|
+
)
|
|
142
|
+
```
|
|
143
|
+
This creates a file with name pattern `<search_term>_<language.code>_<location.code>_<datetime[%Y%m%d%H%M%S]>.csv` inside the folder `data/results/`.
|
|
144
|
+
|
|
145
|
+
Once the pipeline terminated the results can be loaded and examined as follows:
|
|
146
|
+
```python
|
|
147
|
+
df = client.load_results()
|
|
148
|
+
print(df.head(n=10))
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
If the client has been used to run multiple pipelines, an overview of the available results (for a given instance of
|
|
152
|
+
`FraudCrawlerClient`) can be obtained with
|
|
153
|
+
```python
|
|
154
|
+
client.print_available_results()
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
## Contributing
|
|
158
|
+
see `CONTRIBUTING.md`
|
|
159
|
+
|
|
160
|
+
### Async Setup
|
|
161
|
+
The following image provides a schematic representation of the package's async setup.
|
|
162
|
+

|
|
163
|
+
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
# open-veanu/fraudcrawler
|
|
2
|
+
Intelligent Market Monitoring
|
|
3
|
+
|
|
4
|
+
The pipeline for monitoring the market has the folling main steps:
|
|
5
|
+
1. search for a given term using SerpAPI
|
|
6
|
+
2. get product information using ZyteAPI
|
|
7
|
+
3. assess relevance of the found products using an OpenAI API
|
|
8
|
+
|
|
9
|
+
## Installation
|
|
10
|
+
```bash
|
|
11
|
+
python3.11 -m venv .venv
|
|
12
|
+
source .venv/bin/activate
|
|
13
|
+
pip install fraudcrawler
|
|
14
|
+
```
|
|
15
|
+
|
|
16
|
+
## Usage
|
|
17
|
+
### `.env` file
|
|
18
|
+
Make sure to create an `.env` file with the necessary API keys and credentials (c.f. `.env.example` file).
|
|
19
|
+
|
|
20
|
+
### Run demo pipeline
|
|
21
|
+
```bash
|
|
22
|
+
python -m fraudcrawler.launch_demo_pipeline
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
### Customize the pipeline
|
|
26
|
+
Start by initializing the client
|
|
27
|
+
```python
|
|
28
|
+
from fraudcrawler import FraudCrawlerClient
|
|
29
|
+
|
|
30
|
+
# Initialize the client
|
|
31
|
+
client = FraudCrawlerClient()
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
For setting up the search we need 5 main objects.
|
|
35
|
+
|
|
36
|
+
#### `search_term: str`
|
|
37
|
+
The search term for the query (similar to search terms used within major search providers).
|
|
38
|
+
|
|
39
|
+
#### `language: Language`
|
|
40
|
+
The language used in SerpAPI ('hl' parameter), as well as for the optional search term enrichement (e.g. finding similar and related search terms). `language=Language('German')` creates an object having a language name and a language code as: `Language(name='German', code='de')`.
|
|
41
|
+
|
|
42
|
+
#### `location: Location`
|
|
43
|
+
The location used in SerpAPI ('gl' parameter). `location=Location('Switzerland')` creates an object having a location name and a location code as `Location(name='Switzerland', code='ch')`.
|
|
44
|
+
|
|
45
|
+
#### `deepness: Deepness`
|
|
46
|
+
Defines the search depth with the number of results to retrieve and optional enrichment parameters.
|
|
47
|
+
|
|
48
|
+
#### `prompts: List[Prompt]`
|
|
49
|
+
The list of prompts to classify a given product with (multiple) LLM calls. Each prompt object has a `name`, a `context` (used for defining the user prompt), a `system_prompt` (for defining the classification task), `allowed_classes` (a list of possible classes) and optionally `default_if_missing` (a default class if anything goes wrong).
|
|
50
|
+
|
|
51
|
+
```python
|
|
52
|
+
from fraudcrawler import Language, Location, Deepness, Prompt
|
|
53
|
+
# Setup the search
|
|
54
|
+
search_term = "sildenafil"
|
|
55
|
+
language = Language(name="German")
|
|
56
|
+
location = Location(name="Switzerland")
|
|
57
|
+
deepness = Deepness(num_results=50)
|
|
58
|
+
prompts = [
|
|
59
|
+
Prompt(
|
|
60
|
+
name="relevance",
|
|
61
|
+
context="This organization is interested in medical products and drugs.",
|
|
62
|
+
system_prompt=(
|
|
63
|
+
"You are a helpful and intelligent assistant. Your task is to classify any given product "
|
|
64
|
+
"as either relevant (1) or not relevant (0), strictly based on the context and product details provided by the user. "
|
|
65
|
+
"You must consider all aspects of the given context and make a binary decision accordingly. "
|
|
66
|
+
"If the product aligns with the user's needs, classify it as 1 (relevant); otherwise, classify it as 0 (not relevant). "
|
|
67
|
+
"Respond only with the number 1 or 0."
|
|
68
|
+
),
|
|
69
|
+
allowed_classes=[0, 1],
|
|
70
|
+
)
|
|
71
|
+
]
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
(Optional) Add search term enrichement. This will find related search terms (in a given language) and search for these as well.
|
|
75
|
+
```python
|
|
76
|
+
from fraudcrawler import Enrichment
|
|
77
|
+
deepness.enrichment = Enrichment(
|
|
78
|
+
additional_terms=5,
|
|
79
|
+
additional_urls_per_term=10
|
|
80
|
+
)
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
(Optional) Add marketplaces where we explicitely want to look for (this will focus your search as the :site parameter for a google search)
|
|
84
|
+
```python
|
|
85
|
+
from fraudcrawler import Host
|
|
86
|
+
marketplaces = [
|
|
87
|
+
Host(name="International", domains="zavamed.com,apomeds.com"),
|
|
88
|
+
Host(name="National", domains="netdoktor.ch, nobelpharma.ch"),
|
|
89
|
+
]
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
(Optional) Exclude urls (where you don't want to find products)
|
|
93
|
+
```python
|
|
94
|
+
excluded_urls = [
|
|
95
|
+
Host(name="Compendium", domains="compendium.ch"),
|
|
96
|
+
]
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
(Optional) Exclude previously collected urls (intends to save credits)
|
|
100
|
+
```python
|
|
101
|
+
previously_collected_urls = [
|
|
102
|
+
https://pharmaciedelabateliere.ch/shop/sante/douleurs-inflammations/dafalgan-cpr-eff-500-mg-16-pce/,
|
|
103
|
+
https://eiche.ch/product/schmerzmittel-52cd81d5d206a/dafalgan-brausetabletten-1336653,
|
|
104
|
+
]
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
And finally run the pipeline
|
|
108
|
+
```python
|
|
109
|
+
# Execute the pipeline
|
|
110
|
+
client.execute(
|
|
111
|
+
search_term=search_term,
|
|
112
|
+
language=language,
|
|
113
|
+
location=location,
|
|
114
|
+
deepness=deepness,
|
|
115
|
+
prompts=prompts,
|
|
116
|
+
# marketplaces=marketplaces, # Uncomment this for using marketplaces
|
|
117
|
+
# excluded_urls=excluded_urls # Uncomment this for using excluded_urls
|
|
118
|
+
# previously_collected_urls=previously_collected_urls # Uncomment this for using previously_selected_urls
|
|
119
|
+
)
|
|
120
|
+
```
|
|
121
|
+
This creates a file with name pattern `<search_term>_<language.code>_<location.code>_<datetime[%Y%m%d%H%M%S]>.csv` inside the folder `data/results/`.
|
|
122
|
+
|
|
123
|
+
Once the pipeline terminated the results can be loaded and examined as follows:
|
|
124
|
+
```python
|
|
125
|
+
df = client.load_results()
|
|
126
|
+
print(df.head(n=10))
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
If the client has been used to run multiple pipelines, an overview of the available results (for a given instance of
|
|
130
|
+
`FraudCrawlerClient`) can be obtained with
|
|
131
|
+
```python
|
|
132
|
+
client.print_available_results()
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
## Contributing
|
|
136
|
+
see `CONTRIBUTING.md`
|
|
137
|
+
|
|
138
|
+
### Async Setup
|
|
139
|
+
The following image provides a schematic representation of the package's async setup.
|
|
140
|
+

|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
from fraudcrawler.scraping.serp import SerpApi
|
|
2
|
+
from fraudcrawler.scraping.enrich import Enricher
|
|
3
|
+
from fraudcrawler.scraping.zyte import ZyteApi
|
|
4
|
+
from fraudcrawler.processing.processor import Processor
|
|
5
|
+
from fraudcrawler.base.orchestrator import Orchestrator, ProductItem
|
|
6
|
+
from fraudcrawler.base.client import FraudCrawlerClient
|
|
7
|
+
from fraudcrawler.base.base import (
|
|
8
|
+
Deepness,
|
|
9
|
+
Enrichment,
|
|
10
|
+
Host,
|
|
11
|
+
Language,
|
|
12
|
+
Location,
|
|
13
|
+
Prompt,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
__all__ = [
|
|
17
|
+
"SerpApi",
|
|
18
|
+
"Enricher",
|
|
19
|
+
"ZyteApi",
|
|
20
|
+
"Processor",
|
|
21
|
+
"Orchestrator",
|
|
22
|
+
"ProductItem",
|
|
23
|
+
"FraudCrawlerClient",
|
|
24
|
+
"Language",
|
|
25
|
+
"Location",
|
|
26
|
+
"Host",
|
|
27
|
+
"Deepness",
|
|
28
|
+
"Enrichment",
|
|
29
|
+
"Prompt",
|
|
30
|
+
]
|
|
File without changes
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import logging
|
|
3
|
+
from pydantic import BaseModel, field_validator, model_validator
|
|
4
|
+
from pydantic_settings import BaseSettings
|
|
5
|
+
from typing import List
|
|
6
|
+
|
|
7
|
+
import aiohttp
|
|
8
|
+
|
|
9
|
+
from fraudcrawler.settings import (
|
|
10
|
+
GOOGLE_LANGUAGES_FILENAME,
|
|
11
|
+
GOOGLE_LOCATIONS_FILENAME,
|
|
12
|
+
PROCESSOR_DEFAULT_IF_MISSING,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
# Load google locations and languages
|
|
18
|
+
with open(GOOGLE_LOCATIONS_FILENAME, "r") as gfile:
|
|
19
|
+
_locs = json.load(gfile)
|
|
20
|
+
_LOCATION_CODES = {loc["name"]: loc["country_code"].lower() for loc in _locs}
|
|
21
|
+
with open(GOOGLE_LANGUAGES_FILENAME, "r") as gfile:
|
|
22
|
+
_langs = json.load(gfile)
|
|
23
|
+
_LANGUAGE_CODES = {lang["language_name"]: lang["language_code"] for lang in _langs}
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
# Base classes
|
|
27
|
+
class Setup(BaseSettings):
|
|
28
|
+
"""Class for loading environment variables."""
|
|
29
|
+
|
|
30
|
+
# Crawler ENV variables
|
|
31
|
+
serpapi_key: str
|
|
32
|
+
dataforseo_user: str
|
|
33
|
+
dataforseo_pwd: str
|
|
34
|
+
zyteapi_key: str
|
|
35
|
+
openaiapi_key: str
|
|
36
|
+
|
|
37
|
+
class Config:
|
|
38
|
+
env_file = ".env"
|
|
39
|
+
env_file_encoding = "utf-8"
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class Host(BaseModel):
|
|
43
|
+
"""Model for host details (e.g. `Host(name="Galaxus", domains="galaxus.ch, digitec.ch")`)."""
|
|
44
|
+
|
|
45
|
+
name: str
|
|
46
|
+
domains: str | List[str]
|
|
47
|
+
|
|
48
|
+
@field_validator("domains", mode="before")
|
|
49
|
+
def split_domains_if_str(cls, val):
|
|
50
|
+
if isinstance(val, str):
|
|
51
|
+
return [dom.strip() for dom in val.split(",")]
|
|
52
|
+
return val
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class Location(BaseModel):
|
|
56
|
+
"""Model for location details (e.g. `Location(name="Switzerland", code="ch")`)."""
|
|
57
|
+
|
|
58
|
+
name: str
|
|
59
|
+
code: str = ""
|
|
60
|
+
|
|
61
|
+
@model_validator(mode="before")
|
|
62
|
+
def set_code(cls, values):
|
|
63
|
+
"""Set the location code if not provided and make it lower case."""
|
|
64
|
+
name = values.get("name")
|
|
65
|
+
code = values.get("code")
|
|
66
|
+
if code is None or not len(code):
|
|
67
|
+
code = _LOCATION_CODES.get(name)
|
|
68
|
+
if code is None:
|
|
69
|
+
raise ValueError(f'Location code not found for location name="{name}"')
|
|
70
|
+
code = code.lower()
|
|
71
|
+
return {"name": name, "code": code}
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class Language(BaseModel):
|
|
75
|
+
"""Model for language details (e.g. `Language(name="German", code="de")`)."""
|
|
76
|
+
|
|
77
|
+
name: str
|
|
78
|
+
code: str = ""
|
|
79
|
+
|
|
80
|
+
@model_validator(mode="before")
|
|
81
|
+
def set_code(cls, values):
|
|
82
|
+
"""Set the language code if not provided and make it lower case."""
|
|
83
|
+
name = values.get("name")
|
|
84
|
+
code = values.get("code")
|
|
85
|
+
if code is None or not len(code):
|
|
86
|
+
code = _LANGUAGE_CODES.get(name)
|
|
87
|
+
if code is None:
|
|
88
|
+
raise ValueError(f'Language code not found for language name="{name}"')
|
|
89
|
+
code = code.lower()
|
|
90
|
+
return {"name": name, "code": code}
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
class Enrichment(BaseModel):
|
|
94
|
+
"""Model for enriching initial search_term with alternative ones."""
|
|
95
|
+
|
|
96
|
+
additional_terms: int
|
|
97
|
+
additional_urls_per_term: int
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
class Deepness(BaseModel):
|
|
101
|
+
"""Model for search depth."""
|
|
102
|
+
|
|
103
|
+
num_results: int
|
|
104
|
+
enrichment: Enrichment | None = None
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
class Prompt(BaseModel):
|
|
108
|
+
"""Model for prompts."""
|
|
109
|
+
|
|
110
|
+
name: str
|
|
111
|
+
context: str
|
|
112
|
+
system_prompt: str
|
|
113
|
+
allowed_classes: List[int]
|
|
114
|
+
default_if_missing: int = PROCESSOR_DEFAULT_IF_MISSING
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
class AsyncClient:
|
|
118
|
+
"""Base class for sub-classes using async HTTP requests."""
|
|
119
|
+
|
|
120
|
+
@staticmethod
|
|
121
|
+
async def get(
|
|
122
|
+
url: str,
|
|
123
|
+
headers: dict | None = None,
|
|
124
|
+
params: dict | None = None,
|
|
125
|
+
) -> dict:
|
|
126
|
+
"""Async GET request of a given URL returning the data."""
|
|
127
|
+
async with aiohttp.ClientSession(headers=headers) as session:
|
|
128
|
+
async with session.get(url=url, params=params) as response:
|
|
129
|
+
response.raise_for_status()
|
|
130
|
+
json_ = await response.json()
|
|
131
|
+
return json_
|
|
132
|
+
|
|
133
|
+
@staticmethod
|
|
134
|
+
async def post(
|
|
135
|
+
url: str,
|
|
136
|
+
headers: dict | None = None,
|
|
137
|
+
data: List[dict] | dict | None = None,
|
|
138
|
+
auth: aiohttp.BasicAuth | None = None,
|
|
139
|
+
) -> dict:
|
|
140
|
+
"""Async POST request of a given URL returning the data."""
|
|
141
|
+
async with aiohttp.ClientSession(headers=headers) as session:
|
|
142
|
+
async with session.post(url=url, json=data, auth=auth) as response:
|
|
143
|
+
response.raise_for_status()
|
|
144
|
+
json_ = await response.json()
|
|
145
|
+
return json_
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import csv
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
import logging
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from pydantic import BaseModel
|
|
7
|
+
from typing import List
|
|
8
|
+
|
|
9
|
+
import pandas as pd
|
|
10
|
+
|
|
11
|
+
from fraudcrawler.settings import ROOT_DIR
|
|
12
|
+
from fraudcrawler.base.base import Setup, Language, Location, Deepness, Host, Prompt
|
|
13
|
+
from fraudcrawler.base.orchestrator import Orchestrator, ProductItem
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
_RESULTS_DIR = ROOT_DIR / "data" / "results"
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class Results(BaseModel):
|
|
21
|
+
"""The results of the product search."""
|
|
22
|
+
|
|
23
|
+
search_term: str
|
|
24
|
+
filename: Path | None = None
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class FraudCrawlerClient(Orchestrator):
|
|
28
|
+
"""The main client for FraudCrawler."""
|
|
29
|
+
|
|
30
|
+
_filename_template = "{search_term}_{language}_{location}_{timestamp}.csv"
|
|
31
|
+
|
|
32
|
+
def __init__(self):
|
|
33
|
+
setup = Setup()
|
|
34
|
+
super().__init__(
|
|
35
|
+
serpapi_key=setup.serpapi_key,
|
|
36
|
+
dataforseo_user=setup.dataforseo_user,
|
|
37
|
+
dataforseo_pwd=setup.dataforseo_pwd,
|
|
38
|
+
zyteapi_key=setup.zyteapi_key,
|
|
39
|
+
openaiapi_key=setup.openaiapi_key,
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
self._results_dir = _RESULTS_DIR
|
|
43
|
+
if not self._results_dir.exists():
|
|
44
|
+
self._results_dir.mkdir(parents=True)
|
|
45
|
+
self._results: List[Results] = []
|
|
46
|
+
|
|
47
|
+
async def _collect_results(
|
|
48
|
+
self, queue_in: asyncio.Queue[ProductItem | None]
|
|
49
|
+
) -> None:
|
|
50
|
+
"""Collects the results from the given queue_in and saves it as csv.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
queue_in: The input queue containing the results.
|
|
54
|
+
"""
|
|
55
|
+
products = []
|
|
56
|
+
while True:
|
|
57
|
+
product = await queue_in.get()
|
|
58
|
+
if product is None:
|
|
59
|
+
queue_in.task_done()
|
|
60
|
+
break
|
|
61
|
+
|
|
62
|
+
products.append(product.model_dump())
|
|
63
|
+
queue_in.task_done()
|
|
64
|
+
|
|
65
|
+
# Convert the list of products to a DataFrame
|
|
66
|
+
df = pd.json_normalize(products)
|
|
67
|
+
cols = [c.split(".")[-1] for c in df.columns]
|
|
68
|
+
if len(cols) != len(set(cols)):
|
|
69
|
+
logger.error("Duplicate columns after json_normalize.")
|
|
70
|
+
else:
|
|
71
|
+
df.columns = cols
|
|
72
|
+
|
|
73
|
+
# Save the DataFrame to a CSV file
|
|
74
|
+
filename = self._results[-1].filename
|
|
75
|
+
df.to_csv(filename, index=False, quoting=csv.QUOTE_ALL)
|
|
76
|
+
logger.info(f"Results saved to {filename}")
|
|
77
|
+
|
|
78
|
+
def execute(
|
|
79
|
+
self,
|
|
80
|
+
search_term: str,
|
|
81
|
+
language: Language,
|
|
82
|
+
location: Location,
|
|
83
|
+
deepness: Deepness,
|
|
84
|
+
prompts: List[Prompt],
|
|
85
|
+
marketplaces: List[Host] | None = None,
|
|
86
|
+
excluded_urls: List[Host] | None = None,
|
|
87
|
+
) -> None:
|
|
88
|
+
"""Runs the pipeline steps: serp, enrich, zyte, process, and collect the results.
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
search_term: The search term for the query.
|
|
92
|
+
language: The language to use for the query.
|
|
93
|
+
location: The location to use for the query.
|
|
94
|
+
deepness: The search depth and enrichment details.
|
|
95
|
+
prompts: The list of prompts to use for classification.
|
|
96
|
+
marketplaces: The marketplaces to include in the search.
|
|
97
|
+
excluded_urls: The URLs to exclude from the search.
|
|
98
|
+
"""
|
|
99
|
+
timestamp = datetime.today().strftime("%Y%m%d%H%M%S")
|
|
100
|
+
filename = self._results_dir / self._filename_template.format(
|
|
101
|
+
search_term=search_term,
|
|
102
|
+
language=language.code,
|
|
103
|
+
location=location.code,
|
|
104
|
+
timestamp=timestamp,
|
|
105
|
+
)
|
|
106
|
+
self._results.append(Results(search_term=search_term, filename=filename))
|
|
107
|
+
|
|
108
|
+
asyncio.run(
|
|
109
|
+
super().run(
|
|
110
|
+
search_term=search_term,
|
|
111
|
+
language=language,
|
|
112
|
+
location=location,
|
|
113
|
+
deepness=deepness,
|
|
114
|
+
prompts=prompts,
|
|
115
|
+
marketplaces=marketplaces,
|
|
116
|
+
excluded_urls=excluded_urls,
|
|
117
|
+
)
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
def load_results(self, index: int = -1) -> pd.DataFrame:
|
|
121
|
+
"""Loads the results from the saved .csv files.
|
|
122
|
+
|
|
123
|
+
Args:
|
|
124
|
+
index: The index of the results to load (`incex=-1` are the results for the most recent run).
|
|
125
|
+
"""
|
|
126
|
+
|
|
127
|
+
results = self._results[index]
|
|
128
|
+
return pd.read_csv(results.filename)
|
|
129
|
+
|
|
130
|
+
def print_available_results(self) -> None:
|
|
131
|
+
"""Prints the available results."""
|
|
132
|
+
n_res = len(self._results)
|
|
133
|
+
for i, res in enumerate(self._results):
|
|
134
|
+
print(f"index={-n_res + i}: {res.search_term} - {res.filename}")
|