fraudcrawler 0.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of fraudcrawler might be problematic. Click here for more details.

@@ -0,0 +1,30 @@
1
+ from fraudcrawler.scraping.serp import SerpApi
2
+ from fraudcrawler.scraping.enrich import Enricher
3
+ from fraudcrawler.scraping.zyte import ZyteApi
4
+ from fraudcrawler.processing.processor import Processor
5
+ from fraudcrawler.base.orchestrator import Orchestrator, ProductItem
6
+ from fraudcrawler.base.client import FraudCrawlerClient
7
+ from fraudcrawler.base.base import (
8
+ Deepness,
9
+ Enrichment,
10
+ Host,
11
+ Language,
12
+ Location,
13
+ Prompt,
14
+ )
15
+
16
+ __all__ = [
17
+ "SerpApi",
18
+ "Enricher",
19
+ "ZyteApi",
20
+ "Processor",
21
+ "Orchestrator",
22
+ "ProductItem",
23
+ "FraudCrawlerClient",
24
+ "Language",
25
+ "Location",
26
+ "Host",
27
+ "Deepness",
28
+ "Enrichment",
29
+ "Prompt",
30
+ ]
File without changes
@@ -0,0 +1,145 @@
1
+ import json
2
+ import logging
3
+ from pydantic import BaseModel, field_validator, model_validator
4
+ from pydantic_settings import BaseSettings
5
+ from typing import List
6
+
7
+ import aiohttp
8
+
9
+ from fraudcrawler.settings import (
10
+ GOOGLE_LANGUAGES_FILENAME,
11
+ GOOGLE_LOCATIONS_FILENAME,
12
+ PROCESSOR_DEFAULT_IF_MISSING,
13
+ )
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+ # Load google locations and languages
18
+ with open(GOOGLE_LOCATIONS_FILENAME, "r") as gfile:
19
+ _locs = json.load(gfile)
20
+ _LOCATION_CODES = {loc["name"]: loc["country_code"].lower() for loc in _locs}
21
+ with open(GOOGLE_LANGUAGES_FILENAME, "r") as gfile:
22
+ _langs = json.load(gfile)
23
+ _LANGUAGE_CODES = {lang["language_name"]: lang["language_code"] for lang in _langs}
24
+
25
+
26
+ # Base classes
27
+ class Setup(BaseSettings):
28
+ """Class for loading environment variables."""
29
+
30
+ # Crawler ENV variables
31
+ serpapi_key: str
32
+ dataforseo_user: str
33
+ dataforseo_pwd: str
34
+ zyteapi_key: str
35
+ openaiapi_key: str
36
+
37
+ class Config:
38
+ env_file = ".env"
39
+ env_file_encoding = "utf-8"
40
+
41
+
42
+ class Host(BaseModel):
43
+ """Model for host details (e.g. `Host(name="Galaxus", domains="galaxus.ch, digitec.ch")`)."""
44
+
45
+ name: str
46
+ domains: str | List[str]
47
+
48
+ @field_validator("domains", mode="before")
49
+ def split_domains_if_str(cls, val):
50
+ if isinstance(val, str):
51
+ return [dom.strip() for dom in val.split(",")]
52
+ return val
53
+
54
+
55
+ class Location(BaseModel):
56
+ """Model for location details (e.g. `Location(name="Switzerland", code="ch")`)."""
57
+
58
+ name: str
59
+ code: str = ""
60
+
61
+ @model_validator(mode="before")
62
+ def set_code(cls, values):
63
+ """Set the location code if not provided and make it lower case."""
64
+ name = values.get("name")
65
+ code = values.get("code")
66
+ if code is None or not len(code):
67
+ code = _LOCATION_CODES.get(name)
68
+ if code is None:
69
+ raise ValueError(f'Location code not found for location name="{name}"')
70
+ code = code.lower()
71
+ return {"name": name, "code": code}
72
+
73
+
74
+ class Language(BaseModel):
75
+ """Model for language details (e.g. `Language(name="German", code="de")`)."""
76
+
77
+ name: str
78
+ code: str = ""
79
+
80
+ @model_validator(mode="before")
81
+ def set_code(cls, values):
82
+ """Set the language code if not provided and make it lower case."""
83
+ name = values.get("name")
84
+ code = values.get("code")
85
+ if code is None or not len(code):
86
+ code = _LANGUAGE_CODES.get(name)
87
+ if code is None:
88
+ raise ValueError(f'Language code not found for language name="{name}"')
89
+ code = code.lower()
90
+ return {"name": name, "code": code}
91
+
92
+
93
+ class Enrichment(BaseModel):
94
+ """Model for enriching initial search_term with alternative ones."""
95
+
96
+ additional_terms: int
97
+ additional_urls_per_term: int
98
+
99
+
100
+ class Deepness(BaseModel):
101
+ """Model for search depth."""
102
+
103
+ num_results: int
104
+ enrichment: Enrichment | None = None
105
+
106
+
107
+ class Prompt(BaseModel):
108
+ """Model for prompts."""
109
+
110
+ name: str
111
+ context: str
112
+ system_prompt: str
113
+ allowed_classes: List[int]
114
+ default_if_missing: int = PROCESSOR_DEFAULT_IF_MISSING
115
+
116
+
117
+ class AsyncClient:
118
+ """Base class for sub-classes using async HTTP requests."""
119
+
120
+ @staticmethod
121
+ async def get(
122
+ url: str,
123
+ headers: dict | None = None,
124
+ params: dict | None = None,
125
+ ) -> dict:
126
+ """Async GET request of a given URL returning the data."""
127
+ async with aiohttp.ClientSession(headers=headers) as session:
128
+ async with session.get(url=url, params=params) as response:
129
+ response.raise_for_status()
130
+ json_ = await response.json()
131
+ return json_
132
+
133
+ @staticmethod
134
+ async def post(
135
+ url: str,
136
+ headers: dict | None = None,
137
+ data: List[dict] | dict | None = None,
138
+ auth: aiohttp.BasicAuth | None = None,
139
+ ) -> dict:
140
+ """Async POST request of a given URL returning the data."""
141
+ async with aiohttp.ClientSession(headers=headers) as session:
142
+ async with session.post(url=url, json=data, auth=auth) as response:
143
+ response.raise_for_status()
144
+ json_ = await response.json()
145
+ return json_
@@ -0,0 +1,134 @@
1
+ import asyncio
2
+ import csv
3
+ from datetime import datetime
4
+ import logging
5
+ from pathlib import Path
6
+ from pydantic import BaseModel
7
+ from typing import List
8
+
9
+ import pandas as pd
10
+
11
+ from fraudcrawler.settings import ROOT_DIR
12
+ from fraudcrawler.base.base import Setup, Language, Location, Deepness, Host, Prompt
13
+ from fraudcrawler.base.orchestrator import Orchestrator, ProductItem
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+ _RESULTS_DIR = ROOT_DIR / "data" / "results"
18
+
19
+
20
+ class Results(BaseModel):
21
+ """The results of the product search."""
22
+
23
+ search_term: str
24
+ filename: Path | None = None
25
+
26
+
27
+ class FraudCrawlerClient(Orchestrator):
28
+ """The main client for FraudCrawler."""
29
+
30
+ _filename_template = "{search_term}_{language}_{location}_{timestamp}.csv"
31
+
32
+ def __init__(self):
33
+ setup = Setup()
34
+ super().__init__(
35
+ serpapi_key=setup.serpapi_key,
36
+ dataforseo_user=setup.dataforseo_user,
37
+ dataforseo_pwd=setup.dataforseo_pwd,
38
+ zyteapi_key=setup.zyteapi_key,
39
+ openaiapi_key=setup.openaiapi_key,
40
+ )
41
+
42
+ self._results_dir = _RESULTS_DIR
43
+ if not self._results_dir.exists():
44
+ self._results_dir.mkdir(parents=True)
45
+ self._results: List[Results] = []
46
+
47
+ async def _collect_results(
48
+ self, queue_in: asyncio.Queue[ProductItem | None]
49
+ ) -> None:
50
+ """Collects the results from the given queue_in and saves it as csv.
51
+
52
+ Args:
53
+ queue_in: The input queue containing the results.
54
+ """
55
+ products = []
56
+ while True:
57
+ product = await queue_in.get()
58
+ if product is None:
59
+ queue_in.task_done()
60
+ break
61
+
62
+ products.append(product.model_dump())
63
+ queue_in.task_done()
64
+
65
+ # Convert the list of products to a DataFrame
66
+ df = pd.json_normalize(products)
67
+ cols = [c.split(".")[-1] for c in df.columns]
68
+ if len(cols) != len(set(cols)):
69
+ logger.error("Duplicate columns after json_normalize.")
70
+ else:
71
+ df.columns = cols
72
+
73
+ # Save the DataFrame to a CSV file
74
+ filename = self._results[-1].filename
75
+ df.to_csv(filename, index=False, quoting=csv.QUOTE_ALL)
76
+ logger.info(f"Results saved to {filename}")
77
+
78
+ def execute(
79
+ self,
80
+ search_term: str,
81
+ language: Language,
82
+ location: Location,
83
+ deepness: Deepness,
84
+ prompts: List[Prompt],
85
+ marketplaces: List[Host] | None = None,
86
+ excluded_urls: List[Host] | None = None,
87
+ ) -> None:
88
+ """Runs the pipeline steps: serp, enrich, zyte, process, and collect the results.
89
+
90
+ Args:
91
+ search_term: The search term for the query.
92
+ language: The language to use for the query.
93
+ location: The location to use for the query.
94
+ deepness: The search depth and enrichment details.
95
+ prompts: The list of prompts to use for classification.
96
+ marketplaces: The marketplaces to include in the search.
97
+ excluded_urls: The URLs to exclude from the search.
98
+ """
99
+ timestamp = datetime.today().strftime("%Y%m%d%H%M%S")
100
+ filename = self._results_dir / self._filename_template.format(
101
+ search_term=search_term,
102
+ language=language.code,
103
+ location=location.code,
104
+ timestamp=timestamp,
105
+ )
106
+ self._results.append(Results(search_term=search_term, filename=filename))
107
+
108
+ asyncio.run(
109
+ super().run(
110
+ search_term=search_term,
111
+ language=language,
112
+ location=location,
113
+ deepness=deepness,
114
+ prompts=prompts,
115
+ marketplaces=marketplaces,
116
+ excluded_urls=excluded_urls,
117
+ )
118
+ )
119
+
120
+ def load_results(self, index: int = -1) -> pd.DataFrame:
121
+ """Loads the results from the saved .csv files.
122
+
123
+ Args:
124
+ index: The index of the results to load (`incex=-1` are the results for the most recent run).
125
+ """
126
+
127
+ results = self._results[index]
128
+ return pd.read_csv(results.filename)
129
+
130
+ def print_available_results(self) -> None:
131
+ """Prints the available results."""
132
+ n_res = len(self._results)
133
+ for i, res in enumerate(self._results):
134
+ print(f"index={-n_res + i}: {res.search_term} - {res.filename}")