fraudcrawler 0.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of fraudcrawler might be problematic. Click here for more details.

@@ -0,0 +1,100 @@
1
+ import logging
2
+
3
+ from fraudcrawler import FraudCrawlerClient, Language, Location, Deepness, Prompt
4
+
5
+ LOG_FMT = "%(asctime)s | %(name)s | %(funcName)s | %(levelname)s | %(message)s"
6
+ LOG_LVL = "INFO"
7
+ DATE_FMT = "%Y-%m-%d %H:%M:%S"
8
+ logging.basicConfig(format=LOG_FMT, level=LOG_LVL, datefmt=DATE_FMT)
9
+
10
+
11
+ def main():
12
+ # Setup the client
13
+ client = FraudCrawlerClient()
14
+
15
+ # Setup the search
16
+ search_term = "Kühlschrank"
17
+ language = Language(name="German")
18
+ location = Location(name="Switzerland")
19
+ deepness = Deepness(num_results=20)
20
+ prompts = [
21
+ Prompt(
22
+ name="relevance",
23
+ context="This organization is interested in checking the energy efficiency of certain devices.",
24
+ system_prompt=(
25
+ "You are a helpful and intelligent assistant. Your task is to classify any given product "
26
+ "as either relevant (1) or not relevant (0), strictly based on the context and product details provided by the user. "
27
+ "You must consider all aspects of the given context and make a binary decision accordingly. "
28
+ "If the product aligns with the user's needs, classify it as 1 (relevant); otherwise, classify it as 0 (not relevant). "
29
+ "Respond only with the number 1 or 0."
30
+ ),
31
+ allowed_classes=[0, 1],
32
+ ),
33
+ Prompt(
34
+ name="seriousness",
35
+ context="This organization is interested in checking the energy efficiency of certain devices.",
36
+ system_prompt=(
37
+ "You are an intelligent and discerning assistant. Your task is to classify each item as either "
38
+ "a product for sale (1) or not a product for sale (0). To make this distinction, consider the following criteria: \n"
39
+ " 1 Product for Sale (1): Classify as 1 if the result clearly indicates an item available for purchase, typically found "
40
+ "within an online shop or marketplace.\n"
41
+ " 2 Not a Product for Sale (0): Classify as 0 if the result is unrelated to a direct purchase of a product. This includes items such as: \n"
42
+ " - Books and Videos: These may be available for sale, but if they are about or related to the searched product rather than being the "
43
+ "exact product itself, classify as 0.\n"
44
+ " - Advertisements: Promotional content that doesn't directly sell a product.\n"
45
+ " - Companies and Services: Names and descriptions of companies or services related to the product but not the product itself.\n"
46
+ " - Related Topics/Content: Any text or media that discusses or elaborates on the topic without offering a tangible product for sale.\n"
47
+ "Make your decision based solely on the context and details provided in the search result. Respond only with the number 1 or 0."
48
+ ),
49
+ allowed_classes=[0, 1],
50
+ ),
51
+ ]
52
+ # # Optional: Add tern ENRICHEMENT
53
+ # from fraudcrawler import Enrichment
54
+
55
+ # deepness.enrichment = Enrichment(additional_terms=10, additional_urls_per_term=20)
56
+
57
+ # # Optional: Add MARKETPLACES and EXCLUDED_URLS
58
+ # from fraudcrawler import Host
59
+
60
+ # marketplaces = [
61
+ # Host(name="International", domains="zavamed.com,apomeds.com"),
62
+ # Host(name="National", domains="netdoktor.ch, nobelpharma.ch")
63
+ # ]
64
+ # excluded_urls = [
65
+ # Host(name="Digitec", domains="digitec.ch"),
66
+ # Host(name="Brack", domains="brack.ch"),
67
+ # ]
68
+
69
+ # Execute the pipeline
70
+ client.execute(
71
+ search_term=search_term,
72
+ language=language,
73
+ location=location,
74
+ deepness=deepness,
75
+ prompts=prompts,
76
+ # marketplaces=marketplaces,
77
+ # excluded_urls=excluded_urls,
78
+ )
79
+
80
+ # Show results
81
+ print()
82
+ title = "Available results"
83
+ print(title)
84
+ print("=" * len(title))
85
+ client.print_available_results()
86
+ print()
87
+ title = f'Results for "{search_term.upper()}"'
88
+ print(title)
89
+ print("=" * len(title))
90
+ df = client.load_results()
91
+ print(f"Number of products found: {len(df)}")
92
+ print()
93
+ n_head = 10
94
+ print(f"First {n_head} products are:")
95
+ print(df.head(n=n_head))
96
+ print()
97
+
98
+
99
+ if __name__ == "__main__":
100
+ main()
File without changes
@@ -0,0 +1,105 @@
1
+ import logging
2
+
3
+ from openai import AsyncOpenAI
4
+
5
+ from fraudcrawler.base.base import Prompt
6
+ from fraudcrawler.settings import PROCESSOR_USER_PROMPT_TEMPLATE
7
+
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ class Processor:
13
+ """Processes product data for classification based on a prompt configuration."""
14
+
15
+ def __init__(self, api_key: str, model: str):
16
+ """Initializes the Processor.
17
+
18
+ Args:
19
+ api_key: The OpenAI API key.
20
+ model: The OpenAI model to use.
21
+ """
22
+ self._client = AsyncOpenAI(api_key=api_key)
23
+ self._model = model
24
+
25
+ async def _call_openai_api(
26
+ self,
27
+ system_prompt: str,
28
+ user_prompt: str,
29
+ **kwargs,
30
+ ) -> str:
31
+ """Calls the OpenAI API with the given user prompt."""
32
+ response = await self._client.chat.completions.create(
33
+ model=self._model,
34
+ messages=[
35
+ {"role": "system", "content": system_prompt},
36
+ {"role": "user", "content": user_prompt},
37
+ ],
38
+ **kwargs,
39
+ )
40
+ content = response.choices[0].message.content
41
+ if not content:
42
+ raise ValueError("Empty response from OpenAI API")
43
+ return content
44
+
45
+ async def classify(
46
+ self, prompt: Prompt, url: str, name: str | None, description: str | None
47
+ ) -> int:
48
+ """A generic classification method that classified a product based on a prompt object.
49
+
50
+ Args:
51
+ prompt: A dictionary with keys "system_prompt", "user_prompt", etc.
52
+ url: Product URL (often used in the user_prompt).
53
+ name: Product name (often used in the user_prompt).
54
+ description: Product description (often used in the user_prompt).
55
+
56
+ Note:
57
+ This method returns `prompt.default_if_missing` if:
58
+ - 'name' or 'description' is None
59
+ - an error occurs during the API call
60
+ - if the response isn't in allowed_classes.
61
+ """
62
+ # If required fields are missing, return the prompt's default fallback if provided.
63
+ if name is None or description is None:
64
+ logger.warning(
65
+ f"Missing required fields for classification: name='{name}', description='{description}'"
66
+ )
67
+ return prompt.default_if_missing
68
+
69
+ # Substitute placeholders in user_prompt with the relevant arguments
70
+ user_prompt = PROCESSOR_USER_PROMPT_TEMPLATE.format(
71
+ context=prompt.context,
72
+ url=url,
73
+ name=name,
74
+ description=description,
75
+ )
76
+
77
+ # Call the OpenAI API
78
+ try:
79
+ logger.debug(
80
+ f'Calling OpenAI API for classification (name="{name}", prompt="{prompt.name}")'
81
+ )
82
+ content = await self._call_openai_api(
83
+ system_prompt=prompt.system_prompt,
84
+ user_prompt=user_prompt,
85
+ max_tokens=1,
86
+ )
87
+ classification = int(content.strip())
88
+
89
+ # Enforce that the classification is in the allowed classes
90
+ if classification not in prompt.allowed_classes:
91
+ logger.warning(
92
+ f"Classification '{classification}' not in allowed classes {prompt.allowed_classes}"
93
+ )
94
+ return prompt.default_if_missing
95
+
96
+ logger.info(
97
+ f'Classification for "{name}" (prompt={prompt.name}): {classification}'
98
+ )
99
+ return classification
100
+
101
+ except Exception as e:
102
+ logger.error(
103
+ f'Error classifying product "{name}" with prompt "{prompt.name}": {e}'
104
+ )
105
+ return prompt.default_if_missing
File without changes
@@ -0,0 +1,303 @@
1
+ from base64 import b64encode
2
+ from collections import defaultdict
3
+ import logging
4
+ from pydantic import BaseModel
5
+ from typing import Dict, List, Iterator
6
+
7
+ from fraudcrawler.settings import ENRICHMENT_DEFAULT_LIMIT
8
+ from fraudcrawler.base.base import Location, Language, AsyncClient
9
+
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ class Keyword(BaseModel):
15
+ """Model for keyword details (e.g. `Keyword(text="sildenafil", volume=100)`)."""
16
+
17
+ text: str
18
+ volume: int
19
+
20
+
21
+ class Enricher(AsyncClient):
22
+ """A client to interact with the DataForSEO API for enhancing searches (producing alternative search_terms)."""
23
+
24
+ _auth_encoding = "ascii"
25
+ _max_retries = 3
26
+ _retry_delay = 2
27
+ _base_endpoint = "https://api.dataforseo.com"
28
+ _suggestions_endpoint = "/v3/dataforseo_labs/google/keyword_suggestions/live"
29
+ _keywords_endpoint = "/v3/dataforseo_labs/google/related_keywords/live"
30
+
31
+ def __init__(self, user: str, pwd: str):
32
+ """Initializes the DataForSeoApiClient with the given username and password.
33
+
34
+ Args:
35
+ user: The username for DataForSEO API.
36
+ pwd: The password for DataForSEO API.
37
+ """
38
+ self._user = user
39
+ self._pwd = pwd
40
+ auth = f"{user}:{pwd}"
41
+ auth = b64encode(auth.encode(self._auth_encoding)).decode(self._auth_encoding)
42
+ self._headers = {
43
+ "Authorization": f"Basic {auth}",
44
+ "Content-Encoding": "gzip",
45
+ }
46
+
47
+ @staticmethod
48
+ def _extract_items_from_data(data: dict) -> Iterator[dict]:
49
+ """Extracts the items from the DataForSEO response.
50
+
51
+ Args:
52
+ data: The response data from DataForSEO.
53
+ """
54
+ tasks = (
55
+ data.get("tasks") or []
56
+ ) # in contrast to data.get("tasks", []) this handles the case where data["tasks"] is set to None
57
+ for task in tasks:
58
+ results = task.get("result") or []
59
+ for result in results:
60
+ items = result.get("items") or []
61
+ yield from items
62
+
63
+ @staticmethod
64
+ def _parse_suggested_keyword(item: dict) -> Keyword:
65
+ """Parses a keyword from an item in the DataForSEO suggested keyword search response.
66
+
67
+ Args:
68
+ item: An item from the DataForSEO response.
69
+ """
70
+ text = item["keyword"]
71
+ volume = item["keyword_info"]["search_volume"]
72
+ return Keyword(text=text, volume=volume)
73
+
74
+ def _extract_suggested_keywords(self, data: dict) -> List[Keyword]:
75
+ """Extracts the keywords from the DataForSEO response for suggested keywords.
76
+
77
+ Args:
78
+ data: The response data from DataForSEO.
79
+
80
+ The DataForSEO results are of the form
81
+ (c.f. https://docs.dataforseo.com/v3/dataforseo_labs/google/keyword_suggestions/live/?bash):
82
+ {
83
+ "tasks": [
84
+ {
85
+ "result": [
86
+ {
87
+ "items": [
88
+ {
89
+ "keyword": <suggested-keyword>,
90
+ "keyword_info": {
91
+ "search_volume": <volume>
92
+ }
93
+ }
94
+ ]
95
+ }
96
+ ]
97
+ }
98
+ ]
99
+ }
100
+
101
+ Args:
102
+ data: The response data from DataForSEO.
103
+ """
104
+ keywords = []
105
+ for item in self._extract_items_from_data(data=data):
106
+ try:
107
+ keyword = self._parse_suggested_keyword(item)
108
+ keywords.append(keyword)
109
+ except Exception as e:
110
+ logger.warning(f"Ignoring keyword due to error: {e}.")
111
+ return keywords
112
+
113
+ async def _get_suggested_keywords(
114
+ self,
115
+ search_term: str,
116
+ language: Language,
117
+ location: Location,
118
+ limit: int = ENRICHMENT_DEFAULT_LIMIT,
119
+ ) -> List[Keyword]:
120
+ """Get keyword suggestions for a given search_term.
121
+
122
+ Args:
123
+ search_term: The search term to use for the query.
124
+ language: The language to use for the search.
125
+ location: The location to use for the search.
126
+ limit: The upper limit of suggestions to get.
127
+ """
128
+
129
+ # Data must be a list of dictionaries setting a number of search tasks; here we only have one task.
130
+ data = [
131
+ {
132
+ "keyword": search_term,
133
+ "language_name": language.name,
134
+ "location_name": location.name,
135
+ "limit": limit,
136
+ "include_serp_info": True,
137
+ "include_seed_keyword": True,
138
+ }
139
+ ]
140
+ logger.debug(
141
+ f'DataForSEO search for suggested keywords with search_term="{search_term}".'
142
+ )
143
+ try:
144
+ url = f"{self._base_endpoint}{self._suggestions_endpoint}"
145
+ logger.debug(f'DataForSEO url="{url}" with data="{data}".')
146
+ sugg_data = await self.post(url=url, headers=self._headers, data=data)
147
+ except Exception as e:
148
+ logger.error(f"DataForSEO suggested search failed with error: {e}.")
149
+
150
+ # Extract the keywords from the response
151
+ try:
152
+ keywords = self._extract_suggested_keywords(data=sugg_data)
153
+ except Exception as e:
154
+ logger.error(
155
+ f"Failed to extract suggested keywords from DataForSEO response with error: {e}."
156
+ )
157
+
158
+ logger.debug(f"Found {len(keywords)} suggestions from DataForSEO search.")
159
+ return keywords
160
+
161
+ @staticmethod
162
+ def _parse_related_keyword(item: dict) -> Keyword:
163
+ """Parses a keyword from an item in the DataForSEO related keyword search response.
164
+
165
+ Args:
166
+ item: An item from the DataForSEO response.
167
+ """
168
+ text = item["keyword_data"]["keyword"]
169
+ volume = item["keyword_data"]["keyword_info"]["search_volume"]
170
+ return Keyword(text=text, volume=volume)
171
+
172
+ def _extract_related_keywords(self, data: dict) -> List[Keyword]:
173
+ """Extracts the keywords from the DataForSEO response for related keywords.
174
+
175
+ Args:
176
+ data: The response data from DataForSEO.
177
+
178
+ The DataForSEO results are of the form
179
+ (c.f. https://docs.dataforseo.com/v3/dataforseo_labs/google/related_keywords/live/?bash):
180
+ {
181
+ "tasks": [
182
+ {
183
+ "result": [
184
+ {
185
+ "items": [
186
+ {
187
+ "keyword_data": {
188
+ "keyword": <related-keyword>,
189
+ "keyword_info": {
190
+ "search_volume": <volume>
191
+ }
192
+ }
193
+ }
194
+ ]
195
+ }
196
+ ]
197
+ }
198
+ ]
199
+ }
200
+
201
+ Args:
202
+ data: The response data from DataForSEO.
203
+ """
204
+ keywords = []
205
+ for item in self._extract_items_from_data(data=data):
206
+ try:
207
+ keyword = self._parse_related_keyword(item)
208
+ keywords.append(keyword)
209
+ except Exception as e:
210
+ logger.warning(f"Ignoring keyword due to error: {e}.")
211
+ return keywords
212
+
213
+ async def _get_related_keywords(
214
+ self,
215
+ search_term: str,
216
+ language: Language,
217
+ location: Location,
218
+ limit: int = ENRICHMENT_DEFAULT_LIMIT,
219
+ ) -> List[Keyword]:
220
+ """Get related keywords for a given search_term.
221
+
222
+ Args:
223
+ search_term: The search term to use for the query.
224
+ location: The location to use for the search.
225
+ language: The language to use for the search.
226
+ limit: The upper limit of suggestions to get.
227
+ """
228
+
229
+ # Data must be a list of dictionaries setting a number of search tasks; here we only have one task.
230
+ data = [
231
+ {
232
+ "keyword": search_term,
233
+ "language_name": language.name,
234
+ "location_name": location.name,
235
+ "limit": limit,
236
+ }
237
+ ]
238
+ logger.debug(
239
+ f'DataForSEO search for related keywords with search_term="{search_term}".'
240
+ )
241
+ try:
242
+ url = f"{self._base_endpoint}{self._keywords_endpoint}"
243
+ logger.debug(f'DataForSEO url="{url}" with data="{data}".')
244
+ rel_data = await self.post(url=url, headers=self._headers, data=data)
245
+ except Exception as e:
246
+ logger.error(f"DataForSEO related keyword search failed with error: {e}.")
247
+
248
+ # Extract the keywords from the response
249
+ try:
250
+ keywords = self._extract_related_keywords(data=rel_data)
251
+ except Exception as e:
252
+ logger.error(
253
+ f"Failed to extract related keywords from DataForSEO response with error: {e}."
254
+ )
255
+
256
+ logger.debug(f"Found {len(keywords)} related keywords from DataForSEO search.")
257
+ return keywords
258
+
259
+ async def apply(
260
+ self,
261
+ search_term: str,
262
+ language: Language,
263
+ location: Location,
264
+ n_terms: int,
265
+ ) -> List[str]:
266
+ """Applies the enrichment to a search_term.
267
+
268
+ Args:
269
+ search_term: The search term to use for the query.
270
+ location: The location to use for the search.
271
+ language: The language to use for the search.
272
+ n_terms: The number of additional terms
273
+ """
274
+ # Get the additional keywords
275
+ logger.info(
276
+ f'Applying enrichment for search_term="{search_term}" and n_terms="{n_terms}".'
277
+ )
278
+ suggested = await self._get_suggested_keywords(
279
+ search_term=search_term,
280
+ location=location,
281
+ language=language,
282
+ limit=n_terms,
283
+ )
284
+ related = await self._get_related_keywords(
285
+ search_term=search_term,
286
+ location=location,
287
+ language=language,
288
+ limit=n_terms,
289
+ )
290
+
291
+ # Remove original keyword and aggregate them by volume
292
+ keywords = [kw for kw in suggested + related if kw.text != search_term]
293
+ kw_vol: Dict[str, int] = defaultdict(int)
294
+ for kw in keywords:
295
+ kw_vol[kw.text] = max(kw.volume, kw_vol[kw.text])
296
+ keywords = [Keyword(text=k, volume=v) for k, v in kw_vol.items()]
297
+ logger.debug(f"Found {len(keywords)} additional unique keywords.")
298
+
299
+ # Sort the keywords by volume and get the top n_terms
300
+ keywords = sorted(keywords, key=lambda kw: kw.volume, reverse=True)
301
+ terms = [kw.text for kw in keywords[:n_terms]]
302
+ logger.info(f"Produced {len(terms)} additional search_terms.")
303
+ return terms