fraudcrawler 0.5.0__py3-none-any.whl → 0.7.26__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,521 @@
1
+ from copy import deepcopy
2
+ import logging
3
+ from pydantic import BaseModel
4
+ from typing import List, Literal
5
+
6
+ import httpx
7
+ from openai import AsyncOpenAI
8
+ from openai.types.chat import ChatCompletion, ParsedChatCompletion
9
+ from openai.types.responses import (
10
+ Response,
11
+ ParsedResponse,
12
+ ResponseInputImageParam,
13
+ ResponseInputParam,
14
+ )
15
+
16
+ from fraudcrawler.base.base import ProductItem
17
+ from fraudcrawler.base.retry import get_async_retry
18
+ from fraudcrawler.processing.base import (
19
+ ClassificationResult,
20
+ UserInputs,
21
+ Workflow,
22
+ Context,
23
+ )
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+
28
+ class OpenAIWorkflow(Workflow):
29
+ """(Abstract) Workflow using OpenAI API calls."""
30
+
31
+ _product_prompt_template = "Product Details:\n{product_details}\n\nRelevance:"
32
+ _product_details_template = "{field_name}:\n{field_value}"
33
+ _user_inputs_template = "{key}: {val}"
34
+
35
+ def __init__(
36
+ self,
37
+ http_client: httpx.AsyncClient,
38
+ name: str,
39
+ api_key: str,
40
+ model: str,
41
+ ):
42
+ """(Abstract) OpenAI Workflow.
43
+
44
+ Args:
45
+ http_client: An httpx.AsyncClient to use for the async requests.
46
+ name: Name of the node (unique identifier)
47
+ api_key: The OpenAI API key.
48
+ model: The OpenAI model to use.
49
+ """
50
+ super().__init__(name=name)
51
+ self._http_client = http_client
52
+ self._client = AsyncOpenAI(http_client=http_client, api_key=api_key)
53
+ self._model = model
54
+
55
+ async def _chat_completions_create(
56
+ self,
57
+ system_prompt: str,
58
+ user_prompt: str,
59
+ context: Context,
60
+ **kwargs,
61
+ ) -> ChatCompletion:
62
+ """Calls the OpenAI chat.completions.create endpoint.
63
+
64
+ Args:
65
+ context: Logging context for retry logs.
66
+ system_prompt: System prompt for the AI model.
67
+ user_prompt: User prompt for the AI model.
68
+ """
69
+ cntx = deepcopy(context)
70
+ cntx["endpoint"] = "chat.completions.create"
71
+
72
+ # Perform the request and retry if necessary. There is some context aware logging
73
+ # - `before`: before the request is made (or before retrying)
74
+ # - `before_sleep`: if the request fails before sleeping
75
+ retry = get_async_retry()
76
+ retry.before = lambda retry_state: self._log_before(
77
+ context=cntx,
78
+ retry_state=retry_state,
79
+ )
80
+ retry.before_sleep = lambda retry_state: self._log_before_sleep(
81
+ context=cntx,
82
+ retry_state=retry_state,
83
+ )
84
+ async for attempt in retry:
85
+ with attempt:
86
+ response = await self._client.chat.completions.create(
87
+ model=self._model,
88
+ messages=[
89
+ {"role": "system", "content": system_prompt},
90
+ {"role": "user", "content": user_prompt},
91
+ ],
92
+ **kwargs,
93
+ )
94
+ return response
95
+
96
+ async def _chat_completions_parse(
97
+ self,
98
+ system_prompt: str,
99
+ user_prompt: str,
100
+ response_format: type[BaseModel],
101
+ context: Context,
102
+ **kwargs,
103
+ ) -> ParsedChatCompletion:
104
+ """Calls the OpenAI chat.completions.parse endpoint.
105
+
106
+ Args:
107
+ system_prompt: System prompt for the AI model.
108
+ user_prompt: User prompt for the AI model.
109
+ response_format: The model into which the response should be parsed.
110
+ context: Logging context for retry logs.
111
+ """
112
+ cntx = deepcopy(context)
113
+ cntx["endpoint"] = "chat.completions.parse"
114
+
115
+ # Perform the request and retry if necessary. There is some context aware logging
116
+ # - `before`: before the request is made (or before retrying)
117
+ # - `before_sleep`: if the request fails before sleeping
118
+ retry = get_async_retry()
119
+ retry.before = lambda retry_state: self._log_before(
120
+ context=cntx, retry_state=retry_state
121
+ )
122
+ retry.before_sleep = lambda retry_state: self._log_before_sleep(
123
+ context=cntx, retry_state=retry_state
124
+ )
125
+ async for attempt in retry:
126
+ with attempt:
127
+ response = await self._client.chat.completions.parse(
128
+ model=self._model,
129
+ messages=[
130
+ {"role": "system", "content": system_prompt},
131
+ {"role": "user", "content": user_prompt},
132
+ ],
133
+ response_format=response_format, # type: ignore[call-arg]
134
+ **kwargs,
135
+ )
136
+ return response
137
+
138
+ @staticmethod
139
+ def _get_input_param(
140
+ image_url: str,
141
+ system_prompt: str,
142
+ user_prompt: str,
143
+ detail: Literal["low", "high", "auto"],
144
+ ) -> ResponseInputParam:
145
+ # Prepare openai parameters
146
+ image_param: ResponseInputImageParam = {
147
+ "type": "input_image",
148
+ "image_url": image_url,
149
+ "detail": detail,
150
+ }
151
+ input_param: ResponseInputParam = [
152
+ {
153
+ "role": "system",
154
+ "content": system_prompt,
155
+ },
156
+ {
157
+ "role": "user",
158
+ "content": [
159
+ {"type": "input_text", "text": user_prompt},
160
+ image_param,
161
+ ],
162
+ },
163
+ ]
164
+ return input_param
165
+
166
+ async def _responses_create(
167
+ self,
168
+ image_url: str,
169
+ system_prompt: str,
170
+ user_prompt: str,
171
+ context: Context,
172
+ **kwargs,
173
+ ) -> Response:
174
+ """Analyses a base64 encoded image.
175
+
176
+ Args:
177
+ image_url: Raw base64 encoded image with the data URI scheme.
178
+ system_prompt: System prompt for the AI model.
179
+ user_prompt: User prompt for the AI model.
180
+ context: Logging context for retry logs.
181
+
182
+ Note:
183
+ Having the url of a jpeg image (for example), the image_url is optained as:
184
+ ```python
185
+ import requests
186
+
187
+ # Read images as bytes
188
+ resp = requests.get(url)
189
+ resp.raise_for_status()
190
+ image = resp.content
191
+
192
+ # Encode as base64
193
+ b64 = base64.b64encode(image).decode("utf-8")
194
+ data_url = f"data:image/jpeg;base64,{b64}"
195
+ ```
196
+
197
+ The extracted text can be obtained by `response.output_text`
198
+ """
199
+ # Prepare variables
200
+ cntx = deepcopy(context)
201
+ cntx["endpoint"] = "response.create"
202
+
203
+ detail: Literal["low", "high", "auto"] = "high"
204
+ input_param = self._get_input_param(
205
+ image_url=image_url,
206
+ system_prompt=system_prompt,
207
+ user_prompt=user_prompt,
208
+ detail=detail,
209
+ )
210
+
211
+ # Extract information from image
212
+ # Perform the request and retry if necessary. There is some context aware logging
213
+ # - `before`: before the request is made (or before retrying)
214
+ # - `before_sleep`: if the request fails before sleeping
215
+ retry = get_async_retry()
216
+ retry.before = lambda retry_state: self._log_before(
217
+ context=cntx, retry_state=retry_state
218
+ )
219
+ retry.before_sleep = lambda retry_state: self._log_before_sleep(
220
+ context=cntx, retry_state=retry_state
221
+ )
222
+ async for attempt in retry:
223
+ with attempt:
224
+ response = await self._client.responses.create(
225
+ model=self._model,
226
+ input=input_param,
227
+ **kwargs,
228
+ )
229
+ return response
230
+
231
+ async def _responses_parse(
232
+ self,
233
+ image_url: str,
234
+ system_prompt: str,
235
+ user_prompt: str,
236
+ text_format: type[BaseModel],
237
+ context: Context,
238
+ **kwargs,
239
+ ) -> ParsedResponse:
240
+ """Analyses a base64 encoded image and parses the output_text into response_format.
241
+
242
+ Args:
243
+ image_url: Raw base64 encoded image with the data URI scheme.
244
+ system_prompt: System prompt for the AI model.
245
+ user_prompt: User prompt for the AI model.
246
+ text_format: The model into which the response should be parsed.
247
+ context: Logging context for retry logs.
248
+
249
+ Note:
250
+ (c.f. :func:`_responses_create`)
251
+ """
252
+ # Prepare variables
253
+ cntx = deepcopy(context)
254
+ cntx["enpdoint"] = "response.parse"
255
+ detail: Literal["low", "high", "auto"] = "high"
256
+ input_param = self._get_input_param(
257
+ image_url=image_url,
258
+ system_prompt=system_prompt,
259
+ user_prompt=user_prompt,
260
+ detail=detail,
261
+ )
262
+
263
+ # Extract information from image
264
+ # Perform the request and retry if necessary. There is some context aware logging
265
+ # - `before`: before the request is made (or before retrying)
266
+ # - `before_sleep`: if the request fails before sleeping
267
+ retry = get_async_retry()
268
+ retry.before = lambda retry_state: self._log_before(
269
+ context=cntx, retry_state=retry_state
270
+ )
271
+ retry.before_sleep = lambda retry_state: self._log_before_sleep(
272
+ context=cntx, retry_state=retry_state
273
+ )
274
+ async for attempt in retry:
275
+ with attempt:
276
+ response = await self._client.responses.parse(
277
+ model=self._model,
278
+ input=input_param,
279
+ text_format=text_format,
280
+ **kwargs,
281
+ )
282
+ return response
283
+
284
+ @staticmethod
285
+ def _product_item_fields_are_valid(product_item_fields: List[str]) -> bool:
286
+ """Ensure all product_item_fields are valid ProductItem attributes."""
287
+ return set(product_item_fields).issubset(ProductItem.model_fields.keys())
288
+
289
+ def _get_product_details(
290
+ self, product: ProductItem, product_item_fields: List[str]
291
+ ) -> str:
292
+ """Extracts product details based on the configuration.
293
+
294
+ Args:
295
+ product: The product item to extract details from.
296
+ product_item_fields: The product item fields to use.
297
+ """
298
+ if not self._product_item_fields_are_valid(
299
+ product_item_fields=product_item_fields
300
+ ):
301
+ not_valid_fields = set(product_item_fields) - set(
302
+ ProductItem.model_fields.keys()
303
+ )
304
+ raise ValueError(f"Invalid product_item_fields: {not_valid_fields}.")
305
+
306
+ details = []
307
+ for name in product_item_fields:
308
+ if value := getattr(product, name, None):
309
+ details.append(
310
+ self._product_details_template.format(
311
+ field_name=name, field_value=value
312
+ )
313
+ )
314
+ else:
315
+ logger.warning(
316
+ f'Field "{name}" is missing in ProductItem with url="{product.url}"'
317
+ )
318
+ return "\n\n".join(details)
319
+
320
+ async def _get_prompt_from_product_details(
321
+ self, product: ProductItem, product_item_fields: List[str]
322
+ ) -> str:
323
+ """Forms and returns the product related part for the user_prompt."""
324
+
325
+ # Form the product details from the ProductItem
326
+ product_details = self._get_product_details(
327
+ product=product, product_item_fields=product_item_fields
328
+ )
329
+ if not product_details:
330
+ raise ValueError(
331
+ f"Missing product_details for product_item_fields={product_item_fields}."
332
+ )
333
+
334
+ # Create user prompt
335
+ product_prompt = self._product_prompt_template.format(
336
+ product_details=product_details,
337
+ )
338
+ return product_prompt
339
+
340
+ async def _get_prompt_from_user_inputs(self, user_inputs: UserInputs) -> str:
341
+ """Forms and returns the user_inputs part for the user_prompt."""
342
+ user_inputs_strings = [
343
+ self._user_inputs_template.format(key=k, val=v)
344
+ for k, v in user_inputs.items()
345
+ ]
346
+ user_inputs_joined = "\n".join(user_inputs_strings)
347
+ return f"User Inputs:\n{user_inputs_joined}"
348
+
349
+
350
+ class OpenAIClassification(OpenAIWorkflow):
351
+ """Open AI classification workflow with single API call using specific product_item fields for setting up the context.
352
+
353
+ Note:
354
+ The system prompt sets the classes to be produced. They must be contained in allowed classes.
355
+ The fields declared in product_item_fields are concatenated for creating a user prompt from
356
+ which the classification should happen.
357
+ """
358
+
359
+ _max_tokens: int = 1
360
+
361
+ def __init__(
362
+ self,
363
+ http_client: httpx.AsyncClient,
364
+ name: str,
365
+ api_key: str,
366
+ model: str,
367
+ product_item_fields: List[str],
368
+ system_prompt: str,
369
+ allowed_classes: List[int],
370
+ ):
371
+ """Open AI classification workflow.
372
+
373
+ Args:
374
+ http_client: An httpx.AsyncClient to use for the async requests.
375
+ name: Name of the workflow (unique identifier)
376
+ api_key: The OpenAI API key.
377
+ model: The OpenAI model to use.
378
+ product_item_fields: Product item fields used to construct the user prompt.
379
+ system_prompt: System prompt for the AI model.
380
+ allowed_classes: Allowed classes for model output (must be positive).
381
+ """
382
+ super().__init__(
383
+ http_client=http_client,
384
+ name=name,
385
+ api_key=api_key,
386
+ model=model,
387
+ )
388
+ self._product_item_fields = product_item_fields
389
+ self._system_prompt = system_prompt
390
+
391
+ if not all(ac >= 0 for ac in allowed_classes):
392
+ raise ValueError("Values of allowed_classes must be >= 0")
393
+ self._allowed_classes = allowed_classes
394
+
395
+ async def _get_user_prompt(self, product: ProductItem) -> str:
396
+ """Forms and returns the user_prompt."""
397
+ product_prompt = await self._get_prompt_from_product_details(
398
+ product=product,
399
+ product_item_fields=self._product_item_fields,
400
+ )
401
+ return product_prompt
402
+
403
+ async def _chat_classification(
404
+ self,
405
+ product: ProductItem,
406
+ system_prompt: str,
407
+ user_prompt: str,
408
+ **kwargs,
409
+ ) -> ClassificationResult:
410
+ """Calls the OpenAI Chat enpoint for a classification."""
411
+ context = {"product.url": product.url}
412
+ response = await self._chat_completions_create(
413
+ system_prompt=system_prompt,
414
+ user_prompt=user_prompt,
415
+ context=context,
416
+ **kwargs,
417
+ )
418
+
419
+ if (
420
+ not response
421
+ or not (content := response.choices[0].message.content)
422
+ or not (usage := response.usage)
423
+ ):
424
+ raise ValueError(
425
+ f'Error calling OpenAI API: response="{response}, content={content}, usage={usage}".'
426
+ )
427
+
428
+ # Convert to ClassificationResult object
429
+ result = int(content.strip())
430
+ return ClassificationResult(
431
+ result=result,
432
+ input_tokens=usage.prompt_tokens,
433
+ output_tokens=usage.completion_tokens,
434
+ )
435
+
436
+ async def run(self, product: ProductItem) -> ClassificationResult:
437
+ """Calls the OpenAI API with the user prompt from the product."""
438
+
439
+ # Get user prompt
440
+ user_prompt = await self._get_user_prompt(product=product)
441
+
442
+ # Call the OpenAI API
443
+ try:
444
+ clfn = await self._chat_classification(
445
+ product=product,
446
+ system_prompt=self._system_prompt,
447
+ user_prompt=user_prompt,
448
+ max_tokens=self._max_tokens,
449
+ )
450
+
451
+ # Enforce that the classification is in the allowed classes
452
+ if clfn.result not in self._allowed_classes:
453
+ raise ValueError(
454
+ f"classification result={clfn.result} not in allowed_classes={self._allowed_classes}"
455
+ )
456
+
457
+ except Exception as e:
458
+ raise Exception(
459
+ f'Error classifying product at url="{product.url}" with workflow="{self.name}": {e}'
460
+ )
461
+
462
+ logger.debug(
463
+ f'Classification for url="{product.url}" (workflow={self.name}): result={clfn.result}, tokens used={clfn.input_tokens + clfn.output_tokens}'
464
+ )
465
+ return clfn
466
+
467
+
468
+ class OpenAIClassificationUserInputs(OpenAIClassification):
469
+ """Open AI classification workflow with single API call using specific product_item fields plus user_inputs for setting up the context.
470
+
471
+ Note:
472
+ The system prompt sets the classes to be produced. They must be contained in allowed classes.
473
+ The fields declared in product_item_fields together with the user_inputs are concatenated for
474
+ creating a user prompt from which the classification should happen.
475
+ """
476
+
477
+ def __init__(
478
+ self,
479
+ http_client: httpx.AsyncClient,
480
+ name: str,
481
+ api_key: str,
482
+ model: str,
483
+ product_item_fields: List[str],
484
+ system_prompt: str,
485
+ allowed_classes: List[int],
486
+ user_inputs: UserInputs,
487
+ ):
488
+ """Open AI classification workflow from user input.
489
+
490
+ Args:
491
+ http_client: An httpx.AsyncClient to use for the async requests.
492
+ name: Name of the workflow (unique identifier)
493
+ api_key: The OpenAI API key.
494
+ model: The OpenAI model to use.
495
+ product_item_fields: Product item fields used to construct the user prompt.
496
+ system_prompt: System prompt for the AI model.
497
+ allowed_classes: Allowed classes for model output.
498
+ user_inputs: Inputs from the frontend by the user.
499
+ """
500
+ super().__init__(
501
+ http_client=http_client,
502
+ name=name,
503
+ api_key=api_key,
504
+ model=model,
505
+ product_item_fields=product_item_fields,
506
+ system_prompt=system_prompt,
507
+ allowed_classes=allowed_classes,
508
+ )
509
+ self._user_inputs = user_inputs
510
+
511
+ async def _get_user_prompt(self, product: ProductItem) -> str:
512
+ """Forms the user_prompt from the product details plus user_inputs."""
513
+ product_prompt = await self._get_prompt_from_product_details(
514
+ product=product,
515
+ product_item_fields=self._product_item_fields,
516
+ )
517
+ user_inputs_prompt = await self._get_prompt_from_user_inputs(
518
+ user_inputs=self._user_inputs,
519
+ )
520
+ user_prompt = f"{user_inputs_prompt}\n\n{product_prompt}"
521
+ return user_prompt
@@ -324,9 +324,10 @@ class Enricher:
324
324
  language=language,
325
325
  limit=n_terms,
326
326
  )
327
- except Exception as e:
327
+ except Exception:
328
328
  logger.error(
329
- f"Error fetching suggested keywords for search_term='{search_term}': {e}"
329
+ f"Fetching suggested keywords for search_term='{search_term}' failed",
330
+ exc_info=True,
330
331
  )
331
332
  suggested = []
332
333
 
@@ -338,9 +339,10 @@ class Enricher:
338
339
  language=language,
339
340
  limit=n_terms,
340
341
  )
341
- except Exception as e:
342
+ except Exception:
342
343
  logger.error(
343
- f"Error fetching related keywords for search_term='{search_term}': {e}"
344
+ f"Fetching related keywords for search_term='{search_term}' failed",
345
+ exc_info=True,
344
346
  )
345
347
  related = []
346
348