fraudcrawler 0.7.21__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of fraudcrawler might be problematic. Click here for more details.

@@ -0,0 +1,520 @@
1
+ import logging
2
+ from pydantic import BaseModel
3
+ from typing import Dict, List, Literal, TypeAlias
4
+
5
+ import httpx
6
+ from openai import AsyncOpenAI
7
+ from openai.types.chat import ChatCompletion, ParsedChatCompletion
8
+ from openai.types.responses import (
9
+ Response,
10
+ ParsedResponse,
11
+ ResponseInputImageParam,
12
+ ResponseInputParam,
13
+ )
14
+ from tenacity import RetryCallState
15
+
16
+ from fraudcrawler.base.base import ProductItem
17
+ from fraudcrawler.base.retry import get_async_retry
18
+ from fraudcrawler.processing.base import (
19
+ ClassificationResult,
20
+ UserInputs,
21
+ Workflow,
22
+ )
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+ Context: TypeAlias = Dict[str, str]
27
+
28
+
29
+ class OpenAIWorkflow(Workflow):
30
+ """(Abstract) Workflow using OpenAI API calls."""
31
+
32
+ def __init__(
33
+ self,
34
+ http_client: httpx.AsyncClient,
35
+ name: str,
36
+ api_key: str,
37
+ model: str,
38
+ ):
39
+ """(Abstract) OpenAI Workflow.
40
+
41
+ Args:
42
+ http_client: An httpx.AsyncClient to use for the async requests.
43
+ name: Name of the node (unique identifier)
44
+ api_key: The OpenAI API key.
45
+ model: The OpenAI model to use.
46
+ """
47
+ super().__init__(name=name)
48
+ self._http_client = http_client
49
+ self._client = AsyncOpenAI(http_client=http_client, api_key=api_key)
50
+ self._model = model
51
+
52
+ def _log_before(
53
+ self, endpoint: str, context: Context, retry_state: RetryCallState
54
+ ) -> None:
55
+ """Context aware logging before the request is made."""
56
+ if retry_state:
57
+ logger.debug(
58
+ f"Workflow={self.name} calls endpoint={endpoint} within context={context} (Attempt {retry_state.attempt_number})."
59
+ )
60
+ else:
61
+ logger.debug(f"retry_state is {retry_state}; not logging before.")
62
+
63
+ def _log_before_sleep(
64
+ self, endpoint: str, context: Context, retry_state: RetryCallState
65
+ ) -> None:
66
+ """Context aware logging before sleeping after a failed request."""
67
+ if retry_state and retry_state.outcome:
68
+ logger.warning(
69
+ f"Attempt {retry_state.attempt_number} of workflow={self.name} "
70
+ f"calling endpoint={endpoint} within context={context} "
71
+ f"failed with error: {retry_state.outcome.exception()}. "
72
+ f"Retrying in {retry_state.upcoming_sleep:.0f} seconds."
73
+ )
74
+
75
+ async def _chat_completions_create(
76
+ self,
77
+ system_prompt: str,
78
+ user_prompt: str,
79
+ context: Context,
80
+ **kwargs,
81
+ ) -> ChatCompletion:
82
+ """Calls the OpenAI chat.completions.create endpoint.
83
+
84
+ Args:
85
+ context: Logging context for retry logs.
86
+ system_prompt: System prompt for the AI model.
87
+ user_prompt: User prompt for the AI model.
88
+ """
89
+ endpoint = "chat.completions.create"
90
+
91
+ # Perform the request and retry if necessary. There is some context aware logging
92
+ # - `before`: before the request is made (or before retrying)
93
+ # - `before_sleep`: if the request fails before sleeping
94
+ retry = get_async_retry()
95
+ retry.before = lambda retry_state: self._log_before(
96
+ endpoint=endpoint, context=context, retry_state=retry_state
97
+ )
98
+ retry.before_sleep = lambda retry_state: self._log_before_sleep(
99
+ endpoint=endpoint, context=context, retry_state=retry_state
100
+ )
101
+ async for attempt in retry:
102
+ with attempt:
103
+ response = await self._client.chat.completions.create(
104
+ model=self._model,
105
+ messages=[
106
+ {"role": "system", "content": system_prompt},
107
+ {"role": "user", "content": user_prompt},
108
+ ],
109
+ **kwargs,
110
+ )
111
+ return response
112
+
113
+ async def _chat_completions_parse(
114
+ self,
115
+ system_prompt: str,
116
+ user_prompt: str,
117
+ response_format: type[BaseModel],
118
+ context: Context,
119
+ **kwargs,
120
+ ) -> ParsedChatCompletion:
121
+ """Calls the OpenAI chat.completions.parse endpoint.
122
+
123
+ Args:
124
+ system_prompt: System prompt for the AI model.
125
+ user_prompt: User prompt for the AI model.
126
+ response_format: The model into which the response should be parsed.
127
+ context: Logging context for retry logs.
128
+ """
129
+ endpoint = "chat.completions.parse"
130
+
131
+ # Perform the request and retry if necessary. There is some context aware logging
132
+ # - `before`: before the request is made (or before retrying)
133
+ # - `before_sleep`: if the request fails before sleeping
134
+ retry = get_async_retry()
135
+ retry.before = lambda retry_state: self._log_before(
136
+ endpoint=endpoint, context=context, retry_state=retry_state
137
+ )
138
+ retry.before_sleep = lambda retry_state: self._log_before_sleep(
139
+ endpoint=endpoint, context=context, retry_state=retry_state
140
+ )
141
+ async for attempt in retry:
142
+ with attempt:
143
+ response = await self._client.chat.completions.parse(
144
+ model=self._model,
145
+ messages=[
146
+ {"role": "system", "content": system_prompt},
147
+ {"role": "user", "content": user_prompt},
148
+ ],
149
+ response_format=response_format, # type: ignore[call-arg]
150
+ **kwargs,
151
+ )
152
+ return response
153
+
154
+ @staticmethod
155
+ def _get_input_param(
156
+ image_url: str,
157
+ system_prompt: str,
158
+ user_prompt: str,
159
+ detail: Literal["low", "high", "auto"],
160
+ ) -> ResponseInputParam:
161
+ # Prepare openai parameters
162
+ image_param: ResponseInputImageParam = {
163
+ "type": "input_image",
164
+ "image_url": image_url,
165
+ "detail": detail,
166
+ }
167
+ input_param: ResponseInputParam = [
168
+ {
169
+ "role": "system",
170
+ "content": system_prompt,
171
+ },
172
+ {
173
+ "role": "user",
174
+ "content": [
175
+ {"type": "input_text", "text": user_prompt},
176
+ image_param,
177
+ ],
178
+ },
179
+ ]
180
+ return input_param
181
+
182
+ async def _responses_create(
183
+ self,
184
+ image_url: str,
185
+ system_prompt: str,
186
+ user_prompt: str,
187
+ context: Context,
188
+ **kwargs,
189
+ ) -> Response:
190
+ """Analyses a base64 encoded image.
191
+
192
+ Args:
193
+ image_url: Raw base64 encoded image with the data URI scheme.
194
+ system_prompt: System prompt for the AI model.
195
+ user_prompt: User prompt for the AI model.
196
+ context: Logging context for retry logs.
197
+
198
+ Note:
199
+ Having the url of a jpeg image (for example), the image_url is optained as:
200
+ ```python
201
+ import requests
202
+
203
+ # Read images as bytes
204
+ resp = requests.get(url)
205
+ resp.raise_for_status()
206
+ image = resp.content
207
+
208
+ # Encode as base64
209
+ b64 = base64.b64encode(image).decode("utf-8")
210
+ data_url = f"data:image/jpeg;base64,{b64}"
211
+ ```
212
+
213
+ The extracted text can be obtained by `response.output_text`
214
+ """
215
+ # Prepare variables
216
+ endpoint = "response.create"
217
+ detail: Literal["low", "high", "auto"] = "high"
218
+ input_param = self._get_input_param(
219
+ image_url=image_url,
220
+ system_prompt=system_prompt,
221
+ user_prompt=user_prompt,
222
+ detail=detail,
223
+ )
224
+
225
+ # Extract information from image
226
+ # Perform the request and retry if necessary. There is some context aware logging
227
+ # - `before`: before the request is made (or before retrying)
228
+ # - `before_sleep`: if the request fails before sleeping
229
+ retry = get_async_retry()
230
+ retry.before = lambda retry_state: self._log_before(
231
+ endpoint=endpoint, context=context, retry_state=retry_state
232
+ )
233
+ retry.before_sleep = lambda retry_state: self._log_before_sleep(
234
+ endpoint=endpoint, context=context, retry_state=retry_state
235
+ )
236
+ async for attempt in retry:
237
+ with attempt:
238
+ response = await self._client.responses.create(
239
+ model=self._model,
240
+ input=input_param,
241
+ **kwargs,
242
+ )
243
+ return response
244
+
245
+ async def _responses_parse(
246
+ self,
247
+ image_url: str,
248
+ system_prompt: str,
249
+ user_prompt: str,
250
+ text_format: type[BaseModel],
251
+ context: Context,
252
+ **kwargs,
253
+ ) -> ParsedResponse:
254
+ """Analyses a base64 encoded image and parses the output_text into response_format.
255
+
256
+ Args:
257
+ image_url: Raw base64 encoded image with the data URI scheme.
258
+ system_prompt: System prompt for the AI model.
259
+ user_prompt: User prompt for the AI model.
260
+ text_format: The model into which the response should be parsed.
261
+ context: Logging context for retry logs.
262
+
263
+ Note:
264
+ (c.f. :func:`_responses_create`)
265
+ """
266
+ # Prepare variables
267
+ endpoint = "response.parse"
268
+ detail: Literal["low", "high", "auto"] = "high"
269
+ input_param = self._get_input_param(
270
+ image_url=image_url,
271
+ system_prompt=system_prompt,
272
+ user_prompt=user_prompt,
273
+ detail=detail,
274
+ )
275
+
276
+ # Extract information from image
277
+ # Perform the request and retry if necessary. There is some context aware logging
278
+ # - `before`: before the request is made (or before retrying)
279
+ # - `before_sleep`: if the request fails before sleeping
280
+ retry = get_async_retry()
281
+ retry.before = lambda retry_state: self._log_before(
282
+ endpoint=endpoint, context=context, retry_state=retry_state
283
+ )
284
+ retry.before_sleep = lambda retry_state: self._log_before_sleep(
285
+ endpoint=endpoint, context=context, retry_state=retry_state
286
+ )
287
+ async for attempt in retry:
288
+ with attempt:
289
+ response = await self._client.responses.parse(
290
+ model=self._model,
291
+ input=input_param,
292
+ text_format=text_format,
293
+ **kwargs,
294
+ )
295
+ return response
296
+
297
+
298
+ class OpenAIClassification(OpenAIWorkflow):
299
+ """Open AI classification workflow with single API call using specific product_item fields for setting up the context.
300
+
301
+ Note:
302
+ The system prompt sets the classes to be produced. They must be contained in allowed classes.
303
+ The fields declared in product_item_fields are concatenated for creating a user prompt from
304
+ which the classification should happen.
305
+ """
306
+
307
+ _product_prompt_template = "Product Details:\n{product_details}\n\nRelevance:"
308
+ _product_details_template = "{field_name}:\n{field_value}"
309
+ _max_tokens: int = 1
310
+
311
+ def __init__(
312
+ self,
313
+ http_client: httpx.AsyncClient,
314
+ name: str,
315
+ api_key: str,
316
+ model: str,
317
+ product_item_fields: List[str],
318
+ system_prompt: str,
319
+ allowed_classes: List[int],
320
+ ):
321
+ """Open AI classification workflow.
322
+
323
+ Args:
324
+ http_client: An httpx.AsyncClient to use for the async requests.
325
+ name: Name of the workflow (unique identifier)
326
+ api_key: The OpenAI API key.
327
+ model: The OpenAI model to use.
328
+ product_item_fields: Product item fields used to construct the user prompt.
329
+ system_prompt: System prompt for the AI model.
330
+ allowed_classes: Allowed classes for model output (must be positive).
331
+ """
332
+ super().__init__(
333
+ http_client=http_client,
334
+ name=name,
335
+ api_key=api_key,
336
+ model=model,
337
+ )
338
+
339
+ if not self._product_item_fields_are_valid(
340
+ product_item_fields=product_item_fields
341
+ ):
342
+ not_valid_fields = set(product_item_fields) - set(
343
+ ProductItem.model_fields.keys()
344
+ )
345
+ raise ValueError(
346
+ f"Invalid product_item_fields are given: {not_valid_fields}."
347
+ )
348
+ self._product_item_fields = product_item_fields
349
+ self._system_prompt = system_prompt
350
+
351
+ if not all(ac >= 0 for ac in allowed_classes):
352
+ raise ValueError("Values of allowed_classes must be >= 0")
353
+ self._allowed_classes = allowed_classes
354
+
355
+ @staticmethod
356
+ def _product_item_fields_are_valid(product_item_fields: List[str]) -> bool:
357
+ """Ensure all product_item_fields are valid ProductItem attributes."""
358
+ return set(product_item_fields).issubset(ProductItem.model_fields.keys())
359
+
360
+ def _get_product_details(self, product: ProductItem) -> str:
361
+ """Extracts product details based on the configuration.
362
+
363
+ Args:
364
+ product: The product item to extract details from.
365
+ """
366
+ details = []
367
+ for name in self._product_item_fields:
368
+ if value := getattr(product, name, None):
369
+ details.append(
370
+ self._product_details_template.format(
371
+ field_name=name, field_value=value
372
+ )
373
+ )
374
+ else:
375
+ logger.warning(
376
+ f'Field "{name}" is missing in ProductItem with url="{product.url}"'
377
+ )
378
+ return "\n\n".join(details)
379
+
380
+ async def _get_product_prompt(self, product: ProductItem) -> str:
381
+ """Forms and returns the product related part for the user_prompt."""
382
+
383
+ # Form the product details from the ProductItem
384
+ product_details = self._get_product_details(product=product)
385
+ if not product_details:
386
+ raise ValueError(
387
+ f"Missing product_details for product_item_fields={self._product_item_fields}."
388
+ )
389
+
390
+ # Create user prompt
391
+ product_prompt = self._product_prompt_template.format(
392
+ product_details=product_details,
393
+ )
394
+ return product_prompt
395
+
396
+ async def _get_user_prompt(self, product: ProductItem) -> str:
397
+ """Forms and returns the user_prompt."""
398
+ product_prompt = await self._get_product_prompt(product=product)
399
+ return product_prompt
400
+
401
+ async def _chat_classification(
402
+ self,
403
+ product: ProductItem,
404
+ system_prompt: str,
405
+ user_prompt: str,
406
+ **kwargs,
407
+ ) -> ClassificationResult:
408
+ """Calls the OpenAI Chat enpoint for a classification."""
409
+ context = {"product.url": product.url}
410
+ response = await self._chat_completions_create(
411
+ system_prompt=system_prompt,
412
+ user_prompt=user_prompt,
413
+ context=context,
414
+ **kwargs,
415
+ )
416
+
417
+ if (
418
+ not response
419
+ or not (content := response.choices[0].message.content)
420
+ or not (usage := response.usage)
421
+ ):
422
+ raise ValueError(
423
+ f'Error calling OpenAI API: response="{response}, content={content}, usage={usage}".'
424
+ )
425
+
426
+ # Convert to ClassificationResult object
427
+ result = int(content.strip())
428
+ return ClassificationResult(
429
+ result=result,
430
+ input_tokens=usage.prompt_tokens,
431
+ output_tokens=usage.completion_tokens,
432
+ )
433
+
434
+ async def run(self, product: ProductItem) -> ClassificationResult:
435
+ """Calls the OpenAI API with the user prompt from the product."""
436
+
437
+ # Get user prompt
438
+ user_prompt = await self._get_user_prompt(product=product)
439
+
440
+ # Call the OpenAI API
441
+ try:
442
+ clfn = await self._chat_classification(
443
+ product=product,
444
+ system_prompt=self._system_prompt,
445
+ user_prompt=user_prompt,
446
+ max_tokens=self._max_tokens,
447
+ )
448
+
449
+ # Enforce that the classification is in the allowed classes
450
+ if clfn.result not in self._allowed_classes:
451
+ raise ValueError(
452
+ f"classification result={clfn.result} not in allowed_classes={self._allowed_classes}"
453
+ )
454
+
455
+ except Exception as e:
456
+ raise Exception(
457
+ f'Error classifying product at url="{product.url}" with workflow="{self.name}": {e}'
458
+ )
459
+
460
+ logger.debug(
461
+ f'Classification for url="{product.url}" (workflow={self.name}): result={clfn.result}, tokens used={clfn.input_tokens + clfn.output_tokens}'
462
+ )
463
+ return clfn
464
+
465
+
466
+ class OpenAIClassificationUserInputs(OpenAIClassification):
467
+ """Open AI classification workflow with single API call using specific product_item fields plus user_inputs for setting up the context.
468
+
469
+ Note:
470
+ The system prompt sets the classes to be produced. They must be contained in allowed classes.
471
+ The fields declared in product_item_fields together with the user_inputs are concatenated for
472
+ creating a user prompt from which the classification should happen.
473
+ """
474
+
475
+ _user_inputs_template = "{key}: {val}"
476
+
477
+ def __init__(
478
+ self,
479
+ http_client: httpx.AsyncClient,
480
+ name: str,
481
+ api_key: str,
482
+ model: str,
483
+ product_item_fields: List[str],
484
+ system_prompt: str,
485
+ allowed_classes: List[int],
486
+ user_inputs: UserInputs,
487
+ ):
488
+ """Open AI classification workflow from user input.
489
+
490
+ Args:
491
+ http_client: An httpx.AsyncClient to use for the async requests.
492
+ name: Name of the workflow (unique identifier)
493
+ api_key: The OpenAI API key.
494
+ model: The OpenAI model to use.
495
+ product_item_fields: Product item fields used to construct the user prompt.
496
+ system_prompt: System prompt for the AI model.
497
+ allowed_classes: Allowed classes for model output.
498
+ user_inputs: Inputs from the frontend by the user.
499
+ """
500
+ super().__init__(
501
+ http_client=http_client,
502
+ name=name,
503
+ api_key=api_key,
504
+ model=model,
505
+ product_item_fields=product_item_fields,
506
+ system_prompt=system_prompt,
507
+ allowed_classes=allowed_classes,
508
+ )
509
+ user_inputs_strings = [
510
+ self._user_inputs_template.format(key=k, val=v)
511
+ for k, v in user_inputs.items()
512
+ ]
513
+ user_inputs_joined = "\n".join(user_inputs_strings)
514
+ self._user_inputs_prompt = f"User Inputs:\n{user_inputs_joined}"
515
+
516
+ async def _get_user_prompt(self, product: ProductItem) -> str:
517
+ """Forms the user_prompt from the product details plus user_inputs."""
518
+ product_prompt = await super()._get_product_prompt(product=product)
519
+ user_prompt = f"{self._user_inputs_prompt}\n\n{product_prompt}"
520
+ return user_prompt
File without changes