fraudcrawler 0.7.22__py3-none-any.whl → 0.7.26__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,12 +3,15 @@ import logging
3
3
  from pydantic import BaseModel
4
4
  from typing import Any, Dict, List, Sequence, TypeAlias
5
5
 
6
+ from tenacity import RetryCallState
7
+
6
8
  from fraudcrawler.base.base import ProductItem
7
9
 
8
10
  logger = logging.getLogger(__name__)
9
11
 
10
12
 
11
- UserInputs: TypeAlias = Dict[str, List[str]]
13
+ Context: TypeAlias = Dict[str, str]
14
+ UserInputs: TypeAlias = Dict[str, str | List[str]]
12
15
 
13
16
 
14
17
  class ClassificationResult(BaseModel):
@@ -44,6 +47,25 @@ class Workflow(ABC):
44
47
  """
45
48
  self.name = name
46
49
 
50
+ def _log_before(self, context: Context, retry_state: RetryCallState) -> None:
51
+ """Context aware logging before the request is made."""
52
+ if retry_state:
53
+ logger.debug(
54
+ f"Workflow={self.name} retry-call within context={context} (Attempt {retry_state.attempt_number})."
55
+ )
56
+ else:
57
+ logger.debug(f"retry_state is {retry_state}; not logging before.")
58
+
59
+ def _log_before_sleep(self, context: Context, retry_state: RetryCallState) -> None:
60
+ """Context aware logging before sleeping after a failed request."""
61
+ if retry_state and retry_state.outcome:
62
+ logger.warning(
63
+ f"Attempt {retry_state.attempt_number} of workflow={self.name} "
64
+ f"retry-call within context={context} "
65
+ f"failed with error: {retry_state.outcome.exception()}. "
66
+ f"Retrying in {retry_state.upcoming_sleep:.0f} seconds."
67
+ )
68
+
47
69
  @abstractmethod
48
70
  async def run(self, product: ProductItem) -> WorkflowResult:
49
71
  """Runs the workflow."""
@@ -1,6 +1,7 @@
1
+ from copy import deepcopy
1
2
  import logging
2
3
  from pydantic import BaseModel
3
- from typing import Dict, List, Literal, TypeAlias
4
+ from typing import List, Literal
4
5
 
5
6
  import httpx
6
7
  from openai import AsyncOpenAI
@@ -11,7 +12,6 @@ from openai.types.responses import (
11
12
  ResponseInputImageParam,
12
13
  ResponseInputParam,
13
14
  )
14
- from tenacity import RetryCallState
15
15
 
16
16
  from fraudcrawler.base.base import ProductItem
17
17
  from fraudcrawler.base.retry import get_async_retry
@@ -19,16 +19,19 @@ from fraudcrawler.processing.base import (
19
19
  ClassificationResult,
20
20
  UserInputs,
21
21
  Workflow,
22
+ Context,
22
23
  )
23
24
 
24
25
  logger = logging.getLogger(__name__)
25
26
 
26
- Context: TypeAlias = Dict[str, str]
27
-
28
27
 
29
28
  class OpenAIWorkflow(Workflow):
30
29
  """(Abstract) Workflow using OpenAI API calls."""
31
30
 
31
+ _product_prompt_template = "Product Details:\n{product_details}\n\nRelevance:"
32
+ _product_details_template = "{field_name}:\n{field_value}"
33
+ _user_inputs_template = "{key}: {val}"
34
+
32
35
  def __init__(
33
36
  self,
34
37
  http_client: httpx.AsyncClient,
@@ -49,29 +52,6 @@ class OpenAIWorkflow(Workflow):
49
52
  self._client = AsyncOpenAI(http_client=http_client, api_key=api_key)
50
53
  self._model = model
51
54
 
52
- def _log_before(
53
- self, endpoint: str, context: Context, retry_state: RetryCallState
54
- ) -> None:
55
- """Context aware logging before the request is made."""
56
- if retry_state:
57
- logger.debug(
58
- f"Workflow={self.name} calls endpoint={endpoint} within context={context} (Attempt {retry_state.attempt_number})."
59
- )
60
- else:
61
- logger.debug(f"retry_state is {retry_state}; not logging before.")
62
-
63
- def _log_before_sleep(
64
- self, endpoint: str, context: Context, retry_state: RetryCallState
65
- ) -> None:
66
- """Context aware logging before sleeping after a failed request."""
67
- if retry_state and retry_state.outcome:
68
- logger.warning(
69
- f"Attempt {retry_state.attempt_number} of workflow={self.name} "
70
- f"calling endpoint={endpoint} within context={context} "
71
- f"failed with error: {retry_state.outcome.exception()}. "
72
- f"Retrying in {retry_state.upcoming_sleep:.0f} seconds."
73
- )
74
-
75
55
  async def _chat_completions_create(
76
56
  self,
77
57
  system_prompt: str,
@@ -86,17 +66,20 @@ class OpenAIWorkflow(Workflow):
86
66
  system_prompt: System prompt for the AI model.
87
67
  user_prompt: User prompt for the AI model.
88
68
  """
89
- endpoint = "chat.completions.create"
69
+ cntx = deepcopy(context)
70
+ cntx["endpoint"] = "chat.completions.create"
90
71
 
91
72
  # Perform the request and retry if necessary. There is some context aware logging
92
73
  # - `before`: before the request is made (or before retrying)
93
74
  # - `before_sleep`: if the request fails before sleeping
94
75
  retry = get_async_retry()
95
76
  retry.before = lambda retry_state: self._log_before(
96
- endpoint=endpoint, context=context, retry_state=retry_state
77
+ context=cntx,
78
+ retry_state=retry_state,
97
79
  )
98
80
  retry.before_sleep = lambda retry_state: self._log_before_sleep(
99
- endpoint=endpoint, context=context, retry_state=retry_state
81
+ context=cntx,
82
+ retry_state=retry_state,
100
83
  )
101
84
  async for attempt in retry:
102
85
  with attempt:
@@ -126,17 +109,18 @@ class OpenAIWorkflow(Workflow):
126
109
  response_format: The model into which the response should be parsed.
127
110
  context: Logging context for retry logs.
128
111
  """
129
- endpoint = "chat.completions.parse"
112
+ cntx = deepcopy(context)
113
+ cntx["endpoint"] = "chat.completions.parse"
130
114
 
131
115
  # Perform the request and retry if necessary. There is some context aware logging
132
116
  # - `before`: before the request is made (or before retrying)
133
117
  # - `before_sleep`: if the request fails before sleeping
134
118
  retry = get_async_retry()
135
119
  retry.before = lambda retry_state: self._log_before(
136
- endpoint=endpoint, context=context, retry_state=retry_state
120
+ context=cntx, retry_state=retry_state
137
121
  )
138
122
  retry.before_sleep = lambda retry_state: self._log_before_sleep(
139
- endpoint=endpoint, context=context, retry_state=retry_state
123
+ context=cntx, retry_state=retry_state
140
124
  )
141
125
  async for attempt in retry:
142
126
  with attempt:
@@ -213,7 +197,9 @@ class OpenAIWorkflow(Workflow):
213
197
  The extracted text can be obtained by `response.output_text`
214
198
  """
215
199
  # Prepare variables
216
- endpoint = "response.create"
200
+ cntx = deepcopy(context)
201
+ cntx["endpoint"] = "response.create"
202
+
217
203
  detail: Literal["low", "high", "auto"] = "high"
218
204
  input_param = self._get_input_param(
219
205
  image_url=image_url,
@@ -228,10 +214,10 @@ class OpenAIWorkflow(Workflow):
228
214
  # - `before_sleep`: if the request fails before sleeping
229
215
  retry = get_async_retry()
230
216
  retry.before = lambda retry_state: self._log_before(
231
- endpoint=endpoint, context=context, retry_state=retry_state
217
+ context=cntx, retry_state=retry_state
232
218
  )
233
219
  retry.before_sleep = lambda retry_state: self._log_before_sleep(
234
- endpoint=endpoint, context=context, retry_state=retry_state
220
+ context=cntx, retry_state=retry_state
235
221
  )
236
222
  async for attempt in retry:
237
223
  with attempt:
@@ -264,7 +250,8 @@ class OpenAIWorkflow(Workflow):
264
250
  (c.f. :func:`_responses_create`)
265
251
  """
266
252
  # Prepare variables
267
- endpoint = "response.parse"
253
+ cntx = deepcopy(context)
254
+ cntx["enpdoint"] = "response.parse"
268
255
  detail: Literal["low", "high", "auto"] = "high"
269
256
  input_param = self._get_input_param(
270
257
  image_url=image_url,
@@ -279,10 +266,10 @@ class OpenAIWorkflow(Workflow):
279
266
  # - `before_sleep`: if the request fails before sleeping
280
267
  retry = get_async_retry()
281
268
  retry.before = lambda retry_state: self._log_before(
282
- endpoint=endpoint, context=context, retry_state=retry_state
269
+ context=cntx, retry_state=retry_state
283
270
  )
284
271
  retry.before_sleep = lambda retry_state: self._log_before_sleep(
285
- endpoint=endpoint, context=context, retry_state=retry_state
272
+ context=cntx, retry_state=retry_state
286
273
  )
287
274
  async for attempt in retry:
288
275
  with attempt:
@@ -294,6 +281,71 @@ class OpenAIWorkflow(Workflow):
294
281
  )
295
282
  return response
296
283
 
284
+ @staticmethod
285
+ def _product_item_fields_are_valid(product_item_fields: List[str]) -> bool:
286
+ """Ensure all product_item_fields are valid ProductItem attributes."""
287
+ return set(product_item_fields).issubset(ProductItem.model_fields.keys())
288
+
289
+ def _get_product_details(
290
+ self, product: ProductItem, product_item_fields: List[str]
291
+ ) -> str:
292
+ """Extracts product details based on the configuration.
293
+
294
+ Args:
295
+ product: The product item to extract details from.
296
+ product_item_fields: The product item fields to use.
297
+ """
298
+ if not self._product_item_fields_are_valid(
299
+ product_item_fields=product_item_fields
300
+ ):
301
+ not_valid_fields = set(product_item_fields) - set(
302
+ ProductItem.model_fields.keys()
303
+ )
304
+ raise ValueError(f"Invalid product_item_fields: {not_valid_fields}.")
305
+
306
+ details = []
307
+ for name in product_item_fields:
308
+ if value := getattr(product, name, None):
309
+ details.append(
310
+ self._product_details_template.format(
311
+ field_name=name, field_value=value
312
+ )
313
+ )
314
+ else:
315
+ logger.warning(
316
+ f'Field "{name}" is missing in ProductItem with url="{product.url}"'
317
+ )
318
+ return "\n\n".join(details)
319
+
320
+ async def _get_prompt_from_product_details(
321
+ self, product: ProductItem, product_item_fields: List[str]
322
+ ) -> str:
323
+ """Forms and returns the product related part for the user_prompt."""
324
+
325
+ # Form the product details from the ProductItem
326
+ product_details = self._get_product_details(
327
+ product=product, product_item_fields=product_item_fields
328
+ )
329
+ if not product_details:
330
+ raise ValueError(
331
+ f"Missing product_details for product_item_fields={product_item_fields}."
332
+ )
333
+
334
+ # Create user prompt
335
+ product_prompt = self._product_prompt_template.format(
336
+ product_details=product_details,
337
+ )
338
+ return product_prompt
339
+
340
+ async def _get_prompt_from_user_inputs(self, user_inputs: UserInputs) -> str:
341
+ """Forms and returns the user_inputs part for the user_prompt."""
342
+ user_inputs_strings = [
343
+ self._user_inputs_template.format(key=k, val=v)
344
+ for k, v in user_inputs.items()
345
+ ]
346
+ user_inputs_joined = "\n".join(user_inputs_strings)
347
+ return f"User Inputs:\n{user_inputs_joined}"
348
+
297
349
 
298
350
  class OpenAIClassification(OpenAIWorkflow):
299
351
  """Open AI classification workflow with single API call using specific product_item fields for setting up the context.
@@ -304,8 +356,6 @@ class OpenAIClassification(OpenAIWorkflow):
304
356
  which the classification should happen.
305
357
  """
306
358
 
307
- _product_prompt_template = "Product Details:\n{product_details}\n\nRelevance:"
308
- _product_details_template = "{field_name}:\n{field_value}"
309
359
  _max_tokens: int = 1
310
360
 
311
361
  def __init__(
@@ -335,16 +385,6 @@ class OpenAIClassification(OpenAIWorkflow):
335
385
  api_key=api_key,
336
386
  model=model,
337
387
  )
338
-
339
- if not self._product_item_fields_are_valid(
340
- product_item_fields=product_item_fields
341
- ):
342
- not_valid_fields = set(product_item_fields) - set(
343
- ProductItem.model_fields.keys()
344
- )
345
- raise ValueError(
346
- f"Invalid product_item_fields are given: {not_valid_fields}."
347
- )
348
388
  self._product_item_fields = product_item_fields
349
389
  self._system_prompt = system_prompt
350
390
 
@@ -352,50 +392,12 @@ class OpenAIClassification(OpenAIWorkflow):
352
392
  raise ValueError("Values of allowed_classes must be >= 0")
353
393
  self._allowed_classes = allowed_classes
354
394
 
355
- @staticmethod
356
- def _product_item_fields_are_valid(product_item_fields: List[str]) -> bool:
357
- """Ensure all product_item_fields are valid ProductItem attributes."""
358
- return set(product_item_fields).issubset(ProductItem.model_fields.keys())
359
-
360
- def _get_product_details(self, product: ProductItem) -> str:
361
- """Extracts product details based on the configuration.
362
-
363
- Args:
364
- product: The product item to extract details from.
365
- """
366
- details = []
367
- for name in self._product_item_fields:
368
- if value := getattr(product, name, None):
369
- details.append(
370
- self._product_details_template.format(
371
- field_name=name, field_value=value
372
- )
373
- )
374
- else:
375
- logger.warning(
376
- f'Field "{name}" is missing in ProductItem with url="{product.url}"'
377
- )
378
- return "\n\n".join(details)
379
-
380
- async def _get_product_prompt(self, product: ProductItem) -> str:
381
- """Forms and returns the product related part for the user_prompt."""
382
-
383
- # Form the product details from the ProductItem
384
- product_details = self._get_product_details(product=product)
385
- if not product_details:
386
- raise ValueError(
387
- f"Missing product_details for product_item_fields={self._product_item_fields}."
388
- )
389
-
390
- # Create user prompt
391
- product_prompt = self._product_prompt_template.format(
392
- product_details=product_details,
393
- )
394
- return product_prompt
395
-
396
395
  async def _get_user_prompt(self, product: ProductItem) -> str:
397
396
  """Forms and returns the user_prompt."""
398
- product_prompt = await self._get_product_prompt(product=product)
397
+ product_prompt = await self._get_prompt_from_product_details(
398
+ product=product,
399
+ product_item_fields=self._product_item_fields,
400
+ )
399
401
  return product_prompt
400
402
 
401
403
  async def _chat_classification(
@@ -472,8 +474,6 @@ class OpenAIClassificationUserInputs(OpenAIClassification):
472
474
  creating a user prompt from which the classification should happen.
473
475
  """
474
476
 
475
- _user_inputs_template = "{key}: {val}"
476
-
477
477
  def __init__(
478
478
  self,
479
479
  http_client: httpx.AsyncClient,
@@ -506,15 +506,16 @@ class OpenAIClassificationUserInputs(OpenAIClassification):
506
506
  system_prompt=system_prompt,
507
507
  allowed_classes=allowed_classes,
508
508
  )
509
- user_inputs_strings = [
510
- self._user_inputs_template.format(key=k, val=v)
511
- for k, v in user_inputs.items()
512
- ]
513
- user_inputs_joined = "\n".join(user_inputs_strings)
514
- self._user_inputs_prompt = f"User Inputs:\n{user_inputs_joined}"
509
+ self._user_inputs = user_inputs
515
510
 
516
511
  async def _get_user_prompt(self, product: ProductItem) -> str:
517
512
  """Forms the user_prompt from the product details plus user_inputs."""
518
- product_prompt = await super()._get_product_prompt(product=product)
519
- user_prompt = f"{self._user_inputs_prompt}\n\n{product_prompt}"
513
+ product_prompt = await self._get_prompt_from_product_details(
514
+ product=product,
515
+ product_item_fields=self._product_item_fields,
516
+ )
517
+ user_inputs_prompt = await self._get_prompt_from_user_inputs(
518
+ user_inputs=self._user_inputs,
519
+ )
520
+ user_prompt = f"{user_inputs_prompt}\n\n{product_prompt}"
520
521
  return user_prompt
@@ -150,8 +150,8 @@ class ZyteAPI(DomainUtils):
150
150
  """
151
151
  product = details.get("product", {})
152
152
  gtin_list = product.get("gtin", [])
153
-
154
- if gtin_list and len(gtin_list) > 0:
153
+
154
+ if len(gtin_list) > 0:
155
155
  # Extract the first GTIN value
156
156
  gtin_value = gtin_list[0].get("value")
157
157
  if gtin_value:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: fraudcrawler
3
- Version: 0.7.22
3
+ Version: 0.7.26
4
4
  Summary: Intelligent Market Monitoring
5
5
  Home-page: https://github.com/open-veanu/fraudcrawler
6
6
  License: MIT
@@ -8,16 +8,16 @@ fraudcrawler/base/orchestrator.py,sha256=BklS4DNzxbp7yvE2NvBWrDDqnvT4YO7Xh_WXstY
8
8
  fraudcrawler/base/retry.py,sha256=bCDd44XO2-lHO8MGvPblD5152-lHt1dOfMAQSmymLO4,1462
9
9
  fraudcrawler/launch_demo_pipeline.py,sha256=oZWodtNzA5mhmLNYMS6lglry88NutvH4IxnEWOUtL8M,6179
10
10
  fraudcrawler/processing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
- fraudcrawler/processing/base.py,sha256=UkoYxFNZ3BQkXmgJnTtruz8-eIFCtWiquRN_IoEXfM4,4091
12
- fraudcrawler/processing/openai.py,sha256=7sbFg2NPsn627VDzsfIkKantE2KahGmVkSZ1R10OrzQ,19050
11
+ fraudcrawler/processing/base.py,sha256=vNwbwdaN2WANuo73ZFNqo-FMdN7OMCQ08K5bVUtedtc,5129
12
+ fraudcrawler/processing/openai.py,sha256=iwzJEpbMVluhSABEoA_RCMuCC81_2ujonQExJpR6d_o,18627
13
13
  fraudcrawler/scraping/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
14
  fraudcrawler/scraping/enrich.py,sha256=dGWi9p0JStQYSGscCnsQPHNlAeqjoL2rXZnHFNmPhaQ,13158
15
15
  fraudcrawler/scraping/search.py,sha256=qHeUpzv1IpRhdFvaycGtL3FLOwT8rOiF0PfiOH6BmUA,34561
16
16
  fraudcrawler/scraping/url.py,sha256=unUoZ-bThU99ZlLdDUILdPx1kbtwMWPZVPCDqPscqHw,3217
17
- fraudcrawler/scraping/zyte.py,sha256=xSHGKo09sX2dgQBrPI7oeoHsVL4qZ8voQLBXRU1XBqM,11102
17
+ fraudcrawler/scraping/zyte.py,sha256=RbZxmRWaDk3AgcB2EPFVzbqwo_RvzAm5TWT9OTpX2gs,11080
18
18
  fraudcrawler/settings.py,sha256=q3je0r_jd30x2dzlgfm8GyKcigFdgteOLa8HX188bho,3768
19
- fraudcrawler-0.7.22.dist-info/LICENSE,sha256=B-3FuHfe3S0fWAlKlceskPcRhzXq81g-rJ-ddUYb4O8,1062
20
- fraudcrawler-0.7.22.dist-info/METADATA,sha256=D749e0ZWDZSn8pjxvHj7RUf5m0D1_qHzRlZPRFqTE9A,5303
21
- fraudcrawler-0.7.22.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
22
- fraudcrawler-0.7.22.dist-info/entry_points.txt,sha256=1Befm7cM6945y2AA1z9V4gZV63mtSWcAs7ypvgux_Xg,79
23
- fraudcrawler-0.7.22.dist-info/RECORD,,
19
+ fraudcrawler-0.7.26.dist-info/LICENSE,sha256=B-3FuHfe3S0fWAlKlceskPcRhzXq81g-rJ-ddUYb4O8,1062
20
+ fraudcrawler-0.7.26.dist-info/METADATA,sha256=xkNvjmbS9MVld2gSMJDMGzLf9yJWzlA83cTGgvUuTZg,5303
21
+ fraudcrawler-0.7.26.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
22
+ fraudcrawler-0.7.26.dist-info/entry_points.txt,sha256=1Befm7cM6945y2AA1z9V4gZV63mtSWcAs7ypvgux_Xg,79
23
+ fraudcrawler-0.7.26.dist-info/RECORD,,