fraudcrawler 0.4.7__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
fraudcrawler/__init__.py CHANGED
@@ -1,7 +1,7 @@
1
- from fraudcrawler.scraping.serp import SerpApi, SearchEngine
1
+ from fraudcrawler.scraping.search import Search, SearchEngineName
2
2
  from fraudcrawler.scraping.enrich import Enricher
3
3
  from fraudcrawler.scraping.url import URLCollector
4
- from fraudcrawler.scraping.zyte import ZyteApi
4
+ from fraudcrawler.scraping.zyte import ZyteAPI
5
5
  from fraudcrawler.processing.processor import Processor
6
6
  from fraudcrawler.base.orchestrator import Orchestrator
7
7
  from fraudcrawler.base.client import FraudCrawlerClient
@@ -13,14 +13,15 @@ from fraudcrawler.base.base import (
13
13
  Location,
14
14
  Prompt,
15
15
  ProductItem,
16
+ HttpxAsyncClient,
16
17
  )
17
18
 
18
19
  __all__ = [
19
- "SerpApi",
20
- "SearchEngine",
20
+ "Search",
21
+ "SearchEngineName",
21
22
  "Enricher",
22
23
  "URLCollector",
23
- "ZyteApi",
24
+ "ZyteAPI",
24
25
  "Processor",
25
26
  "Orchestrator",
26
27
  "ProductItem",
@@ -31,4 +32,5 @@ __all__ = [
31
32
  "Deepness",
32
33
  "Enrichment",
33
34
  "Prompt",
35
+ "HttpxAsyncClient",
34
36
  ]
fraudcrawler/base/base.py CHANGED
@@ -7,15 +7,21 @@ from pydantic import (
7
7
  model_validator,
8
8
  )
9
9
  from pydantic_settings import BaseSettings
10
+ from urllib.parse import urlparse
10
11
  import re
11
- from typing import List, Dict
12
+ from typing import Any, Dict, List
12
13
 
13
- import aiohttp
14
+ import httpx
14
15
 
15
16
  from fraudcrawler.settings import (
16
17
  GOOGLE_LANGUAGES_FILENAME,
17
18
  GOOGLE_LOCATIONS_FILENAME,
18
19
  )
20
+ from fraudcrawler.settings import (
21
+ DEFAULT_HTTPX_TIMEOUT,
22
+ DEFAULT_HTTPX_LIMITS,
23
+ DEFAULT_HTTPX_REDIRECTS,
24
+ )
19
25
 
20
26
  logger = logging.getLogger(__name__)
21
27
 
@@ -130,7 +136,8 @@ class ProductItem(BaseModel):
130
136
  search_term: str
131
137
  search_term_type: str
132
138
  url: str
133
- marketplace_name: str
139
+ url_resolved: str
140
+ search_engine_name: str
134
141
  domain: str
135
142
 
136
143
  # Zyte parameters
@@ -180,32 +187,57 @@ class Prompt(BaseModel):
180
187
  return val
181
188
 
182
189
 
183
- class AsyncClient:
184
- """Base class for sub-classes using async HTTP requests."""
185
-
186
- @staticmethod
187
- async def get(
188
- url: str,
189
- headers: dict | None = None,
190
- params: dict | None = None,
191
- ) -> dict:
192
- """Async GET request of a given URL returning the data."""
193
- async with aiohttp.ClientSession(headers=headers) as session:
194
- async with session.get(url=url, params=params) as response:
195
- response.raise_for_status()
196
- json_ = await response.json()
197
- return json_
198
-
199
- @staticmethod
200
- async def post(
201
- url: str,
202
- headers: dict | None = None,
203
- data: List[dict] | dict | None = None,
204
- auth: aiohttp.BasicAuth | None = None,
205
- ) -> dict:
206
- """Async POST request of a given URL returning the data."""
207
- async with aiohttp.ClientSession(headers=headers) as session:
208
- async with session.post(url=url, json=data, auth=auth) as response:
209
- response.raise_for_status()
210
- json_ = await response.json()
211
- return json_
190
+ class HttpxAsyncClient(httpx.AsyncClient):
191
+ """Httpx async client that can be used to retain the default settings."""
192
+
193
+ def __init__(
194
+ self,
195
+ timeout: httpx.Timeout | Dict[str, Any] = DEFAULT_HTTPX_TIMEOUT,
196
+ limits: httpx.Limits | Dict[str, Any] = DEFAULT_HTTPX_LIMITS,
197
+ follow_redirects: bool = DEFAULT_HTTPX_REDIRECTS,
198
+ **kwargs: Any,
199
+ ) -> None:
200
+ if isinstance(timeout, dict):
201
+ timeout = httpx.Timeout(**timeout)
202
+ if isinstance(limits, dict):
203
+ limits = httpx.Limits(**limits)
204
+
205
+ kwargs.setdefault("timeout", timeout)
206
+ kwargs.setdefault("limits", limits)
207
+ kwargs.setdefault("follow_redirects", follow_redirects)
208
+ super().__init__(**kwargs)
209
+
210
+
211
+ class DomainUtils:
212
+ """Utility class for domain extraction and normalization.
213
+
214
+ Handles domain parsing from URLs, removes common prefixes (www, http/https),
215
+ and provides consistent domain formatting for search and scraping operations.
216
+ """
217
+
218
+ _hostname_pattern = r"^(?:https?:\/\/)?([^\/:?#]+)"
219
+
220
+ def _get_domain(self, url: str) -> str:
221
+ """Extracts the second-level domain together with the top-level domain (e.g. `google.com`).
222
+
223
+ Args:
224
+ url: The URL to be processed.
225
+ """
226
+ # Add scheme; urlparse requires it
227
+ if not url.startswith(("http://", "https://")):
228
+ url = "http://" + url
229
+
230
+ # Get the hostname
231
+ hostname = urlparse(url).hostname
232
+ if hostname is None and (match := re.search(self._hostname_pattern, url)):
233
+ hostname = match.group(1)
234
+ if hostname is None:
235
+ logger.warning(
236
+ f'Failed to extract domain from url="{url}"; full url is returned'
237
+ )
238
+ return url.lower()
239
+
240
+ # Remove www. prefix
241
+ if hostname and hostname.startswith("www."):
242
+ hostname = hostname[4:]
243
+ return hostname.lower()
@@ -4,7 +4,7 @@ from datetime import datetime
4
4
  import logging
5
5
  from pathlib import Path
6
6
  from pydantic import BaseModel
7
- from typing import List
7
+ from typing import List, Self
8
8
 
9
9
  import pandas as pd
10
10
 
@@ -19,7 +19,7 @@ from fraudcrawler.base.base import (
19
19
  ProductItem,
20
20
  )
21
21
  from fraudcrawler.base.orchestrator import Orchestrator
22
- from fraudcrawler.scraping.serp import SearchEngine
22
+ from fraudcrawler.scraping.search import SearchEngineName
23
23
 
24
24
  logger = logging.getLogger(__name__)
25
25
 
@@ -53,6 +53,13 @@ class FraudCrawlerClient(Orchestrator):
53
53
  self._results_dir.mkdir(parents=True)
54
54
  self._results: List[Results] = []
55
55
 
56
+ async def __aenter__(self) -> Self:
57
+ await super().__aenter__() # let base set itself up
58
+ return self # so `async with FraudCrawlerClient()` gives you this instance
59
+
60
+ async def __aexit__(self, *args, **kwargs) -> None:
61
+ await super().__aexit__(*args, **kwargs)
62
+
56
63
  async def _collect_results(
57
64
  self, queue_in: asyncio.Queue[ProductItem | None]
58
65
  ) -> None:
@@ -93,7 +100,8 @@ class FraudCrawlerClient(Orchestrator):
93
100
  prompts: List[Prompt],
94
101
  marketplaces: List[Host] | None = None,
95
102
  excluded_urls: List[Host] | None = None,
96
- search_engines: List[SearchEngine | str] | None = None,
103
+ search_engines: List[SearchEngineName | str] | None = None,
104
+ previously_collected_urls: List[str] | None = None,
97
105
  ) -> None:
98
106
  """Runs the pipeline steps: serp, enrich, zyte, process, and collect the results.
99
107
 
@@ -103,8 +111,10 @@ class FraudCrawlerClient(Orchestrator):
103
111
  location: The location to use for the query.
104
112
  deepness: The search depth and enrichment details.
105
113
  prompts: The list of prompts to use for classification.
106
- marketplaces: The marketplaces to include in the search.
107
- excluded_urls: The URLs to exclude from the search.
114
+ marketplaces: The marketplaces to include in the search (optional).
115
+ excluded_urls: The URLs to exclude from the search (optional).
116
+ search_engines: The list of search engines to use for the search (optional).
117
+ previously_collected_urls: The urls that have been collected previously and are ignored (optional).
108
118
  """
109
119
  # Handle results files
110
120
  timestamp = datetime.today().strftime("%Y%m%d%H%M%S")
@@ -116,24 +126,30 @@ class FraudCrawlerClient(Orchestrator):
116
126
  )
117
127
  self._results.append(Results(search_term=search_term, filename=filename))
118
128
 
119
- # Normalize inputs
120
- nrm_se: List[SearchEngine] = list(SearchEngine)
129
+ # Normalize inputs - convert strings to SearchEngineName enum values
130
+ nrm_search_engines = list(SearchEngineName)
121
131
  if search_engines:
122
- nrm_se = [
123
- SearchEngine(se) if isinstance(se, str) else se for se in search_engines
132
+ nrm_search_engines = [
133
+ SearchEngineName(se) if isinstance(se, str) else se
134
+ for se in search_engines
124
135
  ]
125
136
 
126
137
  # Run the pipeline by calling the orchestrator's run method
138
+ async def _run(*args, **kwargs):
139
+ async with self:
140
+ return await super(FraudCrawlerClient, self).run(*args, **kwargs)
141
+
127
142
  asyncio.run(
128
- super().run(
143
+ _run(
129
144
  search_term=search_term,
130
- search_engines=nrm_se,
145
+ search_engines=nrm_search_engines,
131
146
  language=language,
132
147
  location=location,
133
148
  deepness=deepness,
134
149
  prompts=prompts,
135
150
  marketplaces=marketplaces,
136
151
  excluded_urls=excluded_urls,
152
+ previously_collected_urls=previously_collected_urls,
137
153
  )
138
154
  )
139
155
 
@@ -1,9 +1,10 @@
1
1
  from abc import ABC, abstractmethod
2
2
  import asyncio
3
3
  import logging
4
- from typing import Dict, List, cast
4
+ from typing import cast, Dict, List, Self
5
5
 
6
6
  from bs4 import BeautifulSoup
7
+ import httpx
7
8
 
8
9
  from fraudcrawler.settings import (
9
10
  PROCESSOR_DEFAULT_MODEL,
@@ -20,13 +21,14 @@ from fraudcrawler.base.base import (
20
21
  Location,
21
22
  Prompt,
22
23
  ProductItem,
24
+ HttpxAsyncClient,
23
25
  )
24
26
  from fraudcrawler import (
25
- SerpApi,
26
- SearchEngine,
27
+ Search,
28
+ SearchEngineName,
27
29
  Enricher,
28
30
  URLCollector,
29
- ZyteApi,
31
+ ZyteAPI,
30
32
  Processor,
31
33
  )
32
34
 
@@ -60,9 +62,18 @@ class Orchestrator(ABC):
60
62
  n_serp_wkrs: int = DEFAULT_N_SERP_WKRS,
61
63
  n_zyte_wkrs: int = DEFAULT_N_ZYTE_WKRS,
62
64
  n_proc_wkrs: int = DEFAULT_N_PROC_WKRS,
65
+ # Configure a custom httpx client.
66
+ # We provide a `HttpxAsyncClient` class that you can pass
67
+ # to retain the default values we use for `limits`, `timeout` & `follow_redirects`.
68
+ http_client: httpx.AsyncClient | None = None,
63
69
  ):
64
70
  """Initializes the orchestrator with the given settings.
65
71
 
72
+ NOTE:
73
+ The class:`Orchestrator` must be used as context manager as follows:
74
+ async with Orchestrator(...) as orchestrator:
75
+ await orchestrator.run()
76
+
66
77
  Args:
67
78
  serpapi_key: The API key for SERP API.
68
79
  dataforseo_user: The user for DataForSEO.
@@ -73,16 +84,16 @@ class Orchestrator(ABC):
73
84
  n_serp_wkrs: Number of async workers for serp (optional).
74
85
  n_zyte_wkrs: Number of async workers for zyte (optional).
75
86
  n_proc_wkrs: Number of async workers for the processor (optional).
87
+ http_client: An httpx.AsyncClient to use for the async requests (optional).
76
88
  """
77
- # Setup the clients
78
- self._serpapi = SerpApi(api_key=serpapi_key)
79
- self._enricher = Enricher(user=dataforseo_user, pwd=dataforseo_pwd)
80
- self._url_collector = URLCollector()
81
- self._zyteapi = ZyteApi(api_key=zyteapi_key)
82
- self._processor = Processor(
83
- api_key=openaiapi_key,
84
- model=openai_model,
85
- )
89
+
90
+ # Store the variables for setting up the clients
91
+ self._serpapi_key = serpapi_key
92
+ self._dataforseo_user = dataforseo_user
93
+ self._dataforseo_pwd = dataforseo_pwd
94
+ self._zyteapi_key = zyteapi_key
95
+ self._openaiapi_key = openaiapi_key
96
+ self._openai_model = openai_model
86
97
 
87
98
  # Setup the async framework
88
99
  self._n_serp_wkrs = n_serp_wkrs
@@ -91,12 +102,50 @@ class Orchestrator(ABC):
91
102
  self._queues: Dict[str, asyncio.Queue] | None = None
92
103
  self._workers: Dict[str, List[asyncio.Task] | asyncio.Task] | None = None
93
104
 
105
+ # Setup the httpx client
106
+ self._http_client = http_client
107
+ self._owns_http_client = http_client is None
108
+
109
+ async def __aenter__(self) -> Self:
110
+ """Creates and starts an httpx.AsyncClient if not provided."""
111
+ if self._http_client is None:
112
+ logger.debug("Creating a new httpx.AsyncClient owned by the orchestrator")
113
+ self._http_client = HttpxAsyncClient()
114
+ self._owns_http_client = True
115
+
116
+ # Setup the clients
117
+ self._search = Search(
118
+ http_client=self._http_client, serpapi_key=self._serpapi_key
119
+ )
120
+ self._enricher = Enricher(
121
+ http_client=self._http_client,
122
+ user=self._dataforseo_user,
123
+ pwd=self._dataforseo_pwd,
124
+ )
125
+ self._url_collector = URLCollector()
126
+ self._zyteapi = ZyteAPI(
127
+ http_client=self._http_client, api_key=self._zyteapi_key
128
+ )
129
+ self._processor = Processor(
130
+ http_client=self._http_client,
131
+ api_key=self._openaiapi_key,
132
+ model=self._openai_model,
133
+ )
134
+ return self
135
+
136
+ async def __aexit__(self, *args, **kwargs) -> None:
137
+ """Closes the httpx.AsyncClient if it was created by this orchestrator."""
138
+ if self._owns_http_client and self._http_client is not None:
139
+ logger.debug("Closing the httpx.AsyncClient owned by the orchestrator")
140
+ await self._http_client.aclose()
141
+ self._http_client = None
142
+
94
143
  async def _serp_execute(
95
144
  self,
96
145
  queue_in: asyncio.Queue[dict | None],
97
146
  queue_out: asyncio.Queue[ProductItem | None],
98
147
  ) -> None:
99
- """Collects the SerpApi search setups from the queue_in, executes the search, filters the results (country_code) and puts them into queue_out.
148
+ """Collects the search setups from the queue_in, executes the search, filters the results and puts them into queue_out.
100
149
 
101
150
  Args:
102
151
  queue_in: The input queue containing the search parameters.
@@ -110,23 +159,30 @@ class Orchestrator(ABC):
110
159
 
111
160
  try:
112
161
  search_term_type = item.pop("search_term_type")
113
- results = await self._serpapi.apply(**item)
162
+ # The search_engines are already SearchEngineName enum values
163
+ search_engines = item.pop("search_engines")
164
+
165
+ results = await self._search.apply(
166
+ **item, search_engines=search_engines
167
+ )
168
+
114
169
  logger.debug(
115
- f"SERP API search for {item['search_term']} returned {len(results)} results"
170
+ f"Search for {item['search_term']} returned {len(results)} results"
116
171
  )
117
172
  for res in results:
118
173
  product = ProductItem(
119
174
  search_term=item["search_term"],
120
175
  search_term_type=search_term_type,
121
176
  url=res.url,
122
- marketplace_name=res.marketplace_name,
177
+ url_resolved=res.url, # Set initial value, will be updated by Zyte
178
+ search_engine_name=res.search_engine_name,
123
179
  domain=res.domain,
124
180
  filtered=res.filtered,
125
181
  filtered_at_stage=res.filtered_at_stage,
126
182
  )
127
183
  await queue_out.put(product)
128
184
  except Exception as e:
129
- logger.error(f"Error executing SERP API search: {e}")
185
+ logger.error(f"Error executing search: {e}")
130
186
  queue_in.task_done()
131
187
 
132
188
  async def _collect_url(
@@ -191,10 +247,22 @@ class Orchestrator(ABC):
191
247
  if not product.filtered:
192
248
  try:
193
249
  # Fetch the product details from Zyte API
194
- details = await self._zyteapi.get_details(url=product.url)
250
+ details = await self._zyteapi.details(url=product.url)
251
+ url_resolved = self._zyteapi.extract_url_resolved(details=details)
252
+ if url_resolved:
253
+ product.url_resolved = url_resolved
195
254
  product.product_name = self._zyteapi.extract_product_name(
196
255
  details=details
197
256
  )
257
+
258
+ # If the resolved URL is different from the original URL, we also need to update the domain as
259
+ # otherwise the unresolved domain will be shown, for example for unresolved domain toppreis.ch but resolved digitec.ch
260
+ if url_resolved and url_resolved != product.url:
261
+ logger.debug(
262
+ f"URL resolved for {product.url} is {url_resolved}"
263
+ )
264
+ product.domain = self._search._get_domain(url_resolved)
265
+
198
266
  product.product_price = self._zyteapi.extract_product_price(
199
267
  details=details
200
268
  )
@@ -362,7 +430,7 @@ class Orchestrator(ABC):
362
430
  queue: asyncio.Queue[dict | None],
363
431
  search_term: str,
364
432
  search_term_type: str,
365
- search_engines: List[SearchEngine],
433
+ search_engines: List[SearchEngineName],
366
434
  language: Language,
367
435
  location: Location,
368
436
  num_results: int,
@@ -387,7 +455,7 @@ class Orchestrator(ABC):
387
455
  self,
388
456
  queue: asyncio.Queue[dict | None],
389
457
  search_term: str,
390
- search_engines: List[SearchEngine],
458
+ search_engines: List[SearchEngineName],
391
459
  language: Language,
392
460
  location: Location,
393
461
  deepness: Deepness,
@@ -417,7 +485,7 @@ class Orchestrator(ABC):
417
485
  if enrichment:
418
486
  # Call DataForSEO to get additional terms
419
487
  n_terms = enrichment.additional_terms
420
- terms = await self._enricher.apply(
488
+ terms = await self._enricher.enrich(
421
489
  search_term=search_term,
422
490
  language=language,
423
491
  location=location,
@@ -437,7 +505,7 @@ class Orchestrator(ABC):
437
505
  async def run(
438
506
  self,
439
507
  search_term: str,
440
- search_engines: List[SearchEngine],
508
+ search_engines: List[SearchEngineName],
441
509
  language: Language,
442
510
  location: Location,
443
511
  deepness: Deepness,
@@ -450,7 +518,7 @@ class Orchestrator(ABC):
450
518
 
451
519
  Args:
452
520
  search_term: The search term for the query.
453
- search_engines: The list of search engines to use for the SerpAPI query.
521
+ search_engines: The list of search engines to use for the search query.
454
522
  language: The language to use for the query.
455
523
  location: The location to use for the query.
456
524
  deepness: The search depth and enrichment details.
@@ -459,10 +527,17 @@ class Orchestrator(ABC):
459
527
  excluded_urls: The URLs to exclude from the search.
460
528
  previously_collected_urls: The urls that have been collected previously and are ignored.
461
529
  """
462
-
463
530
  # ---------------------------
464
531
  # INITIAL SETUP
465
532
  # ---------------------------
533
+ # Ensure we have at least one search engine
534
+ if not search_engines:
535
+ logger.warning(
536
+ "No search engines specified, using all available search engines"
537
+ )
538
+ search_engines = list(SearchEngineName)
539
+
540
+ # Handle previously collected URLs
466
541
  if previously_collected_urls:
467
542
  self._url_collector.collected_previously = set(previously_collected_urls)
468
543
 
@@ -614,4 +689,7 @@ class Orchestrator(ABC):
614
689
  finally:
615
690
  await res_queue.join()
616
691
 
692
+ # ---------------------------
693
+ # CLOSING PIPELINE
694
+ # ---------------------------
617
695
  logger.info("Pipeline concluded; async framework is closed")
@@ -1,4 +1,4 @@
1
- from aiohttp.web_exceptions import HTTPException
1
+ from httpx import HTTPStatusError
2
2
  from tenacity import (
3
3
  AsyncRetrying,
4
4
  retry_if_exception,
@@ -17,7 +17,10 @@ from fraudcrawler.settings import (
17
17
 
18
18
 
19
19
  def _is_retryable_exception(err: BaseException) -> bool:
20
- if isinstance(err, HTTPException) and err.status_code in RETRY_SKIP_IF_CODE:
20
+ if (
21
+ isinstance(err, HTTPStatusError)
22
+ and err.response.status_code in RETRY_SKIP_IF_CODE
23
+ ):
21
24
  return False
22
25
  return True
23
26
 
@@ -54,17 +54,17 @@ def search(search_term: str):
54
54
 
55
55
  # deepness.enrichment = Enrichment(additional_terms=10, additional_urls_per_term=20)
56
56
 
57
- # # Optional: Add MARKETPLACES and EXCLUDED_URLS
58
- # from fraudcrawler import Host
57
+ # Optional: Add MARKETPLACES and EXCLUDED_URLS
58
+ from fraudcrawler import Host
59
59
 
60
60
  # marketplaces = [
61
61
  # Host(name="International", domains="zavamed.com,apomeds.com"),
62
- # Host(name="National", domains="netdoktor.ch, nobelpharma.ch")
63
- # ]
64
- # excluded_urls = [
65
- # Host(name="Digitec", domains="digitec.ch"),
66
- # Host(name="Brack", domains="brack.ch"),
62
+ # # Host(name="National", domains="netdoktor.ch, nobelpharma.ch")
67
63
  # ]
64
+ excluded_urls = [
65
+ Host(name="Digitec", domains="digitec.ch"),
66
+ Host(name="Brack", domains="brack.ch"),
67
+ ]
68
68
 
69
69
  # Execute the pipeline
70
70
  client.execute(
@@ -74,7 +74,7 @@ def search(search_term: str):
74
74
  deepness=deepness,
75
75
  prompts=prompts,
76
76
  # marketplaces=marketplaces,
77
- # excluded_urls=excluded_urls,
77
+ excluded_urls=excluded_urls,
78
78
  )
79
79
 
80
80
  # Show results
@@ -97,4 +97,4 @@ def search(search_term: str):
97
97
 
98
98
 
99
99
  if __name__ == "__main__":
100
- search(search_term = "Medion Kühlbox MD 37454")
100
+ search(search_term='Liebherr "TP1410"')
@@ -1,5 +1,6 @@
1
1
  import logging
2
2
 
3
+ import httpx
3
4
  from openai import AsyncOpenAI
4
5
  from tenacity import RetryCallState
5
6
 
@@ -21,6 +22,7 @@ class Processor:
21
22
 
22
23
  def __init__(
23
24
  self,
25
+ http_client: httpx.AsyncClient,
24
26
  api_key: str,
25
27
  model: str,
26
28
  default_if_missing: int = PROCESSOR_DEFAULT_IF_MISSING,
@@ -29,12 +31,13 @@ class Processor:
29
31
  """Initializes the Processor.
30
32
 
31
33
  Args:
34
+ http_client: An httpx.AsyncClient to use for the async requests.
32
35
  api_key: The OpenAI API key.
33
36
  model: The OpenAI model to use.
34
37
  default_if_missing: The default classification to return if error occurs.
35
38
  empty_token_count: The default value to return as tokensif the classification is empty.
36
39
  """
37
- self._client = AsyncOpenAI(api_key=api_key)
40
+ self._client = AsyncOpenAI(http_client=http_client, api_key=api_key)
38
41
  self._model = model
39
42
  self._error_response = ClassificationResult(
40
43
  result=default_if_missing,
@@ -59,7 +62,7 @@ class Processor:
59
62
  )
60
63
  )
61
64
  else:
62
- logger.error(
65
+ logger.warning(
63
66
  f'Field "{field}" is missing in ProductItem with url="{product.url}"'
64
67
  )
65
68
  return "\n\n".join(details)
@@ -101,9 +104,10 @@ class Processor:
101
104
  ],
102
105
  **kwargs,
103
106
  )
104
- content = response.choices[0].message.content
105
- if not content:
106
- raise ValueError("Empty response from OpenAI API")
107
+ if not response or not (content := response.choices[0].message.content):
108
+ raise ValueError(
109
+ f'Error calling OpenAI API or empty response="{response}".'
110
+ )
107
111
 
108
112
  # Convert the content to an integer
109
113
  try: