fraudcrawler 0.5.0__py3-none-any.whl → 0.7.22__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,199 +0,0 @@
1
- import logging
2
-
3
- import httpx
4
- from openai import AsyncOpenAI
5
- from tenacity import RetryCallState
6
-
7
- from fraudcrawler.base.base import ProductItem, Prompt, ClassificationResult
8
- from fraudcrawler.base.retry import get_async_retry
9
- from fraudcrawler.settings import (
10
- PROCESSOR_PRODUCT_DETAILS_TEMPLATE,
11
- PROCESSOR_USER_PROMPT_TEMPLATE,
12
- PROCESSOR_DEFAULT_IF_MISSING,
13
- PROCESSOR_EMPTY_TOKEN_COUNT,
14
- )
15
-
16
-
17
- logger = logging.getLogger(__name__)
18
-
19
-
20
- class Processor:
21
- """Processes product data for classification based on a prompt configuration."""
22
-
23
- def __init__(
24
- self,
25
- http_client: httpx.AsyncClient,
26
- api_key: str,
27
- model: str,
28
- default_if_missing: int = PROCESSOR_DEFAULT_IF_MISSING,
29
- empty_token_count: int = PROCESSOR_EMPTY_TOKEN_COUNT,
30
- ):
31
- """Initializes the Processor.
32
-
33
- Args:
34
- http_client: An httpx.AsyncClient to use for the async requests.
35
- api_key: The OpenAI API key.
36
- model: The OpenAI model to use.
37
- default_if_missing: The default classification to return if error occurs.
38
- empty_token_count: The default value to return as tokensif the classification is empty.
39
- """
40
- self._client = AsyncOpenAI(http_client=http_client, api_key=api_key)
41
- self._model = model
42
- self._error_response = ClassificationResult(
43
- result=default_if_missing,
44
- input_tokens=empty_token_count,
45
- output_tokens=empty_token_count,
46
- )
47
-
48
- @staticmethod
49
- def _get_product_details(product: ProductItem, prompt: Prompt) -> str:
50
- """Extracts product details based on the prompt configuration.
51
-
52
- Args:
53
- product: The product item to extract details from.
54
- prompt: The prompt configuration containing field names.
55
- """
56
- details = []
57
- for field in prompt.product_item_fields:
58
- if value := getattr(product, field, None):
59
- details.append(
60
- PROCESSOR_PRODUCT_DETAILS_TEMPLATE.format(
61
- field_name=field, field_value=value
62
- )
63
- )
64
- else:
65
- logger.warning(
66
- f'Field "{field}" is missing in ProductItem with url="{product.url}"'
67
- )
68
- return "\n\n".join(details)
69
-
70
- @staticmethod
71
- def _log_before(url: str, prompt: Prompt, retry_state: RetryCallState) -> None:
72
- """Context aware logging before the request is made."""
73
- if retry_state:
74
- logger.debug(
75
- f"Classifying product with url={url} using prompt={prompt} (Attempt {retry_state.attempt_number})."
76
- )
77
- else:
78
- logger.debug(f"retry_state is {retry_state}; not logging before.")
79
-
80
- @staticmethod
81
- def _log_before_sleep(
82
- url: str, prompt: Prompt, retry_state: RetryCallState
83
- ) -> None:
84
- """Context aware logging before sleeping after a failed request."""
85
- if retry_state and retry_state.outcome:
86
- logger.warning(
87
- f"Attempt {retry_state.attempt_number} of classifying product with url={url} using prompt={prompt} "
88
- f"failed with error: {retry_state.outcome.exception()}. "
89
- f"Retrying in {retry_state.upcoming_sleep:.0f} seconds."
90
- )
91
-
92
- async def _call_openai_api(
93
- self,
94
- system_prompt: str,
95
- user_prompt: str,
96
- **kwargs,
97
- ) -> ClassificationResult:
98
- """Calls the OpenAI API with the given user prompt."""
99
- response = await self._client.chat.completions.create(
100
- model=self._model,
101
- messages=[
102
- {"role": "system", "content": system_prompt},
103
- {"role": "user", "content": user_prompt},
104
- ],
105
- **kwargs,
106
- )
107
- if not response or not (content := response.choices[0].message.content):
108
- raise ValueError(
109
- f'Error calling OpenAI API or empty response="{response}".'
110
- )
111
-
112
- # Convert the content to an integer
113
- try:
114
- content = int(content.strip())
115
- except Exception as e:
116
- msg = f"Failed to convert OpenAI response '{content}' to integer: {e}"
117
- logger.error(msg)
118
- raise ValueError(msg)
119
-
120
- # For tracking consumption we alre return the tokens used
121
- classification = ClassificationResult(
122
- result=content,
123
- input_tokens=response.usage.prompt_tokens,
124
- output_tokens=response.usage.completion_tokens,
125
- )
126
-
127
- return classification
128
-
129
- async def classify(
130
- self,
131
- product: ProductItem,
132
- prompt: Prompt,
133
- ) -> ClassificationResult:
134
- """A generic classification method that classifies a product based on a prompt object and returns
135
- the classification, input tokens, and output tokens.
136
-
137
- Args:
138
- product: The product item to classify.
139
- prompt: The prompt to use for classification.
140
-
141
- Note:
142
- This method returns `PROCESSOR_DEFAULT_IF_MISSING` if:
143
- - product_details is empty
144
- - an error occurs during the API call
145
- - if the response isn't in allowed_classes.
146
- """
147
- url = product.url
148
-
149
- # Form the product details from the ProductItem
150
- product_details = self._get_product_details(product=product, prompt=prompt)
151
- if not product_details:
152
- logger.warning("Missing required product_details for classification.")
153
- return self._error_response
154
-
155
- # Prepare the user prompt
156
- user_prompt = PROCESSOR_USER_PROMPT_TEMPLATE.format(
157
- product_details=product_details,
158
- )
159
-
160
- # Call the OpenAI API
161
- try:
162
- logger.debug(
163
- f"Classifying product with url={url} using prompt={prompt.name} and user_prompt={user_prompt}."
164
- )
165
- # Perform the request and retry if necessary. There is some context aware logging
166
- # - `before`: before the request is made (or before retrying)
167
- # - `before_sleep`: if the request fails before sleeping
168
- retry = get_async_retry()
169
- retry.before = lambda retry_state: self._log_before(
170
- url=url, prompt=prompt, retry_state=retry_state
171
- )
172
- retry.before_sleep = lambda retry_state: self._log_before_sleep(
173
- url=url, prompt=prompt, retry_state=retry_state
174
- )
175
- async for attempt in retry:
176
- with attempt:
177
- classification = await self._call_openai_api(
178
- system_prompt=prompt.system_prompt,
179
- user_prompt=user_prompt,
180
- max_tokens=1,
181
- )
182
-
183
- # Enforce that the classification is in the allowed classes
184
- if classification.result not in prompt.allowed_classes:
185
- logger.warning(
186
- f"Classification '{classification.result}' not in allowed classes {prompt.allowed_classes}"
187
- )
188
- return self._error_response
189
-
190
- logger.info(
191
- f'Classification for url="{url}" (prompt={prompt.name}): {classification.result} and total tokens used: {classification.input_tokens + classification.output_tokens}'
192
- )
193
- return classification
194
-
195
- except Exception as e:
196
- logger.error(
197
- f'Error classifying product at url="{url}" with prompt "{prompt.name}": {e}'
198
- )
199
- return self._error_response
@@ -1,167 +0,0 @@
1
- Metadata-Version: 2.1
2
- Name: fraudcrawler
3
- Version: 0.5.0
4
- Summary: Intelligent Market Monitoring
5
- Home-page: https://github.com/open-veanu/fraudcrawler
6
- License: MIT
7
- Author: Domingo Bertus
8
- Author-email: hello@veanu.ch
9
- Requires-Python: >=3.11,<4.0
10
- Classifier: License :: OSI Approved :: MIT License
11
- Classifier: Programming Language :: Python :: 3
12
- Classifier: Programming Language :: Python :: 3.11
13
- Classifier: Programming Language :: Python :: 3.12
14
- Requires-Dist: beautifulsoup4 (>=4.13.4,<5.0.0)
15
- Requires-Dist: httpx (>=0.28.1,<0.29.0)
16
- Requires-Dist: openai (>=1.68.2,<2.0.0)
17
- Requires-Dist: pandas (>=2.2.3,<3.0.0)
18
- Requires-Dist: pydantic-settings (>=2.8.1,<3.0.0)
19
- Requires-Dist: requests (>=2.32.3,<3.0.0)
20
- Requires-Dist: tenacity (>=9.1.2,<10.0.0)
21
- Project-URL: Repository, https://github.com/open-veanu/fraudcrawler
22
- Description-Content-Type: text/markdown
23
-
24
- # open-veanu/fraudcrawler
25
- Intelligent Market Monitoring
26
-
27
- The pipeline for monitoring the market has the folling main steps:
28
- 1. search for a given term using SerpAPI
29
- 2. get product information using ZyteAPI
30
- 3. assess relevance of the found products using an OpenAI API
31
-
32
- ## Installation
33
- ```bash
34
- python3.11 -m venv .venv
35
- source .venv/bin/activate
36
- pip install fraudcrawler
37
- ```
38
-
39
- ## Usage
40
- ### `.env` file
41
- Make sure to create an `.env` file with the necessary API keys and credentials (c.f. `.env.example` file).
42
-
43
- ### Run demo pipeline
44
- ```bash
45
- python -m fraudcrawler.launch_demo_pipeline
46
- ```
47
-
48
- ### Customize the pipeline
49
- Start by initializing the client
50
- ```python
51
- from fraudcrawler import FraudCrawlerClient
52
-
53
- # Initialize the client
54
- client = FraudCrawlerClient()
55
- ```
56
-
57
- For setting up the search we need 5 main objects.
58
-
59
- #### `search_term: str`
60
- The search term for the query (similar to search terms used within major search providers).
61
-
62
- #### `language: Language`
63
- The language used in SerpAPI ('hl' parameter), as well as for the optional search term enrichement (e.g. finding similar and related search terms). `language=Language('German')` creates an object having a language name and a language code as: `Language(name='German', code='de')`.
64
-
65
- #### `location: Location`
66
- The location used in SerpAPI ('gl' parameter). `location=Location('Switzerland')` creates an object having a location name and a location code as `Location(name='Switzerland', code='ch')`.
67
-
68
- #### `deepness: Deepness`
69
- Defines the search depth with the number of results to retrieve and optional enrichment parameters.
70
-
71
- #### `prompts: List[Prompt]`
72
- The list of prompts to classify a given product with (multiple) LLM calls. Each prompt object has a `name`, a `context` (used for defining the user prompt), a `system_prompt` (for defining the classification task), and `allowed_classes` (a list of possible classes).
73
-
74
- ```python
75
- from fraudcrawler import Language, Location, Deepness, Prompt
76
- # Setup the search
77
- search_term = "sildenafil"
78
- language = Language(name="German")
79
- location = Location(name="Switzerland")
80
- deepness = Deepness(num_results=50)
81
- prompts = [
82
- Prompt(
83
- name="relevance",
84
- system_prompt=(
85
- "You are a helpful and intelligent assistant. Your task is to classify any given product "
86
- "as either relevant (1) or not relevant (0), strictly based on the context and product details provided by the user. "
87
- "You must consider all aspects of the given context and make a binary decision accordingly. "
88
- "If the product aligns with the user's needs, classify it as 1 (relevant); otherwise, classify it as 0 (not relevant). "
89
- "Respond only with the number 1 or 0."
90
- ),
91
- allowed_classes=[0, 1],
92
- )
93
- ]
94
- ```
95
-
96
- (Optional) Add search term enrichement. This will find related search terms (in a given language) and search for these as well.
97
- ```python
98
- from fraudcrawler import Enrichment
99
- deepness.enrichment = Enrichment(
100
- additional_terms=5,
101
- additional_urls_per_term=10
102
- )
103
- ```
104
-
105
- (Optional) Add marketplaces where we explicitely want to look for (this will focus your search as the :site parameter for a google search)
106
- ```python
107
- from fraudcrawler import Host
108
- marketplaces = [
109
- Host(name="International", domains="zavamed.com,apomeds.com"),
110
- Host(name="National", domains="netdoktor.ch, nobelpharma.ch"),
111
- ]
112
- ```
113
-
114
- (Optional) Exclude urls (where you don't want to find products)
115
- ```python
116
- excluded_urls = [
117
- Host(name="Compendium", domains="compendium.ch"),
118
- ]
119
- ```
120
-
121
- (Optional) Exclude previously collected urls (intends to save credits)
122
- ```python
123
- previously_collected_urls = [
124
- https://pharmaciedelabateliere.ch/shop/sante/douleurs-inflammations/dafalgan-cpr-eff-500-mg-16-pce/,
125
- https://eiche.ch/product/schmerzmittel-52cd81d5d206a/dafalgan-brausetabletten-1336653,
126
- ]
127
- ```
128
-
129
- And finally run the pipeline
130
- ```python
131
- # Execute the pipeline
132
- client.execute(
133
- search_term=search_term,
134
- language=language,
135
- location=location,
136
- deepness=deepness,
137
- prompts=prompts,
138
- # marketplaces=marketplaces, # Uncomment this for using marketplaces
139
- # excluded_urls=excluded_urls # Uncomment this for using excluded_urls
140
- # previously_collected_urls=previously_collected_urls # Uncomment this for using previously_selected_urls
141
- )
142
- ```
143
- This creates a file with name pattern `<search_term>_<language.code>_<location.code>_<datetime[%Y%m%d%H%M%S]>.csv` inside the folder `data/results/`.
144
-
145
- Once the pipeline terminated the results can be loaded and examined as follows:
146
- ```python
147
- df = client.load_results()
148
- print(df.head(n=10))
149
- ```
150
-
151
- If the client has been used to run multiple pipelines, an overview of the available results (for a given instance of
152
- `FraudCrawlerClient`) can be obtained with
153
- ```python
154
- client.print_available_results()
155
- ```
156
-
157
- ## Contributing
158
- see `CONTRIBUTING.md`
159
-
160
- ### Async Setup
161
- The `Orchestrator` class in `src/base/orchestrator.py` is designed to coordinate multiple services that may have interdependencies, allowing them to run in a semi-iterative manner. This means, for example, that product A can be at stage III of the pipeline while product B is still at stage I.
162
-
163
- This behavior is enabled through an asynchronous pipeline setup. The three main steps, `SerpAPI`, `ZyteAPI`, and `Processor`, all utilize `httpx.AsyncClient`. It is both possible and highly recommended to manage a single AsyncClient instance per application for efficiency. We provide a `HttpxAsyncClient` class that you can pass For more details, see the [httpx documentation](https://www.python-httpx.org/api/#asyncclient).
164
-
165
- The following image provides a schematic representation of the package's async setup.
166
- ![Async Setup](https://github.com/open-veanu/fraudcrawler/raw/master/docs/assets/images/Fraudcrawler_Async_Setup.svg)
167
-
@@ -1,22 +0,0 @@
1
- fraudcrawler/__init__.py,sha256=Kr19jWhtbC1shVoB9fHvBSeoG1IyQB9re1kCZ4YIAi0,842
2
- fraudcrawler/base/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
- fraudcrawler/base/base.py,sha256=suQMnvLIsZO_R0eHZKDWS4u9qnd1ryzPhjGlwcaMD5A,7295
4
- fraudcrawler/base/client.py,sha256=yhkNrhL2SuJXTknLf-8P81fv01FnFMahREZgem-Z-f0,5832
5
- fraudcrawler/base/google-languages.json,sha256=z0VtjMCsCcZq11OkCIb9jMDD1p9Ty4lhV7bq4ddYvec,10748
6
- fraudcrawler/base/google-locations.json,sha256=UtNu0iSStllvFRTQXMobWKmZR1hKmtgtHftLNgaJTT0,9204345
7
- fraudcrawler/base/orchestrator.py,sha256=AKEETrYwKbMy_6YgTdgc6L-VA1iHYOtj3wIqEN3ngO4,26990
8
- fraudcrawler/base/retry.py,sha256=9VyVrbYR_0YnfxFhUrvcM3aWCYR6oR4iZE4A3zzVZUs,1015
9
- fraudcrawler/launch_demo_pipeline.py,sha256=j5lu8lLl8QrkVU1MJH25uKtyYk_6lBSeoouCo30aRXg,4634
10
- fraudcrawler/processing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
- fraudcrawler/processing/processor.py,sha256=Qq8QcTlqfnzFi1t-1KkncXxaIszUO7pGK3LXTdHkDnM,7638
12
- fraudcrawler/scraping/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
- fraudcrawler/scraping/enrich.py,sha256=M4ErLF7q-5EKfEd-dIDS73mZc-aPFp5pJbgdRKCO3f8,13112
14
- fraudcrawler/scraping/search.py,sha256=nHMYaSkq9o6Hr4yUDEPguj8IHVcOpws3_XWiAbCVgLg,24062
15
- fraudcrawler/scraping/url.py,sha256=5Z3hPW73E-TLhM-Zha8OTcUOumc_rcx64R0fT9z2Hi8,1748
16
- fraudcrawler/scraping/zyte.py,sha256=GqvVWA1AWVoClAwd-hQ9iynsT0dOb7R0ntaLK5XVivM,8340
17
- fraudcrawler/settings.py,sha256=uwXMOQpuwyWkuMU0asYGtBlL_qJj8F-Xkg4dUaCmDxE,3670
18
- fraudcrawler-0.5.0.dist-info/LICENSE,sha256=B-3FuHfe3S0fWAlKlceskPcRhzXq81g-rJ-ddUYb4O8,1062
19
- fraudcrawler-0.5.0.dist-info/METADATA,sha256=H9aq_euzQMD8Ag3gbo3GIrfC4eVl-gGahD_DieQ1oow,6642
20
- fraudcrawler-0.5.0.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
21
- fraudcrawler-0.5.0.dist-info/entry_points.txt,sha256=1Befm7cM6945y2AA1z9V4gZV63mtSWcAs7ypvgux_Xg,79
22
- fraudcrawler-0.5.0.dist-info/RECORD,,