fraudcrawler 0.5.0__py3-none-any.whl → 0.7.22__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,6 +3,7 @@ from typing import List, Set, Tuple
3
3
  from urllib.parse import urlparse, parse_qsl, urlencode, quote, urlunparse, ParseResult
4
4
 
5
5
  from fraudcrawler.settings import KNOWN_TRACKERS
6
+ from fraudcrawler.base.base import ProductItem
6
7
 
7
8
  logger = logging.getLogger(__name__)
8
9
 
@@ -11,11 +12,19 @@ class URLCollector:
11
12
  """A class to collect and de-duplicate URLs."""
12
13
 
13
14
  def __init__(self):
14
- self.collected_currently: Set[str] = set()
15
- self.collected_previously: Set[str] = set()
15
+ self._collected_currently: Set[str] = set()
16
+ self._collected_previously: Set[str] = set()
17
+
18
+ def add_previously_collected_urls(self, urls: List[str]) -> None:
19
+ """Add a set of previously collected URLs to the internal state.
20
+
21
+ Args:
22
+ urls: A set of URLs that have been collected in previous runs.
23
+ """
24
+ self._collected_previously.update(urls)
16
25
 
17
26
  @staticmethod
18
- def remove_tracking_parameters(url: str) -> str:
27
+ def _remove_tracking_parameters(url: str) -> str:
19
28
  """Remove tracking parameters from URLs.
20
29
 
21
30
  Args:
@@ -55,3 +64,33 @@ class URLCollector:
55
64
  fragment=parsed_url.fragment,
56
65
  )
57
66
  return urlunparse(clean_url)
67
+
68
+ async def apply(self, product: ProductItem) -> ProductItem:
69
+ """Manages the collection and deduplication of ProductItems.
70
+
71
+ Args:
72
+ product: The product item to process.
73
+ """
74
+ logger.debug(f'Processing product with url="{product.url}"')
75
+
76
+ # Remove tracking parameters from the URL
77
+ url = self._remove_tracking_parameters(product.url)
78
+ product.url = url
79
+
80
+ # deduplicate on current run
81
+ if url in self._collected_currently:
82
+ product.filtered = True
83
+ product.filtered_at_stage = "URL collection (current run deduplication)"
84
+ logger.debug(f"URL {url} already collected in current run")
85
+
86
+ # deduplicate on previous runs coming from a db
87
+ elif url in self._collected_previously:
88
+ product.filtered = True
89
+ product.filtered_at_stage = "URL collection (previous run deduplication)"
90
+ logger.debug(f"URL {url} as already collected in previous run")
91
+
92
+ # Add to currently collected URLs
93
+ else:
94
+ self._collected_currently.add(url)
95
+
96
+ return product
@@ -1,12 +1,13 @@
1
+ from base64 import b64decode
1
2
  import logging
2
3
  from typing import List
3
- from base64 import b64decode
4
4
 
5
+ from bs4 import BeautifulSoup
5
6
  import httpx
6
7
  from tenacity import RetryCallState
7
8
 
8
9
  from fraudcrawler.settings import ZYTE_DEFALUT_PROBABILITY_THRESHOLD
9
- from fraudcrawler.base.base import DomainUtils
10
+ from fraudcrawler.base.base import DomainUtils, ProductItem
10
11
  from fraudcrawler.base.retry import get_async_retry
11
12
 
12
13
  logger = logging.getLogger(__name__)
@@ -61,77 +62,8 @@ class ZyteAPI(DomainUtils):
61
62
  else:
62
63
  logger.debug(f"retry_state is {retry_state}; not logging before_sleep.")
63
64
 
64
- async def details(self, url: str) -> dict:
65
- """Fetches product details for a single URL.
66
-
67
- Args:
68
- url: The URL to fetch product details from.
69
-
70
- Returns:
71
- A dictionary containing the product details, fields include:
72
- (c.f. https://docs.zyte.com/zyte-api/usage/reference.html#operation/extract/response/200/product)
73
- {
74
- "url": str,
75
- "statusCode": str,
76
- "product": {
77
- "name": str,
78
- "price": str,
79
- "mainImage": {"url": str},
80
- "images": [{"url": str}],
81
- "description": str,
82
- "metadata": {
83
- "probability": float,
84
- },
85
- },
86
- "httpResponseBody": base64
87
- }
88
- """
89
- logger.info(f"Fetching product details by Zyte for URL {url}.")
90
-
91
- # Perform the request and retry if necessary. There is some context aware logging:
92
- # - `before`: before the request is made (and before retrying)
93
- # - `before_sleep`: if the request fails before sleeping
94
- retry = get_async_retry()
95
- retry.before = lambda retry_state: self._log_before(
96
- url=url, retry_state=retry_state
97
- )
98
- retry.before_sleep = lambda retry_state: self._log_before_sleep(
99
- url=url, retry_state=retry_state
100
- )
101
- async for attempt in retry:
102
- with attempt:
103
- response = await self._http_client.post(
104
- url=self._endpoint,
105
- json={"url": url, **self._config},
106
- auth=(self._api_key, ""), # API key as username, empty password
107
- )
108
- response.raise_for_status()
109
-
110
- details = response.json()
111
- return details
112
-
113
- @staticmethod
114
- def keep_product(
115
- details: dict,
116
- threshold: float = ZYTE_DEFALUT_PROBABILITY_THRESHOLD,
117
- ) -> bool:
118
- """Determines whether to keep the product based on the probability threshold.
119
-
120
- Args:
121
- details: A product details data dictionary.
122
- threshold: The probability threshold used to filter the products.
123
- """
124
- try:
125
- prob = float(details["product"]["metadata"]["probability"])
126
- except KeyError:
127
- logger.warning(
128
- f"Product with url={details.get('url')} has no probability value - product is ignored"
129
- )
130
- return False
131
- return prob > threshold
132
-
133
65
  @staticmethod
134
- def extract_product_name(details: dict) -> str | None:
66
+ def _extract_product_name(details: dict) -> str | None:
135
67
  """Extracts the product name from the product data.
136
68
 
137
69
  The input argument is a dictionary of the following structure:
@@ -144,7 +76,7 @@ class ZyteAPI(DomainUtils):
144
76
  return details.get("product", {}).get("name")
145
77
 
146
78
  @staticmethod
147
- def extract_url_resolved(details: dict) -> str | None:
79
+ def _extract_url_resolved(details: dict) -> str | None:
148
80
  """Extracts the resolved URL from the product data - this is automatically resolved by Zyte.
149
81
 
150
82
  The input argument is a dictionary of the following structure:
@@ -157,7 +89,7 @@ class ZyteAPI(DomainUtils):
157
89
  return details.get("product", {}).get("url")
158
90
 
159
91
  @staticmethod
160
- def extract_product_price(details: dict) -> str | None:
92
+ def _extract_product_price(details: dict) -> str | None:
161
93
  """Extracts the product price from the product data.
162
94
 
163
95
  The input argument is a dictionary of the following structure:
@@ -170,7 +102,7 @@ class ZyteAPI(DomainUtils):
170
102
  return details.get("product", {}).get("price")
171
103
 
172
104
  @staticmethod
173
- def extract_product_description(details: dict) -> str | None:
105
+ def _extract_product_description(details: dict) -> str | None:
174
106
  """Extracts the product description from the product data.
175
107
 
176
108
  The input argument is a dictionary of the following structure:
@@ -183,7 +115,7 @@ class ZyteAPI(DomainUtils):
183
115
  return details.get("product", {}).get("description")
184
116
 
185
117
  @staticmethod
186
- def extract_image_urls(details: dict) -> List[str]:
118
+ def _extract_image_urls(details: dict) -> List[str]:
187
119
  """Extracts the images from the product data.
188
120
 
189
121
  The input argument is a dictionary of the following structure:
@@ -206,7 +138,28 @@ class ZyteAPI(DomainUtils):
206
138
  return images
207
139
 
208
140
  @staticmethod
209
- def extract_probability(details: dict) -> float:
141
+ def _extract_gtin(details: dict) -> str | None:
142
+ """Extracts the GTIN from the product data.
143
+
144
+ The input argument is a dictionary of the following structure:
145
+ {
146
+ "product": {
147
+ "gtin": [{"type": str, "value": str}],
148
+ }
149
+ }
150
+ """
151
+ product = details.get("product", {})
152
+ gtin_list = product.get("gtin", [])
153
+
154
+ if gtin_list and len(gtin_list) > 0:
155
+ # Extract the first GTIN value
156
+ gtin_value = gtin_list[0].get("value")
157
+ if gtin_value:
158
+ return gtin_value
159
+ return None
160
+
161
+ @staticmethod
162
+ def _extract_probability(details: dict) -> float:
210
163
  """Extracts the probability from the product data.
211
164
 
212
165
  The input argument is a dictionary of the following structure:
@@ -223,7 +176,7 @@ class ZyteAPI(DomainUtils):
223
176
  )
224
177
 
225
178
  @staticmethod
226
- def extract_html(details: dict) -> str | None:
179
+ def _extract_html(details: dict) -> str | None:
227
180
  """Extracts the HTML from the Zyte API response.
228
181
 
229
182
  The input argument is a dictionary of the following structure:
@@ -238,7 +191,120 @@ class ZyteAPI(DomainUtils):
238
191
  if isinstance(encoded, str):
239
192
  decoded_bytes = b64decode(encoded)
240
193
 
241
- # Convert bytes to string (assuming UTF-8 encoding)
242
- decoded_string = decoded_bytes.decode("utf-8")
194
+ # Convert bytes to string
195
+ try:
196
+ decoded_string = decoded_bytes.decode("utf-8")
197
+ except UnicodeDecodeError:
198
+ decoded_string = decoded_bytes.decode("iso-8859-1")
243
199
  return decoded_string
244
200
  return None
201
+
202
+ def enrich_context(self, product: ProductItem, details: dict) -> ProductItem:
203
+ product.product_name = self._extract_product_name(details=details)
204
+
205
+ url_resolved = self._extract_url_resolved(details=details)
206
+ if url_resolved:
207
+ product.url_resolved = url_resolved
208
+
209
+ # If the resolved URL is different from the original URL, we also need to update the domain as
210
+ # otherwise the unresolved domain will be shown.
211
+ # For example for an unresolved domain "toppreise.ch" but resolved "digitec.ch
212
+ if url_resolved and url_resolved != product.url:
213
+ logger.debug(f"URL resolved for {product.url} is {url_resolved}")
214
+ product.domain = self._get_domain(url=url_resolved)
215
+
216
+ product.product_price = self._extract_product_price(details=details)
217
+ product.product_description = self._extract_product_description(details=details)
218
+ product.product_images = self._extract_image_urls(details=details)
219
+ product.product_gtin = self._extract_gtin(details=details)
220
+ product.probability = self._extract_probability(details=details)
221
+ product.html = self._extract_html(details=details)
222
+ if product.html:
223
+ soup = BeautifulSoup(product.html, "html.parser")
224
+ product.html_clean = soup.get_text(separator=" ", strip=True)
225
+
226
+ return product
227
+
228
+ @staticmethod
229
+ def keep_product(
230
+ details: dict,
231
+ threshold: float = ZYTE_DEFALUT_PROBABILITY_THRESHOLD,
232
+ ) -> bool:
233
+ """Determines whether to keep the product based on the probability threshold.
234
+
235
+ Args:
236
+ details: A product details data dictionary.
237
+ threshold: The probability threshold used to filter the products.
238
+ """
239
+ try:
240
+ prob = float(details["product"]["metadata"]["probability"])
241
+ except KeyError:
242
+ logger.warning(
243
+ f"Product with url={details.get('url')} has no probability value - product is ignored"
244
+ )
245
+ return False
246
+ return prob > threshold
247
+
248
+ async def unblock_url_content(self, url: str) -> bytes:
249
+ """Unblock the content of an URL using Zyte proxy mode.
250
+
251
+ Args:
252
+ url: The URL to fetch using Zyte proxy mode.
253
+ """
254
+ logger.debug(f'Unblock URL content using Zyte proxy for url="{url}"')
255
+ details = await self.details(url)
256
+
257
+ if not details or "httpResponseBody" not in details:
258
+ raise httpx.HTTPError("No httpResponseBody in Zyte response")
259
+
260
+ return b64decode(details["httpResponseBody"])
261
+
262
+ async def details(self, url: str) -> dict:
263
+ """Fetches product details for a single URL.
264
+
265
+ Args:
266
+ url: The URL to fetch product details from.
267
+
268
+ Returns:
269
+ A dictionary containing the product details, fields include:
270
+ (c.f. https://docs.zyte.com/zyte-api/usage/reference.html#operation/extract/response/200/product)
271
+ {
272
+ "url": str,
273
+ "statusCode": str,
274
+ "product": {
275
+ "name": str,
276
+ "price": str,
277
+ "mainImage": {"url": str},
278
+ "images": [{"url": str}],
279
+ "description": str,
280
+ "gtin": [{"type": str, "value": str}],
281
+ "metadata": {
282
+ "probability": float,
283
+ },
284
+ },
285
+ "httpResponseBody": base64
286
+ }
287
+ """
288
+ logger.info(f"Fetching product details by Zyte for URL {url}.")
289
+
290
+ # Perform the request and retry if necessary. There is some context aware logging:
291
+ # - `before`: before the request is made (and before retrying)
292
+ # - `before_sleep`: if the request fails before sleeping
293
+ retry = get_async_retry()
294
+ retry.before = lambda retry_state: self._log_before(
295
+ url=url, retry_state=retry_state
296
+ )
297
+ retry.before_sleep = lambda retry_state: self._log_before_sleep(
298
+ url=url, retry_state=retry_state
299
+ )
300
+ async for attempt in retry:
301
+ with attempt:
302
+ response = await self._http_client.post(
303
+ url=self._endpoint,
304
+ json={"url": url, **self._config},
305
+ auth=(self._api_key, ""), # API key as username, empty password
306
+ )
307
+ response.raise_for_status()
308
+
309
+ details = response.json()
310
+ return details
fraudcrawler/settings.py CHANGED
@@ -14,12 +14,22 @@ RETRY_EXP_BASE = 4
14
14
  RETRY_JITTER = 1
15
15
  RETRY_SKIP_IF_CODE = [400, 401, 403] # Skip retrying on these HTTP status codes
16
16
 
17
- # Serp settings
17
+ # Search settings
18
18
  GOOGLE_LOCATIONS_FILENAME = ROOT_DIR / "fraudcrawler" / "base" / "google-locations.json"
19
19
  GOOGLE_LANGUAGES_FILENAME = ROOT_DIR / "fraudcrawler" / "base" / "google-languages.json"
20
20
  SEARCH_DEFAULT_COUNTRY_CODES: List[str] = [
21
21
  # ".com",
22
22
  ]
23
+ TOPPREISE_SEARCH_PATHS = {
24
+ "de": "produktsuche",
25
+ "fr": "chercher",
26
+ "default": "browse",
27
+ }
28
+ TOPPREISE_COMPARISON_PATHS = [
29
+ "preisvergleich",
30
+ "comparison-prix",
31
+ "price-comparison",
32
+ ]
23
33
 
24
34
  # URL De-duplication settings
25
35
  KNOWN_TRACKERS = [
@@ -68,17 +78,19 @@ ENRICHMENT_DEFAULT_LIMIT = 10
68
78
  # Zyte settings
69
79
  ZYTE_DEFALUT_PROBABILITY_THRESHOLD = 0.1
70
80
 
71
- # Processor settings
72
- PROCESSOR_DEFAULT_MODEL = "gpt-4o"
73
- PROCESSOR_DEFAULT_IF_MISSING = -1
74
- PROCESSOR_EMPTY_TOKEN_COUNT = -1
75
- PROCESSOR_USER_PROMPT_TEMPLATE = "Product Details:\n{product_details}\n\nRelevance:"
76
- PROCESSOR_PRODUCT_DETAILS_TEMPLATE = "{field_name}:\n{field_value}"
81
+ # Exact match settings
82
+ EXACT_MATCH_PRODUCT_FIELDS = {
83
+ "url_resolved",
84
+ "product_name",
85
+ "product_description",
86
+ "html",
87
+ }
88
+ EXACT_MATCH_FIELD_SEPARATOR = "\n"
77
89
 
78
90
  # Async workers settings
79
- DEFAULT_N_SERP_WKRS = 10
80
- DEFAULT_N_ZYTE_WKRS = 10
81
- DEFAULT_N_PROC_WKRS = 10
91
+ DEFAULT_N_SRCH_WKRS = 2
92
+ DEFAULT_N_CNTX_WKRS = 23
93
+ DEFAULT_N_PROC_WKRS = 5
82
94
 
83
95
  # HTTPX client settings
84
96
  DEFAULT_HTTPX_TIMEOUT = {
@@ -0,0 +1,173 @@
1
+ Metadata-Version: 2.1
2
+ Name: fraudcrawler
3
+ Version: 0.7.22
4
+ Summary: Intelligent Market Monitoring
5
+ Home-page: https://github.com/open-veanu/fraudcrawler
6
+ License: MIT
7
+ Author: Domingo Bertus
8
+ Author-email: hello@veanu.ch
9
+ Requires-Python: >=3.11,<4.0
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Programming Language :: Python :: 3.11
13
+ Classifier: Programming Language :: Python :: 3.12
14
+ Requires-Dist: beautifulsoup4 (>=4.13.4,<5.0.0)
15
+ Requires-Dist: httpx (>=0.28.1,<0.29.0)
16
+ Requires-Dist: openai (>=1.68.2,<2.0.0)
17
+ Requires-Dist: pandas (>=2.2.3,<3.0.0)
18
+ Requires-Dist: pydantic-settings (>=2.8.1,<3.0.0)
19
+ Requires-Dist: requests (>=2.32.3,<3.0.0)
20
+ Requires-Dist: tenacity (>=9.1.2,<10.0.0)
21
+ Project-URL: Repository, https://github.com/open-veanu/fraudcrawler
22
+ Description-Content-Type: text/markdown
23
+
24
+ # fraudcrawler
25
+
26
+ ![CI Status](https://github.com/open-veanu/fraudcrawler/workflows/CI/badge.svg)
27
+ ![Python Version](https://img.shields.io/badge/python-3.11+-blue.svg)
28
+ ![License](https://img.shields.io/badge/license-MIT-green.svg)
29
+ ![PyPI](https://img.shields.io/pypi/v/fraudcrawler.svg)
30
+
31
+ Fraudcrawler is an intelligent **market monitoring** tool that searches the web for products, extracts product details, and classifies them using LLMs. It combines search APIs, web scraping, and AI to automate product discovery and relevance assessment.
32
+
33
+ ## Features
34
+
35
+ - **Asynchronous pipeline** - Products move through search, extraction, and classification stages independently
36
+ - **Multiple search engines** - Google Search, Google Shopping, and more...
37
+ - **Search term enrichment** - Automatically find related terms and expand your search
38
+ - **Product extraction** - Get structured product data via Zyte API
39
+ - **LLM classification** - Assess product relevance using OpenAI API with custom prompts
40
+ - **Marketplace filtering** - Focus searches on specific domains
41
+ - **Deduplication** - Avoid reprocessing previously collected URLs
42
+ - **CSV export** - Results saved with timestamps for easy tracking
43
+
44
+ ## Prerequisites
45
+
46
+ - Python 3.11 or higher
47
+ - API keys for:
48
+ - **SerpAPI** - Google search results
49
+ - **Zyte API** - Product data extraction
50
+ - **OpenAI API** - Product classification
51
+ - **DataForSEO** (optional) - Search term enrichment
52
+
53
+ ## Installation
54
+
55
+ ```bash
56
+ python3.11 -m venv .venv
57
+ source .venv/bin/activate
58
+ pip install fraudcrawler
59
+ ```
60
+
61
+ **Using Poetry:**
62
+ ```bash
63
+ poetry install
64
+ ```
65
+
66
+ ## Configuration
67
+
68
+ Create a `.env` file with your API credentials (see `.env.example` for template):
69
+
70
+ ```bash
71
+ SERPAPI_KEY=your_serpapi_key
72
+ ZYTEAPI_KEY=your_zyte_key
73
+ OPENAIAPI_KEY=your_openai_key
74
+ DATAFORSEO_USER=your_user # optional
75
+ DATAFORSEO_PWD=your_pwd # optional
76
+ ```
77
+
78
+ ## Usage
79
+
80
+ ### Basic Configuration
81
+ For a complete working example, see `fraudcrawler/launch_demo_pipeline.py`. After setting up the necessary parameters you can launch and analyse the results with:
82
+ ```python
83
+ # Run pipeline
84
+ await client.run(
85
+ search_term=search_term,
86
+ search_engines=search_engines,
87
+ language=language,
88
+ location=location,
89
+ deepness=deepness,
90
+ excluded_urls=excluded_urls,
91
+ )
92
+
93
+ # Load results
94
+ df = client.load_results()
95
+ print(df.head())
96
+ ```
97
+
98
+ ### Advanced Configuration
99
+
100
+ **Search term enrichment** - Find and search related terms:
101
+ ```python
102
+ from fraudcrawler import Enrichment
103
+
104
+ deepness.enrichment = Enrichment(
105
+ additional_terms=5,
106
+ additional_urls_per_term=10
107
+ )
108
+ ```
109
+
110
+ **Marketplace filtering** - Focus on specific domains:
111
+ ```python
112
+ from fraudcrawler import Host
113
+
114
+ marketplaces = [
115
+ Host(name="International", domains="zavamed.com,apomeds.com"),
116
+ Host(name="National", domains="netdoktor.ch,nobelpharma.ch"),
117
+ ]
118
+
119
+ await client.run(..., marketplaces=marketplaces)
120
+ ```
121
+
122
+ **Exclude domains** - Exclude specific domains from your results:
123
+ ```python
124
+ excluded_urls = [
125
+ Host(name="Compendium", domains="compendium.ch"),
126
+ ]
127
+
128
+ await client.run(..., excluded_urls=excluded_urls)
129
+ ```
130
+
131
+ **Skip previously collected URLs**:
132
+ ```python
133
+ previously_collected_urls = [
134
+ "https://example.com/product1",
135
+ "https://example.com/product2",
136
+ ]
137
+
138
+ await client.run(..., previously_collected_urls=previously_collected_urls)
139
+ ```
140
+
141
+ **View all results** from a client instance:
142
+ ```python
143
+ client.print_available_results()
144
+ ```
145
+
146
+ ## Output
147
+
148
+ Results are saved as CSV files in `data/results/` with the naming pattern:
149
+
150
+ ```
151
+ <search_term>_<language_code>_<location_code>_<timestamp>.csv
152
+ ```
153
+
154
+ Example: `sildenafil_de_ch_20250115143022.csv`
155
+
156
+ The CSV includes product details, URLs, and classification scores from your workflows.
157
+
158
+ ## Development
159
+
160
+ For detailed contribution guidelines, see [CONTRIBUTING.md](CONTRIBUTING.md).
161
+
162
+ ## License
163
+
164
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
165
+
166
+ ## Architecture
167
+
168
+ Fraudcrawler uses an asynchronous pipeline where products can be at different processing stages simultaneously. Product A might be in classification while Product B is still being scraped. This is enabled by async workers for each stage (Search, Context Extraction, Processing) using `httpx.AsyncClient`.
169
+
170
+ ![Async Setup](https://github.com/open-veanu/fraudcrawler/raw/master/docs/assets/images/Fraudcrawler_Async_Setup.svg)
171
+
172
+ For more details on the async design, see the [httpx documentation](https://www.python-httpx.org/api/#asyncclient).
173
+
@@ -0,0 +1,23 @@
1
+ fraudcrawler/__init__.py,sha256=4Xzhj6aS9zmjs8KZS9nhFg9YAWOpCX-TAtJ4s32A5Jk,1191
2
+ fraudcrawler/base/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
+ fraudcrawler/base/base.py,sha256=QchdTXgrabQKnjP28n_QKjpxHChVpOGQA2cfKFaGLAc,6821
4
+ fraudcrawler/base/client.py,sha256=uPaC1uMpHlMR1ThcTkzsoliwmGuTGH3jyTHLyVNYSHk,5608
5
+ fraudcrawler/base/google-languages.json,sha256=z0VtjMCsCcZq11OkCIb9jMDD1p9Ty4lhV7bq4ddYvec,10748
6
+ fraudcrawler/base/google-locations.json,sha256=UtNu0iSStllvFRTQXMobWKmZR1hKmtgtHftLNgaJTT0,9204345
7
+ fraudcrawler/base/orchestrator.py,sha256=BklS4DNzxbp7yvE2NvBWrDDqnvT4YO7Xh_WXstYNWYA,26050
8
+ fraudcrawler/base/retry.py,sha256=bCDd44XO2-lHO8MGvPblD5152-lHt1dOfMAQSmymLO4,1462
9
+ fraudcrawler/launch_demo_pipeline.py,sha256=oZWodtNzA5mhmLNYMS6lglry88NutvH4IxnEWOUtL8M,6179
10
+ fraudcrawler/processing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
+ fraudcrawler/processing/base.py,sha256=UkoYxFNZ3BQkXmgJnTtruz8-eIFCtWiquRN_IoEXfM4,4091
12
+ fraudcrawler/processing/openai.py,sha256=7sbFg2NPsn627VDzsfIkKantE2KahGmVkSZ1R10OrzQ,19050
13
+ fraudcrawler/scraping/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
+ fraudcrawler/scraping/enrich.py,sha256=dGWi9p0JStQYSGscCnsQPHNlAeqjoL2rXZnHFNmPhaQ,13158
15
+ fraudcrawler/scraping/search.py,sha256=qHeUpzv1IpRhdFvaycGtL3FLOwT8rOiF0PfiOH6BmUA,34561
16
+ fraudcrawler/scraping/url.py,sha256=unUoZ-bThU99ZlLdDUILdPx1kbtwMWPZVPCDqPscqHw,3217
17
+ fraudcrawler/scraping/zyte.py,sha256=xSHGKo09sX2dgQBrPI7oeoHsVL4qZ8voQLBXRU1XBqM,11102
18
+ fraudcrawler/settings.py,sha256=q3je0r_jd30x2dzlgfm8GyKcigFdgteOLa8HX188bho,3768
19
+ fraudcrawler-0.7.22.dist-info/LICENSE,sha256=B-3FuHfe3S0fWAlKlceskPcRhzXq81g-rJ-ddUYb4O8,1062
20
+ fraudcrawler-0.7.22.dist-info/METADATA,sha256=D749e0ZWDZSn8pjxvHj7RUf5m0D1_qHzRlZPRFqTE9A,5303
21
+ fraudcrawler-0.7.22.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
22
+ fraudcrawler-0.7.22.dist-info/entry_points.txt,sha256=1Befm7cM6945y2AA1z9V4gZV63mtSWcAs7ypvgux_Xg,79
23
+ fraudcrawler-0.7.22.dist-info/RECORD,,