fraudcrawler 0.5.9__py3-none-any.whl → 0.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of fraudcrawler might be problematic. Click here for more details.

@@ -3,6 +3,7 @@ from typing import List, Set, Tuple
3
3
  from urllib.parse import urlparse, parse_qsl, urlencode, quote, urlunparse, ParseResult
4
4
 
5
5
  from fraudcrawler.settings import KNOWN_TRACKERS
6
+ from fraudcrawler.base.base import ProductItem
6
7
 
7
8
  logger = logging.getLogger(__name__)
8
9
 
@@ -11,11 +12,19 @@ class URLCollector:
11
12
  """A class to collect and de-duplicate URLs."""
12
13
 
13
14
  def __init__(self):
14
- self.collected_currently: Set[str] = set()
15
- self.collected_previously: Set[str] = set()
15
+ self._collected_currently: Set[str] = set()
16
+ self._collected_previously: Set[str] = set()
17
+
18
+ def add_previously_collected_urls(self, urls: List[str]) -> None:
19
+ """Add a set of previously collected URLs to the internal state.
20
+
21
+ Args:
22
+ urls: A set of URLs that have been collected in previous runs.
23
+ """
24
+ self._collected_previously.update(urls)
16
25
 
17
26
  @staticmethod
18
- def remove_tracking_parameters(url: str) -> str:
27
+ def _remove_tracking_parameters(url: str) -> str:
19
28
  """Remove tracking parameters from URLs.
20
29
 
21
30
  Args:
@@ -55,3 +64,33 @@ class URLCollector:
55
64
  fragment=parsed_url.fragment,
56
65
  )
57
66
  return urlunparse(clean_url)
67
+
68
+ async def apply(self, product: ProductItem) -> ProductItem:
69
+ """Manages the collection and deduplication of ProductItems.
70
+
71
+ Args:
72
+ product: The product item to process.
73
+ """
74
+ logger.debug(f'Processing product with url="{product.url}"')
75
+
76
+ # Remove tracking parameters from the URL
77
+ url = self._remove_tracking_parameters(product.url)
78
+ product.url = url
79
+
80
+ # deduplicate on current run
81
+ if url in self._collected_currently:
82
+ product.filtered = True
83
+ product.filtered_at_stage = "URL collection (current run deduplication)"
84
+ logger.debug(f"URL {url} already collected in current run")
85
+
86
+ # deduplicate on previous runs coming from a db
87
+ elif url in self._collected_previously:
88
+ product.filtered = True
89
+ product.filtered_at_stage = "URL collection (previous run deduplication)"
90
+ logger.debug(f"URL {url} as already collected in previous run")
91
+
92
+ # Add to currently collected URLs
93
+ else:
94
+ self._collected_currently.add(url)
95
+
96
+ return product
@@ -1,6 +1,6 @@
1
+ from base64 import b64decode
1
2
  import logging
2
3
  from typing import List
3
- from base64 import b64decode
4
4
 
5
5
  import httpx
6
6
  from tenacity import RetryCallState
@@ -242,3 +242,17 @@ class ZyteAPI(DomainUtils):
242
242
  decoded_string = decoded_bytes.decode("utf-8")
243
243
  return decoded_string
244
244
  return None
245
+
246
+ async def unblock_url_content(self, url: str) -> bytes:
247
+ """Unblock the content of an URL using Zyte proxy mode.
248
+
249
+ Args:
250
+ url: The URL to fetch using Zyte proxy mode.
251
+ """
252
+ logger.debug(f'Unblock URL content using Zyte proxy for url="{url}"')
253
+ details = await self.details(url)
254
+
255
+ if not details or "httpResponseBody" not in details:
256
+ raise httpx.HTTPError("No httpResponseBody in Zyte response")
257
+
258
+ return b64decode(details["httpResponseBody"])
fraudcrawler/settings.py CHANGED
@@ -14,12 +14,22 @@ RETRY_EXP_BASE = 4
14
14
  RETRY_JITTER = 1
15
15
  RETRY_SKIP_IF_CODE = [400, 401, 403] # Skip retrying on these HTTP status codes
16
16
 
17
- # Serp settings
17
+ # Search settings
18
18
  GOOGLE_LOCATIONS_FILENAME = ROOT_DIR / "fraudcrawler" / "base" / "google-locations.json"
19
19
  GOOGLE_LANGUAGES_FILENAME = ROOT_DIR / "fraudcrawler" / "base" / "google-languages.json"
20
20
  SEARCH_DEFAULT_COUNTRY_CODES: List[str] = [
21
21
  # ".com",
22
22
  ]
23
+ TOPPREISE_SEARCH_PATHS = {
24
+ "de": "produktsuche",
25
+ "fr": "chercher",
26
+ "default": "browse",
27
+ }
28
+ TOPPREISE_COMPARISON_PATHS = [
29
+ "preisvergleich",
30
+ "comparison-prix",
31
+ "price-comparison",
32
+ ]
23
33
 
24
34
  # URL De-duplication settings
25
35
  KNOWN_TRACKERS = [
@@ -76,8 +86,8 @@ PROCESSOR_USER_PROMPT_TEMPLATE = "Product Details:\n{product_details}\n\nRelevan
76
86
  PROCESSOR_PRODUCT_DETAILS_TEMPLATE = "{field_name}:\n{field_value}"
77
87
 
78
88
  # Async workers settings
79
- DEFAULT_N_SERP_WKRS = 10
80
- DEFAULT_N_ZYTE_WKRS = 10
89
+ DEFAULT_N_SRCH_WKRS = 5
90
+ DEFAULT_N_CNTX_WKRS = 15
81
91
  DEFAULT_N_PROC_WKRS = 10
82
92
 
83
93
  # HTTPX client settings
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.3
2
2
  Name: fraudcrawler
3
- Version: 0.5.9
3
+ Version: 0.6.1
4
4
  Summary: Intelligent Market Monitoring
5
5
  Home-page: https://github.com/open-veanu/fraudcrawler
6
6
  License: MIT
@@ -11,6 +11,7 @@ Classifier: License :: OSI Approved :: MIT License
11
11
  Classifier: Programming Language :: Python :: 3
12
12
  Classifier: Programming Language :: Python :: 3.11
13
13
  Classifier: Programming Language :: Python :: 3.12
14
+ Classifier: Programming Language :: Python :: 3.13
14
15
  Requires-Dist: beautifulsoup4 (>=4.13.4,<5.0.0)
15
16
  Requires-Dist: httpx (>=0.28.1,<0.29.0)
16
17
  Requires-Dist: openai (>=1.68.2,<2.0.0)
@@ -160,7 +161,7 @@ see `CONTRIBUTING.md`
160
161
  ### Async Setup
161
162
  The `Orchestrator` class in `src/base/orchestrator.py` is designed to coordinate multiple services that may have interdependencies, allowing them to run in a semi-iterative manner. This means, for example, that product A can be at stage III of the pipeline while product B is still at stage I.
162
163
 
163
- This behavior is enabled through an asynchronous pipeline setup. The three main steps, `SerpAPI`, `ZyteAPI`, and `Processor`, all utilize `httpx.AsyncClient`. It is both possible and highly recommended to manage a single AsyncClient instance per application for efficiency. We provide a `HttpxAsyncClient` class that you can pass For more details, see the [httpx documentation](https://www.python-httpx.org/api/#asyncclient).
164
+ This behavior is enabled through an asynchronous pipeline setup. The three main steps, `Search`, `Context Extraction`, and `Processing`, all utilize `httpx.AsyncClient`. It is both possible and highly recommended to manage a single AsyncClient instance per application for efficiency. We provide a `HttpxAsyncClient` class that you can pass For more details, see the [httpx documentation](https://www.python-httpx.org/api/#asyncclient).
164
165
 
165
166
  The following image provides a schematic representation of the package's async setup.
166
167
  ![Async Setup](https://github.com/open-veanu/fraudcrawler/raw/master/docs/assets/images/Fraudcrawler_Async_Setup.svg)
@@ -0,0 +1,22 @@
1
+ fraudcrawler/__init__.py,sha256=oSwuiyVBBk_HZfeZxXJR0ELtA4mc-upsBMVHSwuokEo,846
2
+ fraudcrawler/base/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
+ fraudcrawler/base/base.py,sha256=74qwevU8sZBvXAladam0rmjcdn3AiT39MScpxZtD95I,7727
4
+ fraudcrawler/base/client.py,sha256=obxrd65pYja--XQbgpIMsMO6erMNdRG68SzNUs_YvLM,5856
5
+ fraudcrawler/base/google-languages.json,sha256=z0VtjMCsCcZq11OkCIb9jMDD1p9Ty4lhV7bq4ddYvec,10748
6
+ fraudcrawler/base/google-locations.json,sha256=UtNu0iSStllvFRTQXMobWKmZR1hKmtgtHftLNgaJTT0,9204345
7
+ fraudcrawler/base/orchestrator.py,sha256=n0xrMJ9a3g3cRAMmhKEgyrwwrbgsaMno9DeyE93jB5U,27006
8
+ fraudcrawler/base/retry.py,sha256=1Ox7RsnnF62dP53rkidRHetA5mr2HS1R-7FskCVbwug,1178
9
+ fraudcrawler/launch_demo_pipeline.py,sha256=TqlQrs8raT9jIJ3TJK3BOQMLm2qNn2dKaMGL-MyhC70,4635
10
+ fraudcrawler/processing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
+ fraudcrawler/processing/processor.py,sha256=zetp_G5g4z8sBUq-5qOxVRF2W2h9FIwolVxvMqhTmXs,7619
12
+ fraudcrawler/scraping/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
+ fraudcrawler/scraping/enrich.py,sha256=M4ErLF7q-5EKfEd-dIDS73mZc-aPFp5pJbgdRKCO3f8,13112
14
+ fraudcrawler/scraping/search.py,sha256=Anm8ymjCH3BVttogHY-_03YRc64yJswJ8OP8DW56O48,34546
15
+ fraudcrawler/scraping/url.py,sha256=unUoZ-bThU99ZlLdDUILdPx1kbtwMWPZVPCDqPscqHw,3217
16
+ fraudcrawler/scraping/zyte.py,sha256=SxucVH_wtVhPNImIXvijM528IwL6zl6I3ndf0OdVXY0,8860
17
+ fraudcrawler/settings.py,sha256=Bp9_9w_RRr_-PtZXcy30EKbT9YiOc8OLjEMaNZh06vc,3875
18
+ fraudcrawler-0.6.1.dist-info/LICENSE,sha256=B-3FuHfe3S0fWAlKlceskPcRhzXq81g-rJ-ddUYb4O8,1062
19
+ fraudcrawler-0.6.1.dist-info/METADATA,sha256=_LcfOKayMQjAXoCxlJfqYtiSfitegUuQgFUD5XEGFog,6704
20
+ fraudcrawler-0.6.1.dist-info/WHEEL,sha256=RaoafKOydTQ7I_I3JTrPCg6kUmTgtm4BornzOqyEfJ8,88
21
+ fraudcrawler-0.6.1.dist-info/entry_points.txt,sha256=1Befm7cM6945y2AA1z9V4gZV63mtSWcAs7ypvgux_Xg,79
22
+ fraudcrawler-0.6.1.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: poetry-core 1.9.0
2
+ Generator: poetry-core 2.0.0
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
@@ -1,22 +0,0 @@
1
- fraudcrawler/__init__.py,sha256=Kr19jWhtbC1shVoB9fHvBSeoG1IyQB9re1kCZ4YIAi0,842
2
- fraudcrawler/base/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
- fraudcrawler/base/base.py,sha256=S5VP1PD2tymZMmVcVmP4FI0VlCQxDCB2gp0vVD6eN8g,8543
4
- fraudcrawler/base/client.py,sha256=yhkNrhL2SuJXTknLf-8P81fv01FnFMahREZgem-Z-f0,5832
5
- fraudcrawler/base/google-languages.json,sha256=z0VtjMCsCcZq11OkCIb9jMDD1p9Ty4lhV7bq4ddYvec,10748
6
- fraudcrawler/base/google-locations.json,sha256=UtNu0iSStllvFRTQXMobWKmZR1hKmtgtHftLNgaJTT0,9204345
7
- fraudcrawler/base/orchestrator.py,sha256=28X45XLPlJe2hvff8HTLo-V08LNeS0zMWBHe5W3hk4c,27039
8
- fraudcrawler/base/retry.py,sha256=9VyVrbYR_0YnfxFhUrvcM3aWCYR6oR4iZE4A3zzVZUs,1015
9
- fraudcrawler/launch_demo_pipeline.py,sha256=hTzGFQDEwchDSwUx0HgG_TW5h9J7BXM7jn_iB8iI838,4636
10
- fraudcrawler/processing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
- fraudcrawler/processing/processor.py,sha256=Qq8QcTlqfnzFi1t-1KkncXxaIszUO7pGK3LXTdHkDnM,7638
12
- fraudcrawler/scraping/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
- fraudcrawler/scraping/enrich.py,sha256=M4ErLF7q-5EKfEd-dIDS73mZc-aPFp5pJbgdRKCO3f8,13112
14
- fraudcrawler/scraping/search.py,sha256=5fTRo7Tkkz9REU-u50oZu4kbynjIcIigp9HZ0cS_UCg,25510
15
- fraudcrawler/scraping/url.py,sha256=5Z3hPW73E-TLhM-Zha8OTcUOumc_rcx64R0fT9z2Hi8,1748
16
- fraudcrawler/scraping/zyte.py,sha256=GqvVWA1AWVoClAwd-hQ9iynsT0dOb7R0ntaLK5XVivM,8340
17
- fraudcrawler/settings.py,sha256=uwXMOQpuwyWkuMU0asYGtBlL_qJj8F-Xkg4dUaCmDxE,3670
18
- fraudcrawler-0.5.9.dist-info/LICENSE,sha256=B-3FuHfe3S0fWAlKlceskPcRhzXq81g-rJ-ddUYb4O8,1062
19
- fraudcrawler-0.5.9.dist-info/METADATA,sha256=sMszTwcTVxuMGfRhJJZKLAPShnY4zDU6gShAa6k3tPg,6642
20
- fraudcrawler-0.5.9.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
21
- fraudcrawler-0.5.9.dist-info/entry_points.txt,sha256=1Befm7cM6945y2AA1z9V4gZV63mtSWcAs7ypvgux_Xg,79
22
- fraudcrawler-0.5.9.dist-info/RECORD,,