fraudcrawler 0.5.9__py3-none-any.whl → 0.6.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of fraudcrawler might be problematic. Click here for more details.
- fraudcrawler/__init__.py +2 -2
- fraudcrawler/base/base.py +11 -32
- fraudcrawler/base/client.py +1 -1
- fraudcrawler/base/orchestrator.py +135 -135
- fraudcrawler/base/retry.py +12 -6
- fraudcrawler/launch_demo_pipeline.py +1 -1
- fraudcrawler/processing/processor.py +3 -3
- fraudcrawler/scraping/search.py +352 -125
- fraudcrawler/scraping/url.py +42 -3
- fraudcrawler/scraping/zyte.py +15 -1
- fraudcrawler/settings.py +13 -3
- {fraudcrawler-0.5.9.dist-info → fraudcrawler-0.6.1.dist-info}/METADATA +4 -3
- fraudcrawler-0.6.1.dist-info/RECORD +22 -0
- {fraudcrawler-0.5.9.dist-info → fraudcrawler-0.6.1.dist-info}/WHEEL +1 -1
- fraudcrawler-0.5.9.dist-info/RECORD +0 -22
- {fraudcrawler-0.5.9.dist-info → fraudcrawler-0.6.1.dist-info}/LICENSE +0 -0
- {fraudcrawler-0.5.9.dist-info → fraudcrawler-0.6.1.dist-info}/entry_points.txt +0 -0
fraudcrawler/scraping/url.py
CHANGED
|
@@ -3,6 +3,7 @@ from typing import List, Set, Tuple
|
|
|
3
3
|
from urllib.parse import urlparse, parse_qsl, urlencode, quote, urlunparse, ParseResult
|
|
4
4
|
|
|
5
5
|
from fraudcrawler.settings import KNOWN_TRACKERS
|
|
6
|
+
from fraudcrawler.base.base import ProductItem
|
|
6
7
|
|
|
7
8
|
logger = logging.getLogger(__name__)
|
|
8
9
|
|
|
@@ -11,11 +12,19 @@ class URLCollector:
|
|
|
11
12
|
"""A class to collect and de-duplicate URLs."""
|
|
12
13
|
|
|
13
14
|
def __init__(self):
|
|
14
|
-
self.
|
|
15
|
-
self.
|
|
15
|
+
self._collected_currently: Set[str] = set()
|
|
16
|
+
self._collected_previously: Set[str] = set()
|
|
17
|
+
|
|
18
|
+
def add_previously_collected_urls(self, urls: List[str]) -> None:
|
|
19
|
+
"""Add a set of previously collected URLs to the internal state.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
urls: A set of URLs that have been collected in previous runs.
|
|
23
|
+
"""
|
|
24
|
+
self._collected_previously.update(urls)
|
|
16
25
|
|
|
17
26
|
@staticmethod
|
|
18
|
-
def
|
|
27
|
+
def _remove_tracking_parameters(url: str) -> str:
|
|
19
28
|
"""Remove tracking parameters from URLs.
|
|
20
29
|
|
|
21
30
|
Args:
|
|
@@ -55,3 +64,33 @@ class URLCollector:
|
|
|
55
64
|
fragment=parsed_url.fragment,
|
|
56
65
|
)
|
|
57
66
|
return urlunparse(clean_url)
|
|
67
|
+
|
|
68
|
+
async def apply(self, product: ProductItem) -> ProductItem:
|
|
69
|
+
"""Manages the collection and deduplication of ProductItems.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
product: The product item to process.
|
|
73
|
+
"""
|
|
74
|
+
logger.debug(f'Processing product with url="{product.url}"')
|
|
75
|
+
|
|
76
|
+
# Remove tracking parameters from the URL
|
|
77
|
+
url = self._remove_tracking_parameters(product.url)
|
|
78
|
+
product.url = url
|
|
79
|
+
|
|
80
|
+
# deduplicate on current run
|
|
81
|
+
if url in self._collected_currently:
|
|
82
|
+
product.filtered = True
|
|
83
|
+
product.filtered_at_stage = "URL collection (current run deduplication)"
|
|
84
|
+
logger.debug(f"URL {url} already collected in current run")
|
|
85
|
+
|
|
86
|
+
# deduplicate on previous runs coming from a db
|
|
87
|
+
elif url in self._collected_previously:
|
|
88
|
+
product.filtered = True
|
|
89
|
+
product.filtered_at_stage = "URL collection (previous run deduplication)"
|
|
90
|
+
logger.debug(f"URL {url} as already collected in previous run")
|
|
91
|
+
|
|
92
|
+
# Add to currently collected URLs
|
|
93
|
+
else:
|
|
94
|
+
self._collected_currently.add(url)
|
|
95
|
+
|
|
96
|
+
return product
|
fraudcrawler/scraping/zyte.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
|
+
from base64 import b64decode
|
|
1
2
|
import logging
|
|
2
3
|
from typing import List
|
|
3
|
-
from base64 import b64decode
|
|
4
4
|
|
|
5
5
|
import httpx
|
|
6
6
|
from tenacity import RetryCallState
|
|
@@ -242,3 +242,17 @@ class ZyteAPI(DomainUtils):
|
|
|
242
242
|
decoded_string = decoded_bytes.decode("utf-8")
|
|
243
243
|
return decoded_string
|
|
244
244
|
return None
|
|
245
|
+
|
|
246
|
+
async def unblock_url_content(self, url: str) -> bytes:
|
|
247
|
+
"""Unblock the content of an URL using Zyte proxy mode.
|
|
248
|
+
|
|
249
|
+
Args:
|
|
250
|
+
url: The URL to fetch using Zyte proxy mode.
|
|
251
|
+
"""
|
|
252
|
+
logger.debug(f'Unblock URL content using Zyte proxy for url="{url}"')
|
|
253
|
+
details = await self.details(url)
|
|
254
|
+
|
|
255
|
+
if not details or "httpResponseBody" not in details:
|
|
256
|
+
raise httpx.HTTPError("No httpResponseBody in Zyte response")
|
|
257
|
+
|
|
258
|
+
return b64decode(details["httpResponseBody"])
|
fraudcrawler/settings.py
CHANGED
|
@@ -14,12 +14,22 @@ RETRY_EXP_BASE = 4
|
|
|
14
14
|
RETRY_JITTER = 1
|
|
15
15
|
RETRY_SKIP_IF_CODE = [400, 401, 403] # Skip retrying on these HTTP status codes
|
|
16
16
|
|
|
17
|
-
#
|
|
17
|
+
# Search settings
|
|
18
18
|
GOOGLE_LOCATIONS_FILENAME = ROOT_DIR / "fraudcrawler" / "base" / "google-locations.json"
|
|
19
19
|
GOOGLE_LANGUAGES_FILENAME = ROOT_DIR / "fraudcrawler" / "base" / "google-languages.json"
|
|
20
20
|
SEARCH_DEFAULT_COUNTRY_CODES: List[str] = [
|
|
21
21
|
# ".com",
|
|
22
22
|
]
|
|
23
|
+
TOPPREISE_SEARCH_PATHS = {
|
|
24
|
+
"de": "produktsuche",
|
|
25
|
+
"fr": "chercher",
|
|
26
|
+
"default": "browse",
|
|
27
|
+
}
|
|
28
|
+
TOPPREISE_COMPARISON_PATHS = [
|
|
29
|
+
"preisvergleich",
|
|
30
|
+
"comparison-prix",
|
|
31
|
+
"price-comparison",
|
|
32
|
+
]
|
|
23
33
|
|
|
24
34
|
# URL De-duplication settings
|
|
25
35
|
KNOWN_TRACKERS = [
|
|
@@ -76,8 +86,8 @@ PROCESSOR_USER_PROMPT_TEMPLATE = "Product Details:\n{product_details}\n\nRelevan
|
|
|
76
86
|
PROCESSOR_PRODUCT_DETAILS_TEMPLATE = "{field_name}:\n{field_value}"
|
|
77
87
|
|
|
78
88
|
# Async workers settings
|
|
79
|
-
|
|
80
|
-
|
|
89
|
+
DEFAULT_N_SRCH_WKRS = 5
|
|
90
|
+
DEFAULT_N_CNTX_WKRS = 15
|
|
81
91
|
DEFAULT_N_PROC_WKRS = 10
|
|
82
92
|
|
|
83
93
|
# HTTPX client settings
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
2
|
Name: fraudcrawler
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.6.1
|
|
4
4
|
Summary: Intelligent Market Monitoring
|
|
5
5
|
Home-page: https://github.com/open-veanu/fraudcrawler
|
|
6
6
|
License: MIT
|
|
@@ -11,6 +11,7 @@ Classifier: License :: OSI Approved :: MIT License
|
|
|
11
11
|
Classifier: Programming Language :: Python :: 3
|
|
12
12
|
Classifier: Programming Language :: Python :: 3.11
|
|
13
13
|
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
14
15
|
Requires-Dist: beautifulsoup4 (>=4.13.4,<5.0.0)
|
|
15
16
|
Requires-Dist: httpx (>=0.28.1,<0.29.0)
|
|
16
17
|
Requires-Dist: openai (>=1.68.2,<2.0.0)
|
|
@@ -160,7 +161,7 @@ see `CONTRIBUTING.md`
|
|
|
160
161
|
### Async Setup
|
|
161
162
|
The `Orchestrator` class in `src/base/orchestrator.py` is designed to coordinate multiple services that may have interdependencies, allowing them to run in a semi-iterative manner. This means, for example, that product A can be at stage III of the pipeline while product B is still at stage I.
|
|
162
163
|
|
|
163
|
-
This behavior is enabled through an asynchronous pipeline setup. The three main steps, `
|
|
164
|
+
This behavior is enabled through an asynchronous pipeline setup. The three main steps, `Search`, `Context Extraction`, and `Processing`, all utilize `httpx.AsyncClient`. It is both possible and highly recommended to manage a single AsyncClient instance per application for efficiency. We provide a `HttpxAsyncClient` class that you can pass For more details, see the [httpx documentation](https://www.python-httpx.org/api/#asyncclient).
|
|
164
165
|
|
|
165
166
|
The following image provides a schematic representation of the package's async setup.
|
|
166
167
|

|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
fraudcrawler/__init__.py,sha256=oSwuiyVBBk_HZfeZxXJR0ELtA4mc-upsBMVHSwuokEo,846
|
|
2
|
+
fraudcrawler/base/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
+
fraudcrawler/base/base.py,sha256=74qwevU8sZBvXAladam0rmjcdn3AiT39MScpxZtD95I,7727
|
|
4
|
+
fraudcrawler/base/client.py,sha256=obxrd65pYja--XQbgpIMsMO6erMNdRG68SzNUs_YvLM,5856
|
|
5
|
+
fraudcrawler/base/google-languages.json,sha256=z0VtjMCsCcZq11OkCIb9jMDD1p9Ty4lhV7bq4ddYvec,10748
|
|
6
|
+
fraudcrawler/base/google-locations.json,sha256=UtNu0iSStllvFRTQXMobWKmZR1hKmtgtHftLNgaJTT0,9204345
|
|
7
|
+
fraudcrawler/base/orchestrator.py,sha256=n0xrMJ9a3g3cRAMmhKEgyrwwrbgsaMno9DeyE93jB5U,27006
|
|
8
|
+
fraudcrawler/base/retry.py,sha256=1Ox7RsnnF62dP53rkidRHetA5mr2HS1R-7FskCVbwug,1178
|
|
9
|
+
fraudcrawler/launch_demo_pipeline.py,sha256=TqlQrs8raT9jIJ3TJK3BOQMLm2qNn2dKaMGL-MyhC70,4635
|
|
10
|
+
fraudcrawler/processing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
11
|
+
fraudcrawler/processing/processor.py,sha256=zetp_G5g4z8sBUq-5qOxVRF2W2h9FIwolVxvMqhTmXs,7619
|
|
12
|
+
fraudcrawler/scraping/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
13
|
+
fraudcrawler/scraping/enrich.py,sha256=M4ErLF7q-5EKfEd-dIDS73mZc-aPFp5pJbgdRKCO3f8,13112
|
|
14
|
+
fraudcrawler/scraping/search.py,sha256=Anm8ymjCH3BVttogHY-_03YRc64yJswJ8OP8DW56O48,34546
|
|
15
|
+
fraudcrawler/scraping/url.py,sha256=unUoZ-bThU99ZlLdDUILdPx1kbtwMWPZVPCDqPscqHw,3217
|
|
16
|
+
fraudcrawler/scraping/zyte.py,sha256=SxucVH_wtVhPNImIXvijM528IwL6zl6I3ndf0OdVXY0,8860
|
|
17
|
+
fraudcrawler/settings.py,sha256=Bp9_9w_RRr_-PtZXcy30EKbT9YiOc8OLjEMaNZh06vc,3875
|
|
18
|
+
fraudcrawler-0.6.1.dist-info/LICENSE,sha256=B-3FuHfe3S0fWAlKlceskPcRhzXq81g-rJ-ddUYb4O8,1062
|
|
19
|
+
fraudcrawler-0.6.1.dist-info/METADATA,sha256=_LcfOKayMQjAXoCxlJfqYtiSfitegUuQgFUD5XEGFog,6704
|
|
20
|
+
fraudcrawler-0.6.1.dist-info/WHEEL,sha256=RaoafKOydTQ7I_I3JTrPCg6kUmTgtm4BornzOqyEfJ8,88
|
|
21
|
+
fraudcrawler-0.6.1.dist-info/entry_points.txt,sha256=1Befm7cM6945y2AA1z9V4gZV63mtSWcAs7ypvgux_Xg,79
|
|
22
|
+
fraudcrawler-0.6.1.dist-info/RECORD,,
|
|
@@ -1,22 +0,0 @@
|
|
|
1
|
-
fraudcrawler/__init__.py,sha256=Kr19jWhtbC1shVoB9fHvBSeoG1IyQB9re1kCZ4YIAi0,842
|
|
2
|
-
fraudcrawler/base/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
-
fraudcrawler/base/base.py,sha256=S5VP1PD2tymZMmVcVmP4FI0VlCQxDCB2gp0vVD6eN8g,8543
|
|
4
|
-
fraudcrawler/base/client.py,sha256=yhkNrhL2SuJXTknLf-8P81fv01FnFMahREZgem-Z-f0,5832
|
|
5
|
-
fraudcrawler/base/google-languages.json,sha256=z0VtjMCsCcZq11OkCIb9jMDD1p9Ty4lhV7bq4ddYvec,10748
|
|
6
|
-
fraudcrawler/base/google-locations.json,sha256=UtNu0iSStllvFRTQXMobWKmZR1hKmtgtHftLNgaJTT0,9204345
|
|
7
|
-
fraudcrawler/base/orchestrator.py,sha256=28X45XLPlJe2hvff8HTLo-V08LNeS0zMWBHe5W3hk4c,27039
|
|
8
|
-
fraudcrawler/base/retry.py,sha256=9VyVrbYR_0YnfxFhUrvcM3aWCYR6oR4iZE4A3zzVZUs,1015
|
|
9
|
-
fraudcrawler/launch_demo_pipeline.py,sha256=hTzGFQDEwchDSwUx0HgG_TW5h9J7BXM7jn_iB8iI838,4636
|
|
10
|
-
fraudcrawler/processing/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
11
|
-
fraudcrawler/processing/processor.py,sha256=Qq8QcTlqfnzFi1t-1KkncXxaIszUO7pGK3LXTdHkDnM,7638
|
|
12
|
-
fraudcrawler/scraping/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
13
|
-
fraudcrawler/scraping/enrich.py,sha256=M4ErLF7q-5EKfEd-dIDS73mZc-aPFp5pJbgdRKCO3f8,13112
|
|
14
|
-
fraudcrawler/scraping/search.py,sha256=5fTRo7Tkkz9REU-u50oZu4kbynjIcIigp9HZ0cS_UCg,25510
|
|
15
|
-
fraudcrawler/scraping/url.py,sha256=5Z3hPW73E-TLhM-Zha8OTcUOumc_rcx64R0fT9z2Hi8,1748
|
|
16
|
-
fraudcrawler/scraping/zyte.py,sha256=GqvVWA1AWVoClAwd-hQ9iynsT0dOb7R0ntaLK5XVivM,8340
|
|
17
|
-
fraudcrawler/settings.py,sha256=uwXMOQpuwyWkuMU0asYGtBlL_qJj8F-Xkg4dUaCmDxE,3670
|
|
18
|
-
fraudcrawler-0.5.9.dist-info/LICENSE,sha256=B-3FuHfe3S0fWAlKlceskPcRhzXq81g-rJ-ddUYb4O8,1062
|
|
19
|
-
fraudcrawler-0.5.9.dist-info/METADATA,sha256=sMszTwcTVxuMGfRhJJZKLAPShnY4zDU6gShAa6k3tPg,6642
|
|
20
|
-
fraudcrawler-0.5.9.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
|
21
|
-
fraudcrawler-0.5.9.dist-info/entry_points.txt,sha256=1Befm7cM6945y2AA1z9V4gZV63mtSWcAs7ypvgux_Xg,79
|
|
22
|
-
fraudcrawler-0.5.9.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|