webtoolkit 0.0.192__tar.gz → 0.0.194__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {webtoolkit-0.0.192 → webtoolkit-0.0.194}/PKG-INFO +3 -2
- {webtoolkit-0.0.192 → webtoolkit-0.0.194}/pyproject.toml +1 -1
- {webtoolkit-0.0.192 → webtoolkit-0.0.194}/webtoolkit/baseurl.py +6 -0
- {webtoolkit-0.0.192 → webtoolkit-0.0.194}/webtoolkit/remoteserver.py +13 -7
- {webtoolkit-0.0.192 → webtoolkit-0.0.194}/webtoolkit/remoteurl.py +5 -2
- {webtoolkit-0.0.192 → webtoolkit-0.0.194}/LICENSE +0 -0
- {webtoolkit-0.0.192 → webtoolkit-0.0.194}/README.md +0 -0
- {webtoolkit-0.0.192 → webtoolkit-0.0.194}/webtoolkit/__init__.py +0 -0
- {webtoolkit-0.0.192 → webtoolkit-0.0.194}/webtoolkit/contentinterface.py +0 -0
- {webtoolkit-0.0.192 → webtoolkit-0.0.194}/webtoolkit/contentlinkparser.py +0 -0
- {webtoolkit-0.0.192 → webtoolkit-0.0.194}/webtoolkit/contentmoderation.py +0 -0
- {webtoolkit-0.0.192 → webtoolkit-0.0.194}/webtoolkit/contenttext.py +0 -0
- {webtoolkit-0.0.192 → webtoolkit-0.0.194}/webtoolkit/crawlers/__init__.py +0 -0
- {webtoolkit-0.0.192 → webtoolkit-0.0.194}/webtoolkit/crawlers/crawlerinterface.py +0 -0
- {webtoolkit-0.0.192 → webtoolkit-0.0.194}/webtoolkit/crawlers/requestscrawler.py +0 -0
- {webtoolkit-0.0.192 → webtoolkit-0.0.194}/webtoolkit/domaincache.py +0 -0
- {webtoolkit-0.0.192 → webtoolkit-0.0.194}/webtoolkit/handlers/__init__.py +0 -0
- {webtoolkit-0.0.192 → webtoolkit-0.0.194}/webtoolkit/handlers/defaulturlhandler.py +0 -0
- {webtoolkit-0.0.192 → webtoolkit-0.0.194}/webtoolkit/handlers/handlerchannelodysee.py +0 -0
- {webtoolkit-0.0.192 → webtoolkit-0.0.194}/webtoolkit/handlers/handlerchannelyoutube.py +0 -0
- {webtoolkit-0.0.192 → webtoolkit-0.0.194}/webtoolkit/handlers/handlerhttppage.py +0 -0
- {webtoolkit-0.0.192 → webtoolkit-0.0.194}/webtoolkit/handlers/handlerinterface.py +0 -0
- {webtoolkit-0.0.192 → webtoolkit-0.0.194}/webtoolkit/handlers/handlers.py +0 -0
- {webtoolkit-0.0.192 → webtoolkit-0.0.194}/webtoolkit/handlers/handlervideoodysee.py +0 -0
- {webtoolkit-0.0.192 → webtoolkit-0.0.194}/webtoolkit/handlers/handlervideoyoutube.py +0 -0
- {webtoolkit-0.0.192 → webtoolkit-0.0.194}/webtoolkit/pages.py +0 -0
- {webtoolkit-0.0.192 → webtoolkit-0.0.194}/webtoolkit/request.py +0 -0
- {webtoolkit-0.0.192 → webtoolkit-0.0.194}/webtoolkit/response.py +0 -0
- {webtoolkit-0.0.192 → webtoolkit-0.0.194}/webtoolkit/statuses.py +0 -0
- {webtoolkit-0.0.192 → webtoolkit-0.0.194}/webtoolkit/tests/__init__.py +0 -0
- {webtoolkit-0.0.192 → webtoolkit-0.0.194}/webtoolkit/tests/fake/__init__.py +0 -0
- {webtoolkit-0.0.192 → webtoolkit-0.0.194}/webtoolkit/tests/fake/codeproject.py +0 -0
- {webtoolkit-0.0.192 → webtoolkit-0.0.194}/webtoolkit/tests/fake/firebog.py +0 -0
- {webtoolkit-0.0.192 → webtoolkit-0.0.194}/webtoolkit/tests/fake/geekwirecom.py +0 -0
- {webtoolkit-0.0.192 → webtoolkit-0.0.194}/webtoolkit/tests/fake/githubcom.py +0 -0
- {webtoolkit-0.0.192 → webtoolkit-0.0.194}/webtoolkit/tests/fake/hackernews.py +0 -0
- {webtoolkit-0.0.192 → webtoolkit-0.0.194}/webtoolkit/tests/fake/instance.py +0 -0
- {webtoolkit-0.0.192 → webtoolkit-0.0.194}/webtoolkit/tests/fake/opmlfile.py +0 -0
- {webtoolkit-0.0.192 → webtoolkit-0.0.194}/webtoolkit/tests/fake/reddit.py +0 -0
- {webtoolkit-0.0.192 → webtoolkit-0.0.194}/webtoolkit/tests/fake/returndislike.py +0 -0
- {webtoolkit-0.0.192 → webtoolkit-0.0.194}/webtoolkit/tests/fake/robotstxtcom.py +0 -0
- {webtoolkit-0.0.192 → webtoolkit-0.0.194}/webtoolkit/tests/fake/thehill.py +0 -0
- {webtoolkit-0.0.192 → webtoolkit-0.0.194}/webtoolkit/tests/fake/warhammercommunity.py +0 -0
- {webtoolkit-0.0.192 → webtoolkit-0.0.194}/webtoolkit/tests/fake/youtube.py +0 -0
- {webtoolkit-0.0.192 → webtoolkit-0.0.194}/webtoolkit/tests/fakeinternet.py +0 -0
- {webtoolkit-0.0.192 → webtoolkit-0.0.194}/webtoolkit/tests/fakeinternetcontents.py +0 -0
- {webtoolkit-0.0.192 → webtoolkit-0.0.194}/webtoolkit/tests/fakeresponse.py +0 -0
- {webtoolkit-0.0.192 → webtoolkit-0.0.194}/webtoolkit/tests/mocks.py +0 -0
- {webtoolkit-0.0.192 → webtoolkit-0.0.194}/webtoolkit/urllocation.py +0 -0
- {webtoolkit-0.0.192 → webtoolkit-0.0.194}/webtoolkit/utils/dateutils.py +0 -0
- {webtoolkit-0.0.192 → webtoolkit-0.0.194}/webtoolkit/utils/logger.py +0 -0
- {webtoolkit-0.0.192 → webtoolkit-0.0.194}/webtoolkit/webconfig.py +0 -0
- {webtoolkit-0.0.192 → webtoolkit-0.0.194}/webtoolkit/webtools.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
2
|
Name: webtoolkit
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.194
|
|
4
4
|
Summary: Web tools and interfaces for Internet data processing.
|
|
5
5
|
License: GPL3
|
|
6
6
|
Author: Iwan Grozny
|
|
@@ -12,6 +12,7 @@ Classifier: Programming Language :: Python :: 3.9
|
|
|
12
12
|
Classifier: Programming Language :: Python :: 3.10
|
|
13
13
|
Classifier: Programming Language :: Python :: 3.11
|
|
14
14
|
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
15
16
|
Requires-Dist: beautifulsoup4 (>=4.13.5,<5.0.0)
|
|
16
17
|
Requires-Dist: brutefeedparser (>=0.10.5,<0.11.0)
|
|
17
18
|
Requires-Dist: lxml (>=5.4.0,<6.0.0)
|
|
@@ -94,12 +94,18 @@ class BaseUrl(ContentInterface):
|
|
|
94
94
|
request.crawler_name = "RequestsCrawler"
|
|
95
95
|
request.crawler_type = RequestsCrawler(url)
|
|
96
96
|
|
|
97
|
+
if request.timeout_s is None or request.timeout_s == 0:
|
|
98
|
+
request.timeout_s = 60 * 5
|
|
99
|
+
|
|
97
100
|
return request
|
|
98
101
|
|
|
99
102
|
def get_request_for_request(self, request):
|
|
100
103
|
request.crawler_name = "RequestsCrawler"
|
|
101
104
|
request.crawler_type = RequestsCrawler(request.url)
|
|
102
105
|
|
|
106
|
+
if request.timeout_s is None or request.timeout_s == 0:
|
|
107
|
+
request.timeout_s = 60 * 5
|
|
108
|
+
|
|
103
109
|
return request
|
|
104
110
|
|
|
105
111
|
def get_handlers(self):
|
|
@@ -25,15 +25,16 @@ class RemoteServer(object):
|
|
|
25
25
|
def __init__(self, remote_server=None, timeout_s=30):
|
|
26
26
|
self.remote_server = remote_server
|
|
27
27
|
if not self.remote_server:
|
|
28
|
-
|
|
29
|
-
CRAWLER_BUDDY_PORT = os.environ.get("CRAWLER_BUDDY_PORT")
|
|
30
|
-
if CRAWLER_BUDDY_SERVER and CRAWLER_BUDDY_PORT:
|
|
31
|
-
self.remote_server = (
|
|
32
|
-
f"http://{CRAWLER_BUDDY_SERVER}:{CRAWLER_BUDDY_PORT}"
|
|
33
|
-
)
|
|
28
|
+
self.remote_server = RemoteServer.get_remote_server_location()
|
|
34
29
|
|
|
35
30
|
self.timeout_s = timeout_s
|
|
36
31
|
|
|
32
|
+
def get_remote_server_location():
|
|
33
|
+
CRAWLER_BUDDY_SERVER = os.environ.get("CRAWLER_BUDDY_SERVER")
|
|
34
|
+
CRAWLER_BUDDY_PORT = os.environ.get("CRAWLER_BUDDY_PORT")
|
|
35
|
+
if CRAWLER_BUDDY_SERVER and CRAWLER_BUDDY_PORT:
|
|
36
|
+
return f"http://{CRAWLER_BUDDY_SERVER}:{CRAWLER_BUDDY_PORT}"
|
|
37
|
+
|
|
37
38
|
def get_getj(self, request=None, url=None):
|
|
38
39
|
"""
|
|
39
40
|
@returns None in case of error
|
|
@@ -145,6 +146,9 @@ class RemoteServer(object):
|
|
|
145
146
|
"""
|
|
146
147
|
@param link_call Remote server endpoint
|
|
147
148
|
@param url Url for which we call Remote server
|
|
149
|
+
|
|
150
|
+
Note: there should always be a timeout. Server might stop responding,
|
|
151
|
+
it could have hanged, etc.
|
|
148
152
|
"""
|
|
149
153
|
url = request.url
|
|
150
154
|
|
|
@@ -154,8 +158,10 @@ class RemoteServer(object):
|
|
|
154
158
|
|
|
155
159
|
text = None
|
|
156
160
|
|
|
157
|
-
|
|
161
|
+
# it is hard to think of a good deafult value
|
|
162
|
+
timeout_s = 60
|
|
158
163
|
if request.timeout_s is not None:
|
|
164
|
+
# remote server will have timeout_s we add some wiggle room for transmission
|
|
159
165
|
timeout_s = request.timeout_s
|
|
160
166
|
timeout_s += 5
|
|
161
167
|
|
|
@@ -43,8 +43,8 @@ class RemoteUrl(ContentInterface):
|
|
|
43
43
|
"""
|
|
44
44
|
super().__init__(url=url, contents=None)
|
|
45
45
|
self.request = request
|
|
46
|
-
self.remote_server_location
|
|
47
|
-
self.server = RemoteServer(remote_server_location)
|
|
46
|
+
self.remote_server_location=remote_server_location
|
|
47
|
+
self.server = RemoteServer(remote_server=self.remote_server_location)
|
|
48
48
|
self.all_properties = all_properties
|
|
49
49
|
self.social_properties = social_properties
|
|
50
50
|
|
|
@@ -52,6 +52,9 @@ class RemoteUrl(ContentInterface):
|
|
|
52
52
|
if self.all_properties:
|
|
53
53
|
self.get_responses()
|
|
54
54
|
|
|
55
|
+
def get_remote_server_location():
|
|
56
|
+
return RemoteServer.get_remote_server_location()
|
|
57
|
+
|
|
55
58
|
def get_responses(self):
|
|
56
59
|
"""Provides URL responses"""
|
|
57
60
|
if self.all_properties is None:
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|