cobweb-launcher 1.2.41__py3-none-any.whl → 1.2.43__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cobweb/crawlers/crawler.py +26 -22
- {cobweb_launcher-1.2.41.dist-info → cobweb_launcher-1.2.43.dist-info}/METADATA +1 -1
- {cobweb_launcher-1.2.41.dist-info → cobweb_launcher-1.2.43.dist-info}/RECORD +6 -6
- {cobweb_launcher-1.2.41.dist-info → cobweb_launcher-1.2.43.dist-info}/LICENSE +0 -0
- {cobweb_launcher-1.2.41.dist-info → cobweb_launcher-1.2.43.dist-info}/WHEEL +0 -0
- {cobweb_launcher-1.2.41.dist-info → cobweb_launcher-1.2.43.dist-info}/top_level.txt +0 -0
cobweb/crawlers/crawler.py
CHANGED
@@ -6,7 +6,9 @@ from inspect import isgenerator
|
|
6
6
|
from typing import Union, Callable, Mapping
|
7
7
|
from urllib.parse import urlparse
|
8
8
|
|
9
|
+
import urllib3
|
9
10
|
from requests import HTTPError, Response as Res
|
11
|
+
from requests.exceptions import ChunkedEncodingError
|
10
12
|
|
11
13
|
from cobweb.constant import DealModel, LogTemplate
|
12
14
|
from cobweb.base import (
|
@@ -164,29 +166,19 @@ class Crawler(threading.Thread):
|
|
164
166
|
|
165
167
|
if not iterator_status:
|
166
168
|
raise ValueError("request/download/parse function yield value error!")
|
167
|
-
except
|
168
|
-
|
169
|
+
except Exception as e:
|
170
|
+
exception_msg = ''.join(traceback.format_exception(type(e), e, e.__traceback__))
|
171
|
+
url = seed.url
|
172
|
+
status = str(e)
|
173
|
+
if getattr(e, "response", None) and isinstance(e.response, Res):
|
169
174
|
url = e.response.request.url
|
170
175
|
status = e.response.status_code
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
)
|
178
|
-
logger.info(LogTemplate.download_exception.format(
|
179
|
-
detail=seed_detail_log_info,
|
180
|
-
retry=seed.params.retry,
|
181
|
-
priority=seed.params.priority,
|
182
|
-
seed_version=seed.params.seed_version,
|
183
|
-
identifier=seed.identifier or "",
|
184
|
-
exception=''.join(traceback.format_exception(type(e), e, e.__traceback__))
|
185
|
-
))
|
186
|
-
seed.params.retry += 1
|
187
|
-
self._set_seed(seed)
|
188
|
-
time.sleep(self.time_sleep * seed.params.retry)
|
189
|
-
except Exception as e:
|
176
|
+
self.loghub_dot.build(
|
177
|
+
topic=urlparse(url).netloc,
|
178
|
+
data_size=-1, cost_time=-1,
|
179
|
+
status=status, url=url,
|
180
|
+
msg=exception_msg
|
181
|
+
)
|
190
182
|
logger.info(LogTemplate.download_exception.format(
|
191
183
|
detail=seed_detail_log_info,
|
192
184
|
retry=seed.params.retry,
|
@@ -196,9 +188,21 @@ class Crawler(threading.Thread):
|
|
196
188
|
exception=''.join(traceback.format_exception(type(e), e, e.__traceback__))
|
197
189
|
))
|
198
190
|
seed.params.retry += 1
|
199
|
-
# self._todo.push(seed)
|
200
191
|
self._set_seed(seed)
|
201
192
|
# time.sleep(self.time_sleep * seed.params.retry)
|
193
|
+
# except Exception as e:
|
194
|
+
# logger.info(LogTemplate.download_exception.format(
|
195
|
+
# detail=seed_detail_log_info,
|
196
|
+
# retry=seed.params.retry,
|
197
|
+
# priority=seed.params.priority,
|
198
|
+
# seed_version=seed.params.seed_version,
|
199
|
+
# identifier=seed.identifier or "",
|
200
|
+
# exception=''.join(traceback.format_exception(type(e), e, e.__traceback__))
|
201
|
+
# ))
|
202
|
+
# seed.params.retry += 1
|
203
|
+
# # self._todo.push(seed)
|
204
|
+
# self._set_seed(seed)
|
205
|
+
# # time.sleep(self.time_sleep * seed.params.retry)
|
202
206
|
finally:
|
203
207
|
time.sleep(0.1)
|
204
208
|
logger.info("spider thread close")
|
@@ -13,7 +13,7 @@ cobweb/base/response.py,sha256=eB1DWMXFCpn3cJ3yzgCRU1WeZAdayGDohRgdjdMUFN4,406
|
|
13
13
|
cobweb/base/seed.py,sha256=Uz_VBRlAxNYQcFHk3tsZFMlU96yPOedHaWGTvk-zKd8,2908
|
14
14
|
cobweb/crawlers/__init__.py,sha256=msvkB9mTpsgyj8JfNMsmwAcpy5kWk_2NrO1Adw2Hkw0,29
|
15
15
|
cobweb/crawlers/base_crawler.py,sha256=ee_WSDnPQpPTk6wlFuY2UEx5L3hcsAZFcr6i3GLSry8,5751
|
16
|
-
cobweb/crawlers/crawler.py,sha256=
|
16
|
+
cobweb/crawlers/crawler.py,sha256=pEukp5tC-axkzmcagPIpWPgmpxP0NHC1eu8iyJDFegA,8537
|
17
17
|
cobweb/crawlers/file_crawler.py,sha256=2Sjbdgxzqd41WykKUQE3QQlGai3T8k-pmHNmPlTchjQ,4454
|
18
18
|
cobweb/db/__init__.py,sha256=uZwSkd105EAwYo95oZQXAfofUKHVIAZZIPpNMy-hm2Q,56
|
19
19
|
cobweb/db/api_db.py,sha256=bDc5dJQxq4z04h70KUTHd0OqUOEY7Cm3wcNJZtTvJIM,3015
|
@@ -37,8 +37,8 @@ cobweb/utils/bloom.py,sha256=vng-YbKgh9HbtpAWYf_nkUSbfVTOj40aqUUejRYlsCU,1752
|
|
37
37
|
cobweb/utils/dotting.py,sha256=PgsWdM-724Jy-MZWUsaygNWV-huqLMmdLgop7gaBxlo,872
|
38
38
|
cobweb/utils/oss.py,sha256=gyt8-UB07tVphZLQXMOf-JTJwU-mWq8KZkOXKkAf3uk,3513
|
39
39
|
cobweb/utils/tools.py,sha256=5JEaaAwYoV9Sdla2UBIJn6faUBuXmxUMagm9ck6FVqs,1253
|
40
|
-
cobweb_launcher-1.2.
|
41
|
-
cobweb_launcher-1.2.
|
42
|
-
cobweb_launcher-1.2.
|
43
|
-
cobweb_launcher-1.2.
|
44
|
-
cobweb_launcher-1.2.
|
40
|
+
cobweb_launcher-1.2.43.dist-info/LICENSE,sha256=z1rxSIGOyzcSb3orZxFPxzx-0C1vTocmswqBNxpKfEk,1063
|
41
|
+
cobweb_launcher-1.2.43.dist-info/METADATA,sha256=GcKa3nUwsRKVxxoe2lKqHylsTYtXHxbveUMAizWtdJc,6510
|
42
|
+
cobweb_launcher-1.2.43.dist-info/WHEEL,sha256=ewwEueio1C2XeHTvT17n8dZUJgOvyCWCt0WVNLClP9o,92
|
43
|
+
cobweb_launcher-1.2.43.dist-info/top_level.txt,sha256=4GETBGNsKqiCUezmT-mJn7tjhcDlu7nLIV5gGgHBW4I,7
|
44
|
+
cobweb_launcher-1.2.43.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|