cobweb-launcher 1.2.43__py3-none-any.whl → 1.2.45__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cobweb/crawlers/crawler.py +8 -6
- cobweb/utils/dotting.py +38 -10
- {cobweb_launcher-1.2.43.dist-info → cobweb_launcher-1.2.45.dist-info}/METADATA +1 -1
- {cobweb_launcher-1.2.43.dist-info → cobweb_launcher-1.2.45.dist-info}/RECORD +7 -7
- {cobweb_launcher-1.2.43.dist-info → cobweb_launcher-1.2.45.dist-info}/LICENSE +0 -0
- {cobweb_launcher-1.2.43.dist-info → cobweb_launcher-1.2.45.dist-info}/WHEEL +0 -0
- {cobweb_launcher-1.2.43.dist-info → cobweb_launcher-1.2.45.dist-info}/top_level.txt +0 -0
cobweb/crawlers/crawler.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
import json
|
2
|
+
import os
|
2
3
|
import threading
|
3
4
|
import time
|
4
5
|
import traceback
|
@@ -6,9 +7,7 @@ from inspect import isgenerator
|
|
6
7
|
from typing import Union, Callable, Mapping
|
7
8
|
from urllib.parse import urlparse
|
8
9
|
|
9
|
-
import
|
10
|
-
from requests import HTTPError, Response as Res
|
11
|
-
from requests.exceptions import ChunkedEncodingError
|
10
|
+
from requests import Response as Res
|
12
11
|
|
13
12
|
from cobweb.constant import DealModel, LogTemplate
|
14
13
|
from cobweb.base import (
|
@@ -20,6 +19,7 @@ from cobweb.base import (
|
|
20
19
|
logger
|
21
20
|
)
|
22
21
|
from cobweb.utils import LoghubDot
|
22
|
+
proxy_type = os.getenv("PROXY_TYPE", "")
|
23
23
|
|
24
24
|
|
25
25
|
class Crawler(threading.Thread):
|
@@ -149,7 +149,7 @@ class Crawler(threading.Thread):
|
|
149
149
|
topic=urlparse(download_item.response.request.url).netloc,
|
150
150
|
data_size=int(download_item.response.headers.get("content-length", 0)),
|
151
151
|
cost_time=end_time - start_time, status = 200,
|
152
|
-
url=download_item.response.url,
|
152
|
+
url=download_item.response.url, proxy_type=proxy_type,
|
153
153
|
)
|
154
154
|
parse_iterators = self.parse(download_item)
|
155
155
|
if not isgenerator(parse_iterators):
|
@@ -169,7 +169,7 @@ class Crawler(threading.Thread):
|
|
169
169
|
except Exception as e:
|
170
170
|
exception_msg = ''.join(traceback.format_exception(type(e), e, e.__traceback__))
|
171
171
|
url = seed.url
|
172
|
-
status =
|
172
|
+
status = e.__class__.__name__
|
173
173
|
if getattr(e, "response", None) and isinstance(e.response, Res):
|
174
174
|
url = e.response.request.url
|
175
175
|
status = e.response.status_code
|
@@ -177,7 +177,8 @@ class Crawler(threading.Thread):
|
|
177
177
|
topic=urlparse(url).netloc,
|
178
178
|
data_size=-1, cost_time=-1,
|
179
179
|
status=status, url=url,
|
180
|
-
|
180
|
+
proxy_type=proxy_type,
|
181
|
+
msg=exception_msg,
|
181
182
|
)
|
182
183
|
logger.info(LogTemplate.download_exception.format(
|
183
184
|
detail=seed_detail_log_info,
|
@@ -208,6 +209,7 @@ class Crawler(threading.Thread):
|
|
208
209
|
logger.info("spider thread close")
|
209
210
|
|
210
211
|
def run(self):
|
212
|
+
threading.Thread(name="loghub_dot", target=self.loghub_dot.build_run).start()
|
211
213
|
for index in range(self.thread_num):
|
212
214
|
threading.Thread(name=f"spider_{index}", target=self.spider).start()
|
213
215
|
|
cobweb/utils/dotting.py
CHANGED
@@ -1,6 +1,9 @@
|
|
1
1
|
import json
|
2
|
+
import time
|
2
3
|
|
3
4
|
from aliyun.log import LogClient, LogItem, PutLogsRequest
|
5
|
+
|
6
|
+
from base import Queue, logger
|
4
7
|
from cobweb import setting
|
5
8
|
|
6
9
|
|
@@ -8,11 +11,11 @@ class LoghubDot:
|
|
8
11
|
|
9
12
|
def __init__(self):
|
10
13
|
self.client = LogClient(**setting.LOGHUB_CONFIG)
|
14
|
+
self.queue = Queue()
|
11
15
|
|
12
16
|
def build(self, topic, **kwargs):
|
13
17
|
|
14
18
|
temp = {}
|
15
|
-
log_items = []
|
16
19
|
log_item = LogItem()
|
17
20
|
for key, value in kwargs.items():
|
18
21
|
if not isinstance(value, str):
|
@@ -21,12 +24,37 @@ class LoghubDot:
|
|
21
24
|
temp[key] = value
|
22
25
|
contents = sorted(temp.items())
|
23
26
|
log_item.set_contents(contents)
|
24
|
-
log_items.append(log_item)
|
25
|
-
request = PutLogsRequest(
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
)
|
32
|
-
self.
|
27
|
+
# log_items.append(log_item)
|
28
|
+
# request = PutLogsRequest(
|
29
|
+
# project="databee-download-log",
|
30
|
+
# logstore="log",
|
31
|
+
# topic=topic,
|
32
|
+
# logitems=log_items,
|
33
|
+
# compress=True
|
34
|
+
# )
|
35
|
+
self.queue.push((topic, log_item), direct_insertion=True)
|
36
|
+
# self.client.put_logs(request=request)
|
37
|
+
|
38
|
+
def build_run(self):
|
39
|
+
while True:
|
40
|
+
if self.queue.length < 1000:
|
41
|
+
time.sleep(0.5)
|
42
|
+
continue
|
43
|
+
try:
|
44
|
+
log_item_info = {}
|
45
|
+
for _ in range(1000):
|
46
|
+
topic, item = self.queue.pop()
|
47
|
+
if not item:
|
48
|
+
break
|
49
|
+
log_item_info.setdefault(topic, []).append(item)
|
50
|
+
for topic, log_items in log_item_info.items():
|
51
|
+
request = PutLogsRequest(
|
52
|
+
project="databee-download-log",
|
53
|
+
logstore="log",
|
54
|
+
topic=topic,
|
55
|
+
logitems=log_items,
|
56
|
+
compress=True
|
57
|
+
)
|
58
|
+
self.client.put_logs(request=request)
|
59
|
+
except Exception as e:
|
60
|
+
logger.info(str(e))
|
@@ -13,7 +13,7 @@ cobweb/base/response.py,sha256=eB1DWMXFCpn3cJ3yzgCRU1WeZAdayGDohRgdjdMUFN4,406
|
|
13
13
|
cobweb/base/seed.py,sha256=Uz_VBRlAxNYQcFHk3tsZFMlU96yPOedHaWGTvk-zKd8,2908
|
14
14
|
cobweb/crawlers/__init__.py,sha256=msvkB9mTpsgyj8JfNMsmwAcpy5kWk_2NrO1Adw2Hkw0,29
|
15
15
|
cobweb/crawlers/base_crawler.py,sha256=ee_WSDnPQpPTk6wlFuY2UEx5L3hcsAZFcr6i3GLSry8,5751
|
16
|
-
cobweb/crawlers/crawler.py,sha256=
|
16
|
+
cobweb/crawlers/crawler.py,sha256=s5kImH3lzkyRm2AQoH1fwLaIO_CPZlPhqiBknPNaglM,8676
|
17
17
|
cobweb/crawlers/file_crawler.py,sha256=2Sjbdgxzqd41WykKUQE3QQlGai3T8k-pmHNmPlTchjQ,4454
|
18
18
|
cobweb/db/__init__.py,sha256=uZwSkd105EAwYo95oZQXAfofUKHVIAZZIPpNMy-hm2Q,56
|
19
19
|
cobweb/db/api_db.py,sha256=bDc5dJQxq4z04h70KUTHd0OqUOEY7Cm3wcNJZtTvJIM,3015
|
@@ -34,11 +34,11 @@ cobweb/schedulers/scheduler_api.py,sha256=pFEdS1H4zuzxwMhCV-G7CoLz-rEOPv4EVo3xZU
|
|
34
34
|
cobweb/schedulers/scheduler_redis.py,sha256=E5fjc3nNld8GbUhUGT7uY4smRejj2J2ZIzp2g6lhxFM,2205
|
35
35
|
cobweb/utils/__init__.py,sha256=Ev2LZZ1-S56iQYDqFZrqadizEv4Gk8Of-DraH-_WnKY,109
|
36
36
|
cobweb/utils/bloom.py,sha256=vng-YbKgh9HbtpAWYf_nkUSbfVTOj40aqUUejRYlsCU,1752
|
37
|
-
cobweb/utils/dotting.py,sha256=
|
37
|
+
cobweb/utils/dotting.py,sha256=vxK44tq_eD1uIYrkMzesUF43ZgFiu1zay3WZGTSBSx4,1898
|
38
38
|
cobweb/utils/oss.py,sha256=gyt8-UB07tVphZLQXMOf-JTJwU-mWq8KZkOXKkAf3uk,3513
|
39
39
|
cobweb/utils/tools.py,sha256=5JEaaAwYoV9Sdla2UBIJn6faUBuXmxUMagm9ck6FVqs,1253
|
40
|
-
cobweb_launcher-1.2.
|
41
|
-
cobweb_launcher-1.2.
|
42
|
-
cobweb_launcher-1.2.
|
43
|
-
cobweb_launcher-1.2.
|
44
|
-
cobweb_launcher-1.2.
|
40
|
+
cobweb_launcher-1.2.45.dist-info/LICENSE,sha256=z1rxSIGOyzcSb3orZxFPxzx-0C1vTocmswqBNxpKfEk,1063
|
41
|
+
cobweb_launcher-1.2.45.dist-info/METADATA,sha256=T46vEQNn9zyk9trC1O10bT5YBi7rNOh7M11ErhYsJ_0,6510
|
42
|
+
cobweb_launcher-1.2.45.dist-info/WHEEL,sha256=ewwEueio1C2XeHTvT17n8dZUJgOvyCWCt0WVNLClP9o,92
|
43
|
+
cobweb_launcher-1.2.45.dist-info/top_level.txt,sha256=4GETBGNsKqiCUezmT-mJn7tjhcDlu7nLIV5gGgHBW4I,7
|
44
|
+
cobweb_launcher-1.2.45.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|