cobweb-launcher 1.2.44__py3-none-any.whl → 1.2.46__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- cobweb/crawlers/crawler.py +7 -5
- cobweb/utils/dotting.py +38 -10
- {cobweb_launcher-1.2.44.dist-info → cobweb_launcher-1.2.46.dist-info}/METADATA +1 -1
- {cobweb_launcher-1.2.44.dist-info → cobweb_launcher-1.2.46.dist-info}/RECORD +7 -7
- {cobweb_launcher-1.2.44.dist-info → cobweb_launcher-1.2.46.dist-info}/LICENSE +0 -0
- {cobweb_launcher-1.2.44.dist-info → cobweb_launcher-1.2.46.dist-info}/WHEEL +0 -0
- {cobweb_launcher-1.2.44.dist-info → cobweb_launcher-1.2.46.dist-info}/top_level.txt +0 -0
cobweb/crawlers/crawler.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
import json
|
2
|
+
import os
|
2
3
|
import threading
|
3
4
|
import time
|
4
5
|
import traceback
|
@@ -6,9 +7,7 @@ from inspect import isgenerator
|
|
6
7
|
from typing import Union, Callable, Mapping
|
7
8
|
from urllib.parse import urlparse
|
8
9
|
|
9
|
-
import
|
10
|
-
from requests import HTTPError, Response as Res
|
11
|
-
from requests.exceptions import ChunkedEncodingError
|
10
|
+
from requests import Response as Res
|
12
11
|
|
13
12
|
from cobweb.constant import DealModel, LogTemplate
|
14
13
|
from cobweb.base import (
|
@@ -20,6 +19,7 @@ from cobweb.base import (
|
|
20
19
|
logger
|
21
20
|
)
|
22
21
|
from cobweb.utils import LoghubDot
|
22
|
+
proxy_type = os.getenv("PROXY_TYPE", "")
|
23
23
|
|
24
24
|
|
25
25
|
class Crawler(threading.Thread):
|
@@ -149,7 +149,7 @@ class Crawler(threading.Thread):
|
|
149
149
|
topic=urlparse(download_item.response.request.url).netloc,
|
150
150
|
data_size=int(download_item.response.headers.get("content-length", 0)),
|
151
151
|
cost_time=end_time - start_time, status = 200,
|
152
|
-
url=download_item.response.url,
|
152
|
+
url=download_item.response.url, proxy_type=proxy_type,
|
153
153
|
)
|
154
154
|
parse_iterators = self.parse(download_item)
|
155
155
|
if not isgenerator(parse_iterators):
|
@@ -177,7 +177,8 @@ class Crawler(threading.Thread):
|
|
177
177
|
topic=urlparse(url).netloc,
|
178
178
|
data_size=-1, cost_time=-1,
|
179
179
|
status=status, url=url,
|
180
|
-
|
180
|
+
proxy_type=proxy_type,
|
181
|
+
msg=exception_msg,
|
181
182
|
)
|
182
183
|
logger.info(LogTemplate.download_exception.format(
|
183
184
|
detail=seed_detail_log_info,
|
@@ -208,6 +209,7 @@ class Crawler(threading.Thread):
|
|
208
209
|
logger.info("spider thread close")
|
209
210
|
|
210
211
|
def run(self):
|
212
|
+
threading.Thread(name="loghub_dot", target=self.loghub_dot.build_run).start()
|
211
213
|
for index in range(self.thread_num):
|
212
214
|
threading.Thread(name=f"spider_{index}", target=self.spider).start()
|
213
215
|
|
cobweb/utils/dotting.py
CHANGED
@@ -1,6 +1,9 @@
|
|
1
1
|
import json
|
2
|
+
import time
|
2
3
|
|
3
4
|
from aliyun.log import LogClient, LogItem, PutLogsRequest
|
5
|
+
|
6
|
+
from cobweb.base import Queue, logger
|
4
7
|
from cobweb import setting
|
5
8
|
|
6
9
|
|
@@ -8,11 +11,11 @@ class LoghubDot:
|
|
8
11
|
|
9
12
|
def __init__(self):
|
10
13
|
self.client = LogClient(**setting.LOGHUB_CONFIG)
|
14
|
+
self.queue = Queue()
|
11
15
|
|
12
16
|
def build(self, topic, **kwargs):
|
13
17
|
|
14
18
|
temp = {}
|
15
|
-
log_items = []
|
16
19
|
log_item = LogItem()
|
17
20
|
for key, value in kwargs.items():
|
18
21
|
if not isinstance(value, str):
|
@@ -21,12 +24,37 @@ class LoghubDot:
|
|
21
24
|
temp[key] = value
|
22
25
|
contents = sorted(temp.items())
|
23
26
|
log_item.set_contents(contents)
|
24
|
-
log_items.append(log_item)
|
25
|
-
request = PutLogsRequest(
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
)
|
32
|
-
self.
|
27
|
+
# log_items.append(log_item)
|
28
|
+
# request = PutLogsRequest(
|
29
|
+
# project="databee-download-log",
|
30
|
+
# logstore="log",
|
31
|
+
# topic=topic,
|
32
|
+
# logitems=log_items,
|
33
|
+
# compress=True
|
34
|
+
# )
|
35
|
+
self.queue.push((topic, log_item), direct_insertion=True)
|
36
|
+
# self.client.put_logs(request=request)
|
37
|
+
|
38
|
+
def build_run(self):
|
39
|
+
while True:
|
40
|
+
if self.queue.length < 1000:
|
41
|
+
time.sleep(0.5)
|
42
|
+
continue
|
43
|
+
try:
|
44
|
+
log_item_info = {}
|
45
|
+
for _ in range(1000):
|
46
|
+
topic, item = self.queue.pop()
|
47
|
+
if not item:
|
48
|
+
break
|
49
|
+
log_item_info.setdefault(topic, []).append(item)
|
50
|
+
for topic, log_items in log_item_info.items():
|
51
|
+
request = PutLogsRequest(
|
52
|
+
project="databee-download-log",
|
53
|
+
logstore="log",
|
54
|
+
topic=topic,
|
55
|
+
logitems=log_items,
|
56
|
+
compress=True
|
57
|
+
)
|
58
|
+
self.client.put_logs(request=request)
|
59
|
+
except Exception as e:
|
60
|
+
logger.info(str(e))
|
@@ -13,7 +13,7 @@ cobweb/base/response.py,sha256=eB1DWMXFCpn3cJ3yzgCRU1WeZAdayGDohRgdjdMUFN4,406
|
|
13
13
|
cobweb/base/seed.py,sha256=Uz_VBRlAxNYQcFHk3tsZFMlU96yPOedHaWGTvk-zKd8,2908
|
14
14
|
cobweb/crawlers/__init__.py,sha256=msvkB9mTpsgyj8JfNMsmwAcpy5kWk_2NrO1Adw2Hkw0,29
|
15
15
|
cobweb/crawlers/base_crawler.py,sha256=ee_WSDnPQpPTk6wlFuY2UEx5L3hcsAZFcr6i3GLSry8,5751
|
16
|
-
cobweb/crawlers/crawler.py,sha256=
|
16
|
+
cobweb/crawlers/crawler.py,sha256=s5kImH3lzkyRm2AQoH1fwLaIO_CPZlPhqiBknPNaglM,8676
|
17
17
|
cobweb/crawlers/file_crawler.py,sha256=2Sjbdgxzqd41WykKUQE3QQlGai3T8k-pmHNmPlTchjQ,4454
|
18
18
|
cobweb/db/__init__.py,sha256=uZwSkd105EAwYo95oZQXAfofUKHVIAZZIPpNMy-hm2Q,56
|
19
19
|
cobweb/db/api_db.py,sha256=bDc5dJQxq4z04h70KUTHd0OqUOEY7Cm3wcNJZtTvJIM,3015
|
@@ -34,11 +34,11 @@ cobweb/schedulers/scheduler_api.py,sha256=pFEdS1H4zuzxwMhCV-G7CoLz-rEOPv4EVo3xZU
|
|
34
34
|
cobweb/schedulers/scheduler_redis.py,sha256=E5fjc3nNld8GbUhUGT7uY4smRejj2J2ZIzp2g6lhxFM,2205
|
35
35
|
cobweb/utils/__init__.py,sha256=Ev2LZZ1-S56iQYDqFZrqadizEv4Gk8Of-DraH-_WnKY,109
|
36
36
|
cobweb/utils/bloom.py,sha256=vng-YbKgh9HbtpAWYf_nkUSbfVTOj40aqUUejRYlsCU,1752
|
37
|
-
cobweb/utils/dotting.py,sha256=
|
37
|
+
cobweb/utils/dotting.py,sha256=cTmhd8e8vSMqn3SGS8HcSa6cSourysaphJZYhWdpSoY,1905
|
38
38
|
cobweb/utils/oss.py,sha256=gyt8-UB07tVphZLQXMOf-JTJwU-mWq8KZkOXKkAf3uk,3513
|
39
39
|
cobweb/utils/tools.py,sha256=5JEaaAwYoV9Sdla2UBIJn6faUBuXmxUMagm9ck6FVqs,1253
|
40
|
-
cobweb_launcher-1.2.
|
41
|
-
cobweb_launcher-1.2.
|
42
|
-
cobweb_launcher-1.2.
|
43
|
-
cobweb_launcher-1.2.
|
44
|
-
cobweb_launcher-1.2.
|
40
|
+
cobweb_launcher-1.2.46.dist-info/LICENSE,sha256=z1rxSIGOyzcSb3orZxFPxzx-0C1vTocmswqBNxpKfEk,1063
|
41
|
+
cobweb_launcher-1.2.46.dist-info/METADATA,sha256=wQVxJaBKnlIwb-3Pi0RFjubW-5xLvoRLXdSFYnXQmzM,6510
|
42
|
+
cobweb_launcher-1.2.46.dist-info/WHEEL,sha256=ewwEueio1C2XeHTvT17n8dZUJgOvyCWCt0WVNLClP9o,92
|
43
|
+
cobweb_launcher-1.2.46.dist-info/top_level.txt,sha256=4GETBGNsKqiCUezmT-mJn7tjhcDlu7nLIV5gGgHBW4I,7
|
44
|
+
cobweb_launcher-1.2.46.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|