cobweb-launcher 1.2.44__tar.gz → 1.2.45__tar.gz
Sign up to get free protection for your applications and to get access to all the features.
- {cobweb-launcher-1.2.44/cobweb_launcher.egg-info → cobweb-launcher-1.2.45}/PKG-INFO +1 -1
- {cobweb-launcher-1.2.44 → cobweb-launcher-1.2.45}/cobweb/crawlers/crawler.py +7 -5
- cobweb-launcher-1.2.45/cobweb/utils/dotting.py +60 -0
- {cobweb-launcher-1.2.44 → cobweb-launcher-1.2.45/cobweb_launcher.egg-info}/PKG-INFO +1 -1
- {cobweb-launcher-1.2.44 → cobweb-launcher-1.2.45}/setup.py +1 -1
- cobweb-launcher-1.2.44/cobweb/utils/dotting.py +0 -32
- {cobweb-launcher-1.2.44 → cobweb-launcher-1.2.45}/LICENSE +0 -0
- {cobweb-launcher-1.2.44 → cobweb-launcher-1.2.45}/README.md +0 -0
- {cobweb-launcher-1.2.44 → cobweb-launcher-1.2.45}/cobweb/__init__.py +0 -0
- {cobweb-launcher-1.2.44 → cobweb-launcher-1.2.45}/cobweb/base/__init__.py +0 -0
- {cobweb-launcher-1.2.44 → cobweb-launcher-1.2.45}/cobweb/base/common_queue.py +0 -0
- {cobweb-launcher-1.2.44 → cobweb-launcher-1.2.45}/cobweb/base/decorators.py +0 -0
- {cobweb-launcher-1.2.44 → cobweb-launcher-1.2.45}/cobweb/base/item.py +0 -0
- {cobweb-launcher-1.2.44 → cobweb-launcher-1.2.45}/cobweb/base/log.py +0 -0
- {cobweb-launcher-1.2.44 → cobweb-launcher-1.2.45}/cobweb/base/request.py +0 -0
- {cobweb-launcher-1.2.44 → cobweb-launcher-1.2.45}/cobweb/base/response.py +0 -0
- {cobweb-launcher-1.2.44 → cobweb-launcher-1.2.45}/cobweb/base/seed.py +0 -0
- {cobweb-launcher-1.2.44 → cobweb-launcher-1.2.45}/cobweb/constant.py +0 -0
- {cobweb-launcher-1.2.44 → cobweb-launcher-1.2.45}/cobweb/crawlers/__init__.py +0 -0
- {cobweb-launcher-1.2.44 → cobweb-launcher-1.2.45}/cobweb/crawlers/base_crawler.py +0 -0
- {cobweb-launcher-1.2.44 → cobweb-launcher-1.2.45}/cobweb/crawlers/file_crawler.py +0 -0
- {cobweb-launcher-1.2.44 → cobweb-launcher-1.2.45}/cobweb/db/__init__.py +0 -0
- {cobweb-launcher-1.2.44 → cobweb-launcher-1.2.45}/cobweb/db/api_db.py +0 -0
- {cobweb-launcher-1.2.44 → cobweb-launcher-1.2.45}/cobweb/db/redis_db.py +0 -0
- {cobweb-launcher-1.2.44 → cobweb-launcher-1.2.45}/cobweb/exceptions/__init__.py +0 -0
- {cobweb-launcher-1.2.44 → cobweb-launcher-1.2.45}/cobweb/exceptions/oss_db_exception.py +0 -0
- {cobweb-launcher-1.2.44 → cobweb-launcher-1.2.45}/cobweb/launchers/__init__.py +0 -0
- {cobweb-launcher-1.2.44 → cobweb-launcher-1.2.45}/cobweb/launchers/launcher.py +0 -0
- {cobweb-launcher-1.2.44 → cobweb-launcher-1.2.45}/cobweb/launchers/launcher_air.py +0 -0
- {cobweb-launcher-1.2.44 → cobweb-launcher-1.2.45}/cobweb/launchers/launcher_api.py +0 -0
- {cobweb-launcher-1.2.44 → cobweb-launcher-1.2.45}/cobweb/launchers/launcher_pro.py +0 -0
- {cobweb-launcher-1.2.44 → cobweb-launcher-1.2.45}/cobweb/pipelines/__init__.py +0 -0
- {cobweb-launcher-1.2.44 → cobweb-launcher-1.2.45}/cobweb/pipelines/pipeline.py +0 -0
- {cobweb-launcher-1.2.44 → cobweb-launcher-1.2.45}/cobweb/pipelines/pipeline_console.py +0 -0
- {cobweb-launcher-1.2.44 → cobweb-launcher-1.2.45}/cobweb/pipelines/pipeline_loghub.py +0 -0
- {cobweb-launcher-1.2.44 → cobweb-launcher-1.2.45}/cobweb/setting.py +0 -0
- {cobweb-launcher-1.2.44 → cobweb-launcher-1.2.45}/cobweb/utils/__init__.py +0 -0
- {cobweb-launcher-1.2.44 → cobweb-launcher-1.2.45}/cobweb/utils/bloom.py +0 -0
- {cobweb-launcher-1.2.44 → cobweb-launcher-1.2.45}/cobweb/utils/oss.py +0 -0
- {cobweb-launcher-1.2.44 → cobweb-launcher-1.2.45}/cobweb/utils/tools.py +0 -0
- {cobweb-launcher-1.2.44 → cobweb-launcher-1.2.45}/cobweb_launcher.egg-info/SOURCES.txt +0 -0
- {cobweb-launcher-1.2.44 → cobweb-launcher-1.2.45}/cobweb_launcher.egg-info/dependency_links.txt +0 -0
- {cobweb-launcher-1.2.44 → cobweb-launcher-1.2.45}/cobweb_launcher.egg-info/requires.txt +0 -0
- {cobweb-launcher-1.2.44 → cobweb-launcher-1.2.45}/cobweb_launcher.egg-info/top_level.txt +0 -0
- {cobweb-launcher-1.2.44 → cobweb-launcher-1.2.45}/setup.cfg +0 -0
- {cobweb-launcher-1.2.44 → cobweb-launcher-1.2.45}/test/test.py +0 -0
@@ -1,4 +1,5 @@
|
|
1
1
|
import json
|
2
|
+
import os
|
2
3
|
import threading
|
3
4
|
import time
|
4
5
|
import traceback
|
@@ -6,9 +7,7 @@ from inspect import isgenerator
|
|
6
7
|
from typing import Union, Callable, Mapping
|
7
8
|
from urllib.parse import urlparse
|
8
9
|
|
9
|
-
import
|
10
|
-
from requests import HTTPError, Response as Res
|
11
|
-
from requests.exceptions import ChunkedEncodingError
|
10
|
+
from requests import Response as Res
|
12
11
|
|
13
12
|
from cobweb.constant import DealModel, LogTemplate
|
14
13
|
from cobweb.base import (
|
@@ -20,6 +19,7 @@ from cobweb.base import (
|
|
20
19
|
logger
|
21
20
|
)
|
22
21
|
from cobweb.utils import LoghubDot
|
22
|
+
proxy_type = os.getenv("PROXY_TYPE", "")
|
23
23
|
|
24
24
|
|
25
25
|
class Crawler(threading.Thread):
|
@@ -149,7 +149,7 @@ class Crawler(threading.Thread):
|
|
149
149
|
topic=urlparse(download_item.response.request.url).netloc,
|
150
150
|
data_size=int(download_item.response.headers.get("content-length", 0)),
|
151
151
|
cost_time=end_time - start_time, status = 200,
|
152
|
-
url=download_item.response.url,
|
152
|
+
url=download_item.response.url, proxy_type=proxy_type,
|
153
153
|
)
|
154
154
|
parse_iterators = self.parse(download_item)
|
155
155
|
if not isgenerator(parse_iterators):
|
@@ -177,7 +177,8 @@ class Crawler(threading.Thread):
|
|
177
177
|
topic=urlparse(url).netloc,
|
178
178
|
data_size=-1, cost_time=-1,
|
179
179
|
status=status, url=url,
|
180
|
-
|
180
|
+
proxy_type=proxy_type,
|
181
|
+
msg=exception_msg,
|
181
182
|
)
|
182
183
|
logger.info(LogTemplate.download_exception.format(
|
183
184
|
detail=seed_detail_log_info,
|
@@ -208,6 +209,7 @@ class Crawler(threading.Thread):
|
|
208
209
|
logger.info("spider thread close")
|
209
210
|
|
210
211
|
def run(self):
|
212
|
+
threading.Thread(name="loghub_dot", target=self.loghub_dot.build_run).start()
|
211
213
|
for index in range(self.thread_num):
|
212
214
|
threading.Thread(name=f"spider_{index}", target=self.spider).start()
|
213
215
|
|
@@ -0,0 +1,60 @@
|
|
1
|
+
import json
|
2
|
+
import time
|
3
|
+
|
4
|
+
from aliyun.log import LogClient, LogItem, PutLogsRequest
|
5
|
+
|
6
|
+
from base import Queue, logger
|
7
|
+
from cobweb import setting
|
8
|
+
|
9
|
+
|
10
|
+
class LoghubDot:
|
11
|
+
|
12
|
+
def __init__(self):
|
13
|
+
self.client = LogClient(**setting.LOGHUB_CONFIG)
|
14
|
+
self.queue = Queue()
|
15
|
+
|
16
|
+
def build(self, topic, **kwargs):
|
17
|
+
|
18
|
+
temp = {}
|
19
|
+
log_item = LogItem()
|
20
|
+
for key, value in kwargs.items():
|
21
|
+
if not isinstance(value, str):
|
22
|
+
temp[key] = json.dumps(value, ensure_ascii=False)
|
23
|
+
else:
|
24
|
+
temp[key] = value
|
25
|
+
contents = sorted(temp.items())
|
26
|
+
log_item.set_contents(contents)
|
27
|
+
# log_items.append(log_item)
|
28
|
+
# request = PutLogsRequest(
|
29
|
+
# project="databee-download-log",
|
30
|
+
# logstore="log",
|
31
|
+
# topic=topic,
|
32
|
+
# logitems=log_items,
|
33
|
+
# compress=True
|
34
|
+
# )
|
35
|
+
self.queue.push((topic, log_item), direct_insertion=True)
|
36
|
+
# self.client.put_logs(request=request)
|
37
|
+
|
38
|
+
def build_run(self):
|
39
|
+
while True:
|
40
|
+
if self.queue.length < 1000:
|
41
|
+
time.sleep(0.5)
|
42
|
+
continue
|
43
|
+
try:
|
44
|
+
log_item_info = {}
|
45
|
+
for _ in range(1000):
|
46
|
+
topic, item = self.queue.pop()
|
47
|
+
if not item:
|
48
|
+
break
|
49
|
+
log_item_info.setdefault(topic, []).append(item)
|
50
|
+
for topic, log_items in log_item_info.items():
|
51
|
+
request = PutLogsRequest(
|
52
|
+
project="databee-download-log",
|
53
|
+
logstore="log",
|
54
|
+
topic=topic,
|
55
|
+
logitems=log_items,
|
56
|
+
compress=True
|
57
|
+
)
|
58
|
+
self.client.put_logs(request=request)
|
59
|
+
except Exception as e:
|
60
|
+
logger.info(str(e))
|
@@ -1,32 +0,0 @@
|
|
1
|
-
import json
|
2
|
-
|
3
|
-
from aliyun.log import LogClient, LogItem, PutLogsRequest
|
4
|
-
from cobweb import setting
|
5
|
-
|
6
|
-
|
7
|
-
class LoghubDot:
|
8
|
-
|
9
|
-
def __init__(self):
|
10
|
-
self.client = LogClient(**setting.LOGHUB_CONFIG)
|
11
|
-
|
12
|
-
def build(self, topic, **kwargs):
|
13
|
-
|
14
|
-
temp = {}
|
15
|
-
log_items = []
|
16
|
-
log_item = LogItem()
|
17
|
-
for key, value in kwargs.items():
|
18
|
-
if not isinstance(value, str):
|
19
|
-
temp[key] = json.dumps(value, ensure_ascii=False)
|
20
|
-
else:
|
21
|
-
temp[key] = value
|
22
|
-
contents = sorted(temp.items())
|
23
|
-
log_item.set_contents(contents)
|
24
|
-
log_items.append(log_item)
|
25
|
-
request = PutLogsRequest(
|
26
|
-
project="databee-download-log",
|
27
|
-
logstore="log",
|
28
|
-
topic=topic,
|
29
|
-
logitems=log_items,
|
30
|
-
compress=True
|
31
|
-
)
|
32
|
-
self.client.put_logs(request=request)
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{cobweb-launcher-1.2.44 → cobweb-launcher-1.2.45}/cobweb_launcher.egg-info/dependency_links.txt
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|