cobweb-launcher 1.2.24__py3-none-any.whl → 1.2.26__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of cobweb-launcher might be problematic. Click here for more details.
- cobweb/crawlers/crawler.py +17 -1
- cobweb/launchers/launcher.py +3 -0
- cobweb/launchers/launcher_pro.py +3 -3
- cobweb/utils/__init__.py +1 -0
- cobweb/utils/dotting.py +28 -0
- {cobweb_launcher-1.2.24.dist-info → cobweb_launcher-1.2.26.dist-info}/METADATA +1 -1
- cobweb_launcher-1.2.26.dist-info/RECORD +74 -0
- cobweb_launcher-1.2.26.dist-info/top_level.txt +2 -0
- cobweb_new/__init__.py +2 -0
- cobweb_new/base/__init__.py +72 -0
- cobweb_new/base/common_queue.py +53 -0
- cobweb_new/base/decorators.py +72 -0
- cobweb_new/base/item.py +46 -0
- cobweb_new/base/log.py +94 -0
- cobweb_new/base/request.py +82 -0
- cobweb_new/base/response.py +23 -0
- cobweb_new/base/seed.py +118 -0
- cobweb_new/constant.py +105 -0
- cobweb_new/crawlers/__init__.py +1 -0
- cobweb_new/crawlers/crawler-new.py +85 -0
- cobweb_new/crawlers/crawler.py +170 -0
- cobweb_new/db/__init__.py +2 -0
- cobweb_new/db/api_db.py +82 -0
- cobweb_new/db/redis_db.py +158 -0
- cobweb_new/exceptions/__init__.py +1 -0
- cobweb_new/exceptions/oss_db_exception.py +28 -0
- cobweb_new/launchers/__init__.py +3 -0
- cobweb_new/launchers/launcher.py +237 -0
- cobweb_new/launchers/launcher_air.py +88 -0
- cobweb_new/launchers/launcher_api.py +161 -0
- cobweb_new/launchers/launcher_pro.py +96 -0
- cobweb_new/launchers/tesss.py +47 -0
- cobweb_new/pipelines/__init__.py +3 -0
- cobweb_new/pipelines/pipeline.py +68 -0
- cobweb_new/pipelines/pipeline_console.py +22 -0
- cobweb_new/pipelines/pipeline_loghub.py +34 -0
- cobweb_new/setting.py +95 -0
- cobweb_new/utils/__init__.py +5 -0
- cobweb_new/utils/bloom.py +58 -0
- cobweb_new/utils/oss.py +94 -0
- cobweb_new/utils/tools.py +42 -0
- cobweb_launcher-1.2.24.dist-info/RECORD +0 -40
- cobweb_launcher-1.2.24.dist-info/top_level.txt +0 -1
- {cobweb_launcher-1.2.24.dist-info → cobweb_launcher-1.2.26.dist-info}/LICENSE +0 -0
- {cobweb_launcher-1.2.24.dist-info → cobweb_launcher-1.2.26.dist-info}/WHEEL +0 -0
cobweb/crawlers/crawler.py
CHANGED
|
@@ -5,6 +5,9 @@ import traceback
|
|
|
5
5
|
from inspect import isgenerator
|
|
6
6
|
from typing import Union, Callable, Mapping
|
|
7
7
|
|
|
8
|
+
from docutils.nodes import topic
|
|
9
|
+
from nltk.sem.chat80 import items
|
|
10
|
+
|
|
8
11
|
from cobweb.constant import DealModel, LogTemplate
|
|
9
12
|
from cobweb.base import (
|
|
10
13
|
Queue,
|
|
@@ -15,12 +18,15 @@ from cobweb.base import (
|
|
|
15
18
|
ConsoleItem,
|
|
16
19
|
logger
|
|
17
20
|
)
|
|
21
|
+
from cobweb.utils import LoghubDot
|
|
18
22
|
|
|
19
23
|
|
|
20
24
|
class Crawler(threading.Thread):
|
|
21
25
|
|
|
22
26
|
def __init__(
|
|
23
27
|
self,
|
|
28
|
+
task: str,
|
|
29
|
+
project: str,
|
|
24
30
|
stop: threading.Event,
|
|
25
31
|
pause: threading.Event,
|
|
26
32
|
# launcher_queue: Union[Mapping[str, Queue]],
|
|
@@ -35,7 +41,8 @@ class Crawler(threading.Thread):
|
|
|
35
41
|
time_sleep: int,
|
|
36
42
|
):
|
|
37
43
|
super().__init__()
|
|
38
|
-
|
|
44
|
+
self.task = task
|
|
45
|
+
self.project = project
|
|
39
46
|
self._stop = stop
|
|
40
47
|
self._pause = pause
|
|
41
48
|
self._get_seed = get_seed
|
|
@@ -52,6 +59,8 @@ class Crawler(threading.Thread):
|
|
|
52
59
|
self.time_sleep = time_sleep
|
|
53
60
|
self.max_retries = max_retries
|
|
54
61
|
|
|
62
|
+
self.loghub_dot = LoghubDot()
|
|
63
|
+
|
|
55
64
|
@staticmethod
|
|
56
65
|
def request(seed: Seed) -> Union[Request, BaseItem]:
|
|
57
66
|
yield Request(seed.url, seed, timeout=5)
|
|
@@ -115,6 +124,7 @@ class Crawler(threading.Thread):
|
|
|
115
124
|
|
|
116
125
|
if isinstance(request_item, Request):
|
|
117
126
|
iterator_status = False
|
|
127
|
+
start_time = time.time()
|
|
118
128
|
download_iterators = self.download(request_item)
|
|
119
129
|
if not isgenerator(download_iterators):
|
|
120
130
|
raise TypeError("download function isn't a generator")
|
|
@@ -132,6 +142,12 @@ class Crawler(threading.Thread):
|
|
|
132
142
|
status=download_item.response,
|
|
133
143
|
response=LogTemplate.log_info(download_item.to_dict)
|
|
134
144
|
))
|
|
145
|
+
if isinstance(download_item, Response):
|
|
146
|
+
end_time = time.time()
|
|
147
|
+
self.loghub_dot.build(topic=f"{self.project}:{self.task}", data={
|
|
148
|
+
"cost_time": end_time - start_time,
|
|
149
|
+
**download_item.to_dict
|
|
150
|
+
})
|
|
135
151
|
parse_iterators = self.parse(download_item)
|
|
136
152
|
if not isgenerator(parse_iterators):
|
|
137
153
|
raise TypeError("parse function isn't a generator")
|
cobweb/launchers/launcher.py
CHANGED
|
@@ -4,6 +4,8 @@ import threading
|
|
|
4
4
|
import importlib
|
|
5
5
|
from functools import wraps
|
|
6
6
|
|
|
7
|
+
from mypyc.doc.conf import project
|
|
8
|
+
|
|
7
9
|
from cobweb import setting
|
|
8
10
|
from cobweb.base import Seed, Queue, logger
|
|
9
11
|
from cobweb.utils.tools import dynamic_load_class
|
|
@@ -186,6 +188,7 @@ class Launcher(threading.Thread):
|
|
|
186
188
|
self.start_seeds()
|
|
187
189
|
|
|
188
190
|
self._Crawler(
|
|
191
|
+
task=self.task, project=self.project,
|
|
189
192
|
stop=self._stop, pause=self._pause,
|
|
190
193
|
# launcher_queue=self.__LAUNCHER_QUEUE__,
|
|
191
194
|
get_seed=self._get_seed,
|
cobweb/launchers/launcher_pro.py
CHANGED
|
@@ -151,9 +151,9 @@ class LauncherPro(Launcher):
|
|
|
151
151
|
|
|
152
152
|
if seed_info["count"]:
|
|
153
153
|
|
|
154
|
-
succeed_count = self._db.zrem(self._todo_key, *seed_info["common"])
|
|
155
|
-
succeed_count += self._db.done([self._todo_key, self._done_key], *seed_info["succeed"])
|
|
156
|
-
failed_count = self._db.done([self._todo_key, self._fail_key], *seed_info["failed"])
|
|
154
|
+
succeed_count = int(self._db.zrem(self._todo_key, *seed_info["common"]) or 0)
|
|
155
|
+
succeed_count += int(self._db.done([self._todo_key, self._done_key], *seed_info["succeed"]) or 0)
|
|
156
|
+
failed_count = int(self._db.done([self._todo_key, self._fail_key], *seed_info["failed"]) or 0)
|
|
157
157
|
|
|
158
158
|
if failed_count:
|
|
159
159
|
self.statistics(self._statistics_fail_key, failed_count)
|
cobweb/utils/__init__.py
CHANGED
cobweb/utils/dotting.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
import json
|
|
2
|
+
|
|
3
|
+
from aliyun.log import LogClient, PutLogsRequest, LogItem
|
|
4
|
+
|
|
5
|
+
import setting
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class LoghubDot:
|
|
9
|
+
|
|
10
|
+
def __init__(self, *args, **kwargs):
|
|
11
|
+
super().__init__(*args, **kwargs)
|
|
12
|
+
self.client = LogClient(**setting.LOGHUB_CONFIG)
|
|
13
|
+
|
|
14
|
+
def build(self, topic, data):
|
|
15
|
+
log_item = LogItem()
|
|
16
|
+
for key, value in data.items():
|
|
17
|
+
if not isinstance(value, str):
|
|
18
|
+
data[key] = json.dumps(value, ensure_ascii=False)
|
|
19
|
+
contents = sorted(data.items())
|
|
20
|
+
log_item.set_contents(contents)
|
|
21
|
+
request = PutLogsRequest(
|
|
22
|
+
project=setting.LOGHUB_PROJECT,
|
|
23
|
+
logstore="cobweb_log",
|
|
24
|
+
topic=topic,
|
|
25
|
+
logitems=contents,
|
|
26
|
+
compress=True
|
|
27
|
+
)
|
|
28
|
+
self.client.put_logs(request=request)
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
cobweb/__init__.py,sha256=CBd2oByCfc5EmH2dCZYVHkxXYZG-oWrLyTtZU5sEoP0,96
|
|
2
|
+
cobweb/constant.py,sha256=zy3XYsc1qp2B76_Fn_hVQ8eGHlPBd3OFlZK2cryE6FY,2839
|
|
3
|
+
cobweb/setting.py,sha256=47HZsw40HLpsmOmvij1lyQALPQQCN_tWlKZ0wbn2MtM,2216
|
|
4
|
+
cobweb/base/__init__.py,sha256=4gwWWQ0Q8cYG9cD7Lwf4XMqRGc5M_mapS3IczR6zeCE,222
|
|
5
|
+
cobweb/base/common_queue.py,sha256=W7PPZZFl52j3Mc916T0imHj7oAUelA6aKJwW-FecDPE,872
|
|
6
|
+
cobweb/base/decorators.py,sha256=wDCaQ94aAZGxks9Ljc0aXq6omDXT1_yzFy83ZW6VbVI,930
|
|
7
|
+
cobweb/base/item.py,sha256=hYheVTV2Bozp4iciJpE2ZwBIXkaqBg4QQkRccP8yoVk,1049
|
|
8
|
+
cobweb/base/log.py,sha256=L01hXdk3L2qEm9X1FOXQ9VmWIoHSELe0cyZvrdAN61A,2003
|
|
9
|
+
cobweb/base/request.py,sha256=tEkgMVUfdQI-kZuzWuiit9P_q4Q9-_RZh9aXXpc0314,2352
|
|
10
|
+
cobweb/base/response.py,sha256=eB1DWMXFCpn3cJ3yzgCRU1WeZAdayGDohRgdjdMUFN4,406
|
|
11
|
+
cobweb/base/seed.py,sha256=Uz_VBRlAxNYQcFHk3tsZFMlU96yPOedHaWGTvk-zKd8,2908
|
|
12
|
+
cobweb/crawlers/__init__.py,sha256=msvkB9mTpsgyj8JfNMsmwAcpy5kWk_2NrO1Adw2Hkw0,29
|
|
13
|
+
cobweb/crawlers/base_crawler.py,sha256=ee_WSDnPQpPTk6wlFuY2UEx5L3hcsAZFcr6i3GLSry8,5751
|
|
14
|
+
cobweb/crawlers/crawler.py,sha256=dskqhMOxvrZskHZHCMyS9dnh2X-qrzve7JmZ_d5bz3s,7056
|
|
15
|
+
cobweb/crawlers/file_crawler.py,sha256=2Sjbdgxzqd41WykKUQE3QQlGai3T8k-pmHNmPlTchjQ,4454
|
|
16
|
+
cobweb/db/__init__.py,sha256=uZwSkd105EAwYo95oZQXAfofUKHVIAZZIPpNMy-hm2Q,56
|
|
17
|
+
cobweb/db/api_db.py,sha256=bDc5dJQxq4z04h70KUTHd0OqUOEY7Cm3wcNJZtTvJIM,3015
|
|
18
|
+
cobweb/db/redis_db.py,sha256=fumNZJiio-uQqRcSrymx8eJ1PqsdOwITe_Y-9JOXxrQ,4298
|
|
19
|
+
cobweb/exceptions/__init__.py,sha256=E9SHnJBbhD7fOgPFMswqyOf8SKRDrI_i25L0bSpohvk,32
|
|
20
|
+
cobweb/exceptions/oss_db_exception.py,sha256=iP_AImjNHT3-Iv49zCFQ3rdLnlvuHa3h2BXApgrOYpA,636
|
|
21
|
+
cobweb/launchers/__init__.py,sha256=qMuVlQcjErVK67HyKFZEsXf_rfZD5ODjx1QucSCKMOM,114
|
|
22
|
+
cobweb/launchers/launcher.py,sha256=bXPPS6wx3Si05f2arf_9S4XqE3HFrDA-lhAX7tLZEqo,7064
|
|
23
|
+
cobweb/launchers/launcher_air.py,sha256=KAk_M8F3029cXYe7m4nn3Nzyi89lbxJ2cqZjqW8iZ0E,2832
|
|
24
|
+
cobweb/launchers/launcher_api.py,sha256=Ih8f5xDcFlGBn6VSnlrpxcchMB48ugsj2NTWYgGYWfY,8669
|
|
25
|
+
cobweb/launchers/launcher_pro.py,sha256=NBJstQuB0o_jMiySJ14lk0Y3WAxxiScaQvXa1qtTSo4,8683
|
|
26
|
+
cobweb/pipelines/__init__.py,sha256=zSUsGtx6smbs2iXBXvYynReKSgky-3gjqaAtKVnA_OU,105
|
|
27
|
+
cobweb/pipelines/base_pipeline.py,sha256=fYnWf79GmhufXpcnMa3te18SbmnVeYLwxfyo-zLd9CY,1577
|
|
28
|
+
cobweb/pipelines/loghub_pipeline.py,sha256=cjPO6w6UJ0jNw2fVvdX0BCdlm58T7dmYXlxzXOBpvfY,1027
|
|
29
|
+
cobweb/pipelines/pipeline.py,sha256=4TJLX0sUHRxYndF5A4Vs5btUGI-wigkOcFvhTW1hLXI,2009
|
|
30
|
+
cobweb/pipelines/pipeline_console.py,sha256=NEh-4zhuVAQOqwXLsqeb-rcNZ9_KXFUpL3otUTL5qBs,754
|
|
31
|
+
cobweb/pipelines/pipeline_loghub.py,sha256=xZ6D55BGdiM71WUv83jyLGbEyUwhBHLJRZoXthBxxTs,1019
|
|
32
|
+
cobweb/utils/__init__.py,sha256=Ev2LZZ1-S56iQYDqFZrqadizEv4Gk8Of-DraH-_WnKY,109
|
|
33
|
+
cobweb/utils/bloom.py,sha256=vng-YbKgh9HbtpAWYf_nkUSbfVTOj40aqUUejRYlsCU,1752
|
|
34
|
+
cobweb/utils/dotting.py,sha256=0yo0KjjGAU2fb3VLCGKhHwdOzW4QtZ4SDqUcX13tetI,791
|
|
35
|
+
cobweb/utils/oss.py,sha256=gyt8-UB07tVphZLQXMOf-JTJwU-mWq8KZkOXKkAf3uk,3513
|
|
36
|
+
cobweb/utils/tools.py,sha256=5JEaaAwYoV9Sdla2UBIJn6faUBuXmxUMagm9ck6FVqs,1253
|
|
37
|
+
cobweb_new/__init__.py,sha256=CBd2oByCfc5EmH2dCZYVHkxXYZG-oWrLyTtZU5sEoP0,96
|
|
38
|
+
cobweb_new/constant.py,sha256=wy2bLpGZrl1MtgVv-Z1Tmtj5uWh-KGzDrrGKdVStxV4,3236
|
|
39
|
+
cobweb_new/setting.py,sha256=Ya3X4HbvDfSmMF2kSJwaaP1naxrWETTFW88T11agP7k,3035
|
|
40
|
+
cobweb_new/base/__init__.py,sha256=L74KN3qZn6s33EXyxQ_vB3FF8mA7pZJd_ekkWiUKd5Y,2229
|
|
41
|
+
cobweb_new/base/common_queue.py,sha256=Po6yY8HqpC6Wt6csd3Co3lBd7ygN2vmEECczgyc_sM8,1292
|
|
42
|
+
cobweb_new/base/decorators.py,sha256=8VDpANSIhxhrFnwgQzAxM_8ZyDXKdn3zTH0oZIXqRPE,1801
|
|
43
|
+
cobweb_new/base/item.py,sha256=hYheVTV2Bozp4iciJpE2ZwBIXkaqBg4QQkRccP8yoVk,1049
|
|
44
|
+
cobweb_new/base/log.py,sha256=L01hXdk3L2qEm9X1FOXQ9VmWIoHSELe0cyZvrdAN61A,2003
|
|
45
|
+
cobweb_new/base/request.py,sha256=tEkgMVUfdQI-kZuzWuiit9P_q4Q9-_RZh9aXXpc0314,2352
|
|
46
|
+
cobweb_new/base/response.py,sha256=eB1DWMXFCpn3cJ3yzgCRU1WeZAdayGDohRgdjdMUFN4,406
|
|
47
|
+
cobweb_new/base/seed.py,sha256=KBVxVU4jMB6oiw8HPtu-nDUVUZ6jiTjzR917jTYGCZs,2977
|
|
48
|
+
cobweb_new/crawlers/__init__.py,sha256=msvkB9mTpsgyj8JfNMsmwAcpy5kWk_2NrO1Adw2Hkw0,29
|
|
49
|
+
cobweb_new/crawlers/crawler-new.py,sha256=TAYMH2E3BTkjU6bFLlIMVfsR3cV2ggjA0moUpaXOe1Y,2762
|
|
50
|
+
cobweb_new/crawlers/crawler.py,sha256=xiFNM0t69f5xlm59hPbO2MpqtdirVAUhD84-CLpyHPM,6349
|
|
51
|
+
cobweb_new/db/__init__.py,sha256=uZwSkd105EAwYo95oZQXAfofUKHVIAZZIPpNMy-hm2Q,56
|
|
52
|
+
cobweb_new/db/api_db.py,sha256=bDc5dJQxq4z04h70KUTHd0OqUOEY7Cm3wcNJZtTvJIM,3015
|
|
53
|
+
cobweb_new/db/redis_db.py,sha256=FvMzckJtmhwKhZqKoS23iXmJti5P2dnMVD5rJ__5LUw,5139
|
|
54
|
+
cobweb_new/exceptions/__init__.py,sha256=E9SHnJBbhD7fOgPFMswqyOf8SKRDrI_i25L0bSpohvk,32
|
|
55
|
+
cobweb_new/exceptions/oss_db_exception.py,sha256=iP_AImjNHT3-Iv49zCFQ3rdLnlvuHa3h2BXApgrOYpA,636
|
|
56
|
+
cobweb_new/launchers/__init__.py,sha256=qMuVlQcjErVK67HyKFZEsXf_rfZD5ODjx1QucSCKMOM,114
|
|
57
|
+
cobweb_new/launchers/launcher.py,sha256=87P_2rRjzqyQXcG_EJ5Y6lMAk7saM8k1WBJcl9ANX6k,8309
|
|
58
|
+
cobweb_new/launchers/launcher_air.py,sha256=KAk_M8F3029cXYe7m4nn3Nzyi89lbxJ2cqZjqW8iZ0E,2832
|
|
59
|
+
cobweb_new/launchers/launcher_api.py,sha256=qPazoC7U-UmgebbiTkhl6f4yQmN34XMl6HawekhAhEo,5789
|
|
60
|
+
cobweb_new/launchers/launcher_pro.py,sha256=QLjAiN8qMk4NklSY7ldBAR5OEEUB8sECuCCwRrFEC68,3414
|
|
61
|
+
cobweb_new/launchers/tesss.py,sha256=pDe0wwhXbdjjmtfc7JLPfVOvs9yuc7Y8wLT1b1ueeEs,912
|
|
62
|
+
cobweb_new/pipelines/__init__.py,sha256=zSUsGtx6smbs2iXBXvYynReKSgky-3gjqaAtKVnA_OU,105
|
|
63
|
+
cobweb_new/pipelines/pipeline.py,sha256=3IRHHqrHblZ_18Cps2bGK6iugDjs-dde7p3AbarfiN8,1958
|
|
64
|
+
cobweb_new/pipelines/pipeline_console.py,sha256=NEh-4zhuVAQOqwXLsqeb-rcNZ9_KXFUpL3otUTL5qBs,754
|
|
65
|
+
cobweb_new/pipelines/pipeline_loghub.py,sha256=xZ6D55BGdiM71WUv83jyLGbEyUwhBHLJRZoXthBxxTs,1019
|
|
66
|
+
cobweb_new/utils/__init__.py,sha256=c9macpjc15hrCUCdzO5RR_sgK_B9kvJKreSGprZ1ld4,112
|
|
67
|
+
cobweb_new/utils/bloom.py,sha256=vng-YbKgh9HbtpAWYf_nkUSbfVTOj40aqUUejRYlsCU,1752
|
|
68
|
+
cobweb_new/utils/oss.py,sha256=gyt8-UB07tVphZLQXMOf-JTJwU-mWq8KZkOXKkAf3uk,3513
|
|
69
|
+
cobweb_new/utils/tools.py,sha256=5JEaaAwYoV9Sdla2UBIJn6faUBuXmxUMagm9ck6FVqs,1253
|
|
70
|
+
cobweb_launcher-1.2.26.dist-info/LICENSE,sha256=z1rxSIGOyzcSb3orZxFPxzx-0C1vTocmswqBNxpKfEk,1063
|
|
71
|
+
cobweb_launcher-1.2.26.dist-info/METADATA,sha256=UHU3dWAttQMpdYzVGmKXwlENAldvftldmp2v7TSBfxE,6510
|
|
72
|
+
cobweb_launcher-1.2.26.dist-info/WHEEL,sha256=ewwEueio1C2XeHTvT17n8dZUJgOvyCWCt0WVNLClP9o,92
|
|
73
|
+
cobweb_launcher-1.2.26.dist-info/top_level.txt,sha256=UKwd478nkspJ6_fos3jtuc7SKXfmU42bJa_Ek5n2zsE,18
|
|
74
|
+
cobweb_launcher-1.2.26.dist-info/RECORD,,
|
cobweb_new/__init__.py
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
import time
|
|
2
|
+
from inspect import isgenerator
|
|
3
|
+
from typing import Callable, Union
|
|
4
|
+
|
|
5
|
+
from .common_queue import Queue
|
|
6
|
+
from .response import Response
|
|
7
|
+
from .request import Request
|
|
8
|
+
from .item import BaseItem, ConsoleItem
|
|
9
|
+
from .seed import Seed
|
|
10
|
+
|
|
11
|
+
from .log import logger
|
|
12
|
+
# from .decorators import decorator_oss_db, stop, pause
|
|
13
|
+
import decorators
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class TaskQueue:
|
|
17
|
+
|
|
18
|
+
SEED = Queue() # 添加任务种子队列
|
|
19
|
+
TODO = Queue() # 任务种子队列
|
|
20
|
+
REQUEST = Queue() # 请求队列
|
|
21
|
+
|
|
22
|
+
DOWNLOAD = Queue() # 下载任务队列
|
|
23
|
+
RESPONSE = Queue() # 响应队列
|
|
24
|
+
DONE = Queue() # 下载完成队列
|
|
25
|
+
|
|
26
|
+
UPLOAD = Queue() # 任务上传队列
|
|
27
|
+
|
|
28
|
+
DELETE = Queue() # 任务删除队列
|
|
29
|
+
|
|
30
|
+
def __init__(self, db):
|
|
31
|
+
self.db = db
|
|
32
|
+
|
|
33
|
+
@staticmethod
|
|
34
|
+
def is_empty():
|
|
35
|
+
total_length = TaskQueue.SEED.length
|
|
36
|
+
total_length += TaskQueue.TODO.length
|
|
37
|
+
total_length += TaskQueue.REQUEST.length
|
|
38
|
+
total_length += TaskQueue.DOWNLOAD.length
|
|
39
|
+
total_length += TaskQueue.RESPONSE.length
|
|
40
|
+
total_length += TaskQueue.UPLOAD.length
|
|
41
|
+
total_length += TaskQueue.DONE.length
|
|
42
|
+
total_length += TaskQueue.DELETE.length
|
|
43
|
+
return not bool(total_length)
|
|
44
|
+
# @staticmethod
|
|
45
|
+
# def distribute(it):
|
|
46
|
+
|
|
47
|
+
@staticmethod
|
|
48
|
+
def process_task(it: Union[Seed, Request, Response, BaseItem], crawler_func: Callable):
|
|
49
|
+
try:
|
|
50
|
+
iterators = crawler_func(it)
|
|
51
|
+
if not isgenerator(iterators):
|
|
52
|
+
raise TypeError(f"{crawler_func.__name__} function isn't a generator")
|
|
53
|
+
for tk in iterators:
|
|
54
|
+
if isinstance(tk, Request):
|
|
55
|
+
TaskQueue.DOWNLOAD.push(tk)
|
|
56
|
+
elif isinstance(tk, Response):
|
|
57
|
+
TaskQueue.RESPONSE.push(tk)
|
|
58
|
+
elif isinstance(tk, BaseItem):
|
|
59
|
+
TaskQueue.UPLOAD.push(tk)
|
|
60
|
+
elif isinstance(tk, Seed):
|
|
61
|
+
TaskQueue.SEED.push(tk)
|
|
62
|
+
except Exception as e:
|
|
63
|
+
if not isinstance(it, BaseItem):
|
|
64
|
+
it.seed.params.retry += 1
|
|
65
|
+
|
|
66
|
+
time.sleep(5)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class Distribute:
|
|
70
|
+
"""
|
|
71
|
+
数据分发器,将数据分发到各个队列中
|
|
72
|
+
"""
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
import time
|
|
2
|
+
from collections import deque
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class Queue:
|
|
6
|
+
|
|
7
|
+
def __init__(self):
|
|
8
|
+
self._queue = deque()
|
|
9
|
+
|
|
10
|
+
@property
|
|
11
|
+
def length(self) -> int:
|
|
12
|
+
return len(self._queue)
|
|
13
|
+
|
|
14
|
+
def push(self, data, left: bool = False, direct_insertion: bool = False):
|
|
15
|
+
try:
|
|
16
|
+
if not data:
|
|
17
|
+
return None
|
|
18
|
+
if not direct_insertion and any(isinstance(data, t) for t in (list, tuple)):
|
|
19
|
+
self._queue.extendleft(data) if left else self._queue.extend(data)
|
|
20
|
+
else:
|
|
21
|
+
self._queue.appendleft(data) if left else self._queue.append(data)
|
|
22
|
+
except AttributeError:
|
|
23
|
+
pass
|
|
24
|
+
|
|
25
|
+
def pop(self, left: bool = True):
|
|
26
|
+
try:
|
|
27
|
+
return self._queue.popleft() if left else self._queue.pop()
|
|
28
|
+
except IndexError:
|
|
29
|
+
return None
|
|
30
|
+
except AttributeError:
|
|
31
|
+
return None
|
|
32
|
+
|
|
33
|
+
def clear(self):
|
|
34
|
+
self._queue.clear()
|
|
35
|
+
|
|
36
|
+
def get(self):
|
|
37
|
+
try:
|
|
38
|
+
yield self._queue.popleft()
|
|
39
|
+
except IndexError:
|
|
40
|
+
time.sleep(1)
|
|
41
|
+
yield None
|
|
42
|
+
except AttributeError:
|
|
43
|
+
yield None
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class RedisQueue(Queue):
|
|
47
|
+
|
|
48
|
+
def __init__(self, db):
|
|
49
|
+
super().__init__()
|
|
50
|
+
self.db = db
|
|
51
|
+
|
|
52
|
+
def pop(self, left: bool = True):
|
|
53
|
+
...
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
import time
|
|
2
|
+
import threading
|
|
3
|
+
from functools import wraps
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def add_thread(num=1):
|
|
7
|
+
def decorator(func):
|
|
8
|
+
@wraps(func)
|
|
9
|
+
def wrapper(self, *args):
|
|
10
|
+
for i in range(num):
|
|
11
|
+
name = func.__name__ + "_" + str(i) if num > 1 else func.__name__
|
|
12
|
+
self._threads.append(threading.Thread(name=name, target=func, args=(self,) + args))
|
|
13
|
+
return wrapper
|
|
14
|
+
|
|
15
|
+
return decorator
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def pause(func):
|
|
19
|
+
@wraps(func)
|
|
20
|
+
def wrapper(self, *args, **kwargs):
|
|
21
|
+
while not self.pause.is_set():
|
|
22
|
+
try:
|
|
23
|
+
func(self, *args, **kwargs)
|
|
24
|
+
except Exception as e:
|
|
25
|
+
pass
|
|
26
|
+
# logger.info(f"{func.__name__}: " + str(e))
|
|
27
|
+
finally:
|
|
28
|
+
time.sleep(0.1)
|
|
29
|
+
|
|
30
|
+
return wrapper
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def stop(func):
|
|
34
|
+
@wraps(func)
|
|
35
|
+
def wrapper(self, *args, **kwargs):
|
|
36
|
+
while not self.stop.is_set():
|
|
37
|
+
try:
|
|
38
|
+
func(self, *args, **kwargs)
|
|
39
|
+
except Exception as e:
|
|
40
|
+
# logger.info(f"{func.__name__}: " + str(e))
|
|
41
|
+
pass
|
|
42
|
+
finally:
|
|
43
|
+
time.sleep(0.1)
|
|
44
|
+
|
|
45
|
+
return wrapper
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def decorator_oss_db(exception, retries=3):
|
|
49
|
+
def decorator(func):
|
|
50
|
+
@wraps(func)
|
|
51
|
+
def wrapper(callback_func, *args, **kwargs):
|
|
52
|
+
result = None
|
|
53
|
+
for i in range(retries):
|
|
54
|
+
msg = None
|
|
55
|
+
try:
|
|
56
|
+
return func(callback_func, *args, **kwargs)
|
|
57
|
+
except Exception as e:
|
|
58
|
+
result = None
|
|
59
|
+
msg = e
|
|
60
|
+
finally:
|
|
61
|
+
if result:
|
|
62
|
+
return result
|
|
63
|
+
|
|
64
|
+
if i >= 2 and msg:
|
|
65
|
+
raise exception(msg)
|
|
66
|
+
|
|
67
|
+
return wrapper
|
|
68
|
+
|
|
69
|
+
return decorator
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
|
cobweb_new/base/item.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
from .seed import Seed
|
|
2
|
+
from collections import namedtuple
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class Item(type):
|
|
6
|
+
|
|
7
|
+
def __new__(cls, name, bases, dct):
|
|
8
|
+
new_class_instance = type.__new__(cls, name, bases, dct)
|
|
9
|
+
if name != "BaseItem":
|
|
10
|
+
table = getattr(new_class_instance, "__TABLE__")
|
|
11
|
+
fields = getattr(new_class_instance, "__FIELDS__")
|
|
12
|
+
new_class_instance.Data = namedtuple(table, fields)
|
|
13
|
+
return new_class_instance
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class BaseItem(metaclass=Item):
|
|
17
|
+
|
|
18
|
+
__TABLE__ = ""
|
|
19
|
+
__FIELDS__ = ""
|
|
20
|
+
|
|
21
|
+
def __init__(self, seed: Seed, **kwargs):
|
|
22
|
+
self.seed = seed
|
|
23
|
+
|
|
24
|
+
data = {}
|
|
25
|
+
for key, value in kwargs.items():
|
|
26
|
+
if key not in self.__FIELDS__:
|
|
27
|
+
self.__setattr__(key, value)
|
|
28
|
+
else:
|
|
29
|
+
data[key] = value
|
|
30
|
+
|
|
31
|
+
self.data = self.Data(**data)
|
|
32
|
+
|
|
33
|
+
@property
|
|
34
|
+
def to_dict(self):
|
|
35
|
+
return self.data._asdict()
|
|
36
|
+
|
|
37
|
+
@property
|
|
38
|
+
def table(self):
|
|
39
|
+
return self.Data.__name__
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class ConsoleItem(BaseItem):
|
|
43
|
+
|
|
44
|
+
__TABLE__ = "console"
|
|
45
|
+
__FIELDS__ = "data"
|
|
46
|
+
|
cobweb_new/base/log.py
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class ColorCodes:
|
|
5
|
+
# Text Reset
|
|
6
|
+
RESET = "\033[0m"
|
|
7
|
+
|
|
8
|
+
# Regular Colors
|
|
9
|
+
RED = "\033[31m"
|
|
10
|
+
GREEN = "\033[32m"
|
|
11
|
+
YELLOW = "\033[33m"
|
|
12
|
+
BLUE = "\033[34m"
|
|
13
|
+
PURPLE = "\033[35m"
|
|
14
|
+
CYAN = "\033[36m"
|
|
15
|
+
WHITE = "\033[37m"
|
|
16
|
+
|
|
17
|
+
# Bright Colors
|
|
18
|
+
BRIGHT_RED = "\033[91m"
|
|
19
|
+
BRIGHT_GREEN = "\033[92m"
|
|
20
|
+
BRIGHT_YELLOW = "\033[93m"
|
|
21
|
+
BRIGHT_BLUE = "\033[94m"
|
|
22
|
+
BRIGHT_PURPLE = "\033[95m"
|
|
23
|
+
BRIGHT_CYAN = "\033[96m"
|
|
24
|
+
BRIGHT_WHITE = "\033[97m"
|
|
25
|
+
|
|
26
|
+
# Background Colors
|
|
27
|
+
BG_RED = "\033[41m"
|
|
28
|
+
BG_GREEN = "\033[42m"
|
|
29
|
+
BG_YELLOW = "\033[43m"
|
|
30
|
+
BG_BLUE = "\033[44m"
|
|
31
|
+
BG_PURPLE = "\033[45m"
|
|
32
|
+
BG_CYAN = "\033[46m"
|
|
33
|
+
BG_WHITE = "\033[47m"
|
|
34
|
+
|
|
35
|
+
# Bright Background Colors
|
|
36
|
+
BG_BRIGHT_RED = "\033[101m"
|
|
37
|
+
BG_BRIGHT_GREEN = "\033[102m"
|
|
38
|
+
BG_BRIGHT_YELLOW = "\033[103m"
|
|
39
|
+
BG_BRIGHT_BLUE = "\033[104m"
|
|
40
|
+
BG_BRIGHT_PURPLE = "\033[105m"
|
|
41
|
+
BG_BRIGHT_CYAN = "\033[106m"
|
|
42
|
+
BG_BRIGHT_WHITE = "\033[107m"
|
|
43
|
+
|
|
44
|
+
# Text Styles
|
|
45
|
+
BOLD = "\033[1m"
|
|
46
|
+
DIM = "\033[2m"
|
|
47
|
+
ITALIC = "\033[3m"
|
|
48
|
+
UNDERLINE = "\033[4m"
|
|
49
|
+
BLINK = "\033[5m"
|
|
50
|
+
REVERSE = "\033[7m"
|
|
51
|
+
HIDDEN = "\033[8m"
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class Log:
|
|
55
|
+
logging.getLogger('oss2.api').setLevel(logging.WARNING)
|
|
56
|
+
logging.basicConfig(
|
|
57
|
+
level=logging.INFO,
|
|
58
|
+
format=f'%(asctime)s %(name)s [%(filename)s:%(lineno)d %(funcName)s]'
|
|
59
|
+
f' %(levelname)s -> %(message)s'
|
|
60
|
+
)
|
|
61
|
+
log = logging.getLogger()
|
|
62
|
+
|
|
63
|
+
def set_log_name(self, name):
|
|
64
|
+
self.__class__.log = logging.getLogger(name)
|
|
65
|
+
|
|
66
|
+
@property
|
|
67
|
+
def debug(self):
|
|
68
|
+
return self.__class__.log.debug
|
|
69
|
+
|
|
70
|
+
@property
|
|
71
|
+
def info(self):
|
|
72
|
+
return self.__class__.log.info
|
|
73
|
+
|
|
74
|
+
@property
|
|
75
|
+
def warning(self):
|
|
76
|
+
return self.__class__.log.warning
|
|
77
|
+
|
|
78
|
+
@property
|
|
79
|
+
def exception(self):
|
|
80
|
+
return self.__class__.log.exception
|
|
81
|
+
|
|
82
|
+
@property
|
|
83
|
+
def error(self):
|
|
84
|
+
return self.__class__.log.error
|
|
85
|
+
|
|
86
|
+
@property
|
|
87
|
+
def critical(self):
|
|
88
|
+
return self.__class__.log.critical
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
logger = Log()
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
import random
|
|
2
|
+
import requests
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class Request:
|
|
6
|
+
|
|
7
|
+
__REQUEST_ATTRS__ = {
|
|
8
|
+
"params",
|
|
9
|
+
"headers",
|
|
10
|
+
"cookies",
|
|
11
|
+
"data",
|
|
12
|
+
"json",
|
|
13
|
+
"files",
|
|
14
|
+
"auth",
|
|
15
|
+
"timeout",
|
|
16
|
+
"proxies",
|
|
17
|
+
"hooks",
|
|
18
|
+
"stream",
|
|
19
|
+
"verify",
|
|
20
|
+
"cert",
|
|
21
|
+
"allow_redirects",
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
def __init__(
|
|
25
|
+
self,
|
|
26
|
+
url,
|
|
27
|
+
seed,
|
|
28
|
+
random_ua=True,
|
|
29
|
+
check_status_code=True,
|
|
30
|
+
**kwargs
|
|
31
|
+
):
|
|
32
|
+
self.url = url
|
|
33
|
+
self.seed = seed
|
|
34
|
+
self.check_status_code = check_status_code
|
|
35
|
+
self.request_setting = {}
|
|
36
|
+
|
|
37
|
+
for k, v in kwargs.items():
|
|
38
|
+
if k in self.__class__.__REQUEST_ATTRS__:
|
|
39
|
+
self.request_setting[k] = v
|
|
40
|
+
continue
|
|
41
|
+
self.__setattr__(k, v)
|
|
42
|
+
|
|
43
|
+
if not getattr(self, "method", None):
|
|
44
|
+
self.method = "POST" if self.request_setting.get("data") or self.request_setting.get("json") else "GET"
|
|
45
|
+
|
|
46
|
+
if random_ua:
|
|
47
|
+
self._build_header()
|
|
48
|
+
|
|
49
|
+
@property
|
|
50
|
+
def _random_ua(self) -> str:
|
|
51
|
+
v1 = random.randint(4, 15)
|
|
52
|
+
v2 = random.randint(3, 11)
|
|
53
|
+
v3 = random.randint(1, 16)
|
|
54
|
+
v4 = random.randint(533, 605)
|
|
55
|
+
v5 = random.randint(1000, 6000)
|
|
56
|
+
v6 = random.randint(10, 80)
|
|
57
|
+
user_agent = (f"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_{v1}_{v2}) AppleWebKit/{v4}.{v3} "
|
|
58
|
+
f"(KHTML, like Gecko) Chrome/105.0.0.0 Safari/{v4}.{v3} Edg/105.0.{v5}.{v6}")
|
|
59
|
+
return user_agent
|
|
60
|
+
|
|
61
|
+
def _build_header(self) -> dict:
|
|
62
|
+
if not self.request_setting.get("headers"):
|
|
63
|
+
self.request_setting["headers"] = {"accept": "*/*", "user-agent": self._random_ua}
|
|
64
|
+
elif "user-agent" not in [key.lower() for key in self.request_setting["headers"].keys()]:
|
|
65
|
+
self.request_setting["headers"]["user-agent"] = self._random_ua
|
|
66
|
+
|
|
67
|
+
def download(self) -> requests.Response:
|
|
68
|
+
response = requests.request(self.method, self.url, **self.request_setting)
|
|
69
|
+
if self.check_status_code:
|
|
70
|
+
response.raise_for_status()
|
|
71
|
+
return response
|
|
72
|
+
|
|
73
|
+
@property
|
|
74
|
+
def to_dict(self):
|
|
75
|
+
_dict = self.__dict__.copy()
|
|
76
|
+
_dict.pop('url')
|
|
77
|
+
_dict.pop('seed')
|
|
78
|
+
_dict.pop('check_status_code')
|
|
79
|
+
_dict.pop('request_setting')
|
|
80
|
+
return _dict
|
|
81
|
+
|
|
82
|
+
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
|
|
2
|
+
|
|
3
|
+
class Response:
|
|
4
|
+
|
|
5
|
+
def __init__(
|
|
6
|
+
self,
|
|
7
|
+
seed,
|
|
8
|
+
response,
|
|
9
|
+
**kwargs
|
|
10
|
+
):
|
|
11
|
+
self.seed = seed
|
|
12
|
+
self.response = response
|
|
13
|
+
|
|
14
|
+
for k, v in kwargs.items():
|
|
15
|
+
self.__setattr__(k, v)
|
|
16
|
+
|
|
17
|
+
@property
|
|
18
|
+
def to_dict(self):
|
|
19
|
+
_dict = self.__dict__.copy()
|
|
20
|
+
_dict.pop('seed')
|
|
21
|
+
_dict.pop('response')
|
|
22
|
+
return _dict
|
|
23
|
+
|