cobweb-launcher 1.1.22__tar.gz → 1.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cobweb-launcher might be problematic. Click here for more details.

Files changed (42) hide show
  1. {cobweb-launcher-1.1.22/cobweb_launcher.egg-info → cobweb-launcher-1.2.0}/PKG-INFO +1 -1
  2. cobweb-launcher-1.2.0/cobweb/__init__.py +2 -0
  3. {cobweb-launcher-1.1.22 → cobweb-launcher-1.2.0}/cobweb/base/__init__.py +1 -1
  4. {cobweb-launcher-1.1.22 → cobweb-launcher-1.2.0}/cobweb/base/item.py +7 -0
  5. {cobweb-launcher-1.1.22 → cobweb-launcher-1.2.0}/cobweb/constant.py +23 -1
  6. cobweb-launcher-1.2.0/cobweb/crawlers/__init__.py +1 -0
  7. cobweb-launcher-1.1.22/cobweb/crawlers/base_crawler.py → cobweb-launcher-1.2.0/cobweb/crawlers/crawler.py +48 -30
  8. {cobweb-launcher-1.1.22 → cobweb-launcher-1.2.0}/cobweb/launchers/__init__.py +1 -1
  9. {cobweb-launcher-1.1.22 → cobweb-launcher-1.2.0}/cobweb/launchers/launcher.py +38 -52
  10. cobweb-launcher-1.2.0/cobweb/launchers/launcher_air.py +88 -0
  11. {cobweb-launcher-1.1.22 → cobweb-launcher-1.2.0}/cobweb/launchers/launcher_pro.py +75 -34
  12. cobweb-launcher-1.2.0/cobweb/pipelines/__init__.py +3 -0
  13. cobweb-launcher-1.1.22/cobweb/pipelines/base_pipeline.py → cobweb-launcher-1.2.0/cobweb/pipelines/pipeline.py +20 -14
  14. cobweb-launcher-1.2.0/cobweb/pipelines/pipeline_console.py +24 -0
  15. cobweb-launcher-1.1.22/cobweb/pipelines/loghub_pipeline.py → cobweb-launcher-1.2.0/cobweb/pipelines/pipeline_loghub.py +1 -1
  16. {cobweb-launcher-1.1.22 → cobweb-launcher-1.2.0}/cobweb/setting.py +6 -6
  17. {cobweb-launcher-1.1.22 → cobweb-launcher-1.2.0}/cobweb/utils/tools.py +2 -2
  18. {cobweb-launcher-1.1.22 → cobweb-launcher-1.2.0/cobweb_launcher.egg-info}/PKG-INFO +1 -1
  19. {cobweb-launcher-1.1.22 → cobweb-launcher-1.2.0}/cobweb_launcher.egg-info/SOURCES.txt +5 -4
  20. {cobweb-launcher-1.1.22 → cobweb-launcher-1.2.0}/setup.py +1 -1
  21. cobweb-launcher-1.1.22/cobweb/__init__.py +0 -2
  22. cobweb-launcher-1.1.22/cobweb/crawlers/__init__.py +0 -2
  23. cobweb-launcher-1.1.22/cobweb/crawlers/file_crawler.py +0 -98
  24. cobweb-launcher-1.1.22/cobweb/pipelines/__init__.py +0 -2
  25. {cobweb-launcher-1.1.22 → cobweb-launcher-1.2.0}/LICENSE +0 -0
  26. {cobweb-launcher-1.1.22 → cobweb-launcher-1.2.0}/README.md +0 -0
  27. {cobweb-launcher-1.1.22 → cobweb-launcher-1.2.0}/cobweb/base/common_queue.py +0 -0
  28. {cobweb-launcher-1.1.22 → cobweb-launcher-1.2.0}/cobweb/base/decorators.py +0 -0
  29. {cobweb-launcher-1.1.22 → cobweb-launcher-1.2.0}/cobweb/base/log.py +0 -0
  30. {cobweb-launcher-1.1.22 → cobweb-launcher-1.2.0}/cobweb/base/request.py +0 -0
  31. {cobweb-launcher-1.1.22 → cobweb-launcher-1.2.0}/cobweb/base/response.py +0 -0
  32. {cobweb-launcher-1.1.22 → cobweb-launcher-1.2.0}/cobweb/base/seed.py +0 -0
  33. {cobweb-launcher-1.1.22 → cobweb-launcher-1.2.0}/cobweb/db/__init__.py +0 -0
  34. {cobweb-launcher-1.1.22 → cobweb-launcher-1.2.0}/cobweb/db/redis_db.py +0 -0
  35. {cobweb-launcher-1.1.22 → cobweb-launcher-1.2.0}/cobweb/exceptions/__init__.py +0 -0
  36. {cobweb-launcher-1.1.22 → cobweb-launcher-1.2.0}/cobweb/exceptions/oss_db_exception.py +0 -0
  37. {cobweb-launcher-1.1.22 → cobweb-launcher-1.2.0}/cobweb/utils/__init__.py +0 -0
  38. {cobweb-launcher-1.1.22 → cobweb-launcher-1.2.0}/cobweb/utils/oss.py +0 -0
  39. {cobweb-launcher-1.1.22 → cobweb-launcher-1.2.0}/cobweb_launcher.egg-info/dependency_links.txt +0 -0
  40. {cobweb-launcher-1.1.22 → cobweb-launcher-1.2.0}/cobweb_launcher.egg-info/requires.txt +0 -0
  41. {cobweb-launcher-1.1.22 → cobweb-launcher-1.2.0}/cobweb_launcher.egg-info/top_level.txt +0 -0
  42. {cobweb-launcher-1.1.22 → cobweb-launcher-1.2.0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cobweb-launcher
3
- Version: 1.1.22
3
+ Version: 1.2.0
4
4
  Summary: spider_hole
5
5
  Home-page: https://github.com/Juannie-PP/cobweb
6
6
  Author: Juannie-PP
@@ -0,0 +1,2 @@
1
+ from .launchers import LauncherAir, LauncherPro
2
+ from .constant import CrawlerModel
@@ -1,7 +1,7 @@
1
1
  from .common_queue import Queue
2
2
  from .response import Response
3
3
  from .request import Request
4
- from .item import BaseItem
4
+ from .item import BaseItem, ConsoleItem
5
5
  from .seed import Seed
6
6
 
7
7
  from .log import logger
@@ -37,3 +37,10 @@ class BaseItem(metaclass=Item):
37
37
  @property
38
38
  def table(self):
39
39
  return self.Data.__name__
40
+
41
+
42
+ class ConsoleItem(BaseItem):
43
+
44
+ __TABLE__ = "console"
45
+ __FIELDS__ = "data"
46
+
@@ -30,6 +30,24 @@ class DealModel:
30
30
 
31
31
  class LogTemplate:
32
32
 
33
+ console_item = """
34
+ ----------------------- start - console pipeline -----------------
35
+ 种子详情 \n{seed_detail}
36
+ 解析详情 \n{parse_detail}
37
+ ----------------------- end - console pipeline ------------------
38
+ """
39
+
40
+ launcher_air_polling = """
41
+ ----------------------- start - 轮训日志: {task} -----------------
42
+ 内存队列
43
+ 种子数: {doing_len}
44
+ 待消费: {todo_len}
45
+ 已消费: {done_len}
46
+ 存储队列
47
+ 待上传: {upload_len}
48
+ ----------------------- end - 轮训日志: {task} ------------------
49
+ """
50
+
33
51
  launcher_pro_polling = """
34
52
  ----------------------- start - 轮训日志: {task} -----------------
35
53
  内存队列
@@ -69,4 +87,8 @@ class LogTemplate:
69
87
  response
70
88
  status : {status} \n{response}
71
89
  ------------------------------------------------------------------
72
- """
90
+ """
91
+
92
+ @staticmethod
93
+ def log_info(item: dict) -> str:
94
+ return "\n".join([" " * 12 + f"{str(k).ljust(14)}: {str(v)}" for k, v in item.items()])
@@ -0,0 +1 @@
1
+ from .crawler import Crawler
@@ -1,39 +1,52 @@
1
+ import json
1
2
  import threading
2
3
  import time
3
-
4
+ import traceback
4
5
  from inspect import isgenerator
5
6
  from typing import Union, Callable, Mapping
6
7
 
7
- from cobweb.base import Queue, Seed, BaseItem, Request, Response, logger
8
8
  from cobweb.constant import DealModel, LogTemplate
9
- from cobweb.utils import download_log_info
10
- from cobweb import setting
9
+ from cobweb.base import (
10
+ Queue,
11
+ Seed,
12
+ BaseItem,
13
+ Request,
14
+ Response,
15
+ ConsoleItem,
16
+ logger
17
+ )
11
18
 
12
19
 
13
20
  class Crawler(threading.Thread):
14
21
 
15
22
  def __init__(
16
23
  self,
17
- upload_queue: Queue,
18
- custom_func: Union[Mapping[str, Callable]],
24
+ stop: threading.Event,
25
+ pause: threading.Event,
19
26
  launcher_queue: Union[Mapping[str, Queue]],
27
+ custom_func: Union[Mapping[str, Callable]],
28
+ thread_num: int,
29
+ max_retries: int
20
30
  ):
21
31
  super().__init__()
22
32
 
23
- self.upload_queue = upload_queue
33
+ self._stop = stop
34
+ self._pause = pause
35
+ self._new = launcher_queue["new"]
36
+ self._todo = launcher_queue["todo"]
37
+ self._done = launcher_queue["done"]
38
+ self._upload = launcher_queue["upload"]
39
+
24
40
  for func_name, _callable in custom_func.items():
25
41
  if isinstance(_callable, Callable):
26
42
  self.__setattr__(func_name, _callable)
27
43
 
28
- self.launcher_queue = launcher_queue
29
-
30
- self.spider_thread_num = setting.SPIDER_THREAD_NUM
31
- self.max_retries = setting.SPIDER_MAX_RETRIES
44
+ self.thread_num = thread_num
45
+ self.max_retries = max_retries
32
46
 
33
47
  @staticmethod
34
48
  def request(seed: Seed) -> Union[Request, BaseItem]:
35
- stream = True if setting.DOWNLOAD_MODEL else False
36
- yield Request(seed.url, seed, stream=stream, timeout=5)
49
+ yield Request(seed.url, seed, timeout=5)
37
50
 
38
51
  @staticmethod
39
52
  def download(item: Request) -> Union[Seed, BaseItem, Response, str]:
@@ -42,39 +55,43 @@ class Crawler(threading.Thread):
42
55
 
43
56
  @staticmethod
44
57
  def parse(item: Response) -> BaseItem:
45
- pass
58
+ upload_item = item.to_dict
59
+ upload_item["text"] = item.response.text
60
+ yield ConsoleItem(item.seed, data=json.dumps(upload_item, ensure_ascii=False))
46
61
 
47
- def get_seed(self) -> Seed:
48
- return self.launcher_queue['todo'].pop()
62
+ # def get_seed(self) -> Seed:
63
+ # return self._todo.pop()
49
64
 
50
65
  def distribute(self, item, seed):
51
66
  if isinstance(item, BaseItem):
52
- self.upload_queue.push(item)
67
+ self._upload.push(item)
53
68
  elif isinstance(item, Seed):
54
- self.launcher_queue['new'].push(item)
69
+ self._new.push(item)
55
70
  elif isinstance(item, str) and item == DealModel.poll:
56
- self.launcher_queue['todo'].push(seed)
71
+ self._todo.push(seed)
57
72
  elif isinstance(item, str) and item == DealModel.done:
58
- self.launcher_queue['done'].push(seed)
73
+ self._done.push(seed)
59
74
  elif isinstance(item, str) and item == DealModel.fail:
60
75
  seed.params.seed_status = DealModel.fail
61
- self.launcher_queue['done'].push(seed)
76
+ self._done.push(seed)
62
77
  else:
63
78
  raise TypeError("yield value type error!")
64
79
 
65
80
  def spider(self):
66
- while True:
67
- seed = self.get_seed()
81
+ while not self._stop.is_set():
82
+
83
+ seed = self._todo.pop()
68
84
 
69
85
  if not seed:
86
+ time.sleep(1)
70
87
  continue
71
88
 
72
89
  elif seed.params.retry >= self.max_retries:
73
90
  seed.params.seed_status = DealModel.fail
74
- self.launcher_queue['done'].push(seed)
91
+ self._done.push(seed)
75
92
  continue
76
93
 
77
- seed_detail_log_info = download_log_info(seed.to_dict)
94
+ seed_detail_log_info = LogTemplate.log_info(seed.to_dict)
78
95
 
79
96
  try:
80
97
  request_iterators = self.request(seed)
@@ -105,7 +122,7 @@ class Crawler(threading.Thread):
105
122
  seed_version=seed.params.seed_version,
106
123
  identifier=seed.identifier or "",
107
124
  status=download_item.response,
108
- response=download_log_info(download_item.to_dict)
125
+ response=LogTemplate.log_info(download_item.to_dict)
109
126
  ))
110
127
  parse_iterators = self.parse(download_item)
111
128
  if not isgenerator(parse_iterators):
@@ -122,21 +139,22 @@ class Crawler(threading.Thread):
122
139
 
123
140
  if not iterator_status:
124
141
  raise ValueError("request/download/parse function yield value error!")
125
-
126
142
  except Exception as e:
127
143
  logger.info(LogTemplate.download_exception.format(
128
144
  detail=seed_detail_log_info,
129
145
  retry=seed.params.retry,
130
146
  priority=seed.params.priority,
131
147
  seed_version=seed.params.seed_version,
132
- identifier=seed.identifier or "", exception=e
148
+ identifier=seed.identifier or "",
149
+ exception=''.join(traceback.format_exception(type(e), e, e.__traceback__))
133
150
  ))
134
151
  seed.params.retry += 1
135
- self.launcher_queue['todo'].push(seed)
152
+ self._todo.push(seed)
136
153
  finally:
137
154
  time.sleep(0.1)
155
+ logger.info("spider thread close")
138
156
 
139
157
  def run(self):
140
- for index in range(self.spider_thread_num):
158
+ for index in range(self.thread_num):
141
159
  threading.Thread(name=f"spider_{index}", target=self.spider).start()
142
160
 
@@ -1,2 +1,2 @@
1
- from .launcher import Launcher
1
+ from .launcher_air import LauncherAir
2
2
  from .launcher_pro import LauncherPro
@@ -15,15 +15,16 @@ class Launcher(threading.Thread):
15
15
  __DOING__ = {}
16
16
 
17
17
  __CUSTOM_FUNC__ = {
18
- "download": None,
19
- "download_midware": None,
20
- "parse": None,
18
+ # "download": None,
19
+ # "request": None,
20
+ # "parse": None,
21
21
  }
22
22
 
23
23
  __LAUNCHER_QUEUE__ = {
24
24
  "new": Queue(),
25
25
  "todo": Queue(),
26
26
  "done": Queue(),
27
+ "upload": Queue()
27
28
  }
28
29
 
29
30
  __LAUNCHER_FUNC__ = [
@@ -76,9 +77,13 @@ class Launcher(threading.Thread):
76
77
  self._done_queue_max_size = setting.DONE_QUEUE_MAX_SIZE
77
78
  self._upload_queue_max_size = setting.UPLOAD_QUEUE_MAX_SIZE
78
79
 
80
+ self._spider_thread_num = setting.SPIDER_MAX_RETRIES
81
+ self._spider_max_retries = setting.SPIDER_THREAD_NUM
82
+
79
83
  self._done_model = setting.DONE_MODEL
84
+ self._task_model = setting.TASK_MODEL
80
85
 
81
- self._upload_queue = Queue()
86
+ # self._upload_queue = Queue()
82
87
 
83
88
  @property
84
89
  def start_seeds(self):
@@ -121,7 +126,7 @@ class Launcher(threading.Thread):
121
126
  use case:
122
127
  from cobweb.base import Request, Response
123
128
  @launcher.download
124
- def download(item: Response) -> BaseItem:
129
+ def parse(item: Response) -> BaseItem:
125
130
  ...
126
131
  yield xxxItem(seed, **kwargs)
127
132
  """
@@ -133,6 +138,33 @@ class Launcher(threading.Thread):
133
138
  for seed in seeds:
134
139
  self.__DOING__.pop(seed, None)
135
140
 
141
+ def _execute(self):
142
+ for func_name in self.__LAUNCHER_FUNC__:
143
+ threading.Thread(name=func_name, target=getattr(self, func_name)).start()
144
+ time.sleep(2)
145
+
146
+ def run(self):
147
+ threading.Thread(target=self._execute_heartbeat).start()
148
+
149
+ self._Crawler(
150
+ stop=self._stop, pause=self._pause,
151
+ launcher_queue=self.__LAUNCHER_QUEUE__,
152
+ custom_func=self.__CUSTOM_FUNC__,
153
+ thread_num = self._spider_thread_num,
154
+ max_retries = self._spider_max_retries
155
+ ).start()
156
+
157
+ self._Pipeline(
158
+ stop=self._stop, pause=self._pause,
159
+ upload=self.__LAUNCHER_QUEUE__["upload"],
160
+ done=self.__LAUNCHER_QUEUE__["done"],
161
+ upload_size=self._upload_queue_max_size,
162
+ wait_seconds=self._upload_queue_wait_seconds
163
+ ).start()
164
+
165
+ self._execute()
166
+ self._polling()
167
+
136
168
  def _execute_heartbeat(self):
137
169
  pass
138
170
 
@@ -151,52 +183,6 @@ class Launcher(threading.Thread):
151
183
  def _delete(self):
152
184
  pass
153
185
 
154
- def _execute(self):
155
- for func_name in self.__LAUNCHER_FUNC__:
156
- threading.Thread(name=func_name, target=getattr(self, func_name)).start()
157
- time.sleep(2)
158
-
159
186
  def _polling(self):
187
+ pass
160
188
 
161
- check_emtpy_times = 0
162
-
163
- while not self._stop.is_set():
164
-
165
- queue_not_empty_count = 0
166
-
167
- for q in self.__LAUNCHER_QUEUE__.values():
168
- if q.length != 0:
169
- queue_not_empty_count += 1
170
-
171
- if self._pause.is_set() and queue_not_empty_count != 0:
172
- self._pause.clear()
173
- self._execute()
174
-
175
- elif queue_not_empty_count == 0:
176
- check_emtpy_times += 1
177
- else:
178
- check_emtpy_times = 0
179
-
180
- if check_emtpy_times > 2:
181
- check_emtpy_times = 0
182
- self.__DOING__ = {}
183
- self._pause.set()
184
-
185
- def run(self):
186
- threading.Thread(target=self._execute_heartbeat).start()
187
-
188
- self._Crawler(
189
- upload_queue=self._upload_queue,
190
- custom_func=self.__CUSTOM_FUNC__,
191
- launcher_queue=self.__LAUNCHER_QUEUE__,
192
- ).start()
193
-
194
- self._Pipeline(
195
- upload_queue=self._upload_queue,
196
- done_queue=self.__LAUNCHER_QUEUE__["done"],
197
- upload_queue_size=self._upload_queue_max_size,
198
- upload_wait_seconds=self._upload_queue_wait_seconds
199
- ).start()
200
-
201
- self._execute()
202
- self._polling()
@@ -0,0 +1,88 @@
1
+ import time
2
+
3
+ from cobweb.constant import LogTemplate
4
+ from cobweb.base import logger
5
+ from .launcher import Launcher
6
+
7
+
8
+ class LauncherAir(Launcher):
9
+
10
+ def _scheduler(self):
11
+ if self.start_seeds:
12
+ self.__LAUNCHER_QUEUE__['todo'].push(self.start_seeds)
13
+
14
+ def _insert(self):
15
+ while not self._pause.is_set():
16
+ seeds = {}
17
+ status = self.__LAUNCHER_QUEUE__['new'].length < self._new_queue_max_size
18
+ for _ in range(self._new_queue_max_size):
19
+ seed = self.__LAUNCHER_QUEUE__['new'].pop()
20
+ if not seed:
21
+ break
22
+ seeds[seed.to_string] = seed.params.priority
23
+ if seeds:
24
+ self.__LAUNCHER_QUEUE__['todo'].push(seeds)
25
+ if status:
26
+ time.sleep(self._new_queue_wait_seconds)
27
+
28
+ def _delete(self):
29
+ while not self._pause.is_set():
30
+ seeds = []
31
+ status = self.__LAUNCHER_QUEUE__['done'].length < self._done_queue_max_size
32
+
33
+ for _ in range(self._done_queue_max_size):
34
+ seed = self.__LAUNCHER_QUEUE__['done'].pop()
35
+ if not seed:
36
+ break
37
+ seeds.append(seed.to_string)
38
+
39
+ if seeds:
40
+ self._remove_doing_seeds(seeds)
41
+
42
+ if status:
43
+ time.sleep(self._done_queue_wait_seconds)
44
+
45
+ def _polling(self):
46
+
47
+ check_emtpy_times = 0
48
+
49
+ while not self._stop.is_set():
50
+
51
+ queue_not_empty_count = 0
52
+ pooling_wait_seconds = 30
53
+
54
+ for q in self.__LAUNCHER_QUEUE__.values():
55
+ if q.length != 0:
56
+ queue_not_empty_count += 1
57
+
58
+ if queue_not_empty_count == 0:
59
+ pooling_wait_seconds = 3
60
+ if self._pause.is_set():
61
+ check_emtpy_times = 0
62
+ if not self._task_model:
63
+ logger.info("Done! Ready to close thread...")
64
+ self._stop.set()
65
+ elif check_emtpy_times > 2:
66
+ self.__DOING__ = {}
67
+ self._pause.set()
68
+ else:
69
+ logger.info(
70
+ "check whether the task is complete, "
71
+ f"reset times {3 - check_emtpy_times}"
72
+ )
73
+ check_emtpy_times += 1
74
+ elif self._pause.is_set():
75
+ self._pause.clear()
76
+ self._execute()
77
+ else:
78
+ logger.info(LogTemplate.launcher_air_polling.format(
79
+ task=self.task,
80
+ doing_len=len(self.__DOING__.keys()),
81
+ todo_len=self.__LAUNCHER_QUEUE__['todo'].length,
82
+ done_len=self.__LAUNCHER_QUEUE__['done'].length,
83
+ upload_len=self.__LAUNCHER_QUEUE__['upload'].length,
84
+ ))
85
+
86
+ time.sleep(pooling_wait_seconds)
87
+
88
+
@@ -3,19 +3,19 @@ import threading
3
3
 
4
4
  from cobweb.db import RedisDB
5
5
  from cobweb.base import Seed, logger
6
- from cobweb.launchers import Launcher
7
6
  from cobweb.constant import DealModel, LogTemplate
7
+ from .launcher import Launcher
8
8
 
9
9
 
10
10
  class LauncherPro(Launcher):
11
11
 
12
12
  def __init__(self, task, project, custom_setting=None, **kwargs):
13
13
  super().__init__(task, project, custom_setting, **kwargs)
14
- self._todo = "{%s:%s}:todo" % (project, task)
15
- self._done = "{%s:%s}:done" % (project, task)
16
- self._fail = "{%s:%s}:fail" % (project, task)
17
- self._heartbeat = "heartbeat:%s_%s" % (project, task)
18
- self._reset_lock = "lock:reset:%s_%s" % (project, task)
14
+ self._todo_key = "{%s:%s}:todo" % (project, task)
15
+ self._done_key = "{%s:%s}:done" % (project, task)
16
+ self._fail_key = "{%s:%s}:fail" % (project, task)
17
+ self._heartbeat_key = "heartbeat:%s_%s" % (project, task)
18
+ self._reset_lock_key = "lock:reset:%s_%s" % (project, task)
19
19
  self._db = RedisDB()
20
20
 
21
21
  self._heartbeat_start_event = threading.Event()
@@ -23,12 +23,12 @@ class LauncherPro(Launcher):
23
23
 
24
24
  @property
25
25
  def heartbeat(self):
26
- return self._db.exists(self._heartbeat)
26
+ return self._db.exists(self._heartbeat_key)
27
27
 
28
28
  def _execute_heartbeat(self):
29
29
  while not self._stop.is_set():
30
30
  if self._heartbeat_start_event.is_set():
31
- self._db.setex(self._heartbeat, 3)
31
+ self._db.setex(self._heartbeat_key, 3)
32
32
  time.sleep(2)
33
33
 
34
34
  def _reset(self):
@@ -39,15 +39,15 @@ class LauncherPro(Launcher):
39
39
  while not self._pause.is_set():
40
40
  reset_wait_seconds = 30
41
41
  start_reset_time = int(time.time())
42
- if self._db.lock(self._reset_lock, t=120):
42
+ if self._db.lock(self._reset_lock_key, t=120):
43
43
  if not self.heartbeat:
44
44
  self._heartbeat_start_event.set()
45
45
 
46
46
  _min = -int(time.time()) + self._seed_reset_seconds \
47
47
  if self.heartbeat or not init else "-inf"
48
48
 
49
- self._db.members(self._todo, 0, _min=_min, _max="(0")
50
- self._db.delete(self._reset_lock)
49
+ self._db.members(self._todo_key, 0, _min=_min, _max="(0")
50
+ self._db.delete(self._reset_lock_key)
51
51
 
52
52
  ttl = 120 - int(time.time()) + start_reset_time
53
53
  reset_wait_seconds = max(ttl, 1)
@@ -61,14 +61,14 @@ class LauncherPro(Launcher):
61
61
  if self.start_seeds:
62
62
  self.__LAUNCHER_QUEUE__['todo'].push(self.start_seeds)
63
63
  while not self._pause.is_set():
64
- if not self._db.zcount(self._todo, 0, "(1000"):
64
+ if not self._db.zcount(self._todo_key, 0, "(1000"):
65
65
  time.sleep(self._scheduler_wait_seconds)
66
66
  continue
67
67
  if self.__LAUNCHER_QUEUE__['todo'].length >= self._todo_queue_size:
68
68
  time.sleep(self._todo_queue_full_wait_seconds)
69
69
  continue
70
70
  members = self._db.members(
71
- self._todo, int(time.time()),
71
+ self._todo_key, int(time.time()),
72
72
  count=self._todo_queue_size,
73
73
  _min=0, _max="(1000"
74
74
  )
@@ -90,7 +90,7 @@ class LauncherPro(Launcher):
90
90
  break
91
91
  seeds[seed.to_string] = seed.params.priority
92
92
  if seeds:
93
- self._db.zadd(self._todo, seeds, nx=True)
93
+ self._db.zadd(self._todo_key, seeds, nx=True)
94
94
  if status:
95
95
  time.sleep(self._new_queue_wait_seconds)
96
96
 
@@ -102,7 +102,7 @@ class LauncherPro(Launcher):
102
102
  if self.__DOING__:
103
103
  refresh_time = int(time.time())
104
104
  seeds = {k:-refresh_time - v / 1000 for k, v in self.__DOING__.items()}
105
- self._db.zadd(self._todo, item=seeds, xx=True)
105
+ self._db.zadd(self._todo_key, item=seeds, xx=True)
106
106
  time.sleep(30)
107
107
 
108
108
  def _delete(self):
@@ -124,13 +124,13 @@ class LauncherPro(Launcher):
124
124
  else:
125
125
  seeds.append(seed.to_string)
126
126
  if seeds:
127
- self._db.zrem(self._todo, *seeds)
127
+ self._db.zrem(self._todo_key, *seeds)
128
128
  self._remove_doing_seeds(seeds)
129
129
  if s_seeds:
130
- self._db.done([self._todo, self._done], *s_seeds)
130
+ self._db.done([self._todo_key, self._done_key], *s_seeds)
131
131
  self._remove_doing_seeds(s_seeds)
132
132
  if f_seeds:
133
- self._db.done([self._todo, self._fail], *f_seeds)
133
+ self._db.done([self._todo_key, self._fail_key], *f_seeds)
134
134
  self._remove_doing_seeds(f_seeds)
135
135
 
136
136
  if status:
@@ -141,32 +141,73 @@ class LauncherPro(Launcher):
141
141
  while not self._stop.is_set():
142
142
  queue_not_empty_count = 0
143
143
  pooling_wait_seconds = 30
144
+
144
145
  for q in self.__LAUNCHER_QUEUE__.values():
145
146
  if q.length != 0:
146
147
  queue_not_empty_count += 1
147
- if self._pause.is_set():
148
+
149
+ if queue_not_empty_count == 0:
150
+ pooling_wait_seconds = 3
151
+ if self._pause.is_set():
152
+ check_emtpy_times = 0
153
+ if not self._task_model:
154
+ logger.info("Done! Ready to close thread...")
155
+ self._stop.set()
156
+ elif not self._db.zcount(self._todo_key, _min=0, _max="(1000") and check_emtpy_times > 2:
157
+ self.__DOING__ = {}
158
+ self._pause.set()
159
+ else:
160
+ logger.info(
161
+ "check whether the task is complete, "
162
+ f"reset times {3 - check_emtpy_times}"
163
+ )
164
+ check_emtpy_times += 1
165
+ elif self._pause.is_set():
148
166
  self._pause.clear()
149
167
  self._execute()
150
- elif queue_not_empty_count == 0:
151
- pooling_wait_seconds = 5
152
- check_emtpy_times += 1
153
168
  else:
154
- check_emtpy_times = 0
155
-
156
- if not self._db.zcount(self._todo, _min=0, _max="(1000") and check_emtpy_times > 2:
157
- check_emtpy_times = 0
158
- self.__DOING__ = {}
159
- self._pause.set()
160
-
161
- if not self._pause.is_set():
162
169
  logger.info(LogTemplate.launcher_pro_polling.format(
163
170
  task=self.task,
164
171
  doing_len=len(self.__DOING__.keys()),
165
172
  todo_len=self.__LAUNCHER_QUEUE__['todo'].length,
166
173
  done_len=self.__LAUNCHER_QUEUE__['done'].length,
167
- redis_seed_count=self._db.zcount(self._todo, "-inf", "+inf"),
168
- redis_todo_len=self._db.zcount(self._todo, 0, "(1000"),
169
- redis_doing_len=self._db.zcount(self._todo, "-inf", "(0"),
170
- upload_len=self._upload_queue.length
174
+ redis_seed_count=self._db.zcount(self._todo_key, "-inf", "+inf"),
175
+ redis_todo_len=self._db.zcount(self._todo_key, 0, "(1000"),
176
+ redis_doing_len=self._db.zcount(self._todo_key, "-inf", "(0"),
177
+ upload_len=self.__LAUNCHER_QUEUE__['upload'].length,
171
178
  ))
179
+
172
180
  time.sleep(pooling_wait_seconds)
181
+ # if self._pause.is_set():
182
+ # self._pause.clear()
183
+ # self._execute()
184
+ #
185
+ # elif queue_not_empty_count == 0:
186
+ # pooling_wait_seconds = 5
187
+ # check_emtpy_times += 1
188
+ # else:
189
+ # check_emtpy_times = 0
190
+ #
191
+ # if not self._db.zcount(self._todo, _min=0, _max="(1000") and check_emtpy_times > 2:
192
+ # check_emtpy_times = 0
193
+ # self.__DOING__ = {}
194
+ # self._pause.set()
195
+ #
196
+ # time.sleep(pooling_wait_seconds)
197
+ #
198
+ # if not self._pause.is_set():
199
+ # logger.info(LogTemplate.launcher_pro_polling.format(
200
+ # task=self.task,
201
+ # doing_len=len(self.__DOING__.keys()),
202
+ # todo_len=self.__LAUNCHER_QUEUE__['todo'].length,
203
+ # done_len=self.__LAUNCHER_QUEUE__['done'].length,
204
+ # redis_seed_count=self._db.zcount(self._todo, "-inf", "+inf"),
205
+ # redis_todo_len=self._db.zcount(self._todo, 0, "(1000"),
206
+ # redis_doing_len=self._db.zcount(self._todo, "-inf", "(0"),
207
+ # upload_len=self.__LAUNCHER_QUEUE__['upload'].length,
208
+ # ))
209
+ # elif not self._task_model:
210
+ # self._stop.set()
211
+
212
+ logger.info("Done! Ready to close thread...")
213
+
@@ -0,0 +1,3 @@
1
+ from .pipeline import Pipeline
2
+ from .pipeline_console import Console
3
+ from .pipeline_loghub import Loghub
@@ -9,16 +9,20 @@ class Pipeline(threading.Thread, ABC):
9
9
 
10
10
  def __init__(
11
11
  self,
12
- done_queue: Queue,
13
- upload_queue: Queue,
14
- upload_queue_size: int,
15
- upload_wait_seconds: int
12
+ stop: threading.Event,
13
+ pause: threading.Event,
14
+ upload: Queue, done: Queue,
15
+ upload_size: int,
16
+ wait_seconds: int
16
17
  ):
17
18
  super().__init__()
18
- self.done_queue = done_queue
19
- self.upload_queue = upload_queue
20
- self.upload_queue_size = upload_queue_size
21
- self.upload_wait_seconds = upload_wait_seconds
19
+ self._stop = stop
20
+ self._pause = pause
21
+ self._upload = upload
22
+ self._done = done
23
+
24
+ self.upload_size = upload_size
25
+ self.wait_seconds = wait_seconds
22
26
 
23
27
  @abstractmethod
24
28
  def build(self, item: BaseItem) -> dict:
@@ -29,13 +33,13 @@ class Pipeline(threading.Thread, ABC):
29
33
  pass
30
34
 
31
35
  def run(self):
32
- while True:
33
- status = self.upload_queue.length < self.upload_queue_size
36
+ while not self._stop.is_set():
37
+ status = self._upload.length < self.upload_size
34
38
  if status:
35
- time.sleep(self.upload_wait_seconds)
39
+ time.sleep(self.wait_seconds)
36
40
  data_info, seeds = {}, []
37
- for _ in range(self.upload_queue_size):
38
- item = self.upload_queue.pop()
41
+ for _ in range(self.upload_size):
42
+ item = self._upload.pop()
39
43
  if not item:
40
44
  break
41
45
  data = self.build(item)
@@ -49,6 +53,8 @@ class Pipeline(threading.Thread, ABC):
49
53
  logger.info(e)
50
54
  status = False
51
55
  if status:
52
- self.done_queue.push(seeds)
56
+ self._done.push(seeds)
57
+
58
+ logger.info("upload pipeline close!")
53
59
 
54
60
 
@@ -0,0 +1,24 @@
1
+ import json
2
+
3
+ from cobweb.base import ConsoleItem, logger
4
+ from cobweb.constant import LogTemplate
5
+ from cobweb.pipelines import Pipeline
6
+
7
+
8
+ class Console(Pipeline):
9
+
10
+ def build(self, item: ConsoleItem):
11
+ return {
12
+ "seed": item.seed.to_dict,
13
+ "data": item.to_dict
14
+ }
15
+
16
+ def upload(self, table, datas):
17
+ for data in datas:
18
+ parse_detail = LogTemplate.log_info(data["data"])
19
+ if len(parse_detail) > 500:
20
+ parse_detail = parse_detail[:500] + " ...\n" + " " * 12 + "-- Text is too long and details are omitted!"
21
+ logger.info(LogTemplate.console_item.format(
22
+ seed_detail=LogTemplate.log_info(data["seed"]),
23
+ parse_detail=parse_detail
24
+ ))
@@ -6,7 +6,7 @@ from cobweb.pipelines import Pipeline
6
6
  from aliyun.log import LogClient, LogItem, PutLogsRequest
7
7
 
8
8
 
9
- class LoghubPipeline(Pipeline):
9
+ class Loghub(Pipeline):
10
10
 
11
11
  def __init__(self, *args, **kwargs):
12
12
  super().__init__(*args, **kwargs)
@@ -30,8 +30,8 @@ OSS_MIN_UPLOAD_SIZE = 1024
30
30
  # 采集器选择
31
31
  CRAWLER = "cobweb.crawlers.Crawler"
32
32
 
33
- # 数据上传链路
34
- PIPELINE = "cobweb.pipelines.loghub_pipeline.LoghubPipeline"
33
+ # 数据存储链路
34
+ PIPELINE = "cobweb.pipelines.pipeline_console.Console"
35
35
 
36
36
 
37
37
  # Launcher 等待时间
@@ -52,12 +52,12 @@ UPLOAD_QUEUE_MAX_SIZE = 100 # upload队列长度
52
52
  # DONE_MODEL IN (0, 1), 种子完成模式
53
53
  DONE_MODEL = 0 # 0:种子消费成功直接从队列移除,失败则添加至失败队列;1:种子消费成功添加至成功队列,失败添加至失败队列
54
54
 
55
- # DOWNLOAD_MODEL IN (0, 1), 下载模式
56
- DOWNLOAD_MODEL = 0 # 0: 通用下载;1:文件下载
57
-
58
55
  # spider
59
56
  SPIDER_THREAD_NUM = 10
60
57
  SPIDER_MAX_RETRIES = 5
61
58
 
59
+ # 任务模式
60
+ TASK_MODEL = 0 # 0:单次,1:常驻
61
+
62
62
  # 文件下载响应类型过滤
63
- FILE_FILTER_CONTENT_TYPE = ["text/html", "application/xhtml+xml"]
63
+ # FILE_FILTER_CONTENT_TYPE = ["text/html", "application/xhtml+xml"]
@@ -38,5 +38,5 @@ def dynamic_load_class(model_info):
38
38
  raise TypeError()
39
39
 
40
40
 
41
- def download_log_info(item:dict) -> str:
42
- return "\n".join([" " * 12 + f"{str(k).ljust(14)}: {str(v)}" for k, v in item.items()])
41
+ # def download_log_info(item:dict) -> str:
42
+ # return "\n".join([" " * 12 + f"{str(k).ljust(14)}: {str(v)}" for k, v in item.items()])
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cobweb-launcher
3
- Version: 1.1.22
3
+ Version: 1.2.0
4
4
  Summary: spider_hole
5
5
  Home-page: https://github.com/Juannie-PP/cobweb
6
6
  Author: Juannie-PP
@@ -13,18 +13,19 @@ cobweb/base/request.py
13
13
  cobweb/base/response.py
14
14
  cobweb/base/seed.py
15
15
  cobweb/crawlers/__init__.py
16
- cobweb/crawlers/base_crawler.py
17
- cobweb/crawlers/file_crawler.py
16
+ cobweb/crawlers/crawler.py
18
17
  cobweb/db/__init__.py
19
18
  cobweb/db/redis_db.py
20
19
  cobweb/exceptions/__init__.py
21
20
  cobweb/exceptions/oss_db_exception.py
22
21
  cobweb/launchers/__init__.py
23
22
  cobweb/launchers/launcher.py
23
+ cobweb/launchers/launcher_air.py
24
24
  cobweb/launchers/launcher_pro.py
25
25
  cobweb/pipelines/__init__.py
26
- cobweb/pipelines/base_pipeline.py
27
- cobweb/pipelines/loghub_pipeline.py
26
+ cobweb/pipelines/pipeline.py
27
+ cobweb/pipelines/pipeline_console.py
28
+ cobweb/pipelines/pipeline_loghub.py
28
29
  cobweb/utils/__init__.py
29
30
  cobweb/utils/oss.py
30
31
  cobweb/utils/tools.py
@@ -5,7 +5,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
5
5
 
6
6
  setup(
7
7
  name="cobweb-launcher",
8
- version="1.1.22",
8
+ version="1.2.0",
9
9
  packages=find_packages(),
10
10
  url="https://github.com/Juannie-PP/cobweb",
11
11
  license="MIT",
@@ -1,2 +0,0 @@
1
- from .launchers import Launcher, LauncherPro
2
- from .constant import CrawlerModel
@@ -1,2 +0,0 @@
1
- from .base_crawler import Crawler
2
- from .file_crawler import FileCrawlerAir
@@ -1,98 +0,0 @@
1
- import os
2
- from typing import Union
3
- from cobweb import setting
4
- from cobweb.utils import OssUtil
5
- from cobweb.crawlers import Crawler
6
- from cobweb.base import Seed, BaseItem, Request, Response
7
- from cobweb.exceptions import OssDBPutPartError, OssDBMergeError
8
-
9
-
10
- oss_util = OssUtil(is_path_style=bool(int(os.getenv("PRIVATE_LINK", 0))))
11
-
12
-
13
- class FileCrawlerAir(Crawler):
14
-
15
- @staticmethod
16
- def download(item: Request) -> Union[Seed, BaseItem, Response, str]:
17
- seed_dict = item.seed.to_dict
18
- seed_dict["bucket_name"] = oss_util.bucket
19
- try:
20
- seed_dict["oss_path"] = key = item.seed.oss_path or getattr(item, "oss_path")
21
-
22
- if oss_util.exists(key):
23
- seed_dict["data_size"] = oss_util.head(key).content_length
24
- yield Response(item.seed, "exists", **seed_dict)
25
-
26
- else:
27
- seed_dict.setdefault("end", "")
28
- seed_dict.setdefault("start", 0)
29
-
30
- if seed_dict["end"] or seed_dict["start"]:
31
- start, end = seed_dict["start"], seed_dict["end"]
32
- item.request_setting["headers"]['Range'] = f'bytes={start}-{end}'
33
-
34
- if not item.seed.identifier:
35
- content = b""
36
- chunk_size = oss_util.chunk_size
37
- min_upload_size = oss_util.min_upload_size
38
- seed_dict.setdefault("position", 1)
39
-
40
- response = item.download()
41
-
42
- content_type = response.headers.get("content-type", "").split(";")[0]
43
- seed_dict["data_size"] = content_length = int(response.headers.get("content-length", 0))
44
-
45
- if content_type and content_type in setting.FILE_FILTER_CONTENT_TYPE:
46
- """过滤响应文件类型"""
47
- response.close()
48
- seed_dict["filter"] = True
49
- seed_dict["msg"] = f"response content type is {content_type}"
50
- yield Response(item.seed, response, **seed_dict)
51
-
52
- elif seed_dict['position'] == 1 and min_upload_size >= content_length > 0:
53
- """过小文件标识返回"""
54
- response.close()
55
- seed_dict["filter"] = True
56
- seed_dict["msg"] = "file size is too small"
57
- yield Response(item.seed, response, **seed_dict)
58
-
59
- elif seed_dict['position'] == 1 and chunk_size > content_length > min_upload_size:
60
- """小文件直接下载"""
61
- for part_data in response.iter_content(chunk_size):
62
- content += part_data
63
- response.close()
64
- oss_util.put(key, content)
65
- yield Response(item.seed, response, **seed_dict)
66
-
67
- else:
68
- """中大文件同步分片下载"""
69
- seed_dict.setdefault("upload_id", oss_util.init_part(key).upload_id)
70
-
71
- for part_data in response.iter_content(chunk_size):
72
- content += part_data
73
- if len(content) >= chunk_size:
74
- upload_data = content[:chunk_size]
75
- content = content[chunk_size:]
76
- oss_util.put_part(key, seed_dict["upload_id"], seed_dict['position'], content)
77
- seed_dict['start'] += len(upload_data)
78
- seed_dict['position'] += 1
79
-
80
- response.close()
81
-
82
- if content:
83
- oss_util.put_part(key, seed_dict["upload_id"], seed_dict['position'], content)
84
- oss_util.merge(key, seed_dict["upload_id"])
85
- seed_dict["data_size"] = oss_util.head(key).content_length
86
- yield Response(item.seed, response, **seed_dict)
87
-
88
- elif item.seed.identifier == "merge":
89
- oss_util.merge(key, seed_dict["upload_id"])
90
- seed_dict["data_size"] = oss_util.head(key).content_length
91
- yield Response(item.seed, "merge", **seed_dict)
92
-
93
- except OssDBPutPartError:
94
- yield Seed(seed_dict)
95
- except OssDBMergeError:
96
- yield Seed(seed_dict, identifier="merge")
97
-
98
-
@@ -1,2 +0,0 @@
1
- from .base_pipeline import Pipeline
2
- from .loghub_pipeline import LoghubPipeline