cobweb-launcher 1.3.3__tar.gz → 1.3.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (81) hide show
  1. {cobweb-launcher-1.3.3 → cobweb-launcher-1.3.5}/PKG-INFO +1 -1
  2. cobweb-launcher-1.3.5/cobweb/__init__.py +2 -0
  3. {cobweb-launcher-1.3.3 → cobweb-launcher-1.3.5}/cobweb/launchers/__init__.py +1 -1
  4. cobweb-launcher-1.3.5/cobweb/launchers/launcher_air.py +88 -0
  5. cobweb-launcher-1.3.5/cobweb/launchers/launcher_api.py +88 -0
  6. {cobweb-launcher-1.3.3 → cobweb-launcher-1.3.5}/cobweb/schedulers/__init__.py +2 -0
  7. cobweb-launcher-1.3.5/cobweb/schedulers/scheduler_api.py +69 -0
  8. {cobweb-launcher-1.3.3 → cobweb-launcher-1.3.5}/cobweb/utils/__init__.py +1 -1
  9. {cobweb-launcher-1.3.3 → cobweb-launcher-1.3.5}/cobweb_launcher.egg-info/PKG-INFO +1 -1
  10. {cobweb-launcher-1.3.3 → cobweb-launcher-1.3.5}/cobweb_launcher.egg-info/SOURCES.txt +1 -0
  11. {cobweb-launcher-1.3.3 → cobweb-launcher-1.3.5}/setup.py +1 -1
  12. cobweb-launcher-1.3.3/cobweb/launchers/launcher_api.py +0 -161
  13. cobweb-launcher-1.3.3/cobweb_/__init__.py +0 -2
  14. cobweb-launcher-1.3.3/cobweb_/launchers/launcher_air.py +0 -88
  15. {cobweb-launcher-1.3.3 → cobweb-launcher-1.3.5}/LICENSE +0 -0
  16. {cobweb-launcher-1.3.3 → cobweb-launcher-1.3.5}/README.md +0 -0
  17. {cobweb-launcher-1.3.3 → cobweb-launcher-1.3.5}/cobweb/base/__init__.py +0 -0
  18. {cobweb-launcher-1.3.3 → cobweb-launcher-1.3.5}/cobweb/base/basic.py +0 -0
  19. {cobweb-launcher-1.3.3 → cobweb-launcher-1.3.5}/cobweb/base/common_queue.py +0 -0
  20. {cobweb-launcher-1.3.3 → cobweb-launcher-1.3.5}/cobweb/base/dotting.py +0 -0
  21. {cobweb-launcher-1.3.3 → cobweb-launcher-1.3.5}/cobweb/base/item.py +0 -0
  22. {cobweb-launcher-1.3.3 → cobweb-launcher-1.3.5}/cobweb/base/log.py +0 -0
  23. {cobweb-launcher-1.3.3 → cobweb-launcher-1.3.5}/cobweb/base/request.py +0 -0
  24. {cobweb-launcher-1.3.3 → cobweb-launcher-1.3.5}/cobweb/base/response.py +0 -0
  25. {cobweb-launcher-1.3.3 → cobweb-launcher-1.3.5}/cobweb/base/seed.py +0 -0
  26. {cobweb-launcher-1.3.3 → cobweb-launcher-1.3.5}/cobweb/constant.py +0 -0
  27. {cobweb-launcher-1.3.3 → cobweb-launcher-1.3.5}/cobweb/crawlers/__init__.py +0 -0
  28. {cobweb-launcher-1.3.3 → cobweb-launcher-1.3.5}/cobweb/crawlers/crawler.py +0 -0
  29. {cobweb-launcher-1.3.3 → cobweb-launcher-1.3.5}/cobweb/db/__init__.py +0 -0
  30. {cobweb-launcher-1.3.3 → cobweb-launcher-1.3.5}/cobweb/db/api_db.py +0 -0
  31. {cobweb-launcher-1.3.3 → cobweb-launcher-1.3.5}/cobweb/db/redis_db.py +0 -0
  32. {cobweb-launcher-1.3.3 → cobweb-launcher-1.3.5}/cobweb/exceptions/__init__.py +0 -0
  33. {cobweb-launcher-1.3.3 → cobweb-launcher-1.3.5}/cobweb/exceptions/oss_db_exception.py +0 -0
  34. {cobweb-launcher-1.3.3 → cobweb-launcher-1.3.5}/cobweb/launchers/launcher.py +0 -0
  35. {cobweb-launcher-1.3.3 → cobweb-launcher-1.3.5}/cobweb/launchers/launcher_pro.py +0 -0
  36. {cobweb-launcher-1.3.3 → cobweb-launcher-1.3.5}/cobweb/pipelines/__init__.py +0 -0
  37. {cobweb-launcher-1.3.3 → cobweb-launcher-1.3.5}/cobweb/pipelines/pipeline.py +0 -0
  38. {cobweb-launcher-1.3.3 → cobweb-launcher-1.3.5}/cobweb/pipelines/pipeline_console.py +0 -0
  39. {cobweb-launcher-1.3.3 → cobweb-launcher-1.3.5}/cobweb/pipelines/pipeline_loghub.py +0 -0
  40. {cobweb-launcher-1.3.3 → cobweb-launcher-1.3.5}/cobweb/schedulers/scheduler_redis.py +0 -0
  41. {cobweb-launcher-1.3.3 → cobweb-launcher-1.3.5}/cobweb/setting.py +0 -0
  42. {cobweb-launcher-1.3.3 → cobweb-launcher-1.3.5}/cobweb/utils/bloom.py +0 -0
  43. {cobweb-launcher-1.3.3 → cobweb-launcher-1.3.5}/cobweb/utils/oss.py +0 -0
  44. {cobweb-launcher-1.3.3 → cobweb-launcher-1.3.5}/cobweb/utils/tools.py +0 -0
  45. {cobweb-launcher-1.3.3/cobweb → cobweb-launcher-1.3.5/cobweb_}/__init__.py +0 -0
  46. {cobweb-launcher-1.3.3 → cobweb-launcher-1.3.5}/cobweb_/base/__init__.py +0 -0
  47. {cobweb-launcher-1.3.3 → cobweb-launcher-1.3.5}/cobweb_/base/common_queue.py +0 -0
  48. {cobweb-launcher-1.3.3 → cobweb-launcher-1.3.5}/cobweb_/base/decorators.py +0 -0
  49. {cobweb-launcher-1.3.3 → cobweb-launcher-1.3.5}/cobweb_/base/item.py +0 -0
  50. {cobweb-launcher-1.3.3 → cobweb-launcher-1.3.5}/cobweb_/base/log.py +0 -0
  51. {cobweb-launcher-1.3.3 → cobweb-launcher-1.3.5}/cobweb_/base/request.py +0 -0
  52. {cobweb-launcher-1.3.3 → cobweb-launcher-1.3.5}/cobweb_/base/response.py +0 -0
  53. {cobweb-launcher-1.3.3 → cobweb-launcher-1.3.5}/cobweb_/base/seed.py +0 -0
  54. {cobweb-launcher-1.3.3 → cobweb-launcher-1.3.5}/cobweb_/constant.py +0 -0
  55. {cobweb-launcher-1.3.3 → cobweb-launcher-1.3.5}/cobweb_/crawlers/__init__.py +0 -0
  56. {cobweb-launcher-1.3.3 → cobweb-launcher-1.3.5}/cobweb_/crawlers/crawler.py +0 -0
  57. {cobweb-launcher-1.3.3 → cobweb-launcher-1.3.5}/cobweb_/db/__init__.py +0 -0
  58. {cobweb-launcher-1.3.3 → cobweb-launcher-1.3.5}/cobweb_/db/api_db.py +0 -0
  59. {cobweb-launcher-1.3.3 → cobweb-launcher-1.3.5}/cobweb_/db/redis_db.py +0 -0
  60. {cobweb-launcher-1.3.3 → cobweb-launcher-1.3.5}/cobweb_/exceptions/__init__.py +0 -0
  61. {cobweb-launcher-1.3.3 → cobweb-launcher-1.3.5}/cobweb_/exceptions/oss_db_exception.py +0 -0
  62. {cobweb-launcher-1.3.3 → cobweb-launcher-1.3.5}/cobweb_/launchers/__init__.py +0 -0
  63. {cobweb-launcher-1.3.3 → cobweb-launcher-1.3.5}/cobweb_/launchers/launcher.py +0 -0
  64. {cobweb-launcher-1.3.3/cobweb → cobweb-launcher-1.3.5/cobweb_}/launchers/launcher_air.py +0 -0
  65. {cobweb-launcher-1.3.3 → cobweb-launcher-1.3.5}/cobweb_/launchers/launcher_api.py +0 -0
  66. {cobweb-launcher-1.3.3 → cobweb-launcher-1.3.5}/cobweb_/launchers/launcher_pro.py +0 -0
  67. {cobweb-launcher-1.3.3 → cobweb-launcher-1.3.5}/cobweb_/pipelines/__init__.py +0 -0
  68. {cobweb-launcher-1.3.3 → cobweb-launcher-1.3.5}/cobweb_/pipelines/pipeline.py +0 -0
  69. {cobweb-launcher-1.3.3 → cobweb-launcher-1.3.5}/cobweb_/pipelines/pipeline_console.py +0 -0
  70. {cobweb-launcher-1.3.3 → cobweb-launcher-1.3.5}/cobweb_/pipelines/pipeline_loghub.py +0 -0
  71. {cobweb-launcher-1.3.3 → cobweb-launcher-1.3.5}/cobweb_/setting.py +0 -0
  72. {cobweb-launcher-1.3.3 → cobweb-launcher-1.3.5}/cobweb_/utils/__init__.py +0 -0
  73. {cobweb-launcher-1.3.3 → cobweb-launcher-1.3.5}/cobweb_/utils/bloom.py +0 -0
  74. {cobweb-launcher-1.3.3 → cobweb-launcher-1.3.5}/cobweb_/utils/dotting.py +0 -0
  75. {cobweb-launcher-1.3.3 → cobweb-launcher-1.3.5}/cobweb_/utils/oss.py +0 -0
  76. {cobweb-launcher-1.3.3 → cobweb-launcher-1.3.5}/cobweb_/utils/tools.py +0 -0
  77. {cobweb-launcher-1.3.3 → cobweb-launcher-1.3.5}/cobweb_launcher.egg-info/dependency_links.txt +0 -0
  78. {cobweb-launcher-1.3.3 → cobweb-launcher-1.3.5}/cobweb_launcher.egg-info/requires.txt +0 -0
  79. {cobweb-launcher-1.3.3 → cobweb-launcher-1.3.5}/cobweb_launcher.egg-info/top_level.txt +0 -0
  80. {cobweb-launcher-1.3.3 → cobweb-launcher-1.3.5}/setup.cfg +0 -0
  81. {cobweb-launcher-1.3.3 → cobweb-launcher-1.3.5}/test/test.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cobweb-launcher
3
- Version: 1.3.3
3
+ Version: 1.3.5
4
4
  Summary: spider_hole
5
5
  Home-page: https://github.com/Juannie-PP/cobweb
6
6
  Author: Juannie-PP
@@ -0,0 +1,2 @@
1
+ from .launchers import LauncherPro, LauncherApi
2
+ from .constant import CrawlerModel
@@ -1,3 +1,3 @@
1
1
  # from .launcher_air import LauncherAir
2
2
  from .launcher_pro import LauncherPro
3
- # from .launcher_api import LauncherApi
3
+ from .launcher_api import LauncherApi
@@ -0,0 +1,88 @@
1
+ # import time
2
+ #
3
+ # from cobweb.base import logger
4
+ # from cobweb.constant import LogTemplate
5
+ # from .launcher import Launcher, check_pause
6
+ #
7
+ #
8
+ # class LauncherAir(Launcher):
9
+ #
10
+ # # def _scheduler(self):
11
+ # # if self.start_seeds:
12
+ # # self.__LAUNCHER_QUEUE__['todo'].push(self.start_seeds)
13
+ #
14
+ # @check_pause
15
+ # def _insert(self):
16
+ # seeds = {}
17
+ # status = self.__LAUNCHER_QUEUE__['new'].length < self._new_queue_max_size
18
+ # for _ in range(self._new_queue_max_size):
19
+ # seed = self.__LAUNCHER_QUEUE__['new'].pop()
20
+ # if not seed:
21
+ # break
22
+ # seeds[seed.to_string] = seed.params.priority
23
+ # if seeds:
24
+ # self.__LAUNCHER_QUEUE__['todo'].push(seeds)
25
+ # if status:
26
+ # time.sleep(self._new_queue_wait_seconds)
27
+ #
28
+ # @check_pause
29
+ # def _delete(self):
30
+ # seeds = []
31
+ # status = self.__LAUNCHER_QUEUE__['done'].length < self._done_queue_max_size
32
+ #
33
+ # for _ in range(self._done_queue_max_size):
34
+ # seed = self.__LAUNCHER_QUEUE__['done'].pop()
35
+ # if not seed:
36
+ # break
37
+ # seeds.append(seed.to_string)
38
+ #
39
+ # if seeds:
40
+ # self._remove_doing_seeds(seeds)
41
+ #
42
+ # if status:
43
+ # time.sleep(self._done_queue_wait_seconds)
44
+ #
45
+ # def _polling(self):
46
+ #
47
+ # check_emtpy_times = 0
48
+ #
49
+ # while not self._stop.is_set():
50
+ #
51
+ # queue_not_empty_count = 0
52
+ # pooling_wait_seconds = 30
53
+ #
54
+ # for q in self.__LAUNCHER_QUEUE__.values():
55
+ # if q.length != 0:
56
+ # queue_not_empty_count += 1
57
+ #
58
+ # if queue_not_empty_count == 0:
59
+ # pooling_wait_seconds = 3
60
+ # if self._pause.is_set():
61
+ # check_emtpy_times = 0
62
+ # if not self._task_model:
63
+ # logger.info("Done! Ready to close thread...")
64
+ # self._stop.set()
65
+ # elif check_emtpy_times > 2:
66
+ # self.__DOING__ = {}
67
+ # self._pause.set()
68
+ # else:
69
+ # logger.info(
70
+ # "check whether the task is complete, "
71
+ # f"reset times {3 - check_emtpy_times}"
72
+ # )
73
+ # check_emtpy_times += 1
74
+ # elif self._pause.is_set():
75
+ # self._pause.clear()
76
+ # self._execute()
77
+ # else:
78
+ # logger.info(LogTemplate.launcher_air_polling.format(
79
+ # task=self.task,
80
+ # doing_len=len(self.__DOING__.keys()),
81
+ # todo_len=self.__LAUNCHER_QUEUE__['todo'].length,
82
+ # done_len=self.__LAUNCHER_QUEUE__['done'].length,
83
+ # upload_len=self.__LAUNCHER_QUEUE__['upload'].length,
84
+ # ))
85
+ #
86
+ # time.sleep(pooling_wait_seconds)
87
+ #
88
+ #
@@ -0,0 +1,88 @@
1
+ import time
2
+
3
+ from cobweb.base import TaskQueue, Decorators
4
+ from cobweb.schedulers import ApiScheduler
5
+ from .launcher import Launcher
6
+
7
+
8
+ class LauncherApi(Launcher):
9
+
10
+ def __init__(self, task, project, custom_setting=None, **kwargs):
11
+ super().__init__(task, project, custom_setting, **kwargs)
12
+ self._redis_download = "{%s:%s}:download" % (project, task)
13
+ self._redis_todo = "{%s:%s}:todo" % (project, task)
14
+ self._scheduler = ApiScheduler(task, project)
15
+
16
+ @Decorators.stop
17
+ def _schedule(self):
18
+ thread_sleep = self.scheduling_wait_time
19
+ for q, key, size, item_info in [
20
+ (TaskQueue.TODO, self._redis_todo, self.todo_queue_size, self._task_info["todo"]),
21
+ (TaskQueue.DOWNLOAD, self._redis_download, self.download_queue_size, self._task_info["download"]),
22
+ ]:
23
+ if q.length < size:
24
+ for member, priority in self._scheduler.schedule(key, self.scheduling_size):
25
+ q.push((member, priority), direct_insertion=True)
26
+ self.add_working_item(key.split(":")[-1], member, priority)
27
+ thread_sleep = 0.1
28
+ time.sleep(thread_sleep)
29
+
30
+ @Decorators.stop
31
+ def _heartbeat(self):
32
+ if self._scheduler.working.is_set():
33
+ self._scheduler.set_heartbeat()
34
+ time.sleep(3)
35
+
36
+ @Decorators.stop
37
+ def _reset(self):
38
+ self._scheduler.reset(
39
+ keys=[self._redis_todo, self._redis_download],
40
+ reset_time=self.seed_reset_seconds
41
+ )
42
+ time.sleep(30)
43
+
44
+ @Decorators.pause
45
+ def _insert(self):
46
+ thread_sleep = 0.1
47
+ for q, key, size in [
48
+ (TaskQueue.SEED, self._redis_todo, self.seed_queue_size),
49
+ (TaskQueue.REQUEST, self._redis_download, self.request_queue_size),
50
+ ]:
51
+ item_info = {}
52
+ while (item := q.pop()) and len(item_info.keys()) < self.inserting_size:
53
+ item_info[item.seed] = item.params.priority
54
+ if q.length >= size:
55
+ thread_sleep = self.inserting_wait_time
56
+ self._scheduler.insert(key, item_info)
57
+ time.sleep(thread_sleep)
58
+
59
+ @Decorators.pause
60
+ def _refresh(self):
61
+ self._scheduler.refresh(self._redis_todo, self._task_info["todo"])
62
+ self._scheduler.refresh(self._redis_download, self._task_info["download"])
63
+ time.sleep(10)
64
+
65
+ @Decorators.pause
66
+ def _remove(self):
67
+ thread_sleep = self.removing_wait_time
68
+ for q, key, size in [
69
+ (TaskQueue.DELETE, self._redis_todo, self.delete_queue_size),
70
+ (TaskQueue.DONE, self._redis_download, self.done_queue_size),
71
+ ]:
72
+ items = []
73
+ while (item := q.pop()) and len(items) < self.removing_size:
74
+ items.append(item)
75
+ self._scheduler.delete(key, items)
76
+ self.remove_working_items(key.split(":")[-1], items)
77
+ if q.length >= size:
78
+ thread_sleep = 0.1
79
+ time.sleep(thread_sleep)
80
+
81
+ def _init_schedule_thread(self):
82
+ self._add_thread(func=self._heartbeat)
83
+ self._add_thread(func=self._reset)
84
+ self._add_thread(func=self._refresh)
85
+ self._add_thread(func=self._schedule)
86
+ self._add_thread(func=self._insert)
87
+ self._add_thread(func=self._remove)
88
+ # self._add_thread(func=self._polling)
@@ -1 +1,3 @@
1
1
  from .scheduler_redis import RedisScheduler
2
+ from .scheduler_api import ApiScheduler
3
+
@@ -0,0 +1,69 @@
1
+ import threading
2
+ import time
3
+
4
+ # from cobweb.base import Seed
5
+ from cobweb.db import ApiDB
6
+
7
+
8
+ class ApiScheduler:
9
+
10
+ def __init__(self, task, project, scheduler_wait_seconds=30):
11
+ self._todo_key = "{%s:%s}:todo" % (project, task)
12
+ self._download_key = "{%s:%s}:download" % (project, task)
13
+ self._heartbeat_key = "heartbeat:%s_%s" % (project, task)
14
+ self._speed_control_key = "speed_control:%s_%s" % (project, task)
15
+ self._reset_lock_key = "lock:reset:%s_%s" % (project, task)
16
+ self._db = ApiDB()
17
+
18
+ self.scheduler_wait_seconds = scheduler_wait_seconds
19
+ self.working = threading.Event()
20
+
21
+ @property
22
+ def heartbeat(self):
23
+ return self._db.exists(self._heartbeat_key)
24
+
25
+ def set_heartbeat(self):
26
+ return self._db.setex(self._heartbeat_key, 5)
27
+
28
+ def schedule(self, key, count):
29
+ if not self._db.zcount(key, 0, "(1000"):
30
+ time.sleep(self.scheduler_wait_seconds)
31
+ else:
32
+ source = int(time.time())
33
+ members = self._db.members(key, source, count=count, _min=0, _max="(1000")
34
+ for member, priority in members:
35
+ # seed = Seed(member, priority=priority)
36
+ yield member.decode(), priority
37
+
38
+ def insert(self, key, items):
39
+ if items:
40
+ self._db.zadd(key, items, nx=True)
41
+
42
+ def reset(self, keys, reset_time=30):
43
+ if self._db.lock(self._reset_lock_key, t=120):
44
+
45
+ if isinstance(keys, str):
46
+ keys = [keys]
47
+
48
+ _min = reset_time - int(time.time()) if self.heartbeat else "-inf"
49
+
50
+ for key in keys:
51
+ self._db.members(key, 0, _min=_min, _max="(0")
52
+
53
+ if not self.heartbeat:
54
+ self.working.set()
55
+ time.sleep(10)
56
+
57
+ self._db.delete(self._reset_lock_key)
58
+
59
+ def refresh(self, key, items: dict[str, int]):
60
+ refresh_time = int(time.time())
61
+ its = {k: -refresh_time - v / 1000 for k, v in items}
62
+ self._db.zadd(key, item=its, xx=True)
63
+
64
+ def delete(self, key, values):
65
+ self._db.zrem(key, *values)
66
+
67
+
68
+
69
+
@@ -1,5 +1,5 @@
1
1
  from .oss import OssUtil
2
2
  from .tools import *
3
3
  from .bloom import BloomFilter
4
- from .task_queue import TaskQueue
4
+ # from .task_queue import TaskQueue
5
5
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cobweb-launcher
3
- Version: 1.3.3
3
+ Version: 1.3.5
4
4
  Summary: spider_hole
5
5
  Home-page: https://github.com/Juannie-PP/cobweb
6
6
  Author: Juannie-PP
@@ -30,6 +30,7 @@ cobweb/pipelines/pipeline.py
30
30
  cobweb/pipelines/pipeline_console.py
31
31
  cobweb/pipelines/pipeline_loghub.py
32
32
  cobweb/schedulers/__init__.py
33
+ cobweb/schedulers/scheduler_api.py
33
34
  cobweb/schedulers/scheduler_redis.py
34
35
  cobweb/utils/__init__.py
35
36
  cobweb/utils/bloom.py
@@ -5,7 +5,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
5
5
 
6
6
  setup(
7
7
  name="cobweb-launcher",
8
- version="1.3.3",
8
+ version="1.3.5",
9
9
  packages=find_packages(),
10
10
  url="https://github.com/Juannie-PP/cobweb",
11
11
  license="MIT",
@@ -1,161 +0,0 @@
1
- import time
2
- import threading
3
-
4
- from cobweb.db import ApiDB
5
- from cobweb.base import Seed, TaskQueue,logger, stop, pause
6
- from cobweb.constant import DealModel
7
- from .launcher import Launcher
8
-
9
-
10
- class LauncherApi(Launcher):
11
-
12
- def __init__(self, task, project, custom_setting=None, **kwargs):
13
- super().__init__(task, project, custom_setting, **kwargs)
14
- self._db = ApiDB()
15
-
16
- self._todo_key = "{%s:%s}:todo" % (project, task)
17
- self._done_key = "{%s:%s}:done" % (project, task)
18
- self._fail_key = "{%s:%s}:fail" % (project, task)
19
- self._heartbeat_key = "heartbeat:%s_%s" % (project, task)
20
-
21
- self._statistics_done_key = "statistics:%s:%s:done" % (project, task)
22
- self._statistics_fail_key = "statistics:%s:%s:fail" % (project, task)
23
- self._speed_control_key = "speed_control:%s_%s" % (project, task)
24
-
25
- self._reset_lock_key = "lock:reset:%s_%s" % (project, task)
26
-
27
- self._heartbeat_start_event = threading.Event()
28
-
29
- @property
30
- def heartbeat(self):
31
- return self._db.exists(self._heartbeat_key)
32
-
33
- def statistics(self, key, count):
34
- if not self.task_model and not self._db.exists(key):
35
- self._db.setex(key, 86400 * 30, int(count))
36
- else:
37
- self._db.incrby(key, count)
38
-
39
- def _get_seed(self) -> Seed:
40
- """
41
- 从队列中获取种子(频控)
42
- 设置时间窗口为self._time_window(秒),判断在该窗口内的采集量是否满足阈值(self._spider_max_speed)
43
- :return: True -> 种子, False -> None
44
- """
45
- if TaskQueue.TODO.length and not self._db.auto_incr(
46
- self._speed_control_key,
47
- t=self.time_window,
48
- limit=self.spider_max_count
49
- ):
50
- expire_time = self._db.ttl(self._speed_control_key)
51
- logger.info(f"Too fast! Please wait {expire_time} seconds...")
52
- time.sleep(expire_time / 2)
53
- return None
54
- return TaskQueue.TODO.pop()
55
-
56
- @stop
57
- def _reset(self):
58
- """
59
- 检查过期种子,重新添加到redis缓存中
60
- """
61
- if self._db.lock(self._reset_lock_key, t=120):
62
-
63
- _min = -int(time.time()) + self.seed_reset_seconds \
64
- if self.heartbeat else "-inf"
65
-
66
- self._db.members(self._todo_key, 0, _min=_min, _max="(0")
67
-
68
- if not self.heartbeat:
69
- self._heartbeat_start_event.set()
70
-
71
- self._db.delete(self._reset_lock_key)
72
-
73
- time.sleep(30)
74
-
75
- @stop
76
- def _refresh(self):
77
- """
78
- 刷新doing种子过期时间,防止reset重新消费
79
- """
80
- if self.doing_seeds:
81
- refresh_time = int(time.time())
82
- seeds = {k: -refresh_time - v / 1e3 for k, v in self.doing_seeds.items()}
83
- self._db.zadd(self._todo_key, item=seeds, xx=True)
84
- time.sleep(3)
85
-
86
- @stop
87
- def _scheduler(self):
88
- """
89
- 调度任务,获取redis队列种子,同时添加到doing字典中
90
- """
91
- if not self._db.zcount(self._todo_key, 0, "(1000"):
92
- time.sleep(self.scheduler_wait_seconds)
93
- elif TaskQueue.TODO.length >= self.todo_queue_size:
94
- time.sleep(self.todo_queue_full_wait_seconds)
95
- else:
96
- members = self._db.members(
97
- self._todo_key, int(time.time()),
98
- count=self.todo_queue_size,
99
- _min=0, _max="(1000"
100
- )
101
- for member, priority in members:
102
- seed = Seed(member, priority=priority)
103
- TaskQueue.TODO.push(seed)
104
- self.doing_seeds[seed.to_string] = seed.params.priority
105
-
106
- @pause
107
- def _heartbeat(self):
108
- if self._heartbeat_start_event.is_set():
109
- self._db.setex(self._heartbeat_key, t=5)
110
- time.sleep(3)
111
-
112
- @pause
113
- def _insert(self):
114
- """
115
- 添加新种子到redis队列中
116
- """
117
- seeds = {}
118
- for _ in range(self.new_queue_max_size):
119
- if seed := TaskQueue.SEED.pop():
120
- seeds[seed.to_string] = seed.params.priority
121
- if seeds:
122
- self._db.zadd(self._todo_key, seeds, nx=True)
123
- if TaskQueue.SEED.length < self.new_queue_max_size:
124
- time.sleep(self.new_queue_wait_seconds)
125
-
126
- @pause
127
- def _delete(self):
128
- """
129
- 删除队列种子,根据状态添加至成功或失败队列,移除doing字典种子索引
130
- """
131
- seed_info = {"count": 0, "failed": [], "succeed": [], "common": []}
132
- status = TaskQueue.DONE.length < self.done_queue_max_size
133
-
134
- for _ in range(self.done_queue_max_size):
135
- seed = TaskQueue.DONE.pop()
136
- if not seed:
137
- break
138
- if seed.params.seed_status == DealModel.fail:
139
- seed_info["failed"].append(seed.to_string)
140
- elif self.done_model == 1:
141
- seed_info["succeed"].append(seed.to_string)
142
- else:
143
- seed_info["common"].append(seed.to_string)
144
- seed_info['count'] += 1
145
-
146
- if seed_info["count"]:
147
-
148
- succeed_count = int(self._db.zrem(self._todo_key, *seed_info["common"]) or 0)
149
- succeed_count += int(self._db.done([self._todo_key, self._done_key], *seed_info["succeed"]) or 0)
150
- failed_count = int(self._db.done([self._todo_key, self._fail_key], *seed_info["failed"]) or 0)
151
-
152
- if failed_count:
153
- self.statistics(self._statistics_fail_key, failed_count)
154
- if succeed_count:
155
- self.statistics(self._statistics_done_key, succeed_count)
156
-
157
- self._remove_doing_seeds(seed_info["common"] + seed_info["succeed"] + seed_info["failed"])
158
-
159
- if status:
160
- time.sleep(self.done_queue_wait_seconds)
161
-
@@ -1,2 +0,0 @@
1
- from .launchers import LauncherAir, LauncherPro, LauncherApi
2
- from .constant import CrawlerModel
@@ -1,88 +0,0 @@
1
- import time
2
-
3
- from cobweb.base import logger
4
- from cobweb.constant import LogTemplate
5
- from .launcher import Launcher, check_pause
6
-
7
-
8
- class LauncherAir(Launcher):
9
-
10
- # def _scheduler(self):
11
- # if self.start_seeds:
12
- # self.__LAUNCHER_QUEUE__['todo'].push(self.start_seeds)
13
-
14
- @check_pause
15
- def _insert(self):
16
- seeds = {}
17
- status = self.__LAUNCHER_QUEUE__['new'].length < self._new_queue_max_size
18
- for _ in range(self._new_queue_max_size):
19
- seed = self.__LAUNCHER_QUEUE__['new'].pop()
20
- if not seed:
21
- break
22
- seeds[seed.to_string] = seed.params.priority
23
- if seeds:
24
- self.__LAUNCHER_QUEUE__['todo'].push(seeds)
25
- if status:
26
- time.sleep(self._new_queue_wait_seconds)
27
-
28
- @check_pause
29
- def _delete(self):
30
- seeds = []
31
- status = self.__LAUNCHER_QUEUE__['done'].length < self._done_queue_max_size
32
-
33
- for _ in range(self._done_queue_max_size):
34
- seed = self.__LAUNCHER_QUEUE__['done'].pop()
35
- if not seed:
36
- break
37
- seeds.append(seed.to_string)
38
-
39
- if seeds:
40
- self._remove_doing_seeds(seeds)
41
-
42
- if status:
43
- time.sleep(self._done_queue_wait_seconds)
44
-
45
- def _polling(self):
46
-
47
- check_emtpy_times = 0
48
-
49
- while not self._stop.is_set():
50
-
51
- queue_not_empty_count = 0
52
- pooling_wait_seconds = 30
53
-
54
- for q in self.__LAUNCHER_QUEUE__.values():
55
- if q.length != 0:
56
- queue_not_empty_count += 1
57
-
58
- if queue_not_empty_count == 0:
59
- pooling_wait_seconds = 3
60
- if self._pause.is_set():
61
- check_emtpy_times = 0
62
- if not self._task_model:
63
- logger.info("Done! Ready to close thread...")
64
- self._stop.set()
65
- elif check_emtpy_times > 2:
66
- self.__DOING__ = {}
67
- self._pause.set()
68
- else:
69
- logger.info(
70
- "check whether the task is complete, "
71
- f"reset times {3 - check_emtpy_times}"
72
- )
73
- check_emtpy_times += 1
74
- elif self._pause.is_set():
75
- self._pause.clear()
76
- self._execute()
77
- else:
78
- logger.info(LogTemplate.launcher_air_polling.format(
79
- task=self.task,
80
- doing_len=len(self.__DOING__.keys()),
81
- todo_len=self.__LAUNCHER_QUEUE__['todo'].length,
82
- done_len=self.__LAUNCHER_QUEUE__['done'].length,
83
- upload_len=self.__LAUNCHER_QUEUE__['upload'].length,
84
- ))
85
-
86
- time.sleep(pooling_wait_seconds)
87
-
88
-
File without changes