cobweb-launcher 0.1.16__tar.gz → 0.1.18__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cobweb-launcher might be problematic. Click here for more details.

Files changed (47) hide show
  1. {cobweb-launcher-0.1.16 → cobweb-launcher-0.1.18}/PKG-INFO +1 -1
  2. {cobweb-launcher-0.1.16 → cobweb-launcher-0.1.18}/cobweb/__init__.py +1 -0
  3. {cobweb-launcher-0.1.16 → cobweb-launcher-0.1.18}/cobweb/constant.py +3 -1
  4. cobweb-launcher-0.1.18/cobweb/db/storer/redis.py +15 -0
  5. cobweb-launcher-0.1.18/cobweb/distributed/launcher.py +243 -0
  6. cobweb-launcher-0.1.18/cobweb/distributed/models.py +143 -0
  7. cobweb-launcher-0.1.18/cobweb/equip/single/__init__.py +0 -0
  8. cobweb-launcher-0.1.18/cobweb/setting.py +13 -0
  9. cobweb-launcher-0.1.18/cobweb/single/__init__.py +0 -0
  10. cobweb-launcher-0.1.18/cobweb/single/launcher.py +231 -0
  11. cobweb-launcher-0.1.18/cobweb/single/models.py +134 -0
  12. cobweb-launcher-0.1.18/cobweb/single/nest.py +153 -0
  13. {cobweb-launcher-0.1.16 → cobweb-launcher-0.1.18}/cobweb/task.py +3 -1
  14. {cobweb-launcher-0.1.16 → cobweb-launcher-0.1.18}/cobweb_launcher.egg-info/PKG-INFO +1 -1
  15. {cobweb-launcher-0.1.16 → cobweb-launcher-0.1.18}/cobweb_launcher.egg-info/SOURCES.txt +9 -0
  16. {cobweb-launcher-0.1.16 → cobweb-launcher-0.1.18}/setup.py +1 -1
  17. {cobweb-launcher-0.1.16 → cobweb-launcher-0.1.18}/LICENSE +0 -0
  18. {cobweb-launcher-0.1.16 → cobweb-launcher-0.1.18}/README.md +0 -0
  19. {cobweb-launcher-0.1.16 → cobweb-launcher-0.1.18}/cobweb/bbb.py +0 -0
  20. {cobweb-launcher-0.1.16 → cobweb-launcher-0.1.18}/cobweb/db/__init__.py +0 -0
  21. {cobweb-launcher-0.1.16 → cobweb-launcher-0.1.18}/cobweb/db/oss_db.py +0 -0
  22. {cobweb-launcher-0.1.16 → cobweb-launcher-0.1.18}/cobweb/db/redis_db.py +0 -0
  23. {cobweb-launcher-0.1.16 → cobweb-launcher-0.1.18}/cobweb/db/scheduler/__init__.py +0 -0
  24. {cobweb-launcher-0.1.16 → cobweb-launcher-0.1.18}/cobweb/db/scheduler/default.py +0 -0
  25. {cobweb-launcher-0.1.16 → cobweb-launcher-0.1.18}/cobweb/db/scheduler/textfile.py +0 -0
  26. {cobweb-launcher-0.1.16 → cobweb-launcher-0.1.18}/cobweb/db/storer/__init__.py +0 -0
  27. {cobweb-launcher-0.1.16 → cobweb-launcher-0.1.18}/cobweb/db/storer/console.py +0 -0
  28. {cobweb-launcher-0.1.16 → cobweb-launcher-0.1.18}/cobweb/db/storer/loghub.py +0 -0
  29. {cobweb-launcher-0.1.16 → cobweb-launcher-0.1.18}/cobweb/db/storer/textfile.py +0 -0
  30. {cobweb-launcher-0.1.16 → cobweb-launcher-0.1.18}/cobweb/decorators.py +0 -0
  31. {cobweb-launcher-0.1.16/cobweb/equip/dev → cobweb-launcher-0.1.18/cobweb/distributed}/__init__.py +0 -0
  32. {cobweb-launcher-0.1.16 → cobweb-launcher-0.1.18}/cobweb/equip/__init__.py +0 -0
  33. {cobweb-launcher-0.1.16/cobweb/equip/distributed → cobweb-launcher-0.1.18/cobweb/equip/dev}/__init__.py +0 -0
  34. {cobweb-launcher-0.1.16 → cobweb-launcher-0.1.18}/cobweb/equip/dev/launcher.py +0 -0
  35. {cobweb-launcher-0.1.16 → cobweb-launcher-0.1.18}/cobweb/equip/dev/models.py +0 -0
  36. {cobweb-launcher-0.1.16/cobweb/equip/single → cobweb-launcher-0.1.18/cobweb/equip/distributed}/__init__.py +0 -0
  37. {cobweb-launcher-0.1.16 → cobweb-launcher-0.1.18}/cobweb/equip/distributed/launcher.py +0 -0
  38. {cobweb-launcher-0.1.16 → cobweb-launcher-0.1.18}/cobweb/equip/distributed/models.py +0 -0
  39. {cobweb-launcher-0.1.16 → cobweb-launcher-0.1.18}/cobweb/equip/single/launcher.py +0 -0
  40. {cobweb-launcher-0.1.16 → cobweb-launcher-0.1.18}/cobweb/equip/single/models.py +0 -0
  41. {cobweb-launcher-0.1.16 → cobweb-launcher-0.1.18}/cobweb/interface.py +0 -0
  42. {cobweb-launcher-0.1.16 → cobweb-launcher-0.1.18}/cobweb/log.py +0 -0
  43. {cobweb-launcher-0.1.16 → cobweb-launcher-0.1.18}/cobweb/utils.py +0 -0
  44. {cobweb-launcher-0.1.16 → cobweb-launcher-0.1.18}/cobweb_launcher.egg-info/dependency_links.txt +0 -0
  45. {cobweb-launcher-0.1.16 → cobweb-launcher-0.1.18}/cobweb_launcher.egg-info/requires.txt +0 -0
  46. {cobweb-launcher-0.1.16 → cobweb-launcher-0.1.18}/cobweb_launcher.egg-info/top_level.txt +0 -0
  47. {cobweb-launcher-0.1.16 → cobweb-launcher-0.1.18}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cobweb-launcher
3
- Version: 0.1.16
3
+ Version: 0.1.18
4
4
  Summary: spider_hole
5
5
  Home-page: https://github.com/Juannie-PP/cobweb
6
6
  Author: Juannie-PP
@@ -3,6 +3,7 @@ from .task import Task
3
3
  from .log import log
4
4
  from .db.redis_db import RedisDB
5
5
  from .db.oss_db import OssDB
6
+ from .constant import Setting
6
7
 
7
8
  from .equip.distributed.launcher import launcher
8
9
  from .equip.single.launcher import launcher as single_launcher
@@ -22,4 +22,6 @@ class Setting:
22
22
  CHECK_LOCK_TIME = None
23
23
  DEAL_MODEL = None
24
24
  LAUNCHER_MODEL = None
25
- SPIDER_RUN_TIME = None
25
+ SCHEDULER_WAIT_TIME = None
26
+ SCHEDULER_BLOCK_TIME = None
27
+ SPIDER_WAIT_TIME = None
@@ -0,0 +1,15 @@
1
+ from cobweb import log, StorerInterface
2
+
3
+
4
+ class Redis(StorerInterface):
5
+
6
+ def store(self, data_list):
7
+ try:
8
+ data_str = "\n".join(str(data) for data in data_list)
9
+ with open(self.table, "a") as fp:
10
+ fp.write(data_str)
11
+ log.info(f"save data, data length: {len(data_list)}")
12
+ return True
13
+ except Exception as e:
14
+ return False
15
+
@@ -0,0 +1,243 @@
1
+ import time
2
+ import threading
3
+ from threading import Thread
4
+
5
+ from .models import Scheduler, Spider, Storer
6
+ from cobweb import log, Queue, DBItem, RedisDB
7
+ from cobweb.setting import MODEL, RESET_SCORE, CHECK_LOCK_TIME
8
+ from cobweb.utils import (
9
+ struct_queue_name as sqn,
10
+ restore_table_name as rtn,
11
+ parse_import_model as pim,
12
+ )
13
+
14
+
15
+ def check(stop, last, spider, scheduler, storer_list, ready_seed_length, spider_queue_length):
16
+ log.info("run check thread after 30 seconds...")
17
+ time.sleep(30)
18
+ spider_info = """
19
+ ------------------- check: {0} ------------------
20
+ running_spider_thread_num: {1}
21
+ redis_ready_seed_length: {2}
22
+ redis_spider_seed_length: {3}
23
+ memory_seed_queue_length: {4}
24
+ storer_upload_queue_length_info:
25
+ {5}
26
+ ----------------------- end -----------------------"""
27
+ while True:
28
+ status = "running"
29
+ running_spider_thread_num = spider.spider_in_progress.length
30
+ redis_ready_seed_length = ready_seed_length()
31
+ redis_spider_seed_length = spider_queue_length()
32
+ memory_seed_queue_length = scheduler.queue.length
33
+ storer_upload_queue_list = []
34
+ for storer in storer_list:
35
+ storer_upload_queue_list.append(
36
+ f"{storer.__class__.__name__} storer queue length: {storer.queue.length}"
37
+ )
38
+ if (
39
+ scheduler.stop and
40
+ # not redis_ready_seed_length and
41
+ not memory_seed_queue_length and
42
+ not running_spider_thread_num
43
+ ):
44
+ if not MODEL:
45
+ log.info("spider is done?")
46
+ last.set()
47
+ time.sleep(3)
48
+ storer_queue_empty = True
49
+ storer_upload_queue_list = []
50
+ for storer in storer_list:
51
+ if storer.queue.length:
52
+ storer_queue_empty = False
53
+ storer_upload_queue_list.append(
54
+ f"{storer.__class__.__name__} storer queue length: {storer.queue.length}"
55
+ )
56
+ if (
57
+ storer_queue_empty and
58
+ not redis_ready_seed_length and
59
+ not redis_spider_seed_length
60
+ ):
61
+ if MODEL:
62
+ log.info("waiting for push seeds...")
63
+ status = "waiting"
64
+ time.sleep(30)
65
+ else:
66
+ log.info("spider done!")
67
+ break
68
+
69
+ last.clear()
70
+
71
+ storer_upload_queue_length_info = "\n ".join(
72
+ storer_upload_queue_list) if storer_upload_queue_list else "None"
73
+ log.info(spider_info.format(
74
+ status,
75
+ running_spider_thread_num,
76
+ redis_ready_seed_length,
77
+ redis_spider_seed_length,
78
+ memory_seed_queue_length,
79
+ storer_upload_queue_length_info
80
+ ))
81
+
82
+ time.sleep(3)
83
+ stop.set()
84
+
85
+
86
+ def launcher(task):
87
+ """
88
+ 任务启动装饰器
89
+ :param task: 任务配置信息
90
+ """
91
+ def decorator(func):
92
+ """
93
+ Item:
94
+ Textfile()
95
+ Loghub()
96
+ Console()
97
+ e.g.
98
+ task.fields = "a,b"
99
+ func(item, seed)
100
+ a = "a"
101
+ b = "b"
102
+ data = {"a": "a", "b": "b"}
103
+ yield item.Loghub(**data)
104
+ yield item.Loghub(a=a, b=b)
105
+ """
106
+ storer_list = []
107
+
108
+ # 程序结束事件
109
+ last = threading.Event()
110
+ # 停止采集事件
111
+ stop = threading.Event()
112
+
113
+ # 初始化redis信息
114
+ redis_db = RedisDB(
115
+ task.project, task.task_name, task.redis_info,
116
+ model=MODEL, cs_lct=CHECK_LOCK_TIME, rs_time=RESET_SCORE
117
+ )
118
+
119
+ log.info("初始化cobweb!")
120
+
121
+ seed_queue = Queue()
122
+
123
+ if task.scheduler_info is None:
124
+ task.scheduler_info = dict()
125
+
126
+ # 调度器动态继承
127
+ sql = task.scheduler_info.get("sql")
128
+ table = task.scheduler_info.get("table")
129
+ size = task.scheduler_info.get("size")
130
+ scheduler_config = task.scheduler_info.get("config")
131
+ scheduler_db = task.scheduler_info.get("db", "default")
132
+ DB, class_name = pim(scheduler_db, "scheduler")
133
+ # SchedulerDB, table, sql, length, size, config = task.scheduler_info
134
+ SchedulerTmp = type(class_name, (Scheduler, DB), {})
135
+
136
+ # 初始化调度器
137
+ scheduler = SchedulerTmp(
138
+ table=table, sql=sql, size=size, queue=seed_queue,
139
+ length=task.scheduler_queue_length, config=scheduler_config
140
+ )
141
+
142
+ # 初始化采集器
143
+ spider = Spider(seed_queue, task.max_retries)
144
+
145
+ # 解析存储器信息
146
+ storer_info_list = task.storer_info or []
147
+ if not isinstance(storer_info_list, list):
148
+ storer_info_list = [storer_info_list]
149
+
150
+ # new item
151
+ item = type("Item", (object,), {"redis_client": redis_db.client})()
152
+
153
+ for storer_info in storer_info_list:
154
+ storer_db = storer_info["db"]
155
+ fields = storer_info["fields"]
156
+ storer_table = storer_info.get("table", "console")
157
+ storer_config = storer_info.get("config")
158
+
159
+ StorerDB, class_name = pim(storer_db, "storer")
160
+ StorerTmp = type(class_name, (Storer, StorerDB), {})
161
+
162
+ db_name = class_name.lower()
163
+ if not getattr(item, db_name, None):
164
+ instance = type(db_name, (DBItem,), {})
165
+ setattr(item, db_name, instance)
166
+
167
+ storer_item_instance = getattr(item, db_name)
168
+ storer_item_instance.init_item(storer_table, fields)
169
+
170
+ storer_queue = sqn(db_name, storer_table)
171
+ queue = getattr(storer_item_instance, storer_queue)
172
+ # 初始话存储器
173
+ table_name = rtn(table_name=storer_table)
174
+ storer = StorerTmp(
175
+ table=table_name, fields=fields,
176
+ length=task.storer_queue_length,
177
+ queue=queue, config=storer_config
178
+ )
179
+ storer_list.append(storer)
180
+
181
+ Thread(target=redis_db.check_spider_queue, args=(stop, len(storer_list))).start()
182
+ Thread(target=redis_db.set_heartbeat, args=(stop,)).start()
183
+
184
+ # 推送初始种子
185
+ # seeds = start_seeds(task.start_seed)
186
+ redis_db.add_seed(task.seeds)
187
+ # 启动调度器, 调度至redis队列
188
+ Thread(
189
+ # name="xxxx_schedule_seeds",
190
+ target=scheduler.schedule_seed,
191
+ args=(
192
+ redis_db.ready_seed_length,
193
+ redis_db.get_scheduler_lock,
194
+ redis_db.add_seed
195
+ )
196
+ ).start()
197
+
198
+ # 启动调度器, 调度任务队列
199
+ Thread(
200
+ # name="xxxx_schedule_task",
201
+ target=scheduler.schedule_task,
202
+ args=(
203
+ stop, redis_db.get_seed,
204
+ redis_db.ready_seed_length
205
+ )
206
+ ).start()
207
+
208
+ # 启动采集器
209
+ for index in range(task.spider_num):
210
+ Thread(
211
+ # name=f"xxxx_spider_task:{index}",
212
+ target=spider.spider_task,
213
+ args=(
214
+ stop, func, item,
215
+ redis_db.del_seed
216
+ )
217
+ ).start()
218
+
219
+ # 启动存储器
220
+ for storer in storer_list:
221
+ Thread(
222
+ # name=f"xxxx_store_task:{storer.table}",
223
+ target=storer.store_task,
224
+ args=(
225
+ stop, last,
226
+ redis_db.reset_seed,
227
+ redis_db.set_storer
228
+ )
229
+ ).start()
230
+
231
+ Thread(
232
+ # name="check_spider",
233
+ target=check,
234
+ args=(
235
+ stop, last, spider,
236
+ scheduler, storer_list,
237
+ redis_db.ready_seed_length,
238
+ redis_db.spider_queue_length,
239
+ )
240
+ ).start()
241
+
242
+ return decorator
243
+
@@ -0,0 +1,143 @@
1
+ import time
2
+ from hashlib import md5
3
+ from cobweb import log, Queue, Seed
4
+ from cobweb.utils import issubclass_cobweb_inf
5
+
6
+ # from pympler import asizeof
7
+
8
+
9
+ class Scheduler:
10
+
11
+ def schedule_seed(self, ready_seed_length, get_scheduler_lock, add_seed):
12
+
13
+ inf_name = "SchedulerInterface"
14
+ if not issubclass_cobweb_inf(self.__class__, inf_name):
15
+ raise Exception("not have schedule function!")
16
+
17
+ if self.__class__.__name__ == "Default":
18
+ self.stop = True
19
+ return None
20
+
21
+ while not self.stop:
22
+ length = ready_seed_length()
23
+ if length > self.size:
24
+ time.sleep(15)
25
+
26
+ elif get_scheduler_lock():
27
+ seeds = self.schedule()
28
+ add_seed(seeds)
29
+
30
+ log.info(f"close thread: schedule_seed")
31
+
32
+ def schedule_task(self, stop, get_seed, ready_seed_length):
33
+ time.sleep(3)
34
+ while not stop.is_set():
35
+
36
+ if not ready_seed_length():
37
+ time.sleep(5)
38
+ continue
39
+
40
+ if self.queue.length >= self.length:
41
+ time.sleep(3)
42
+ continue
43
+
44
+ seeds = get_seed(self.length)
45
+ self.queue.push(seeds)
46
+ log.info(f"close thread: schedule_task")
47
+
48
+
49
+ class Spider:
50
+
51
+ def __init__(self, queue, max_retries=5):
52
+ self.spider_in_progress = Queue()
53
+ self.max_retries = max_retries
54
+ self.queue = queue
55
+
56
+ def spider_task(self, stop, func, item, del_seed):
57
+ while not stop.is_set():
58
+ seed = self.queue.pop()
59
+ if not seed:
60
+ time.sleep(3)
61
+ continue
62
+ elif seed._retry >= self.max_retries:
63
+ del_seed(seed, spider_status=False)
64
+ continue
65
+ try:
66
+ self.spider_in_progress.push(1, direct_insertion=True)
67
+ # log.info("spider seed: " + str(seed))
68
+ ret_count = 0
69
+ status = None
70
+ store_queue = None
71
+ store_data = list()
72
+ for it in func(item, seed):
73
+ ret_count += 1
74
+ if getattr(it, "table_name", None):
75
+ if not store_queue:
76
+ store_queue = it.queue()
77
+ store_data.append(it.struct_data)
78
+ elif isinstance(it, Seed):
79
+ self.queue.push(it)
80
+ elif any(isinstance(it, t) for t in (list, tuple)):
81
+ self.queue.push([s if isinstance(s, Seed) else Seed(s) for s in it])
82
+ elif isinstance(it, bool):
83
+ status = it
84
+
85
+ if store_queue and store_data:
86
+ store_data.append(seed)
87
+ store_queue.push(store_data)
88
+
89
+ if status:
90
+ del_seed(seed, spider_status=True)
91
+ elif not ret_count or status is False:
92
+ seed._retry += 1
93
+ self.queue.push(seed)
94
+
95
+ except Exception as e:
96
+ seed._retry += 1
97
+ self.queue.push(seed)
98
+ log.info(f"{str(seed)} -> {str(e)}")
99
+ finally:
100
+ self.spider_in_progress.pop()
101
+ log.info(f"close thread: spider")
102
+
103
+
104
+ class Storer:
105
+
106
+ def store_task(self, stop, last, reset_seed, set_storer):
107
+
108
+ inf_name = "StorerInterface"
109
+ if not issubclass_cobweb_inf(self.__class__, inf_name):
110
+ return None
111
+
112
+ if not getattr(self, "store", None):
113
+ raise Exception("not have store function!")
114
+
115
+ storer_name = self.__class__.__name__ + self.table
116
+ store_key_id = md5(storer_name.encode()).hexdigest()
117
+
118
+ while not stop.is_set():
119
+
120
+ if last.is_set() or self.queue.length >= self.length:
121
+ seeds, data_list = [], []
122
+
123
+ while True:
124
+ data = self.queue.pop()
125
+ if not data:
126
+ break
127
+ if isinstance(data, Seed):
128
+ seeds.append(data)
129
+ if len(data_list) >= self.length:
130
+ break
131
+ continue
132
+ data_list.append(data)
133
+
134
+ if data_list:
135
+ if self.store(data_list):
136
+ set_storer(store_key_id, seeds)
137
+ else:
138
+ reset_seed(seeds)
139
+ continue
140
+
141
+ time.sleep(3)
142
+
143
+ log.info(f"close thread: {storer_name}")
File without changes
@@ -0,0 +1,13 @@
1
+ import os
2
+
3
+
4
+ # model: 0, 1, 2
5
+ MODEL = int(os.getenv("MODEL", "0"))
6
+
7
+ # 重制score值的等待时间, 默认10分钟
8
+ RESET_SCORE = int(os.getenv("RESET_SCORE", "600"))
9
+
10
+ # 默认设置检查spider queue队列锁的存活时间为30s
11
+ CHECK_LOCK_TIME = int(os.getenv("CHECK_LOCK_TIME", 30))
12
+
13
+
File without changes
@@ -0,0 +1,231 @@
1
+ import time
2
+ import threading
3
+ from threading import Thread
4
+
5
+ from .models import Scheduler, Spider, Storer
6
+ from cobweb import log, Queue, DBItem, RedisDB
7
+ from cobweb.setting import MODEL, RESET_SCORE, CHECK_LOCK_TIME
8
+ from cobweb.utils import (
9
+ struct_queue_name as sqn,
10
+ restore_table_name as rtn,
11
+ parse_import_model as pim,
12
+ )
13
+
14
+
15
+ def check(stop, last, spider, scheduler, storer, ready_seed_length, spider_queue_length):
16
+ log.info("run check thread after 30 seconds...")
17
+ time.sleep(30)
18
+ spider_info = """
19
+ ------------------- check: {0} ------------------
20
+ redis_spider_seed_length: {1}
21
+ redis_ready_seed_length: {2}
22
+ running_spider_thread_num: {3}
23
+ memory_seed_queue_length: {4}
24
+ storer_queue_length_info: {5}
25
+ ----------------------- end -----------------------"""
26
+ while True:
27
+ status = "running"
28
+ running_spider_thread_num = spider.spider_in_progress.length
29
+ redis_ready_seed_length = ready_seed_length()
30
+ redis_spider_seed_length = spider_queue_length()
31
+ memory_seed_queue_length = scheduler.queue.length
32
+ storer_upload_queue_length = storer.queue.length
33
+ if (
34
+ scheduler.stop and
35
+ # not redis_ready_seed_length and
36
+ not memory_seed_queue_length and
37
+ not running_spider_thread_num
38
+ ):
39
+ if not MODEL:
40
+ log.info("spider is done?")
41
+ last.set()
42
+ time.sleep(3)
43
+ storer_queue_empty = True
44
+ if storer.queue.length:
45
+ storer_queue_empty = False
46
+ storer_upload_queue_length = storer.queue.length
47
+ if (
48
+ storer_queue_empty and
49
+ not redis_ready_seed_length and
50
+ not redis_spider_seed_length
51
+ ):
52
+ if MODEL:
53
+ log.info("waiting for push seeds...")
54
+ status = "waiting"
55
+ time.sleep(30)
56
+ else:
57
+ log.info("spider done!")
58
+ break
59
+
60
+ last.clear()
61
+
62
+ log.info(spider_info.format(
63
+ status,
64
+ redis_spider_seed_length,
65
+ redis_ready_seed_length,
66
+ running_spider_thread_num,
67
+ memory_seed_queue_length,
68
+ storer_upload_queue_length
69
+ ))
70
+
71
+ time.sleep(3)
72
+ stop.set()
73
+
74
+
75
+ def launcher(task):
76
+ """
77
+ 任务启动装饰器
78
+ :param task: 任务配置信息
79
+ """
80
+ def decorator(func):
81
+ """
82
+ Item:
83
+ Textfile()
84
+ Loghub()
85
+ Console()
86
+ e.g.
87
+ task.fields = "a,b"
88
+ func(item, seed)
89
+ a = "a"
90
+ b = "b"
91
+ data = {"a": "a", "b": "b"}
92
+ yield item.Loghub(**data)
93
+ yield item.Loghub(a=a, b=b)
94
+ """
95
+ storer_list = []
96
+
97
+ # 程序结束事件
98
+ last = threading.Event()
99
+ # 停止采集事件
100
+ stop = threading.Event()
101
+
102
+ # 初始化redis信息
103
+ redis_db = RedisDB(
104
+ task.project, task.task_name, task.redis_info,
105
+ model=MODEL, cs_lct=CHECK_LOCK_TIME, rs_time=RESET_SCORE
106
+ )
107
+
108
+ # new item
109
+ item = type("Item", (object,), {"redis_client": redis_db.client})()
110
+
111
+ log.info("初始化cobweb!")
112
+
113
+ seed_queue = Queue()
114
+
115
+ scheduler_info = task.scheduler_info or dict()
116
+
117
+ # 调度器动态继承
118
+ sql = scheduler_info.get("sql")
119
+ table = scheduler_info.get("table")
120
+ size = scheduler_info.get("size")
121
+ scheduler_config = scheduler_info.get("config")
122
+ scheduler_db = scheduler_info.get("db", "default")
123
+ DB, class_name = pim(scheduler_db, "scheduler")
124
+ # SchedulerDB, table, sql, length, size, config = task.scheduler_info
125
+ SchedulerTmp = type(class_name, (Scheduler, DB), {})
126
+
127
+ # 初始化调度器
128
+ scheduler = SchedulerTmp(
129
+ table=table, sql=sql, size=size, queue=seed_queue,
130
+ length=task.scheduler_queue_length, config=scheduler_config
131
+ )
132
+
133
+ # 初始化采集器
134
+ spider = Spider(seed_queue, task.max_retries)
135
+
136
+ storer = None
137
+
138
+ # 解析存储器信息
139
+ storer_info = task.storer_info or dict()
140
+
141
+ # for storer_info in storer_info_list:
142
+ if storer_info:
143
+ storer_db = storer_info["db"]
144
+ fields = storer_info["fields"]
145
+ storer_table = storer_info.get("table", "console")
146
+ storer_config = storer_info.get("config")
147
+
148
+ StorerDB, class_name = pim(storer_db, "storer")
149
+ StorerTmp = type(class_name, (Storer, StorerDB), {})
150
+
151
+ db_name = class_name.lower()
152
+ if not getattr(item, db_name, None):
153
+ instance = type(db_name, (DBItem,), {})
154
+ setattr(item, db_name, instance)
155
+
156
+ storer_item_instance = getattr(item, db_name)
157
+ storer_item_instance.init_item(storer_table, fields)
158
+
159
+ storer_queue = sqn(db_name, storer_table)
160
+ queue = getattr(storer_item_instance, storer_queue)
161
+ # 初始话存储器
162
+ table_name = rtn(table_name=storer_table)
163
+ storer = StorerTmp(
164
+ table=table_name, fields=fields,
165
+ length=task.storer_queue_length,
166
+ queue=queue, config=storer_config
167
+ )
168
+
169
+ Thread(target=redis_db.check_spider_queue, args=(stop, len(storer_list))).start()
170
+ Thread(target=redis_db.set_heartbeat, args=(stop,)).start()
171
+
172
+ # 推送初始种子
173
+ # seeds = start_seeds(task.start_seed)
174
+ redis_db.add_seed(task.seeds)
175
+ # 启动调度器, 调度至redis队列
176
+ Thread(
177
+ # name="xxxx_schedule_seeds",
178
+ target=scheduler.schedule_seed,
179
+ args=(
180
+ redis_db.ready_seed_length,
181
+ redis_db.get_scheduler_lock,
182
+ redis_db.add_seed
183
+ )
184
+ ).start()
185
+
186
+ # 启动调度器, 调度任务队列
187
+ Thread(
188
+ # name="xxxx_schedule_task",
189
+ target=scheduler.schedule_task,
190
+ args=(
191
+ stop, redis_db.get_seed,
192
+ redis_db.ready_seed_length
193
+ )
194
+ ).start()
195
+
196
+ # 启动采集器
197
+ for index in range(task.spider_num):
198
+ Thread(
199
+ # name=f"xxxx_spider_task:{index}",
200
+ target=spider.spider_task,
201
+ args=(
202
+ stop, func, item,
203
+ redis_db.del_seed
204
+ )
205
+ ).start()
206
+
207
+ # 启动存储器
208
+ if storer:
209
+ Thread(
210
+ # name=f"xxxx_store_task:{storer.table}",
211
+ target=storer.store_task,
212
+ args=(
213
+ stop, last,
214
+ redis_db.reset_seed,
215
+ redis_db.del_seed
216
+ )
217
+ ).start()
218
+
219
+ Thread(
220
+ # name="check_spider",
221
+ target=check,
222
+ args=(
223
+ stop, last, spider,
224
+ scheduler, storer,
225
+ redis_db.ready_seed_length,
226
+ redis_db.spider_queue_length,
227
+ )
228
+ ).start()
229
+
230
+ return decorator
231
+
@@ -0,0 +1,134 @@
1
+ import time
2
+ from cobweb import log, Queue, Seed, Setting
3
+ from cobweb.utils import issubclass_cobweb_inf
4
+ # from pympler import asizeof
5
+
6
+
7
+ class Scheduler:
8
+
9
+ def schedule_seed(self, ready_seed_length, get_scheduler_lock, add_seed):
10
+
11
+ inf_name = "SchedulerInterface"
12
+ if not issubclass_cobweb_inf(self.__class__, inf_name):
13
+ raise Exception("not have schedule function!")
14
+
15
+ if self.__class__.__name__ == "Default":
16
+ self.stop = True
17
+ return None
18
+
19
+ while not self.stop:
20
+ length = ready_seed_length()
21
+ if length > self.size:
22
+ time.sleep(15)
23
+
24
+ elif get_scheduler_lock():
25
+ seeds = self.schedule()
26
+ add_seed(seeds)
27
+
28
+ log.info(f"close thread: schedule_seed")
29
+
30
+ def schedule_task(self, stop, get_seed, ready_seed_length):
31
+ time.sleep(3)
32
+ while not stop.is_set():
33
+
34
+ if not ready_seed_length():
35
+ time.sleep(Setting.SCHEDULER_WAIT_TIME)
36
+ continue
37
+
38
+ if self.queue.length >= self.length:
39
+ time.sleep(Setting.SCHEDULER_BLOCK_TIME)
40
+ continue
41
+
42
+ seeds = get_seed(self.length)
43
+ self.queue.push(seeds)
44
+ log.info(f"close thread: schedule_task")
45
+
46
+
47
+ class Spider:
48
+
49
+ def __init__(self, queue, max_retries=5):
50
+ self.spider_in_progress = Queue()
51
+ self.max_retries = max_retries
52
+ self.queue = queue
53
+
54
+ def spider_task(self, stop, func, item, del_seed):
55
+ while not stop.is_set():
56
+ seed = self.queue.pop()
57
+ if not seed:
58
+ time.sleep(Setting.SPIDER_WAIT_TIME)
59
+ continue
60
+ elif seed._retry >= self.max_retries:
61
+ del_seed(seed, spider_status=False)
62
+ continue
63
+ try:
64
+ self.spider_in_progress.push(1, direct_insertion=True)
65
+ # log.info("spider seed: " + str(seed))
66
+ ret_count = 0
67
+ status = None
68
+ for it in func(item, seed):
69
+ ret_count += 1
70
+ if getattr(it, "table_name", None):
71
+ store_queue = it.queue()
72
+ store_queue.push(
73
+ [seed, it.struct_data],
74
+ direct_insertion=True
75
+ )
76
+ elif isinstance(it, Seed):
77
+ self.queue.push(it)
78
+ elif any(isinstance(it, t) for t in (list, tuple)):
79
+ self.queue.push([s if isinstance(s, Seed) else Seed(s) for s in it])
80
+ elif isinstance(it, bool):
81
+ status = it
82
+
83
+ if status:
84
+ del_seed(seed, spider_status=True)
85
+ elif not ret_count or status is False:
86
+ seed._retry += 1
87
+ self.queue.push(seed)
88
+
89
+ except Exception as e:
90
+ seed._retry += 1
91
+ self.queue.push(seed)
92
+ log.info(f"{str(seed)} -> {str(e)}")
93
+ finally:
94
+ self.spider_in_progress.pop()
95
+ log.info(f"close thread: spider")
96
+
97
+
98
+ class Storer:
99
+
100
+ def store_task(self, stop, last, reset_seed, del_seed):
101
+
102
+ inf_name = "StorerInterface"
103
+ if not issubclass_cobweb_inf(self.__class__, inf_name):
104
+ return None
105
+
106
+ if not getattr(self, "store", None):
107
+ raise Exception("not have store function!")
108
+
109
+ storer_name = self.__class__.__name__ + self.table
110
+
111
+ while not stop.is_set():
112
+
113
+ if last.is_set() or self.queue.length >= self.length:
114
+ seeds, data_list = [], []
115
+
116
+ for _ in range(self.length):
117
+ items = self.queue.pop()
118
+ if not items:
119
+ break
120
+ seed, data = items
121
+ seeds.append(seed)
122
+ data_list.append(data)
123
+
124
+ if data_list:
125
+ if self.store(data_list):
126
+ del_seed(seeds)
127
+ else:
128
+ reset_seed(seeds)
129
+ log.info("reset seeds!")
130
+ continue
131
+
132
+ time.sleep(3)
133
+
134
+ log.info(f"close thread: {storer_name}")
@@ -0,0 +1,153 @@
1
+ import time
2
+ import threading
3
+
4
+ from equip.single import Seed, DBItem
5
+ from equip.single import struct_queue_name, restore_table_name
6
+ from equip.single import Distributor, Scheduler, Spider, Storer
7
+
8
+
9
+ def init_task_seed(seeds):
10
+ if not seeds:
11
+ return None
12
+ if isinstance(seeds, list) or isinstance(seeds, tuple):
13
+ for seed in seeds:
14
+ yield Seed(seed)
15
+ elif isinstance(seeds, str) or isinstance(seeds, dict):
16
+ yield Seed(seeds)
17
+
18
+
19
+ def parse_storer_info(storer_info):
20
+ storer_data = {}
21
+ storer_info_list = []
22
+ if storer_info.__class__.__name__ == 'StorerInfo':
23
+ storer_info_list.append(storer_info)
24
+ elif isinstance(storer_info, tuple) or isinstance(storer_info, list):
25
+ storer_info_list = storer_info
26
+ for info in storer_info_list:
27
+ db_name = info.DB.__name__
28
+ storer_data.setdefault(db_name, {"StorerDB": info.DB, "db_args_list": []})
29
+ storer_data[db_name]["db_args_list"].append(info[1:])
30
+ return storer_data
31
+
32
+
33
+ def check(stop_event, last_event, distributor, scheduler, spider, storer_list):
34
+ while True:
35
+ time.sleep(3)
36
+ if (
37
+ scheduler.stop and
38
+ not distributor.seed_queue.length and
39
+ not spider.spider_in_progress.length
40
+ ):
41
+ last_event.set()
42
+ time.sleep(10)
43
+ storer_queue_empty = True
44
+ for storer in storer_list:
45
+ if storer.queue.length:
46
+ storer_queue_empty = False
47
+ break
48
+ if storer_queue_empty:
49
+ break
50
+ last_event.clear()
51
+ stop_event.set()
52
+
53
+
54
+ def cobweb(task):
55
+ """
56
+ 任务启动装饰器
57
+ :param task: 任务配置信息
58
+ """
59
+ def decorator(func):
60
+ """
61
+ func(Item, seed)
62
+ Item:
63
+ Item.Textfile()
64
+ Item.Console()
65
+ """
66
+ # project task_name start_seed spider_num queue_length scheduler_info storer_info
67
+
68
+ storer_list = []
69
+
70
+ # 程序结束事件
71
+ last_event = threading.Event()
72
+ # 暂停采集事件
73
+ stop_event = threading.Event()
74
+
75
+ # 创建分发器
76
+ distributor = Distributor()
77
+
78
+ # 调度器动态继承
79
+ SchedulerDB, table, sql, length, size = task.SchedulerInfo
80
+ SchedulerTmp = type('Scheduler', (Scheduler, SchedulerDB), {})
81
+
82
+ # 初始化调度器
83
+ scheduler = SchedulerTmp(table=table, sql=sql, length=length, size=size, queue=distributor.seed_queue)
84
+
85
+ # 初始化采集器
86
+ spider = Spider(queue=distributor.seed_queue)
87
+
88
+ # 解析存储器信息
89
+ storer_data = parse_storer_info(task.storer_info)
90
+
91
+ # sds
92
+ item = type("item", (object,), {})
93
+ for db_name in storer_data.keys():
94
+ # 存储器动态继承
95
+ StorerDB = storer_data[db_name]["StorerDB"]
96
+ StorerTmp = type('Storer', (Storer, StorerDB), {})
97
+ db_args_list = storer_data[db_name]["db_args_list"]
98
+ for storer_db_args in db_args_list:
99
+ table, fields, length = storer_db_args
100
+ if not getattr(item, db_name, None):
101
+ instance = type(db_name, (DBItem,), {})
102
+ setattr(item, db_name, instance)
103
+ # 创建存储xxx
104
+ getattr(item, db_name).init_item(table, fields)
105
+ # 创建存储队列
106
+ storer_queue = struct_queue_name(db_name, table)
107
+ distributor.create_queue(queue_name=storer_queue)
108
+ queue = distributor.get_queue(queue_name=storer_queue)
109
+ # 初始话存储器
110
+ table_name = restore_table_name(table_name=table)
111
+ storer = StorerTmp(table=table_name, fields=fields, length=length, queue=queue)
112
+ storer_list.append(storer)
113
+
114
+ # 推送初始种子
115
+ distributor.distribute(init_task_seed, seeds=task.start_seed)
116
+
117
+ # 启动调度器
118
+ threading.Thread(
119
+ target=scheduler.schedule_task,
120
+ args=(distributor.distribute,),
121
+ name="single_scheduler_task"
122
+ ).start()
123
+
124
+ # 启动采集器
125
+ for index in range(task.spider_num):
126
+ threading.Thread(
127
+ target=spider.spider_task,
128
+ args=(stop_event, distributor.distribute, func, item),
129
+ name=f"single_spider_task:{index}"
130
+ ).start()
131
+
132
+ # 启动存储器
133
+ for storer in storer_list:
134
+ threading.Thread(
135
+ target=storer.store_task,
136
+ args=(stop_event, last_event, distributor.distribute),
137
+ name=f"single_store_task:{storer.table}",
138
+ ).start()
139
+
140
+ threading.Thread(
141
+ target=check, name="check",
142
+ args=(
143
+ stop_event, last_event, distributor,
144
+ scheduler, spider, storer_list
145
+ )
146
+ ).start()
147
+
148
+ # return starter(task, func)
149
+ return decorator
150
+
151
+
152
+
153
+
@@ -6,9 +6,11 @@ from .utils import parse_info, struct_start_seeds
6
6
  def init_task_env():
7
7
  Setting.RESET_SCORE = int(os.getenv("RESET_SCORE", 600))
8
8
  Setting.CHECK_LOCK_TIME = int(os.getenv("CHECK_LOCK_TIME", 30))
9
- Setting.SPIDER_RUN_TIME = float(os.getenv("SPIDER_RUN_TIME", 0.2))
10
9
  Setting.DEAL_MODEL = os.getenv("DEAL_MODEL", DealModel.failure)
11
10
  Setting.LAUNCHER_MODEL = os.getenv("LAUNCHER_MODEL", LauncherModel.task)
11
+ Setting.SCHEDULER_WAIT_TIME = float(os.getenv("SCHEDULER_WAIT_TIME", 5))
12
+ Setting.SCHEDULER_BLOCK_TIME = float(os.getenv("SCHEDULER_BLOCK_TIME", 3))
13
+ Setting.SPIDER_WAIT_TIME = float(os.getenv("SPIDER_WAIT_TIME", 3))
12
14
 
13
15
 
14
16
  class Task:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cobweb-launcher
3
- Version: 0.1.16
3
+ Version: 0.1.18
4
4
  Summary: spider_hole
5
5
  Home-page: https://github.com/Juannie-PP/cobweb
6
6
  Author: Juannie-PP
@@ -7,6 +7,7 @@ cobweb/constant.py
7
7
  cobweb/decorators.py
8
8
  cobweb/interface.py
9
9
  cobweb/log.py
10
+ cobweb/setting.py
10
11
  cobweb/task.py
11
12
  cobweb/utils.py
12
13
  cobweb/db/__init__.py
@@ -18,7 +19,11 @@ cobweb/db/scheduler/textfile.py
18
19
  cobweb/db/storer/__init__.py
19
20
  cobweb/db/storer/console.py
20
21
  cobweb/db/storer/loghub.py
22
+ cobweb/db/storer/redis.py
21
23
  cobweb/db/storer/textfile.py
24
+ cobweb/distributed/__init__.py
25
+ cobweb/distributed/launcher.py
26
+ cobweb/distributed/models.py
22
27
  cobweb/equip/__init__.py
23
28
  cobweb/equip/dev/__init__.py
24
29
  cobweb/equip/dev/launcher.py
@@ -29,6 +34,10 @@ cobweb/equip/distributed/models.py
29
34
  cobweb/equip/single/__init__.py
30
35
  cobweb/equip/single/launcher.py
31
36
  cobweb/equip/single/models.py
37
+ cobweb/single/__init__.py
38
+ cobweb/single/launcher.py
39
+ cobweb/single/models.py
40
+ cobweb/single/nest.py
32
41
  cobweb_launcher.egg-info/PKG-INFO
33
42
  cobweb_launcher.egg-info/SOURCES.txt
34
43
  cobweb_launcher.egg-info/dependency_links.txt
@@ -5,7 +5,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
5
5
 
6
6
  setup(
7
7
  name="cobweb-launcher",
8
- version="0.1.16",
8
+ version="0.1.18",
9
9
  packages=find_packages(),
10
10
  url="https://github.com/Juannie-PP/cobweb",
11
11
  license="MIT",