cobweb-launcher 1.0.1__py3-none-any.whl → 1.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cobweb-launcher might be problematic. Click here for more details.

Files changed (45) hide show
  1. cobweb/launchers/launcher.py +1 -2
  2. cobweb/utils/oss.py +1 -2
  3. {cobweb_launcher-1.0.1.dist-info → cobweb_launcher-1.0.3.dist-info}/METADATA +1 -1
  4. cobweb_launcher-1.0.3.dist-info/RECORD +32 -0
  5. cobweb/bbb.py +0 -191
  6. cobweb/config.py +0 -164
  7. cobweb/db/oss_db.py +0 -128
  8. cobweb/db/scheduler/__init__.py +0 -1
  9. cobweb/db/scheduler/default.py +0 -8
  10. cobweb/db/scheduler/textfile.py +0 -27
  11. cobweb/db/storer/__init__.py +0 -1
  12. cobweb/db/storer/console.py +0 -9
  13. cobweb/db/storer/loghub.py +0 -54
  14. cobweb/db/storer/redis.py +0 -15
  15. cobweb/db/storer/textfile.py +0 -15
  16. cobweb/decorators.py +0 -16
  17. cobweb/distributed/__init__.py +0 -0
  18. cobweb/distributed/launcher.py +0 -243
  19. cobweb/distributed/models.py +0 -143
  20. cobweb/equip/__init__.py +0 -8
  21. cobweb/equip/dev/__init__.py +0 -0
  22. cobweb/equip/dev/launcher.py +0 -202
  23. cobweb/equip/dev/models.py +0 -156
  24. cobweb/equip/distributed/__init__.py +0 -0
  25. cobweb/equip/distributed/launcher.py +0 -219
  26. cobweb/equip/distributed/models.py +0 -158
  27. cobweb/equip/download/__init__.py +0 -0
  28. cobweb/equip/download/launcher.py +0 -203
  29. cobweb/equip/download/models.py +0 -169
  30. cobweb/equip/single/__init__.py +0 -0
  31. cobweb/equip/single/launcher.py +0 -203
  32. cobweb/equip/single/models.py +0 -173
  33. cobweb/interface.py +0 -34
  34. cobweb/log.py +0 -96
  35. cobweb/new.py +0 -20
  36. cobweb/single/__init__.py +0 -0
  37. cobweb/single/launcher.py +0 -231
  38. cobweb/single/models.py +0 -134
  39. cobweb/single/nest.py +0 -153
  40. cobweb/task.py +0 -61
  41. cobweb/utils.py +0 -90
  42. cobweb_launcher-1.0.1.dist-info/RECORD +0 -69
  43. {cobweb_launcher-1.0.1.dist-info → cobweb_launcher-1.0.3.dist-info}/LICENSE +0 -0
  44. {cobweb_launcher-1.0.1.dist-info → cobweb_launcher-1.0.3.dist-info}/WHEEL +0 -0
  45. {cobweb_launcher-1.0.1.dist-info → cobweb_launcher-1.0.3.dist-info}/top_level.txt +0 -0
@@ -1,156 +0,0 @@
1
- import time
2
- from inspect import isgenerator
3
- # from pympler import asizeof
4
- from .. import log, ici
5
- from .. import DealModel, Queue, Seed, Setting
6
-
7
-
8
- class Scheduler:
9
-
10
- def schedule_seed(self, ready_seed_length, get_scheduler_lock, add_seed):
11
-
12
- inf_name = "SchedulerInterface"
13
- if not ici(self.__class__, inf_name):
14
- raise Exception("not have schedule function!")
15
-
16
- if self.__class__.__name__ == "Default":
17
- self.stop = True
18
- return None
19
-
20
- while not self.stop:
21
- length = ready_seed_length()
22
- if length > self.size:
23
- time.sleep(15)
24
-
25
- elif get_scheduler_lock():
26
- seeds = self.schedule()
27
- add_seed(seeds)
28
-
29
- log.info(f"close thread: schedule_seed")
30
-
31
- def schedule_task(self, stop, get_seed, ready_seed_length):
32
- time.sleep(3)
33
- while not stop.is_set():
34
-
35
- if not ready_seed_length():
36
- time.sleep(5)
37
- continue
38
-
39
- if self.queue.length >= self.length:
40
- time.sleep(3)
41
- continue
42
-
43
- seeds = get_seed(self.length)
44
- self.queue.push(seeds)
45
- log.info(f"close thread: schedule_task")
46
-
47
-
48
- class Spider:
49
-
50
- def __init__(self, queue, storage, max_retries=5):
51
- self.spider_in_progress = Queue()
52
- self.max_retries = max_retries
53
- self.storage = storage
54
- self.queue = queue
55
-
56
- def spider_task(self, stop, func, item, del_seed):
57
- while not stop.is_set():
58
-
59
- seed = self.queue.pop()
60
-
61
- if not seed:
62
- time.sleep(3)
63
- continue
64
-
65
- elif seed._retry >= self.max_retries:
66
- del_seed(seed, spider_status=False)
67
- continue
68
-
69
- try:
70
- self.spider_in_progress.push(1, direct_insertion=True)
71
- log.info("spider seed: " + str(seed))
72
- iterators = func(item, seed)
73
-
74
- if not isgenerator(iterators):
75
- if not self.storage:
76
- del_seed(seed, spider_status=True)
77
- continue
78
- raise TypeError(f"{func.__name__} isn't a generator")
79
-
80
- status = None
81
- for it in iterators:
82
- status = True
83
- if getattr(it, "table_name", None):
84
- store_queue = it.queue()
85
- store_queue.push(
86
- [seed, it.struct_data],
87
- direct_insertion=True
88
- )
89
- elif isinstance(it, Seed):
90
- self.queue.push(it)
91
-
92
- elif isinstance(it, str) and it == DealModel.polling:
93
- self.queue.push(seed)
94
- break
95
- elif isinstance(it, str) and it == DealModel.success:
96
- del_seed(seed, spider_status=True)
97
- break
98
- elif isinstance(it, str) and it == DealModel.failure:
99
- del_seed(seed, spider_status=False)
100
- break
101
- else:
102
- raise TypeError("yield value type error!")
103
-
104
- if not status:
105
- seed._retry += 1
106
- self.queue.push(seed)
107
-
108
- except Exception as e:
109
- seed._retry += 1
110
- self.queue.push(seed)
111
- log.info(f"{str(seed)} -> {str(e)}")
112
- finally:
113
- self.spider_in_progress.pop()
114
- time.sleep(Setting.SPIDER_RUN_TIME)
115
- log.info(f"close thread: spider")
116
-
117
-
118
- class Storer:
119
-
120
- def store_task(self, stop, last, reset_seed, del_seed):
121
-
122
- inf_name = "StorerInterface"
123
- if not ici(self.__class__, inf_name):
124
- return None
125
-
126
- if not getattr(self, "store", None):
127
- raise Exception("not have store function!")
128
-
129
- storer_name = self.__class__.__name__ + self.table
130
-
131
- while not stop.is_set():
132
-
133
- storer_length = self.queue.length
134
- if not storer_length:
135
- time.sleep(5)
136
- continue
137
-
138
- if last.is_set() or storer_length >= self.length:
139
- seeds, data_list = [], []
140
-
141
- for _ in range(self.length):
142
- items = self.queue.pop()
143
- if not items:
144
- break
145
- seed, data = items
146
- seeds.append(seed)
147
- data_list.append(data)
148
-
149
- if self.store(data_list):
150
- del_seed(seeds)
151
- else:
152
- reset_seed(seeds)
153
-
154
- time.sleep(3)
155
-
156
- log.info(f"close thread: {storer_name}")
File without changes
@@ -1,219 +0,0 @@
1
- import time
2
- import threading
3
-
4
- from .. import log, sqn, rtn, pim
5
- from .. import Queue, DBItem, RedisDB, Setting
6
- from .models import Scheduler, Spider, Storer
7
-
8
-
9
- def check(stop, last, spider, scheduler, storer_list, ready_seed_length, spider_queue_length):
10
- log.info("run check thread after 30 seconds...")
11
- time.sleep(30)
12
- spider_info = """
13
- ------------------- check: {0} ------------------
14
- running_spider_thread_num: {1}
15
- redis_ready_seed_length: {2}
16
- redis_spider_seed_length: {3}
17
- memory_seed_queue_length: {4}
18
- storer_upload_queue_length_info:
19
- {5}
20
- ----------------------- end -----------------------"""
21
- while True:
22
- status = "running"
23
- running_spider_thread_num = spider.spider_in_progress.length
24
- redis_ready_seed_length = ready_seed_length()
25
- redis_spider_seed_length = spider_queue_length()
26
- memory_seed_queue_length = scheduler.queue.length
27
- storer_upload_queue_list = []
28
- for storer in storer_list:
29
- storer_upload_queue_list.append(
30
- f"{storer.__class__.__name__} storer queue length: {storer.queue.length}"
31
- )
32
- if (
33
- scheduler.stop and
34
- not memory_seed_queue_length and
35
- not running_spider_thread_num
36
- ):
37
- if not Setting.LAUNCHER_MODEL:
38
- log.info("spider is done?")
39
- last.set()
40
- time.sleep(3)
41
- storer_queue_empty = True
42
- storer_upload_queue_list = []
43
- for storer in storer_list:
44
- if storer.queue.length:
45
- storer_queue_empty = False
46
- storer_upload_queue_list.append(
47
- f"{storer.__class__.__name__} storer queue length: {storer.queue.length}"
48
- )
49
- if (
50
- storer_queue_empty and
51
- not redis_ready_seed_length and
52
- not redis_spider_seed_length
53
- ):
54
- if Setting.LAUNCHER_MODEL:
55
- log.info("waiting for push seeds...")
56
- status = "waiting"
57
- time.sleep(30)
58
- else:
59
- log.info("spider done!")
60
- break
61
-
62
- last.clear()
63
-
64
- storer_upload_queue_length_info = "\n ".join(
65
- storer_upload_queue_list) if storer_upload_queue_list else "None"
66
- log.info(spider_info.format(
67
- status,
68
- running_spider_thread_num,
69
- redis_ready_seed_length,
70
- redis_spider_seed_length,
71
- memory_seed_queue_length,
72
- storer_upload_queue_length_info
73
- ))
74
-
75
- time.sleep(3)
76
- stop.set()
77
-
78
-
79
- def launcher(task):
80
- """
81
- 任务启动装饰器
82
- :param task: 任务配置信息
83
- """
84
- def decorator(func):
85
- storer_list = []
86
-
87
- # 程序结束事件
88
- last = threading.Event()
89
- # 停止采集事件
90
- stop = threading.Event()
91
-
92
- # 初始化redis信息
93
- redis_db = RedisDB(task.project, task.task_name, task.redis_info)
94
-
95
- log.info("初始化cobweb!")
96
-
97
- seed_queue = Queue()
98
-
99
- if task.scheduler_info is None:
100
- task.scheduler_info = dict()
101
-
102
- # 调度器动态继承
103
- sql = task.scheduler_info.get("sql")
104
- table = task.scheduler_info.get("table")
105
- size = task.scheduler_info.get("size")
106
- scheduler_config = task.scheduler_info.get("config")
107
- scheduler_db = task.scheduler_info.get("db", "default")
108
- DB, class_name = pim(scheduler_db, "scheduler")
109
- # SchedulerDB, table, sql, length, size, config = task.scheduler_info
110
- SchedulerTmp = type(class_name, (Scheduler, DB), {})
111
-
112
- # 初始化调度器
113
- scheduler = SchedulerTmp(
114
- table=table, sql=sql, size=size, queue=seed_queue,
115
- length=task.scheduler_queue_length, config=scheduler_config
116
- )
117
-
118
- # 解析存储器信息
119
- storer_info_list = task.storer_info or []
120
- if not isinstance(storer_info_list, list):
121
- storer_info_list = [storer_info_list]
122
-
123
- # new item
124
- item = type("Item", (object,), {"redis_client": redis_db.client})()
125
-
126
- for storer_info in storer_info_list:
127
- storer_db = storer_info["db"]
128
- fields = storer_info["fields"]
129
- storer_table = storer_info.get("table", "console")
130
- storer_config = storer_info.get("config")
131
-
132
- StorerDB, class_name = pim(storer_db, "storer")
133
- StorerTmp = type(class_name, (Storer, StorerDB), {})
134
-
135
- db_name = class_name.lower()
136
- if not getattr(item, db_name, None):
137
- instance = type(db_name, (DBItem,), {})
138
- setattr(item, db_name, instance)
139
-
140
- storer_item_instance = getattr(item, db_name)
141
- storer_item_instance.init_item(storer_table, fields)
142
-
143
- storer_queue = sqn(db_name, storer_table)
144
- queue = getattr(storer_item_instance, storer_queue)
145
- # 初始话存储器
146
- table_name = rtn(table_name=storer_table)
147
- storer = StorerTmp(
148
- table=table_name, fields=fields,
149
- length=task.storer_queue_length,
150
- queue=queue, config=storer_config
151
- )
152
- storer_list.append(storer)
153
-
154
- # 初始化采集器
155
- spider = Spider(seed_queue, storer_list and True, task.max_retries)
156
-
157
- threading.Thread(target=redis_db.check_spider_queue, args=(stop, len(storer_list))).start()
158
- threading.Thread(target=redis_db.set_heartbeat, args=(stop,)).start()
159
-
160
- # 推送初始种子
161
- # seeds = start_seeds(task.start_seed)
162
- redis_db.add_seed(task.seeds)
163
- # 启动调度器, 调度至redis队列
164
- threading.Thread(
165
- # name="xxxx_schedule_seeds",
166
- target=scheduler.schedule_seed,
167
- args=(
168
- redis_db.ready_seed_length,
169
- redis_db.get_scheduler_lock,
170
- redis_db.add_seed
171
- )
172
- ).start()
173
-
174
- # 启动调度器, 调度任务队列
175
- threading.Thread(
176
- # name="xxxx_schedule_task",
177
- target=scheduler.schedule_task,
178
- args=(
179
- stop, redis_db.get_seed,
180
- redis_db.ready_seed_length
181
- )
182
- ).start()
183
-
184
- # 启动采集器
185
- for index in range(task.spider_num):
186
- threading.Thread(
187
- # name=f"xxxx_spider_task:{index}",
188
- target=spider.spider_task,
189
- args=(
190
- stop, func, item,
191
- redis_db.del_seed
192
- )
193
- ).start()
194
-
195
- # 启动存储器
196
- for storer in storer_list:
197
- threading.Thread(
198
- # name=f"xxxx_store_task:{storer.table}",
199
- target=storer.store_task,
200
- args=(
201
- stop, last,
202
- redis_db.reset_seed,
203
- redis_db.set_storer
204
- )
205
- ).start()
206
-
207
- threading.Thread(
208
- # name="check_spider",
209
- target=check,
210
- args=(
211
- stop, last, spider,
212
- scheduler, storer_list,
213
- redis_db.ready_seed_length,
214
- redis_db.spider_queue_length,
215
- )
216
- ).start()
217
-
218
- return decorator
219
-
@@ -1,158 +0,0 @@
1
- import time
2
- from hashlib import md5
3
- from inspect import isgenerator
4
-
5
- from .. import log, ici
6
- from .. import DealModel, Queue, Seed, Setting
7
- # from pympler import asizeof
8
-
9
-
10
- class Scheduler:
11
-
12
- def schedule_seed(self, ready_seed_length, get_scheduler_lock, add_seed):
13
-
14
- inf_name = "SchedulerInterface"
15
- if not ici(self.__class__, inf_name):
16
- raise Exception("not have schedule function!")
17
-
18
- if self.__class__.__name__ == "Default":
19
- self.stop = True
20
- return None
21
-
22
- while not self.stop:
23
- length = ready_seed_length()
24
- if length > self.size:
25
- time.sleep(15)
26
-
27
- elif get_scheduler_lock():
28
- seeds = self.schedule()
29
- add_seed(seeds)
30
-
31
- log.info(f"close thread: schedule_seed")
32
-
33
- def schedule_task(self, stop, get_seed, ready_seed_length):
34
- time.sleep(3)
35
- while not stop.is_set():
36
-
37
- if not ready_seed_length():
38
- time.sleep(5)
39
- continue
40
-
41
- if self.queue.length >= self.length:
42
- time.sleep(3)
43
- continue
44
-
45
- seeds = get_seed(self.length)
46
- self.queue.push(seeds)
47
- log.info(f"close thread: schedule_task")
48
-
49
-
50
- class Spider:
51
-
52
- def __init__(self, queue, storage, max_retries=5):
53
- self.spider_in_progress = Queue()
54
- self.max_retries = max_retries
55
- self.storage = storage
56
- self.queue = queue
57
-
58
- def spider_task(self, stop, func, item, del_seed):
59
- while not stop.is_set():
60
- seed = self.queue.pop()
61
- if not seed:
62
- time.sleep(3)
63
- continue
64
- elif seed._retry >= self.max_retries:
65
- del_seed(seed, spider_status=False)
66
- continue
67
- try:
68
- self.spider_in_progress.push(1, direct_insertion=True)
69
- # log.info("spider seed: " + str(seed))
70
-
71
- store_queue = None
72
- store_data = list()
73
-
74
- iterators = func(item, seed)
75
-
76
- if not isgenerator(iterators):
77
- if not self.storage:
78
- del_seed(seed, spider_status=True)
79
- continue
80
- raise TypeError(f"{func.__name__} isn't a generator")
81
-
82
- for it in iterators:
83
- if getattr(it, "table_name", None):
84
- if not store_queue:
85
- store_queue = it.queue()
86
- store_data.append(it.struct_data)
87
- elif isinstance(it, Seed):
88
- self.queue.push(it)
89
-
90
- elif isinstance(it, str) and it == DealModel.polling:
91
- self.queue.push(seed)
92
- break
93
- elif isinstance(it, str) and it == DealModel.success:
94
- del_seed(seed, spider_status=True)
95
- break
96
- elif isinstance(it, str) and it == DealModel.failure:
97
- del_seed(seed, spider_status=False)
98
- break
99
- else:
100
- raise TypeError("yield value type error!")
101
-
102
- if store_queue and store_data:
103
- store_data.append(seed)
104
- store_queue.push(store_data)
105
-
106
- except Exception as e:
107
- seed._retry += 1
108
- self.queue.push(seed)
109
- log.info(f"{str(seed)} -> {str(e)}")
110
- finally:
111
- self.spider_in_progress.pop()
112
- time.sleep(Setting.SPIDER_RUN_TIME)
113
- log.info(f"close thread: spider")
114
-
115
-
116
- class Storer:
117
-
118
- def store_task(self, stop, last, reset_seed, set_storer):
119
-
120
- inf_name = "StorerInterface"
121
- if not ici(self.__class__, inf_name):
122
- return None
123
-
124
- if not getattr(self, "store", None):
125
- raise Exception("not have store function!")
126
-
127
- storer_name = self.__class__.__name__ + self.table
128
- store_key_id = md5(storer_name.encode()).hexdigest()
129
-
130
- while not stop.is_set():
131
-
132
- storer_length = self.queue.length
133
- if not storer_length:
134
- time.sleep(5)
135
- continue
136
-
137
- if last.is_set() or storer_length >= self.length:
138
- seeds, data_list = [], []
139
-
140
- while True:
141
- data = self.queue.pop()
142
- if not data:
143
- break
144
- if isinstance(data, Seed):
145
- seeds.append(data)
146
- if len(data_list) >= self.length:
147
- break
148
- continue
149
- data_list.append(data)
150
-
151
- if self.store(data_list):
152
- set_storer(store_key_id, seeds)
153
- else:
154
- reset_seed(seeds)
155
-
156
- time.sleep(3)
157
-
158
- log.info(f"close thread: {storer_name}")
File without changes