cobweb-launcher 0.1.15__py3-none-any.whl → 0.1.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cobweb-launcher might be problematic. Click here for more details.

File without changes
@@ -0,0 +1,202 @@
1
+ import time
2
+ import threading
3
+
4
+ from .. import log, sqn, rtn, pim
5
+ from .. import Queue, DBItem, RedisDB, Setting, OssDB
6
+ from .models import Scheduler, Spider, Storer
7
+
8
+
9
+ def check(stop, last, spider, scheduler, storer, ready_seed_length, spider_queue_length):
10
+ log.info("run check thread after 30 seconds...")
11
+ time.sleep(30)
12
+ spider_info = """
13
+ ------------------- check: {0} ------------------
14
+ redis_spider_seed_length: {1}
15
+ redis_ready_seed_length: {2}
16
+ running_spider_thread_num: {3}
17
+ memory_seed_queue_length: {4}
18
+ storer_queue_length_info: {5}
19
+ ----------------------- end -----------------------"""
20
+ while True:
21
+ status = "running"
22
+ running_spider_thread_num = spider.spider_in_progress.length
23
+ redis_ready_seed_length = ready_seed_length()
24
+ redis_spider_seed_length = spider_queue_length()
25
+ memory_seed_queue_length = scheduler.queue.length
26
+ storer_upload_queue_length = storer.queue.length if storer else None
27
+ if (
28
+ scheduler.stop and
29
+ not memory_seed_queue_length and
30
+ not running_spider_thread_num
31
+ ):
32
+ if not Setting.LAUNCHER_MODEL:
33
+ log.info("spider is done?")
34
+ last.set()
35
+ time.sleep(3)
36
+ storer_queue_empty = True
37
+ if storer and storer.queue.length:
38
+ storer_queue_empty = False
39
+ storer_upload_queue_length = storer.queue.length if storer else None
40
+ if (
41
+ storer_queue_empty and
42
+ not redis_ready_seed_length and
43
+ not redis_spider_seed_length
44
+ ):
45
+ if Setting.LAUNCHER_MODEL:
46
+ log.info("waiting for push seeds...")
47
+ status = "waiting"
48
+ time.sleep(30)
49
+ else:
50
+ log.info("spider done!")
51
+ break
52
+
53
+ last.clear()
54
+
55
+ log.info(spider_info.format(
56
+ status,
57
+ redis_spider_seed_length,
58
+ redis_ready_seed_length,
59
+ running_spider_thread_num,
60
+ memory_seed_queue_length,
61
+ storer_upload_queue_length
62
+ ))
63
+
64
+ time.sleep(3)
65
+ stop.set()
66
+
67
+
68
+ def launcher(task):
69
+ """
70
+ 任务启动装饰器
71
+ :param task: 任务配置信息
72
+ """
73
+ def decorator(func):
74
+ # 程序结束事件
75
+ last = threading.Event()
76
+ # 停止采集事件
77
+ stop = threading.Event()
78
+
79
+ # 初始化redis信息
80
+ redis_db = RedisDB(task.project, task.task_name, task.redis_info)
81
+
82
+ # new item
83
+ item = type("Item", (object,), {"redis_client": redis_db.client})()
84
+ if task.oss_config:
85
+ item.oss = OssDB(**task.oss_config)
86
+
87
+ log.info("初始化cobweb!")
88
+
89
+ seed_queue = Queue()
90
+
91
+ scheduler_info = task.scheduler_info or dict()
92
+ # 调度器动态继承
93
+ sql = scheduler_info.get("sql")
94
+ table = scheduler_info.get("table")
95
+ size = scheduler_info.get("size")
96
+ scheduler_config = scheduler_info.get("config")
97
+ scheduler_db = scheduler_info.get("db", "default")
98
+ DB, class_name = pim(scheduler_db, "scheduler")
99
+ # SchedulerDB, table, sql, length, size, config = task.scheduler_info
100
+ SchedulerTmp = type(class_name, (Scheduler, DB), {})
101
+ # 初始化调度器
102
+ scheduler = SchedulerTmp(
103
+ table=table, sql=sql, size=size, queue=seed_queue,
104
+ length=task.scheduler_queue_length, config=scheduler_config
105
+ )
106
+
107
+ storer = None
108
+ storer_info = task.storer_info or dict()
109
+
110
+ if storer_info:
111
+ storer_db = storer_info["db"]
112
+ fields = storer_info["fields"]
113
+ storer_table = storer_info.get("table", "console")
114
+ storer_config = storer_info.get("config")
115
+
116
+ StorerDB, class_name = pim(storer_db, "storer")
117
+ StorerTmp = type(class_name, (Storer, StorerDB), {})
118
+
119
+ db_name = class_name.lower()
120
+ if not getattr(item, db_name, None):
121
+ instance = type(db_name, (DBItem,), {})
122
+ setattr(item, db_name, instance)
123
+
124
+ storer_item_instance = getattr(item, db_name)
125
+ storer_item_instance.init_item(storer_table, fields)
126
+
127
+ storer_queue = sqn(db_name, storer_table)
128
+ queue = getattr(storer_item_instance, storer_queue)
129
+ # 初始话存储器
130
+ table_name = rtn(table_name=storer_table)
131
+ storer = StorerTmp(
132
+ table=table_name, fields=fields,
133
+ length=task.storer_queue_length,
134
+ queue=queue, config=storer_config
135
+ )
136
+
137
+ # 初始化采集器
138
+ spider = Spider(seed_queue, storer and True, task.max_retries)
139
+
140
+ threading.Thread(target=redis_db.check_spider_queue, args=(stop, 0)).start()
141
+ threading.Thread(target=redis_db.set_heartbeat, args=(stop,)).start()
142
+
143
+ # 推送初始种子
144
+ # seeds = start_seeds(task.start_seed)
145
+ redis_db.add_seed(task.seeds)
146
+ # 启动调度器, 调度至redis队列
147
+ threading.Thread(
148
+ # name="xxxx_schedule_seeds",
149
+ target=scheduler.schedule_seed,
150
+ args=(
151
+ redis_db.ready_seed_length,
152
+ redis_db.get_scheduler_lock,
153
+ redis_db.add_seed
154
+ )
155
+ ).start()
156
+
157
+ # 启动调度器, 调度任务队列
158
+ threading.Thread(
159
+ # name="xxxx_schedule_task",
160
+ target=scheduler.schedule_task,
161
+ args=(
162
+ stop, redis_db.get_seed,
163
+ redis_db.ready_seed_length
164
+ )
165
+ ).start()
166
+
167
+ # 启动采集器
168
+ for index in range(task.spider_num):
169
+ threading.Thread(
170
+ # name=f"xxxx_spider_task:{index}",
171
+ target=spider.spider_task,
172
+ args=(
173
+ stop, func, item,
174
+ redis_db.del_seed
175
+ )
176
+ ).start()
177
+
178
+ # 启动存储器
179
+ if storer:
180
+ threading.Thread(
181
+ # name=f"xxxx_store_task:{storer.table}",
182
+ target=storer.store_task,
183
+ args=(
184
+ stop, last,
185
+ redis_db.reset_seed,
186
+ redis_db.del_seed
187
+ )
188
+ ).start()
189
+
190
+ threading.Thread(
191
+ # name="check_spider",
192
+ target=check,
193
+ args=(
194
+ stop, last, spider,
195
+ scheduler, storer,
196
+ redis_db.ready_seed_length,
197
+ redis_db.spider_queue_length,
198
+ )
199
+ ).start()
200
+
201
+ return decorator
202
+
@@ -0,0 +1,156 @@
1
+ import time
2
+ from inspect import isgenerator
3
+ # from pympler import asizeof
4
+ from .. import log, ici
5
+ from .. import DealModel, Queue, Seed, Setting
6
+
7
+
8
+ class Scheduler:
9
+
10
+ def schedule_seed(self, ready_seed_length, get_scheduler_lock, add_seed):
11
+
12
+ inf_name = "SchedulerInterface"
13
+ if not ici(self.__class__, inf_name):
14
+ raise Exception("not have schedule function!")
15
+
16
+ if self.__class__.__name__ == "Default":
17
+ self.stop = True
18
+ return None
19
+
20
+ while not self.stop:
21
+ length = ready_seed_length()
22
+ if length > self.size:
23
+ time.sleep(15)
24
+
25
+ elif get_scheduler_lock():
26
+ seeds = self.schedule()
27
+ add_seed(seeds)
28
+
29
+ log.info(f"close thread: schedule_seed")
30
+
31
+ def schedule_task(self, stop, get_seed, ready_seed_length):
32
+ time.sleep(3)
33
+ while not stop.is_set():
34
+
35
+ if not ready_seed_length():
36
+ time.sleep(5)
37
+ continue
38
+
39
+ if self.queue.length >= self.length:
40
+ time.sleep(3)
41
+ continue
42
+
43
+ seeds = get_seed(self.length)
44
+ self.queue.push(seeds)
45
+ log.info(f"close thread: schedule_task")
46
+
47
+
48
+ class Spider:
49
+
50
+ def __init__(self, queue, storage, max_retries=5):
51
+ self.spider_in_progress = Queue()
52
+ self.max_retries = max_retries
53
+ self.storage = storage
54
+ self.queue = queue
55
+
56
+ def spider_task(self, stop, func, item, del_seed):
57
+ while not stop.is_set():
58
+
59
+ seed = self.queue.pop()
60
+
61
+ if not seed:
62
+ time.sleep(3)
63
+ continue
64
+
65
+ elif seed._retry >= self.max_retries:
66
+ del_seed(seed, spider_status=False)
67
+ continue
68
+
69
+ try:
70
+ self.spider_in_progress.push(1, direct_insertion=True)
71
+ log.info("spider seed: " + str(seed))
72
+ iterators = func(item, seed)
73
+
74
+ if not isgenerator(iterators):
75
+ if not self.storage:
76
+ del_seed(seed, spider_status=True)
77
+ continue
78
+ raise TypeError(f"{func.__name__} isn't a generator")
79
+
80
+ status = None
81
+ for it in iterators:
82
+ status = True
83
+ if getattr(it, "table_name", None):
84
+ store_queue = it.queue()
85
+ store_queue.push(
86
+ [seed, it.struct_data],
87
+ direct_insertion=True
88
+ )
89
+ elif isinstance(it, Seed):
90
+ self.queue.push(it)
91
+
92
+ elif isinstance(it, str) and it == DealModel.polling:
93
+ self.queue.push(seed)
94
+ break
95
+ elif isinstance(it, str) and it == DealModel.success:
96
+ del_seed(seed, spider_status=True)
97
+ break
98
+ elif isinstance(it, str) and it == DealModel.failure:
99
+ del_seed(seed, spider_status=False)
100
+ break
101
+ else:
102
+ raise TypeError("yield value type error!")
103
+
104
+ if not status:
105
+ seed._retry += 1
106
+ self.queue.push(seed)
107
+
108
+ except Exception as e:
109
+ seed._retry += 1
110
+ self.queue.push(seed)
111
+ log.info(f"{str(seed)} -> {str(e)}")
112
+ finally:
113
+ self.spider_in_progress.pop()
114
+ time.sleep(Setting.SPIDER_RUN_TIME)
115
+ log.info(f"close thread: spider")
116
+
117
+
118
+ class Storer:
119
+
120
+ def store_task(self, stop, last, reset_seed, del_seed):
121
+
122
+ inf_name = "StorerInterface"
123
+ if not ici(self.__class__, inf_name):
124
+ return None
125
+
126
+ if not getattr(self, "store", None):
127
+ raise Exception("not have store function!")
128
+
129
+ storer_name = self.__class__.__name__ + self.table
130
+
131
+ while not stop.is_set():
132
+
133
+ storer_length = self.queue.length
134
+ if not storer_length:
135
+ time.sleep(5)
136
+ continue
137
+
138
+ if last.is_set() or storer_length >= self.length:
139
+ seeds, data_list = [], []
140
+
141
+ for _ in range(self.length):
142
+ items = self.queue.pop()
143
+ if not items:
144
+ break
145
+ seed, data = items
146
+ seeds.append(seed)
147
+ data_list.append(data)
148
+
149
+ if self.store(data_list):
150
+ del_seed(seeds)
151
+ else:
152
+ reset_seed(seeds)
153
+
154
+ time.sleep(3)
155
+
156
+ log.info(f"close thread: {storer_name}")
@@ -69,6 +69,9 @@ class Spider:
69
69
  try:
70
70
  self.spider_in_progress.push(1, direct_insertion=True)
71
71
  log.info("spider seed: " + str(seed))
72
+
73
+ store_queue = None
74
+ store_data = list()
72
75
  iterators = func(item, seed)
73
76
 
74
77
  if not isgenerator(iterators):
@@ -80,12 +83,16 @@ class Spider:
80
83
  status = None
81
84
  for it in iterators:
82
85
  status = True
86
+ # if getattr(it, "table_name", None):
87
+ # store_queue = it.queue()
88
+ # store_queue.push(
89
+ # [seed, it.struct_data],
90
+ # direct_insertion=True
91
+ # )
83
92
  if getattr(it, "table_name", None):
84
- store_queue = it.queue()
85
- store_queue.push(
86
- [seed, it.struct_data],
87
- direct_insertion=True
88
- )
93
+ if not store_queue:
94
+ store_queue = it.queue()
95
+ store_data.append(it.struct_data)
89
96
  elif isinstance(it, Seed):
90
97
  self.queue.push(it)
91
98
 
@@ -105,6 +112,10 @@ class Spider:
105
112
  seed._retry += 1
106
113
  self.queue.push(seed)
107
114
 
115
+ elif store_queue and store_data:
116
+ store_data.append(seed)
117
+ store_queue.push(store_data)
118
+
108
119
  except Exception as e:
109
120
  seed._retry += 1
110
121
  self.queue.push(seed)
@@ -134,23 +145,27 @@ class Storer:
134
145
  if not storer_length:
135
146
  time.sleep(5)
136
147
  continue
148
+ elif not last.is_set() and storer_length < self.length:
149
+ time.sleep(3)
150
+ continue
137
151
 
138
- if last.is_set() or storer_length >= self.length:
139
- seeds, data_list = [], []
152
+ seeds, data_list = [], []
140
153
 
141
- for _ in range(self.length):
142
- items = self.queue.pop()
143
- if not items:
154
+ while True:
155
+ data = self.queue.pop()
156
+ if not data:
157
+ break
158
+ if isinstance(data, Seed):
159
+ seeds.append(data)
160
+ if len(data_list) >= self.length:
144
161
  break
145
- seed, data = items
146
- seeds.append(seed)
147
- data_list.append(data)
162
+ continue
163
+ data_list.append(data)
148
164
 
149
- if self.store(data_list):
150
- del_seed(seeds)
151
- else:
152
- reset_seed(seeds)
165
+ if self.store(data_list):
166
+ del_seed(seeds)
167
+ else:
168
+ reset_seed(seeds)
153
169
 
154
- time.sleep(3)
155
170
 
156
171
  log.info(f"close thread: {storer_name}")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cobweb-launcher
3
- Version: 0.1.15
3
+ Version: 0.1.16
4
4
  Summary: spider_hole
5
5
  Home-page: https://github.com/Juannie-PP/cobweb
6
6
  Author: Juannie-PP
@@ -22,18 +22,21 @@ cobweb/distributed/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuF
22
22
  cobweb/distributed/launcher.py,sha256=jTtBXBmna_6yFdj6gyGQiiEtg8I0g5uI5h8kbHWt454,7998
23
23
  cobweb/distributed/models.py,sha256=PUQokXMGD-H4A99nX7qYA395Ul6IsWGruMTVa05nswY,4568
24
24
  cobweb/equip/__init__.py,sha256=LWhbrTnG9kD1et0D40EzLISPuE0PIS-5WD3y3CLDaWk,247
25
+ cobweb/equip/dev/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
26
+ cobweb/equip/dev/launcher.py,sha256=KRsw7yxklvFM85cel-EyLsNPLyrC9Hd26BMSx6-4Hac,6785
27
+ cobweb/equip/dev/models.py,sha256=w3LQEhTrgqoYZn5v9TiEK2A68xuC7QH7suRP9OYnoOg,4813
25
28
  cobweb/equip/distributed/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
26
29
  cobweb/equip/distributed/launcher.py,sha256=1LzxibGXWR20XpXawakiRpEMaa9yfaj2rFSKnmEwjFc,7475
27
30
  cobweb/equip/distributed/models.py,sha256=qTGzxLdb2arsZSZK2HE4-MrqhraUhc2Ol5wBvlv_aWA,5008
28
31
  cobweb/equip/single/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
29
32
  cobweb/equip/single/launcher.py,sha256=KRsw7yxklvFM85cel-EyLsNPLyrC9Hd26BMSx6-4Hac,6785
30
- cobweb/equip/single/models.py,sha256=w3LQEhTrgqoYZn5v9TiEK2A68xuC7QH7suRP9OYnoOg,4813
33
+ cobweb/equip/single/models.py,sha256=pWALuEat5v9cNAvJ_PaAlDcIHUWuu38EQl0KFcpSJXA,5344
31
34
  cobweb/single/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
32
35
  cobweb/single/launcher.py,sha256=IoJbn87j7t7Pib_FxoWZmmX8asXOqNGb-9ospw6EYJI,7302
33
36
  cobweb/single/models.py,sha256=UXcxr_Quok91k82plaqbj4deB-UBCWo14WCo6SS5L_o,4247
34
37
  cobweb/single/nest.py,sha256=49K6KQ934INfPrWQsrq9rIFpQauLbLGOFbDaHvoQzOk,5015
35
- cobweb_launcher-0.1.15.dist-info/LICENSE,sha256=z1rxSIGOyzcSb3orZxFPxzx-0C1vTocmswqBNxpKfEk,1063
36
- cobweb_launcher-0.1.15.dist-info/METADATA,sha256=xkxL2woC_kK4XxUfrCaf80SeA5SE9t_0vwMynXt90lc,1220
37
- cobweb_launcher-0.1.15.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
38
- cobweb_launcher-0.1.15.dist-info/top_level.txt,sha256=4GETBGNsKqiCUezmT-mJn7tjhcDlu7nLIV5gGgHBW4I,7
39
- cobweb_launcher-0.1.15.dist-info/RECORD,,
38
+ cobweb_launcher-0.1.16.dist-info/LICENSE,sha256=z1rxSIGOyzcSb3orZxFPxzx-0C1vTocmswqBNxpKfEk,1063
39
+ cobweb_launcher-0.1.16.dist-info/METADATA,sha256=5LhIX1XNOr4pfnbotXN-lNWdwLUdBeb5Q3O157KORns,1220
40
+ cobweb_launcher-0.1.16.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
41
+ cobweb_launcher-0.1.16.dist-info/top_level.txt,sha256=4GETBGNsKqiCUezmT-mJn7tjhcDlu7nLIV5gGgHBW4I,7
42
+ cobweb_launcher-0.1.16.dist-info/RECORD,,