cobweb-launcher 0.1.2__tar.gz → 0.1.4__tar.gz

Sign up to get free protection for your applications and to get access to all the features.
Files changed (38) hide show
  1. {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/PKG-INFO +1 -1
  2. {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/__init__.py +1 -0
  3. {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/db/redis_db.py +5 -3
  4. {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/distributed/launcher.py +14 -51
  5. {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/distributed/models.py +7 -3
  6. cobweb-launcher-0.1.4/cobweb/single/launcher.py +231 -0
  7. cobweb-launcher-0.1.4/cobweb/single/models.py +136 -0
  8. cobweb-launcher-0.1.4/cobweb/utils.py +90 -0
  9. {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb_launcher.egg-info/PKG-INFO +1 -1
  10. {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb_launcher.egg-info/SOURCES.txt +1 -1
  11. {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/setup.py +1 -1
  12. cobweb-launcher-0.1.2/cobweb/single/models.py +0 -104
  13. cobweb-launcher-0.1.2/cobweb/single/nest.py +0 -153
  14. cobweb-launcher-0.1.2/cobweb/utils.py +0 -88
  15. {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/LICENSE +0 -0
  16. {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/README.md +0 -0
  17. {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/bbb.py +0 -0
  18. {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/db/__init__.py +0 -0
  19. {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/db/oss_db.py +0 -0
  20. {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/db/scheduler/__init__.py +0 -0
  21. {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/db/scheduler/default.py +0 -0
  22. {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/db/scheduler/textfile.py +0 -0
  23. {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/db/storer/__init__.py +0 -0
  24. {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/db/storer/console.py +0 -0
  25. {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/db/storer/loghub.py +0 -0
  26. {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/db/storer/redis.py +0 -0
  27. {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/db/storer/textfile.py +0 -0
  28. {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/decorators.py +0 -0
  29. {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/distributed/__init__.py +0 -0
  30. {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/interface.py +0 -0
  31. {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/log.py +0 -0
  32. {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/setting.py +0 -0
  33. {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/single/__init__.py +0 -0
  34. {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/task.py +0 -0
  35. {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb_launcher.egg-info/dependency_links.txt +0 -0
  36. {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb_launcher.egg-info/requires.txt +0 -0
  37. {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb_launcher.egg-info/top_level.txt +0 -0
  38. {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cobweb-launcher
3
- Version: 0.1.2
3
+ Version: 0.1.4
4
4
  Summary: spider_hole
5
5
  Home-page: https://github.com/Juannie-PP/cobweb
6
6
  Author: Juannie-PP
@@ -5,6 +5,7 @@ from .interface import SchedulerInterface, StorerInterface
5
5
  from .db.redis_db import RedisDB
6
6
  from .db.oss_db import OssDB
7
7
  from .distributed.launcher import launcher
8
+ from .single.launcher import launcher as single_launcher
8
9
  from . import setting
9
10
 
10
11
 
@@ -151,17 +151,19 @@ class RedisDB:
151
151
  @check_redis_status
152
152
  def check_spider_queue(self, stop, storer_num):
153
153
  while not stop.is_set():
154
- # 每15s获取check锁,等待600s后仍获取不到锁则重试;获取到锁后,设置锁的存活时间为15s
154
+ # 每15s获取check锁,等待600s后仍获取不到锁则重试;获取到锁后,设置锁的存活时间为${cs_lct}s
155
155
  if self._get_lock(key=self.check_lock, t=self.cs_lct, timeout=600, sleep_time=3):
156
156
  heartbeat = True if self.client.exists(self.heartbeat_key) else False
157
- # 重启重制score值,否则获取n分钟前的分数值
157
+ # 重启重制score值,否则获取${rs_time}分钟前的分数值
158
158
  score = -int(time.time()) + self.rs_time if heartbeat else "-inf"
159
159
 
160
160
  keys = self.client.keys(self.storer_key % "*")
161
+
161
162
  if keys and len(keys) >= storer_num:
162
163
  intersection_key = self.storer_key % "intersection"
163
164
  self.client.delete(intersection_key)
164
165
  self.client.zinterstore(intersection_key, keys)
166
+
165
167
  while True:
166
168
  members = self.client.zrange(intersection_key, 0, 1999)
167
169
  if not members:
@@ -192,7 +194,7 @@ class RedisDB:
192
194
  self.client.setex(self.heartbeat_key, 15, "")
193
195
 
194
196
  # self.client.delete(self.check_lock)
195
- time.sleep(3)
197
+ # time.sleep(3)
196
198
 
197
199
  @check_redis_status
198
200
  def set_heartbeat(self, stop):
@@ -1,50 +1,15 @@
1
1
  import time
2
2
  import threading
3
3
  from threading import Thread
4
- from importlib import import_module
5
4
 
6
- from cobweb import log, Queue, DBItem, RedisDB, OssDB, StorerInterface
7
- from cobweb.utils import struct_queue_name, restore_table_name
8
- from cobweb.setting import MODEL, RESET_SCORE, CHECK_LOCK_TIME
9
5
  from .models import Scheduler, Spider, Storer
10
-
11
-
12
- def get_scheduler_db(db):
13
- if isinstance(db, str):
14
- if "." in db:
15
- model_path = db.split(".")
16
- model = import_module(db)
17
- obj = getattr(model, db)
18
- else:
19
- model = import_module(f"cobweb.db.scheduler.{db.lower()}")
20
- obj = getattr(model, db.capitalize())
21
- return obj
22
- # if db.lower() in dir(StorerDB):
23
- # return getattr(StorerDB, db)
24
- # else:
25
- # pass
26
- elif issubclass(db, StorerInterface):
27
- return db
28
- raise TypeError()
29
-
30
-
31
- def get_storer_db(db):
32
- if isinstance(db, str):
33
- if "." in db:
34
- model_path = db.split(".")
35
- model = import_module(db)
36
- obj = getattr(model, db)
37
- else:
38
- model = import_module(f"cobweb.db.storer.{db.lower()}")
39
- obj = getattr(model, db.capitalize())
40
- return obj, db.lower()
41
- # if db.lower() in dir(StorerDB):
42
- # return getattr(StorerDB, db)
43
- # else:
44
- # pass
45
- elif issubclass(db, StorerInterface):
46
- return db, db.__name__.lower()
47
- raise TypeError()
6
+ from cobweb import log, Queue, DBItem, RedisDB
7
+ from cobweb.setting import MODEL, RESET_SCORE, CHECK_LOCK_TIME
8
+ from cobweb.utils import (
9
+ struct_queue_name as sqn,
10
+ restore_table_name as rtn,
11
+ parse_import_model as pim,
12
+ )
48
13
 
49
14
 
50
15
  def check(stop, last, spider, scheduler, storer_list, ready_seed_length, spider_queue_length):
@@ -164,9 +129,9 @@ def launcher(task):
164
129
  size = task.scheduler_info.get("size")
165
130
  scheduler_config = task.scheduler_info.get("config")
166
131
  scheduler_db = task.scheduler_info.get("db", "default")
167
- DB = get_scheduler_db(scheduler_db)
132
+ DB, class_name = pim(scheduler_db, "scheduler")
168
133
  # SchedulerDB, table, sql, length, size, config = task.scheduler_info
169
- SchedulerTmp = type(DB.__name__, (Scheduler, DB), {})
134
+ SchedulerTmp = type(class_name, (Scheduler, DB), {})
170
135
 
171
136
  # 初始化调度器
172
137
  scheduler = SchedulerTmp(
@@ -185,18 +150,16 @@ def launcher(task):
185
150
  # new item
186
151
  item = type("Item", (object,), {"redis_client": redis_db.client})()
187
152
 
188
- if task.oss_config:
189
- item.oss = OssDB(**task.oss_config)
190
-
191
153
  for storer_info in storer_info_list:
192
154
  storer_db = storer_info["db"]
193
155
  fields = storer_info["fields"]
194
156
  storer_table = storer_info.get("table", "console")
195
157
  storer_config = storer_info.get("config")
196
158
 
197
- StorerDB, db_name = get_storer_db(storer_db)
198
- StorerTmp = type(StorerDB.__name__, (Storer, StorerDB), {})
159
+ StorerDB, class_name = pim(storer_db, "storer")
160
+ StorerTmp = type(class_name, (Storer, StorerDB), {})
199
161
 
162
+ db_name = class_name.lower()
200
163
  if not getattr(item, db_name, None):
201
164
  instance = type(db_name, (DBItem,), {})
202
165
  setattr(item, db_name, instance)
@@ -204,10 +167,10 @@ def launcher(task):
204
167
  storer_item_instance = getattr(item, db_name)
205
168
  storer_item_instance.init_item(storer_table, fields)
206
169
 
207
- storer_queue = struct_queue_name(db_name, storer_table)
170
+ storer_queue = sqn(db_name, storer_table)
208
171
  queue = getattr(storer_item_instance, storer_queue)
209
172
  # 初始话存储器
210
- table_name = restore_table_name(table_name=storer_table)
173
+ table_name = rtn(table_name=storer_table)
211
174
  storer = StorerTmp(
212
175
  table=table_name, fields=fields,
213
176
  length=task.storer_queue_length,
@@ -1,6 +1,8 @@
1
1
  import time
2
2
  from hashlib import md5
3
- from cobweb import log, Queue, Seed, StorerInterface, SchedulerInterface
3
+ from cobweb import log, Queue, Seed
4
+ from utils import issubclass_cobweb_interface
5
+
4
6
  # from pympler import asizeof
5
7
 
6
8
 
@@ -8,7 +10,8 @@ class Scheduler:
8
10
 
9
11
  def schedule_seed(self, ready_seed_length, get_scheduler_lock, add_seed):
10
12
 
11
- if not issubclass(self.__class__, SchedulerInterface):
13
+ inf_name = "SchedulerInterface"
14
+ if not issubclass_cobweb_interface(self.__class__, inf_name):
12
15
  raise Exception("not have schedule function!")
13
16
 
14
17
  if self.__class__.__name__ == "Default":
@@ -103,7 +106,8 @@ class Storer:
103
106
 
104
107
  def store_task(self, stop, last, reset_seed, set_storer):
105
108
 
106
- if not issubclass(self.__class__, StorerInterface):
109
+ inf_name = "StorerInterface"
110
+ if not issubclass_cobweb_interface(self.__class__, inf_name):
107
111
  return None
108
112
 
109
113
  if not getattr(self, "store", None):
@@ -0,0 +1,231 @@
1
+ import time
2
+ import threading
3
+ from threading import Thread
4
+
5
+ from .models import Scheduler, Spider, Storer
6
+ from cobweb import log, Queue, DBItem, RedisDB
7
+ from cobweb.setting import MODEL, RESET_SCORE, CHECK_LOCK_TIME
8
+ from cobweb.utils import (
9
+ struct_queue_name as sqn,
10
+ restore_table_name as rtn,
11
+ parse_import_model as pim,
12
+ )
13
+
14
+
15
+ def check(stop, last, spider, scheduler, storer, ready_seed_length, spider_queue_length):
16
+ log.info("run check thread after 30 seconds...")
17
+ time.sleep(30)
18
+ spider_info = """
19
+ ------------------- check: {0} ------------------
20
+ redis_spider_seed_length: {1}
21
+ redis_ready_seed_length: {2}
22
+ running_spider_thread_num: {3}
23
+ memory_seed_queue_length: {4}
24
+ storer_queue_length_info: {5}
25
+ ----------------------- end -----------------------"""
26
+ while True:
27
+ status = "running"
28
+ running_spider_thread_num = spider.spider_in_progress.length
29
+ redis_ready_seed_length = ready_seed_length()
30
+ redis_spider_seed_length = spider_queue_length()
31
+ memory_seed_queue_length = scheduler.queue.length
32
+ storer_upload_queue_length = storer.queue.length
33
+ if (
34
+ scheduler.stop and
35
+ # not redis_ready_seed_length and
36
+ not memory_seed_queue_length and
37
+ not running_spider_thread_num
38
+ ):
39
+ if not MODEL:
40
+ log.info("spider is done?")
41
+ last.set()
42
+ time.sleep(3)
43
+ storer_queue_empty = True
44
+ if storer.queue.length:
45
+ storer_queue_empty = False
46
+ storer_upload_queue_length = storer.queue.length
47
+ if (
48
+ storer_queue_empty and
49
+ not redis_ready_seed_length and
50
+ not redis_spider_seed_length
51
+ ):
52
+ if MODEL:
53
+ log.info("waiting for push seeds...")
54
+ status = "waiting"
55
+ time.sleep(30)
56
+ else:
57
+ log.info("spider done!")
58
+ break
59
+
60
+ last.clear()
61
+
62
+ log.info(spider_info.format(
63
+ status,
64
+ redis_spider_seed_length,
65
+ redis_ready_seed_length,
66
+ running_spider_thread_num,
67
+ memory_seed_queue_length,
68
+ storer_upload_queue_length
69
+ ))
70
+
71
+ time.sleep(3)
72
+ stop.set()
73
+
74
+
75
+ def launcher(task):
76
+ """
77
+ 任务启动装饰器
78
+ :param task: 任务配置信息
79
+ """
80
+ def decorator(func):
81
+ """
82
+ Item:
83
+ Textfile()
84
+ Loghub()
85
+ Console()
86
+ e.g.
87
+ task.fields = "a,b"
88
+ func(item, seed)
89
+ a = "a"
90
+ b = "b"
91
+ data = {"a": "a", "b": "b"}
92
+ yield item.Loghub(**data)
93
+ yield item.Loghub(a=a, b=b)
94
+ """
95
+ storer_list = []
96
+
97
+ # 程序结束事件
98
+ last = threading.Event()
99
+ # 停止采集事件
100
+ stop = threading.Event()
101
+
102
+ # 初始化redis信息
103
+ redis_db = RedisDB(
104
+ task.project, task.task_name, task.redis_info,
105
+ model=MODEL, cs_lct=CHECK_LOCK_TIME, rs_time=RESET_SCORE
106
+ )
107
+
108
+ # new item
109
+ item = type("Item", (object,), {"redis_client": redis_db.client})()
110
+
111
+ log.info("初始化cobweb!")
112
+
113
+ seed_queue = Queue()
114
+
115
+ scheduler_info = task.scheduler_info or dict()
116
+
117
+ # 调度器动态继承
118
+ sql = scheduler_info.get("sql")
119
+ table = scheduler_info.get("table")
120
+ size = scheduler_info.get("size")
121
+ scheduler_config = scheduler_info.get("config")
122
+ scheduler_db = scheduler_info.get("db", "default")
123
+ DB, class_name = pim(scheduler_db, "scheduler")
124
+ # SchedulerDB, table, sql, length, size, config = task.scheduler_info
125
+ SchedulerTmp = type(class_name, (Scheduler, DB), {})
126
+
127
+ # 初始化调度器
128
+ scheduler = SchedulerTmp(
129
+ table=table, sql=sql, size=size, queue=seed_queue,
130
+ length=task.scheduler_queue_length, config=scheduler_config
131
+ )
132
+
133
+ # 初始化采集器
134
+ spider = Spider(seed_queue, task.max_retries)
135
+
136
+ storer = None
137
+
138
+ # 解析存储器信息
139
+ storer_info = task.storer_info or dict()
140
+
141
+ # for storer_info in storer_info_list:
142
+ if storer_info:
143
+ storer_db = storer_info["db"]
144
+ fields = storer_info["fields"]
145
+ storer_table = storer_info.get("table", "console")
146
+ storer_config = storer_info.get("config")
147
+
148
+ StorerDB, class_name = pim(storer_db, "storer")
149
+ StorerTmp = type(class_name, (Storer, StorerDB), {})
150
+
151
+ db_name = class_name.lower()
152
+ if not getattr(item, db_name, None):
153
+ instance = type(db_name, (DBItem,), {})
154
+ setattr(item, db_name, instance)
155
+
156
+ storer_item_instance = getattr(item, db_name)
157
+ storer_item_instance.init_item(storer_table, fields)
158
+
159
+ storer_queue = sqn(db_name, storer_table)
160
+ queue = getattr(storer_item_instance, storer_queue)
161
+ # 初始话存储器
162
+ table_name = rtn(table_name=storer_table)
163
+ storer = StorerTmp(
164
+ table=table_name, fields=fields,
165
+ length=task.storer_queue_length,
166
+ queue=queue, config=storer_config
167
+ )
168
+
169
+ Thread(target=redis_db.check_spider_queue, args=(stop, len(storer_list))).start()
170
+ Thread(target=redis_db.set_heartbeat, args=(stop,)).start()
171
+
172
+ # 推送初始种子
173
+ # seeds = start_seeds(task.start_seed)
174
+ redis_db.add_seed(task.seeds)
175
+ # 启动调度器, 调度至redis队列
176
+ Thread(
177
+ # name="xxxx_schedule_seeds",
178
+ target=scheduler.schedule_seed,
179
+ args=(
180
+ redis_db.ready_seed_length,
181
+ redis_db.get_scheduler_lock,
182
+ redis_db.add_seed
183
+ )
184
+ ).start()
185
+
186
+ # 启动调度器, 调度任务队列
187
+ Thread(
188
+ # name="xxxx_schedule_task",
189
+ target=scheduler.schedule_task,
190
+ args=(
191
+ stop, redis_db.get_seed,
192
+ redis_db.ready_seed_length
193
+ )
194
+ ).start()
195
+
196
+ # 启动采集器
197
+ for index in range(task.spider_num):
198
+ Thread(
199
+ # name=f"xxxx_spider_task:{index}",
200
+ target=spider.spider_task,
201
+ args=(
202
+ stop, func, item,
203
+ redis_db.del_seed
204
+ )
205
+ ).start()
206
+
207
+ # 启动存储器
208
+ if storer:
209
+ Thread(
210
+ # name=f"xxxx_store_task:{storer.table}",
211
+ target=storer.store_task,
212
+ args=(
213
+ stop, last,
214
+ redis_db.reset_seed,
215
+ redis_db.set_storer
216
+ )
217
+ ).start()
218
+
219
+ Thread(
220
+ # name="check_spider",
221
+ target=check,
222
+ args=(
223
+ stop, last, spider,
224
+ scheduler, storer,
225
+ redis_db.ready_seed_length,
226
+ redis_db.spider_queue_length,
227
+ )
228
+ ).start()
229
+
230
+ return decorator
231
+
@@ -0,0 +1,136 @@
1
+ import time
2
+ from cobweb import log, Queue, Seed
3
+ from utils import issubclass_cobweb_interface
4
+
5
+ # from pympler import asizeof
6
+
7
+
8
+ class Scheduler:
9
+
10
+ def schedule_seed(self, ready_seed_length, get_scheduler_lock, add_seed):
11
+
12
+ inf_name = "SchedulerInterface"
13
+ if not issubclass_cobweb_interface(self.__class__, inf_name):
14
+ raise Exception("not have schedule function!")
15
+
16
+ if self.__class__.__name__ == "Default":
17
+ self.stop = True
18
+ return None
19
+
20
+ while not self.stop:
21
+ length = ready_seed_length()
22
+ if length > self.size:
23
+ time.sleep(15)
24
+
25
+ elif get_scheduler_lock():
26
+ seeds = self.schedule()
27
+ add_seed(seeds)
28
+
29
+ log.info(f"close thread: schedule_seed")
30
+
31
+ def schedule_task(self, stop, get_seed, ready_seed_length):
32
+ time.sleep(3)
33
+ while not stop.is_set():
34
+
35
+ if not ready_seed_length():
36
+ time.sleep(15)
37
+ continue
38
+
39
+ if self.queue.length >= self.length:
40
+ time.sleep(3)
41
+ continue
42
+
43
+ seeds = get_seed(self.length)
44
+ self.queue.push(seeds)
45
+ log.info(f"close thread: schedule_task")
46
+
47
+
48
+ class Spider:
49
+
50
+ def __init__(self, queue, max_retries=5):
51
+ self.spider_in_progress = Queue()
52
+ self.max_retries = max_retries
53
+ self.queue = queue
54
+
55
+ def spider_task(self, stop, func, item, del_seed):
56
+ while not stop.is_set():
57
+ seed = self.queue.pop()
58
+ if not seed:
59
+ time.sleep(3)
60
+ continue
61
+ elif seed._retry >= self.max_retries:
62
+ del_seed(seed, spider_status=False)
63
+ continue
64
+ try:
65
+ self.spider_in_progress.push(1, direct_insertion=True)
66
+ # log.info("spider seed: " + str(seed))
67
+ status = None
68
+ for it in func(item, seed):
69
+ if getattr(it, "table_name", None):
70
+ store_queue = it.queue()
71
+ store_queue.push(
72
+ [seed, it.struct_data],
73
+ direct_insertion=True
74
+ )
75
+ elif isinstance(it, Seed):
76
+ self.queue.push(it)
77
+ elif any(isinstance(it, t) for t in (list, tuple)):
78
+ self.queue.push([s if isinstance(s, Seed) else Seed(s) for s in it])
79
+ elif isinstance(it, bool):
80
+ status = it
81
+ elif it is None:
82
+ status = False
83
+
84
+ if status is not None:
85
+ if status:
86
+ del_seed(seed, spider_status=True)
87
+ else:
88
+ seed._retry += 1
89
+ self.queue.push(seed)
90
+
91
+ except Exception as e:
92
+ seed._retry += 1
93
+ self.queue.push(seed)
94
+ log.info(f"{str(seed)} -> {str(e)}")
95
+ finally:
96
+ self.spider_in_progress.pop()
97
+ log.info(f"close thread: spider")
98
+
99
+
100
+ class Storer:
101
+
102
+ def store_task(self, stop, last, reset_seed, del_seed):
103
+
104
+ inf_name = "StorerInterface"
105
+ if not issubclass_cobweb_interface(self.__class__, inf_name):
106
+ return None
107
+
108
+ if not getattr(self, "store", None):
109
+ raise Exception("not have store function!")
110
+
111
+ storer_name = self.__class__.__name__ + self.table
112
+
113
+ while not stop.is_set():
114
+
115
+ if last.is_set() or self.queue.length >= self.length:
116
+ seeds, data_list = [], []
117
+
118
+ for _ in range(self.length):
119
+ items = self.queue.pop()
120
+ if not items:
121
+ break
122
+ seed, data = items
123
+ seeds.append(seed)
124
+ data_list.append(data)
125
+
126
+ if data_list:
127
+ if self.store(data_list):
128
+ del_seed(seeds)
129
+ else:
130
+ reset_seed(seeds)
131
+ log.info("reset seeds!")
132
+ continue
133
+
134
+ time.sleep(3)
135
+
136
+ log.info(f"close thread: {storer_name}")
@@ -0,0 +1,90 @@
1
+ import json
2
+ import re
3
+ import sys
4
+ from abc import ABC
5
+ from typing import Iterable
6
+ from importlib import import_module
7
+
8
+
9
+ def struct_table_name(table_name):
10
+ return table_name.replace(".", "__p__").replace(":", "__c__")
11
+
12
+
13
+ def restore_table_name(table_name):
14
+ return table_name.replace("__p__", ".").replace("__c__", ":")
15
+
16
+
17
+ def struct_queue_name(db_name, table_name):
18
+ return sys.intern(f"__{db_name}_{table_name}_queue__")
19
+
20
+
21
+ def parse_info(info):
22
+ if not info:
23
+ return info
24
+
25
+ if isinstance(info, dict):
26
+ return info
27
+
28
+ if isinstance(info, str):
29
+ return json.loads(info)
30
+
31
+ if isinstance(info, Iterable):
32
+ result = list()
33
+ for ii in info:
34
+ if isinstance(ii, str):
35
+ result.append(json.loads(ii))
36
+ elif isinstance(ii, dict):
37
+ result.append(ii)
38
+ else:
39
+ raise TypeError("must be in [str, dict]")
40
+
41
+ return result
42
+
43
+
44
+ def struct_start_seeds(seeds):
45
+ from .bbb import Seed
46
+ if not seeds:
47
+ return None
48
+ if any(isinstance(seeds, t) for t in (list, tuple)):
49
+ return [Seed(seed) for seed in seeds]
50
+ elif any(isinstance(seeds, t) for t in (str, dict)):
51
+ return Seed(seeds)
52
+
53
+
54
+ def issubclass_cobweb_interface(_class, inf_name):
55
+ for _c in _class.__mro__[1:]:
56
+ if _c.__name__ == inf_name:
57
+ return True
58
+ return False
59
+
60
+
61
+ def parse_import_model(model_info, model_type=None):
62
+ if model_type not in ["scheduler", "storer"]:
63
+ raise TypeError("model_type must be in scheduler, storer")
64
+ if isinstance(model_info, str):
65
+ if "import" in model_info:
66
+ model_path, class_name = re.search(
67
+ r"from (.*?) import (.*?)$", model_info
68
+ ).groups()
69
+ model = import_module(model_path)
70
+ class_object = getattr(model, class_name)
71
+ elif "." in model_info:
72
+ info_list = model_info.split(".")
73
+ class_name = info_list[-1]
74
+ model_path = ".".join(info_list[:-1])
75
+ model = import_module(model_path)
76
+ class_object = getattr(model, class_name)
77
+ else:
78
+ model_path = f"cobweb.db.{model_type}.{model_info.lower()}"
79
+ class_name = model_info.capitalize()
80
+ model = import_module(model_path)
81
+ class_object = getattr(model, class_name)
82
+ return class_object, class_name
83
+ elif issubclass(model_info, ABC):
84
+ inf_name = model_type.capitalize() + "Interface"
85
+ if issubclass_cobweb_interface(model_info, inf_name):
86
+ return model_info, model_info.__name__
87
+ raise ImportError()
88
+ raise TypeError()
89
+
90
+
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cobweb-launcher
3
- Version: 0.1.2
3
+ Version: 0.1.4
4
4
  Summary: spider_hole
5
5
  Home-page: https://github.com/Juannie-PP/cobweb
6
6
  Author: Juannie-PP
@@ -24,8 +24,8 @@ cobweb/distributed/__init__.py
24
24
  cobweb/distributed/launcher.py
25
25
  cobweb/distributed/models.py
26
26
  cobweb/single/__init__.py
27
+ cobweb/single/launcher.py
27
28
  cobweb/single/models.py
28
- cobweb/single/nest.py
29
29
  cobweb_launcher.egg-info/PKG-INFO
30
30
  cobweb_launcher.egg-info/SOURCES.txt
31
31
  cobweb_launcher.egg-info/dependency_links.txt
@@ -5,7 +5,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
5
5
 
6
6
  setup(
7
7
  name="cobweb-launcher",
8
- version="0.1.2",
8
+ version="0.1.4",
9
9
  packages=find_packages(),
10
10
  url="https://github.com/Juannie-PP/cobweb",
11
11
  license="MIT",
@@ -1,104 +0,0 @@
1
- import time
2
- # from pympler import asizeof
3
- from single.nest import Queue
4
- from single.nest import struct_queue_name
5
- from single.nest import SchedulerInterface, StorerInterface
6
-
7
-
8
- # class Transceiver:
9
- class Distributor:
10
-
11
- def __init__(self):
12
- self.seed_queue = Queue()
13
-
14
- @property
15
- def queue_names(self):
16
- return tuple(self.__dict__.keys())
17
-
18
- @property
19
- def used_memory(self):
20
- return asizeof.asizeof(self)
21
-
22
- def create_queue(self, queue_name: str):
23
- self.__setattr__(queue_name, Queue())
24
-
25
- def get_queue(self, queue_name: str):
26
- return self.__getattribute__(queue_name)
27
-
28
- def deal_item(self, item):
29
- icn = item.__class__.__name__
30
- if icn == "Seed":
31
- self.seed_queue.push(item)
32
- elif getattr(item, "table_name", None):
33
- queue_name = struct_queue_name(icn, item.table_name)
34
- getattr(self, queue_name).push(item.serialization)
35
-
36
- def distribute(self, callback, *args, **kwargs):
37
- iterable = callback(*args, **kwargs)
38
- if not iterable:
39
- return None
40
- for result in iterable:
41
- self.deal_item(result)
42
- return True
43
-
44
-
45
- class Scheduler:
46
-
47
- def schedule_task(self, distribute):
48
-
49
- if not issubclass(self.__class__, SchedulerInterface):
50
- return None
51
-
52
- if not getattr(self, "schedule", None):
53
- raise Exception("not have schedule function!")
54
-
55
- while not self.stop:
56
-
57
- if self.queue.length < self.length:
58
- distribute(self.schedule)
59
-
60
- else:
61
- print("------------")
62
- time.sleep(15)
63
-
64
-
65
- class Spider:
66
-
67
- def __init__(self, queue):
68
- self.queue = queue
69
- self.spider_in_progress = Queue()
70
-
71
- def spider_task(self, stop_event, distribute, func, item):
72
- while not stop_event.is_set():
73
- seed = self.queue.pop()
74
- if not seed:
75
- time.sleep(3)
76
- continue
77
- try:
78
- self.spider_in_progress.push(1)
79
- distribute(func, item, seed)
80
- except Exception as e:
81
- print(e)
82
- finally:
83
- self.spider_in_progress.pop()
84
-
85
-
86
- class Storer:
87
-
88
- def store_task(self, stop_event, last_event, distribute):
89
-
90
- if not issubclass(self.__class__, StorerInterface):
91
- return None
92
-
93
- if not getattr(self, "store", None):
94
- raise Exception("not have store function!")
95
-
96
- while not stop_event.is_set():
97
- if last_event.is_set() or self.queue.length > self.length:
98
- data_list = []
99
- data_length = min(self.queue.length, self.length)
100
- for _ in range(data_length):
101
- data = self.queue.pop()
102
- data_list.append(data)
103
- if data_list:
104
- distribute(self.store, data_list)
@@ -1,153 +0,0 @@
1
- import time
2
- import threading
3
-
4
- from single.nest import Seed, DBItem
5
- from single.nest import struct_queue_name, restore_table_name
6
- from single.nest import Distributor, Scheduler, Spider, Storer
7
-
8
-
9
- def init_task_seed(seeds):
10
- if not seeds:
11
- return None
12
- if isinstance(seeds, list) or isinstance(seeds, tuple):
13
- for seed in seeds:
14
- yield Seed(seed)
15
- elif isinstance(seeds, str) or isinstance(seeds, dict):
16
- yield Seed(seeds)
17
-
18
-
19
- def parse_storer_info(storer_info):
20
- storer_data = {}
21
- storer_info_list = []
22
- if storer_info.__class__.__name__ == 'StorerInfo':
23
- storer_info_list.append(storer_info)
24
- elif isinstance(storer_info, tuple) or isinstance(storer_info, list):
25
- storer_info_list = storer_info
26
- for info in storer_info_list:
27
- db_name = info.DB.__name__
28
- storer_data.setdefault(db_name, {"StorerDB": info.DB, "db_args_list": []})
29
- storer_data[db_name]["db_args_list"].append(info[1:])
30
- return storer_data
31
-
32
-
33
- def check(stop_event, last_event, distributor, scheduler, spider, storer_list):
34
- while True:
35
- time.sleep(3)
36
- if (
37
- scheduler.stop and
38
- not distributor.seed_queue.length and
39
- not spider.spider_in_progress.length
40
- ):
41
- last_event.set()
42
- time.sleep(10)
43
- storer_queue_empty = True
44
- for storer in storer_list:
45
- if storer.queue.length:
46
- storer_queue_empty = False
47
- break
48
- if storer_queue_empty:
49
- break
50
- last_event.clear()
51
- stop_event.set()
52
-
53
-
54
- def cobweb(task):
55
- """
56
- 任务启动装饰器
57
- :param task: 任务配置信息
58
- """
59
- def decorator(func):
60
- """
61
- func(Item, seed)
62
- Item:
63
- Item.Textfile()
64
- Item.Console()
65
- """
66
- # project task_name start_seed spider_num queue_length scheduler_info storer_info
67
-
68
- storer_list = []
69
-
70
- # 程序结束事件
71
- last_event = threading.Event()
72
- # 暂停采集事件
73
- stop_event = threading.Event()
74
-
75
- # 创建分发器
76
- distributor = Distributor()
77
-
78
- # 调度器动态继承
79
- SchedulerDB, table, sql, length, size = task.SchedulerInfo
80
- SchedulerTmp = type('Scheduler', (Scheduler, SchedulerDB), {})
81
-
82
- # 初始化调度器
83
- scheduler = SchedulerTmp(table=table, sql=sql, length=length, size=size, queue=distributor.seed_queue)
84
-
85
- # 初始化采集器
86
- spider = Spider(queue=distributor.seed_queue)
87
-
88
- # 解析存储器信息
89
- storer_data = parse_storer_info(task.storer_info)
90
-
91
- # sds
92
- item = type("item", (object,), {})
93
- for db_name in storer_data.keys():
94
- # 存储器动态继承
95
- StorerDB = storer_data[db_name]["StorerDB"]
96
- StorerTmp = type('Storer', (Storer, StorerDB), {})
97
- db_args_list = storer_data[db_name]["db_args_list"]
98
- for storer_db_args in db_args_list:
99
- table, fields, length = storer_db_args
100
- if not getattr(item, db_name, None):
101
- instance = type(db_name, (DBItem,), {})
102
- setattr(item, db_name, instance)
103
- # 创建存储xxx
104
- getattr(item, db_name).init_item(table, fields)
105
- # 创建存储队列
106
- storer_queue = struct_queue_name(db_name, table)
107
- distributor.create_queue(queue_name=storer_queue)
108
- queue = distributor.get_queue(queue_name=storer_queue)
109
- # 初始话存储器
110
- table_name = restore_table_name(table_name=table)
111
- storer = StorerTmp(table=table_name, fields=fields, length=length, queue=queue)
112
- storer_list.append(storer)
113
-
114
- # 推送初始种子
115
- distributor.distribute(init_task_seed, seeds=task.start_seed)
116
-
117
- # 启动调度器
118
- threading.Thread(
119
- target=scheduler.schedule_task,
120
- args=(distributor.distribute,),
121
- name="single_scheduler_task"
122
- ).start()
123
-
124
- # 启动采集器
125
- for index in range(task.spider_num):
126
- threading.Thread(
127
- target=spider.spider_task,
128
- args=(stop_event, distributor.distribute, func, item),
129
- name=f"single_spider_task:{index}"
130
- ).start()
131
-
132
- # 启动存储器
133
- for storer in storer_list:
134
- threading.Thread(
135
- target=storer.store_task,
136
- args=(stop_event, last_event, distributor.distribute),
137
- name=f"single_store_task:{storer.table}",
138
- ).start()
139
-
140
- threading.Thread(
141
- target=check, name="check",
142
- args=(
143
- stop_event, last_event, distributor,
144
- scheduler, spider, storer_list
145
- )
146
- ).start()
147
-
148
- # return starter(task, func)
149
- return decorator
150
-
151
-
152
-
153
-
@@ -1,88 +0,0 @@
1
- import json
2
- import sys
3
- from typing import Iterable
4
-
5
- import requests
6
-
7
-
8
- # from cobweb import Seed
9
-
10
-
11
- def struct_table_name(table_name):
12
- return table_name.replace(".", "__p__").replace(":", "__c__")
13
-
14
-
15
- def restore_table_name(table_name):
16
- return table_name.replace("__p__", ".").replace("__c__", ":")
17
-
18
-
19
- def struct_queue_name(db_name, table_name):
20
- return sys.intern(f"__{db_name}_{table_name}_queue__")
21
-
22
-
23
- # class StorerDB:
24
- #
25
- # @staticmethod
26
- # def console(self):
27
- # from db.storer.console import Console
28
- # table = struct_table_name(table)
29
- # return StorerInfo(DB=Console, table=table, length=length, config=None)
30
- #
31
- # @staticmethod
32
- # def textfile(table, length=200):
33
- # from db.storer.textfile import Textfile
34
- # table = struct_table_name(table)
35
- # return StorerInfo(DB=Textfile, table=table, length=length, config=None)
36
- #
37
- # @staticmethod
38
- # def loghub(table, length=200, config=None):
39
- # from db.storer.loghub import Loghub
40
- # table = struct_table_name(table)
41
- # return StorerInfo(DB=Loghub, table=table, length=length, config=config)
42
-
43
-
44
- def parse_info(info):
45
- if not info:
46
- return info
47
-
48
- if isinstance(info, dict):
49
- return info
50
-
51
- if isinstance(info, str):
52
- return json.loads(info)
53
-
54
- if isinstance(info, Iterable):
55
- result = list()
56
- for ii in info:
57
- if isinstance(ii, str):
58
- result.append(json.loads(ii))
59
- elif isinstance(ii, dict):
60
- result.append(ii)
61
- else:
62
- raise TypeError("must be in [str, dict]")
63
-
64
- return result
65
-
66
-
67
- def struct_start_seeds(seeds):
68
- from .bbb import Seed
69
- if not seeds:
70
- return None
71
- if any(isinstance(seeds, t) for t in (list, tuple)):
72
- return [Seed(seed) for seed in seeds]
73
- elif any(isinstance(seeds, t) for t in (str, dict)):
74
- return Seed(seeds)
75
-
76
-
77
- # def get_storer_db(db):
78
- #
79
- # if isinstance(db, str):
80
- # model = import_module(f" db.storer.{db.lower()}")
81
- #
82
- # # if db.lower() in dir(StorerDB):
83
- # # return getattr(StorerDB, db)
84
- # # else:
85
- # # pass
86
-
87
-
88
-
File without changes