cobweb-launcher 0.1.2__tar.gz → 0.1.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/PKG-INFO +1 -1
  2. {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/__init__.py +1 -0
  3. {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/db/redis_db.py +5 -3
  4. {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/distributed/launcher.py +14 -51
  5. {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/distributed/models.py +7 -3
  6. cobweb-launcher-0.1.4/cobweb/single/launcher.py +231 -0
  7. cobweb-launcher-0.1.4/cobweb/single/models.py +136 -0
  8. cobweb-launcher-0.1.4/cobweb/utils.py +90 -0
  9. {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb_launcher.egg-info/PKG-INFO +1 -1
  10. {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb_launcher.egg-info/SOURCES.txt +1 -1
  11. {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/setup.py +1 -1
  12. cobweb-launcher-0.1.2/cobweb/single/models.py +0 -104
  13. cobweb-launcher-0.1.2/cobweb/single/nest.py +0 -153
  14. cobweb-launcher-0.1.2/cobweb/utils.py +0 -88
  15. {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/LICENSE +0 -0
  16. {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/README.md +0 -0
  17. {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/bbb.py +0 -0
  18. {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/db/__init__.py +0 -0
  19. {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/db/oss_db.py +0 -0
  20. {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/db/scheduler/__init__.py +0 -0
  21. {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/db/scheduler/default.py +0 -0
  22. {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/db/scheduler/textfile.py +0 -0
  23. {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/db/storer/__init__.py +0 -0
  24. {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/db/storer/console.py +0 -0
  25. {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/db/storer/loghub.py +0 -0
  26. {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/db/storer/redis.py +0 -0
  27. {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/db/storer/textfile.py +0 -0
  28. {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/decorators.py +0 -0
  29. {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/distributed/__init__.py +0 -0
  30. {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/interface.py +0 -0
  31. {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/log.py +0 -0
  32. {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/setting.py +0 -0
  33. {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/single/__init__.py +0 -0
  34. {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/task.py +0 -0
  35. {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb_launcher.egg-info/dependency_links.txt +0 -0
  36. {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb_launcher.egg-info/requires.txt +0 -0
  37. {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb_launcher.egg-info/top_level.txt +0 -0
  38. {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cobweb-launcher
3
- Version: 0.1.2
3
+ Version: 0.1.4
4
4
  Summary: spider_hole
5
5
  Home-page: https://github.com/Juannie-PP/cobweb
6
6
  Author: Juannie-PP
@@ -5,6 +5,7 @@ from .interface import SchedulerInterface, StorerInterface
5
5
  from .db.redis_db import RedisDB
6
6
  from .db.oss_db import OssDB
7
7
  from .distributed.launcher import launcher
8
+ from .single.launcher import launcher as single_launcher
8
9
  from . import setting
9
10
 
10
11
 
@@ -151,17 +151,19 @@ class RedisDB:
151
151
  @check_redis_status
152
152
  def check_spider_queue(self, stop, storer_num):
153
153
  while not stop.is_set():
154
- # 每15s获取check锁,等待600s后仍获取不到锁则重试;获取到锁后,设置锁的存活时间为15s
154
+ # 每15s获取check锁,等待600s后仍获取不到锁则重试;获取到锁后,设置锁的存活时间为${cs_lct}s
155
155
  if self._get_lock(key=self.check_lock, t=self.cs_lct, timeout=600, sleep_time=3):
156
156
  heartbeat = True if self.client.exists(self.heartbeat_key) else False
157
- # 重启重制score值,否则获取n分钟前的分数值
157
+ # 重启重制score值,否则获取${rs_time}分钟前的分数值
158
158
  score = -int(time.time()) + self.rs_time if heartbeat else "-inf"
159
159
 
160
160
  keys = self.client.keys(self.storer_key % "*")
161
+
161
162
  if keys and len(keys) >= storer_num:
162
163
  intersection_key = self.storer_key % "intersection"
163
164
  self.client.delete(intersection_key)
164
165
  self.client.zinterstore(intersection_key, keys)
166
+
165
167
  while True:
166
168
  members = self.client.zrange(intersection_key, 0, 1999)
167
169
  if not members:
@@ -192,7 +194,7 @@ class RedisDB:
192
194
  self.client.setex(self.heartbeat_key, 15, "")
193
195
 
194
196
  # self.client.delete(self.check_lock)
195
- time.sleep(3)
197
+ # time.sleep(3)
196
198
 
197
199
  @check_redis_status
198
200
  def set_heartbeat(self, stop):
@@ -1,50 +1,15 @@
1
1
  import time
2
2
  import threading
3
3
  from threading import Thread
4
- from importlib import import_module
5
4
 
6
- from cobweb import log, Queue, DBItem, RedisDB, OssDB, StorerInterface
7
- from cobweb.utils import struct_queue_name, restore_table_name
8
- from cobweb.setting import MODEL, RESET_SCORE, CHECK_LOCK_TIME
9
5
  from .models import Scheduler, Spider, Storer
10
-
11
-
12
- def get_scheduler_db(db):
13
- if isinstance(db, str):
14
- if "." in db:
15
- model_path = db.split(".")
16
- model = import_module(db)
17
- obj = getattr(model, db)
18
- else:
19
- model = import_module(f"cobweb.db.scheduler.{db.lower()}")
20
- obj = getattr(model, db.capitalize())
21
- return obj
22
- # if db.lower() in dir(StorerDB):
23
- # return getattr(StorerDB, db)
24
- # else:
25
- # pass
26
- elif issubclass(db, StorerInterface):
27
- return db
28
- raise TypeError()
29
-
30
-
31
- def get_storer_db(db):
32
- if isinstance(db, str):
33
- if "." in db:
34
- model_path = db.split(".")
35
- model = import_module(db)
36
- obj = getattr(model, db)
37
- else:
38
- model = import_module(f"cobweb.db.storer.{db.lower()}")
39
- obj = getattr(model, db.capitalize())
40
- return obj, db.lower()
41
- # if db.lower() in dir(StorerDB):
42
- # return getattr(StorerDB, db)
43
- # else:
44
- # pass
45
- elif issubclass(db, StorerInterface):
46
- return db, db.__name__.lower()
47
- raise TypeError()
6
+ from cobweb import log, Queue, DBItem, RedisDB
7
+ from cobweb.setting import MODEL, RESET_SCORE, CHECK_LOCK_TIME
8
+ from cobweb.utils import (
9
+ struct_queue_name as sqn,
10
+ restore_table_name as rtn,
11
+ parse_import_model as pim,
12
+ )
48
13
 
49
14
 
50
15
  def check(stop, last, spider, scheduler, storer_list, ready_seed_length, spider_queue_length):
@@ -164,9 +129,9 @@ def launcher(task):
164
129
  size = task.scheduler_info.get("size")
165
130
  scheduler_config = task.scheduler_info.get("config")
166
131
  scheduler_db = task.scheduler_info.get("db", "default")
167
- DB = get_scheduler_db(scheduler_db)
132
+ DB, class_name = pim(scheduler_db, "scheduler")
168
133
  # SchedulerDB, table, sql, length, size, config = task.scheduler_info
169
- SchedulerTmp = type(DB.__name__, (Scheduler, DB), {})
134
+ SchedulerTmp = type(class_name, (Scheduler, DB), {})
170
135
 
171
136
  # 初始化调度器
172
137
  scheduler = SchedulerTmp(
@@ -185,18 +150,16 @@ def launcher(task):
185
150
  # new item
186
151
  item = type("Item", (object,), {"redis_client": redis_db.client})()
187
152
 
188
- if task.oss_config:
189
- item.oss = OssDB(**task.oss_config)
190
-
191
153
  for storer_info in storer_info_list:
192
154
  storer_db = storer_info["db"]
193
155
  fields = storer_info["fields"]
194
156
  storer_table = storer_info.get("table", "console")
195
157
  storer_config = storer_info.get("config")
196
158
 
197
- StorerDB, db_name = get_storer_db(storer_db)
198
- StorerTmp = type(StorerDB.__name__, (Storer, StorerDB), {})
159
+ StorerDB, class_name = pim(storer_db, "storer")
160
+ StorerTmp = type(class_name, (Storer, StorerDB), {})
199
161
 
162
+ db_name = class_name.lower()
200
163
  if not getattr(item, db_name, None):
201
164
  instance = type(db_name, (DBItem,), {})
202
165
  setattr(item, db_name, instance)
@@ -204,10 +167,10 @@ def launcher(task):
204
167
  storer_item_instance = getattr(item, db_name)
205
168
  storer_item_instance.init_item(storer_table, fields)
206
169
 
207
- storer_queue = struct_queue_name(db_name, storer_table)
170
+ storer_queue = sqn(db_name, storer_table)
208
171
  queue = getattr(storer_item_instance, storer_queue)
209
172
  # 初始话存储器
210
- table_name = restore_table_name(table_name=storer_table)
173
+ table_name = rtn(table_name=storer_table)
211
174
  storer = StorerTmp(
212
175
  table=table_name, fields=fields,
213
176
  length=task.storer_queue_length,
@@ -1,6 +1,8 @@
1
1
  import time
2
2
  from hashlib import md5
3
- from cobweb import log, Queue, Seed, StorerInterface, SchedulerInterface
3
+ from cobweb import log, Queue, Seed
4
+ from utils import issubclass_cobweb_interface
5
+
4
6
  # from pympler import asizeof
5
7
 
6
8
 
@@ -8,7 +10,8 @@ class Scheduler:
8
10
 
9
11
  def schedule_seed(self, ready_seed_length, get_scheduler_lock, add_seed):
10
12
 
11
- if not issubclass(self.__class__, SchedulerInterface):
13
+ inf_name = "SchedulerInterface"
14
+ if not issubclass_cobweb_interface(self.__class__, inf_name):
12
15
  raise Exception("not have schedule function!")
13
16
 
14
17
  if self.__class__.__name__ == "Default":
@@ -103,7 +106,8 @@ class Storer:
103
106
 
104
107
  def store_task(self, stop, last, reset_seed, set_storer):
105
108
 
106
- if not issubclass(self.__class__, StorerInterface):
109
+ inf_name = "StorerInterface"
110
+ if not issubclass_cobweb_interface(self.__class__, inf_name):
107
111
  return None
108
112
 
109
113
  if not getattr(self, "store", None):
@@ -0,0 +1,231 @@
1
+ import time
2
+ import threading
3
+ from threading import Thread
4
+
5
+ from .models import Scheduler, Spider, Storer
6
+ from cobweb import log, Queue, DBItem, RedisDB
7
+ from cobweb.setting import MODEL, RESET_SCORE, CHECK_LOCK_TIME
8
+ from cobweb.utils import (
9
+ struct_queue_name as sqn,
10
+ restore_table_name as rtn,
11
+ parse_import_model as pim,
12
+ )
13
+
14
+
15
+ def check(stop, last, spider, scheduler, storer, ready_seed_length, spider_queue_length):
16
+ log.info("run check thread after 30 seconds...")
17
+ time.sleep(30)
18
+ spider_info = """
19
+ ------------------- check: {0} ------------------
20
+ redis_spider_seed_length: {1}
21
+ redis_ready_seed_length: {2}
22
+ running_spider_thread_num: {3}
23
+ memory_seed_queue_length: {4}
24
+ storer_queue_length_info: {5}
25
+ ----------------------- end -----------------------"""
26
+ while True:
27
+ status = "running"
28
+ running_spider_thread_num = spider.spider_in_progress.length
29
+ redis_ready_seed_length = ready_seed_length()
30
+ redis_spider_seed_length = spider_queue_length()
31
+ memory_seed_queue_length = scheduler.queue.length
32
+ storer_upload_queue_length = storer.queue.length
33
+ if (
34
+ scheduler.stop and
35
+ # not redis_ready_seed_length and
36
+ not memory_seed_queue_length and
37
+ not running_spider_thread_num
38
+ ):
39
+ if not MODEL:
40
+ log.info("spider is done?")
41
+ last.set()
42
+ time.sleep(3)
43
+ storer_queue_empty = True
44
+ if storer.queue.length:
45
+ storer_queue_empty = False
46
+ storer_upload_queue_length = storer.queue.length
47
+ if (
48
+ storer_queue_empty and
49
+ not redis_ready_seed_length and
50
+ not redis_spider_seed_length
51
+ ):
52
+ if MODEL:
53
+ log.info("waiting for push seeds...")
54
+ status = "waiting"
55
+ time.sleep(30)
56
+ else:
57
+ log.info("spider done!")
58
+ break
59
+
60
+ last.clear()
61
+
62
+ log.info(spider_info.format(
63
+ status,
64
+ redis_spider_seed_length,
65
+ redis_ready_seed_length,
66
+ running_spider_thread_num,
67
+ memory_seed_queue_length,
68
+ storer_upload_queue_length
69
+ ))
70
+
71
+ time.sleep(3)
72
+ stop.set()
73
+
74
+
75
+ def launcher(task):
76
+ """
77
+ 任务启动装饰器
78
+ :param task: 任务配置信息
79
+ """
80
+ def decorator(func):
81
+ """
82
+ Item:
83
+ Textfile()
84
+ Loghub()
85
+ Console()
86
+ e.g.
87
+ task.fields = "a,b"
88
+ func(item, seed)
89
+ a = "a"
90
+ b = "b"
91
+ data = {"a": "a", "b": "b"}
92
+ yield item.Loghub(**data)
93
+ yield item.Loghub(a=a, b=b)
94
+ """
95
+ storer_list = []
96
+
97
+ # 程序结束事件
98
+ last = threading.Event()
99
+ # 停止采集事件
100
+ stop = threading.Event()
101
+
102
+ # 初始化redis信息
103
+ redis_db = RedisDB(
104
+ task.project, task.task_name, task.redis_info,
105
+ model=MODEL, cs_lct=CHECK_LOCK_TIME, rs_time=RESET_SCORE
106
+ )
107
+
108
+ # new item
109
+ item = type("Item", (object,), {"redis_client": redis_db.client})()
110
+
111
+ log.info("初始化cobweb!")
112
+
113
+ seed_queue = Queue()
114
+
115
+ scheduler_info = task.scheduler_info or dict()
116
+
117
+ # 调度器动态继承
118
+ sql = scheduler_info.get("sql")
119
+ table = scheduler_info.get("table")
120
+ size = scheduler_info.get("size")
121
+ scheduler_config = scheduler_info.get("config")
122
+ scheduler_db = scheduler_info.get("db", "default")
123
+ DB, class_name = pim(scheduler_db, "scheduler")
124
+ # SchedulerDB, table, sql, length, size, config = task.scheduler_info
125
+ SchedulerTmp = type(class_name, (Scheduler, DB), {})
126
+
127
+ # 初始化调度器
128
+ scheduler = SchedulerTmp(
129
+ table=table, sql=sql, size=size, queue=seed_queue,
130
+ length=task.scheduler_queue_length, config=scheduler_config
131
+ )
132
+
133
+ # 初始化采集器
134
+ spider = Spider(seed_queue, task.max_retries)
135
+
136
+ storer = None
137
+
138
+ # 解析存储器信息
139
+ storer_info = task.storer_info or dict()
140
+
141
+ # for storer_info in storer_info_list:
142
+ if storer_info:
143
+ storer_db = storer_info["db"]
144
+ fields = storer_info["fields"]
145
+ storer_table = storer_info.get("table", "console")
146
+ storer_config = storer_info.get("config")
147
+
148
+ StorerDB, class_name = pim(storer_db, "storer")
149
+ StorerTmp = type(class_name, (Storer, StorerDB), {})
150
+
151
+ db_name = class_name.lower()
152
+ if not getattr(item, db_name, None):
153
+ instance = type(db_name, (DBItem,), {})
154
+ setattr(item, db_name, instance)
155
+
156
+ storer_item_instance = getattr(item, db_name)
157
+ storer_item_instance.init_item(storer_table, fields)
158
+
159
+ storer_queue = sqn(db_name, storer_table)
160
+ queue = getattr(storer_item_instance, storer_queue)
161
+ # 初始话存储器
162
+ table_name = rtn(table_name=storer_table)
163
+ storer = StorerTmp(
164
+ table=table_name, fields=fields,
165
+ length=task.storer_queue_length,
166
+ queue=queue, config=storer_config
167
+ )
168
+
169
+ Thread(target=redis_db.check_spider_queue, args=(stop, len(storer_list))).start()
170
+ Thread(target=redis_db.set_heartbeat, args=(stop,)).start()
171
+
172
+ # 推送初始种子
173
+ # seeds = start_seeds(task.start_seed)
174
+ redis_db.add_seed(task.seeds)
175
+ # 启动调度器, 调度至redis队列
176
+ Thread(
177
+ # name="xxxx_schedule_seeds",
178
+ target=scheduler.schedule_seed,
179
+ args=(
180
+ redis_db.ready_seed_length,
181
+ redis_db.get_scheduler_lock,
182
+ redis_db.add_seed
183
+ )
184
+ ).start()
185
+
186
+ # 启动调度器, 调度任务队列
187
+ Thread(
188
+ # name="xxxx_schedule_task",
189
+ target=scheduler.schedule_task,
190
+ args=(
191
+ stop, redis_db.get_seed,
192
+ redis_db.ready_seed_length
193
+ )
194
+ ).start()
195
+
196
+ # 启动采集器
197
+ for index in range(task.spider_num):
198
+ Thread(
199
+ # name=f"xxxx_spider_task:{index}",
200
+ target=spider.spider_task,
201
+ args=(
202
+ stop, func, item,
203
+ redis_db.del_seed
204
+ )
205
+ ).start()
206
+
207
+ # 启动存储器
208
+ if storer:
209
+ Thread(
210
+ # name=f"xxxx_store_task:{storer.table}",
211
+ target=storer.store_task,
212
+ args=(
213
+ stop, last,
214
+ redis_db.reset_seed,
215
+ redis_db.set_storer
216
+ )
217
+ ).start()
218
+
219
+ Thread(
220
+ # name="check_spider",
221
+ target=check,
222
+ args=(
223
+ stop, last, spider,
224
+ scheduler, storer,
225
+ redis_db.ready_seed_length,
226
+ redis_db.spider_queue_length,
227
+ )
228
+ ).start()
229
+
230
+ return decorator
231
+
@@ -0,0 +1,136 @@
1
+ import time
2
+ from cobweb import log, Queue, Seed
3
+ from utils import issubclass_cobweb_interface
4
+
5
+ # from pympler import asizeof
6
+
7
+
8
+ class Scheduler:
9
+
10
+ def schedule_seed(self, ready_seed_length, get_scheduler_lock, add_seed):
11
+
12
+ inf_name = "SchedulerInterface"
13
+ if not issubclass_cobweb_interface(self.__class__, inf_name):
14
+ raise Exception("not have schedule function!")
15
+
16
+ if self.__class__.__name__ == "Default":
17
+ self.stop = True
18
+ return None
19
+
20
+ while not self.stop:
21
+ length = ready_seed_length()
22
+ if length > self.size:
23
+ time.sleep(15)
24
+
25
+ elif get_scheduler_lock():
26
+ seeds = self.schedule()
27
+ add_seed(seeds)
28
+
29
+ log.info(f"close thread: schedule_seed")
30
+
31
+ def schedule_task(self, stop, get_seed, ready_seed_length):
32
+ time.sleep(3)
33
+ while not stop.is_set():
34
+
35
+ if not ready_seed_length():
36
+ time.sleep(15)
37
+ continue
38
+
39
+ if self.queue.length >= self.length:
40
+ time.sleep(3)
41
+ continue
42
+
43
+ seeds = get_seed(self.length)
44
+ self.queue.push(seeds)
45
+ log.info(f"close thread: schedule_task")
46
+
47
+
48
+ class Spider:
49
+
50
+ def __init__(self, queue, max_retries=5):
51
+ self.spider_in_progress = Queue()
52
+ self.max_retries = max_retries
53
+ self.queue = queue
54
+
55
+ def spider_task(self, stop, func, item, del_seed):
56
+ while not stop.is_set():
57
+ seed = self.queue.pop()
58
+ if not seed:
59
+ time.sleep(3)
60
+ continue
61
+ elif seed._retry >= self.max_retries:
62
+ del_seed(seed, spider_status=False)
63
+ continue
64
+ try:
65
+ self.spider_in_progress.push(1, direct_insertion=True)
66
+ # log.info("spider seed: " + str(seed))
67
+ status = None
68
+ for it in func(item, seed):
69
+ if getattr(it, "table_name", None):
70
+ store_queue = it.queue()
71
+ store_queue.push(
72
+ [seed, it.struct_data],
73
+ direct_insertion=True
74
+ )
75
+ elif isinstance(it, Seed):
76
+ self.queue.push(it)
77
+ elif any(isinstance(it, t) for t in (list, tuple)):
78
+ self.queue.push([s if isinstance(s, Seed) else Seed(s) for s in it])
79
+ elif isinstance(it, bool):
80
+ status = it
81
+ elif it is None:
82
+ status = False
83
+
84
+ if status is not None:
85
+ if status:
86
+ del_seed(seed, spider_status=True)
87
+ else:
88
+ seed._retry += 1
89
+ self.queue.push(seed)
90
+
91
+ except Exception as e:
92
+ seed._retry += 1
93
+ self.queue.push(seed)
94
+ log.info(f"{str(seed)} -> {str(e)}")
95
+ finally:
96
+ self.spider_in_progress.pop()
97
+ log.info(f"close thread: spider")
98
+
99
+
100
+ class Storer:
101
+
102
+ def store_task(self, stop, last, reset_seed, del_seed):
103
+
104
+ inf_name = "StorerInterface"
105
+ if not issubclass_cobweb_interface(self.__class__, inf_name):
106
+ return None
107
+
108
+ if not getattr(self, "store", None):
109
+ raise Exception("not have store function!")
110
+
111
+ storer_name = self.__class__.__name__ + self.table
112
+
113
+ while not stop.is_set():
114
+
115
+ if last.is_set() or self.queue.length >= self.length:
116
+ seeds, data_list = [], []
117
+
118
+ for _ in range(self.length):
119
+ items = self.queue.pop()
120
+ if not items:
121
+ break
122
+ seed, data = items
123
+ seeds.append(seed)
124
+ data_list.append(data)
125
+
126
+ if data_list:
127
+ if self.store(data_list):
128
+ del_seed(seeds)
129
+ else:
130
+ reset_seed(seeds)
131
+ log.info("reset seeds!")
132
+ continue
133
+
134
+ time.sleep(3)
135
+
136
+ log.info(f"close thread: {storer_name}")
@@ -0,0 +1,90 @@
1
+ import json
2
+ import re
3
+ import sys
4
+ from abc import ABC
5
+ from typing import Iterable
6
+ from importlib import import_module
7
+
8
+
9
+ def struct_table_name(table_name):
10
+ return table_name.replace(".", "__p__").replace(":", "__c__")
11
+
12
+
13
+ def restore_table_name(table_name):
14
+ return table_name.replace("__p__", ".").replace("__c__", ":")
15
+
16
+
17
+ def struct_queue_name(db_name, table_name):
18
+ return sys.intern(f"__{db_name}_{table_name}_queue__")
19
+
20
+
21
+ def parse_info(info):
22
+ if not info:
23
+ return info
24
+
25
+ if isinstance(info, dict):
26
+ return info
27
+
28
+ if isinstance(info, str):
29
+ return json.loads(info)
30
+
31
+ if isinstance(info, Iterable):
32
+ result = list()
33
+ for ii in info:
34
+ if isinstance(ii, str):
35
+ result.append(json.loads(ii))
36
+ elif isinstance(ii, dict):
37
+ result.append(ii)
38
+ else:
39
+ raise TypeError("must be in [str, dict]")
40
+
41
+ return result
42
+
43
+
44
+ def struct_start_seeds(seeds):
45
+ from .bbb import Seed
46
+ if not seeds:
47
+ return None
48
+ if any(isinstance(seeds, t) for t in (list, tuple)):
49
+ return [Seed(seed) for seed in seeds]
50
+ elif any(isinstance(seeds, t) for t in (str, dict)):
51
+ return Seed(seeds)
52
+
53
+
54
+ def issubclass_cobweb_interface(_class, inf_name):
55
+ for _c in _class.__mro__[1:]:
56
+ if _c.__name__ == inf_name:
57
+ return True
58
+ return False
59
+
60
+
61
+ def parse_import_model(model_info, model_type=None):
62
+ if model_type not in ["scheduler", "storer"]:
63
+ raise TypeError("model_type must be in scheduler, storer")
64
+ if isinstance(model_info, str):
65
+ if "import" in model_info:
66
+ model_path, class_name = re.search(
67
+ r"from (.*?) import (.*?)$", model_info
68
+ ).groups()
69
+ model = import_module(model_path)
70
+ class_object = getattr(model, class_name)
71
+ elif "." in model_info:
72
+ info_list = model_info.split(".")
73
+ class_name = info_list[-1]
74
+ model_path = ".".join(info_list[:-1])
75
+ model = import_module(model_path)
76
+ class_object = getattr(model, class_name)
77
+ else:
78
+ model_path = f"cobweb.db.{model_type}.{model_info.lower()}"
79
+ class_name = model_info.capitalize()
80
+ model = import_module(model_path)
81
+ class_object = getattr(model, class_name)
82
+ return class_object, class_name
83
+ elif issubclass(model_info, ABC):
84
+ inf_name = model_type.capitalize() + "Interface"
85
+ if issubclass_cobweb_interface(model_info, inf_name):
86
+ return model_info, model_info.__name__
87
+ raise ImportError()
88
+ raise TypeError()
89
+
90
+
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cobweb-launcher
3
- Version: 0.1.2
3
+ Version: 0.1.4
4
4
  Summary: spider_hole
5
5
  Home-page: https://github.com/Juannie-PP/cobweb
6
6
  Author: Juannie-PP
@@ -24,8 +24,8 @@ cobweb/distributed/__init__.py
24
24
  cobweb/distributed/launcher.py
25
25
  cobweb/distributed/models.py
26
26
  cobweb/single/__init__.py
27
+ cobweb/single/launcher.py
27
28
  cobweb/single/models.py
28
- cobweb/single/nest.py
29
29
  cobweb_launcher.egg-info/PKG-INFO
30
30
  cobweb_launcher.egg-info/SOURCES.txt
31
31
  cobweb_launcher.egg-info/dependency_links.txt
@@ -5,7 +5,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
5
5
 
6
6
  setup(
7
7
  name="cobweb-launcher",
8
- version="0.1.2",
8
+ version="0.1.4",
9
9
  packages=find_packages(),
10
10
  url="https://github.com/Juannie-PP/cobweb",
11
11
  license="MIT",
@@ -1,104 +0,0 @@
1
- import time
2
- # from pympler import asizeof
3
- from single.nest import Queue
4
- from single.nest import struct_queue_name
5
- from single.nest import SchedulerInterface, StorerInterface
6
-
7
-
8
- # class Transceiver:
9
- class Distributor:
10
-
11
- def __init__(self):
12
- self.seed_queue = Queue()
13
-
14
- @property
15
- def queue_names(self):
16
- return tuple(self.__dict__.keys())
17
-
18
- @property
19
- def used_memory(self):
20
- return asizeof.asizeof(self)
21
-
22
- def create_queue(self, queue_name: str):
23
- self.__setattr__(queue_name, Queue())
24
-
25
- def get_queue(self, queue_name: str):
26
- return self.__getattribute__(queue_name)
27
-
28
- def deal_item(self, item):
29
- icn = item.__class__.__name__
30
- if icn == "Seed":
31
- self.seed_queue.push(item)
32
- elif getattr(item, "table_name", None):
33
- queue_name = struct_queue_name(icn, item.table_name)
34
- getattr(self, queue_name).push(item.serialization)
35
-
36
- def distribute(self, callback, *args, **kwargs):
37
- iterable = callback(*args, **kwargs)
38
- if not iterable:
39
- return None
40
- for result in iterable:
41
- self.deal_item(result)
42
- return True
43
-
44
-
45
- class Scheduler:
46
-
47
- def schedule_task(self, distribute):
48
-
49
- if not issubclass(self.__class__, SchedulerInterface):
50
- return None
51
-
52
- if not getattr(self, "schedule", None):
53
- raise Exception("not have schedule function!")
54
-
55
- while not self.stop:
56
-
57
- if self.queue.length < self.length:
58
- distribute(self.schedule)
59
-
60
- else:
61
- print("------------")
62
- time.sleep(15)
63
-
64
-
65
- class Spider:
66
-
67
- def __init__(self, queue):
68
- self.queue = queue
69
- self.spider_in_progress = Queue()
70
-
71
- def spider_task(self, stop_event, distribute, func, item):
72
- while not stop_event.is_set():
73
- seed = self.queue.pop()
74
- if not seed:
75
- time.sleep(3)
76
- continue
77
- try:
78
- self.spider_in_progress.push(1)
79
- distribute(func, item, seed)
80
- except Exception as e:
81
- print(e)
82
- finally:
83
- self.spider_in_progress.pop()
84
-
85
-
86
- class Storer:
87
-
88
- def store_task(self, stop_event, last_event, distribute):
89
-
90
- if not issubclass(self.__class__, StorerInterface):
91
- return None
92
-
93
- if not getattr(self, "store", None):
94
- raise Exception("not have store function!")
95
-
96
- while not stop_event.is_set():
97
- if last_event.is_set() or self.queue.length > self.length:
98
- data_list = []
99
- data_length = min(self.queue.length, self.length)
100
- for _ in range(data_length):
101
- data = self.queue.pop()
102
- data_list.append(data)
103
- if data_list:
104
- distribute(self.store, data_list)
@@ -1,153 +0,0 @@
1
- import time
2
- import threading
3
-
4
- from single.nest import Seed, DBItem
5
- from single.nest import struct_queue_name, restore_table_name
6
- from single.nest import Distributor, Scheduler, Spider, Storer
7
-
8
-
9
- def init_task_seed(seeds):
10
- if not seeds:
11
- return None
12
- if isinstance(seeds, list) or isinstance(seeds, tuple):
13
- for seed in seeds:
14
- yield Seed(seed)
15
- elif isinstance(seeds, str) or isinstance(seeds, dict):
16
- yield Seed(seeds)
17
-
18
-
19
- def parse_storer_info(storer_info):
20
- storer_data = {}
21
- storer_info_list = []
22
- if storer_info.__class__.__name__ == 'StorerInfo':
23
- storer_info_list.append(storer_info)
24
- elif isinstance(storer_info, tuple) or isinstance(storer_info, list):
25
- storer_info_list = storer_info
26
- for info in storer_info_list:
27
- db_name = info.DB.__name__
28
- storer_data.setdefault(db_name, {"StorerDB": info.DB, "db_args_list": []})
29
- storer_data[db_name]["db_args_list"].append(info[1:])
30
- return storer_data
31
-
32
-
33
- def check(stop_event, last_event, distributor, scheduler, spider, storer_list):
34
- while True:
35
- time.sleep(3)
36
- if (
37
- scheduler.stop and
38
- not distributor.seed_queue.length and
39
- not spider.spider_in_progress.length
40
- ):
41
- last_event.set()
42
- time.sleep(10)
43
- storer_queue_empty = True
44
- for storer in storer_list:
45
- if storer.queue.length:
46
- storer_queue_empty = False
47
- break
48
- if storer_queue_empty:
49
- break
50
- last_event.clear()
51
- stop_event.set()
52
-
53
-
54
- def cobweb(task):
55
- """
56
- 任务启动装饰器
57
- :param task: 任务配置信息
58
- """
59
- def decorator(func):
60
- """
61
- func(Item, seed)
62
- Item:
63
- Item.Textfile()
64
- Item.Console()
65
- """
66
- # project task_name start_seed spider_num queue_length scheduler_info storer_info
67
-
68
- storer_list = []
69
-
70
- # 程序结束事件
71
- last_event = threading.Event()
72
- # 暂停采集事件
73
- stop_event = threading.Event()
74
-
75
- # 创建分发器
76
- distributor = Distributor()
77
-
78
- # 调度器动态继承
79
- SchedulerDB, table, sql, length, size = task.SchedulerInfo
80
- SchedulerTmp = type('Scheduler', (Scheduler, SchedulerDB), {})
81
-
82
- # 初始化调度器
83
- scheduler = SchedulerTmp(table=table, sql=sql, length=length, size=size, queue=distributor.seed_queue)
84
-
85
- # 初始化采集器
86
- spider = Spider(queue=distributor.seed_queue)
87
-
88
- # 解析存储器信息
89
- storer_data = parse_storer_info(task.storer_info)
90
-
91
- # sds
92
- item = type("item", (object,), {})
93
- for db_name in storer_data.keys():
94
- # 存储器动态继承
95
- StorerDB = storer_data[db_name]["StorerDB"]
96
- StorerTmp = type('Storer', (Storer, StorerDB), {})
97
- db_args_list = storer_data[db_name]["db_args_list"]
98
- for storer_db_args in db_args_list:
99
- table, fields, length = storer_db_args
100
- if not getattr(item, db_name, None):
101
- instance = type(db_name, (DBItem,), {})
102
- setattr(item, db_name, instance)
103
- # 创建存储xxx
104
- getattr(item, db_name).init_item(table, fields)
105
- # 创建存储队列
106
- storer_queue = struct_queue_name(db_name, table)
107
- distributor.create_queue(queue_name=storer_queue)
108
- queue = distributor.get_queue(queue_name=storer_queue)
109
- # 初始话存储器
110
- table_name = restore_table_name(table_name=table)
111
- storer = StorerTmp(table=table_name, fields=fields, length=length, queue=queue)
112
- storer_list.append(storer)
113
-
114
- # 推送初始种子
115
- distributor.distribute(init_task_seed, seeds=task.start_seed)
116
-
117
- # 启动调度器
118
- threading.Thread(
119
- target=scheduler.schedule_task,
120
- args=(distributor.distribute,),
121
- name="single_scheduler_task"
122
- ).start()
123
-
124
- # 启动采集器
125
- for index in range(task.spider_num):
126
- threading.Thread(
127
- target=spider.spider_task,
128
- args=(stop_event, distributor.distribute, func, item),
129
- name=f"single_spider_task:{index}"
130
- ).start()
131
-
132
- # 启动存储器
133
- for storer in storer_list:
134
- threading.Thread(
135
- target=storer.store_task,
136
- args=(stop_event, last_event, distributor.distribute),
137
- name=f"single_store_task:{storer.table}",
138
- ).start()
139
-
140
- threading.Thread(
141
- target=check, name="check",
142
- args=(
143
- stop_event, last_event, distributor,
144
- scheduler, spider, storer_list
145
- )
146
- ).start()
147
-
148
- # return starter(task, func)
149
- return decorator
150
-
151
-
152
-
153
-
@@ -1,88 +0,0 @@
1
- import json
2
- import sys
3
- from typing import Iterable
4
-
5
- import requests
6
-
7
-
8
- # from cobweb import Seed
9
-
10
-
11
- def struct_table_name(table_name):
12
- return table_name.replace(".", "__p__").replace(":", "__c__")
13
-
14
-
15
- def restore_table_name(table_name):
16
- return table_name.replace("__p__", ".").replace("__c__", ":")
17
-
18
-
19
- def struct_queue_name(db_name, table_name):
20
- return sys.intern(f"__{db_name}_{table_name}_queue__")
21
-
22
-
23
- # class StorerDB:
24
- #
25
- # @staticmethod
26
- # def console(self):
27
- # from db.storer.console import Console
28
- # table = struct_table_name(table)
29
- # return StorerInfo(DB=Console, table=table, length=length, config=None)
30
- #
31
- # @staticmethod
32
- # def textfile(table, length=200):
33
- # from db.storer.textfile import Textfile
34
- # table = struct_table_name(table)
35
- # return StorerInfo(DB=Textfile, table=table, length=length, config=None)
36
- #
37
- # @staticmethod
38
- # def loghub(table, length=200, config=None):
39
- # from db.storer.loghub import Loghub
40
- # table = struct_table_name(table)
41
- # return StorerInfo(DB=Loghub, table=table, length=length, config=config)
42
-
43
-
44
- def parse_info(info):
45
- if not info:
46
- return info
47
-
48
- if isinstance(info, dict):
49
- return info
50
-
51
- if isinstance(info, str):
52
- return json.loads(info)
53
-
54
- if isinstance(info, Iterable):
55
- result = list()
56
- for ii in info:
57
- if isinstance(ii, str):
58
- result.append(json.loads(ii))
59
- elif isinstance(ii, dict):
60
- result.append(ii)
61
- else:
62
- raise TypeError("must be in [str, dict]")
63
-
64
- return result
65
-
66
-
67
- def struct_start_seeds(seeds):
68
- from .bbb import Seed
69
- if not seeds:
70
- return None
71
- if any(isinstance(seeds, t) for t in (list, tuple)):
72
- return [Seed(seed) for seed in seeds]
73
- elif any(isinstance(seeds, t) for t in (str, dict)):
74
- return Seed(seeds)
75
-
76
-
77
- # def get_storer_db(db):
78
- #
79
- # if isinstance(db, str):
80
- # model = import_module(f" db.storer.{db.lower()}")
81
- #
82
- # # if db.lower() in dir(StorerDB):
83
- # # return getattr(StorerDB, db)
84
- # # else:
85
- # # pass
86
-
87
-
88
-
File without changes