cobweb-launcher 0.1.8__tar.gz → 0.1.9__tar.gz

Sign up to get free protection for your applications and to get access to all the features.
Files changed (42) hide show
  1. {cobweb-launcher-0.1.8 → cobweb-launcher-0.1.9}/PKG-INFO +5 -1
  2. cobweb-launcher-0.1.9/cobweb/__init__.py +7 -0
  3. cobweb-launcher-0.1.9/cobweb/constant.py +24 -0
  4. cobweb-launcher-0.1.9/cobweb/db/__init__.py +3 -0
  5. {cobweb-launcher-0.1.8 → cobweb-launcher-0.1.9}/cobweb/db/redis_db.py +22 -32
  6. cobweb-launcher-0.1.9/cobweb/db/scheduler/__init__.py +1 -0
  7. cobweb-launcher-0.1.9/cobweb/db/scheduler/default.py +8 -0
  8. {cobweb-launcher-0.1.8 → cobweb-launcher-0.1.9}/cobweb/db/scheduler/textfile.py +2 -2
  9. cobweb-launcher-0.1.9/cobweb/db/storer/__init__.py +1 -0
  10. {cobweb-launcher-0.1.8 → cobweb-launcher-0.1.9}/cobweb/db/storer/console.py +2 -2
  11. {cobweb-launcher-0.1.8 → cobweb-launcher-0.1.9}/cobweb/db/storer/loghub.py +2 -2
  12. cobweb-launcher-0.1.8/cobweb/db/storer/redis.py → cobweb-launcher-0.1.9/cobweb/db/storer/textfile.py +2 -2
  13. {cobweb-launcher-0.1.8 → cobweb-launcher-0.1.9}/cobweb/decorators.py +1 -1
  14. cobweb-launcher-0.1.9/cobweb/equip/__init__.py +8 -0
  15. {cobweb-launcher-0.1.8/cobweb → cobweb-launcher-0.1.9/cobweb/equip}/distributed/launcher.py +15 -39
  16. {cobweb-launcher-0.1.8/cobweb → cobweb-launcher-0.1.9/cobweb/equip}/distributed/models.py +34 -25
  17. {cobweb-launcher-0.1.8/cobweb → cobweb-launcher-0.1.9/cobweb/equip}/single/launcher.py +18 -49
  18. {cobweb-launcher-0.1.8/cobweb → cobweb-launcher-0.1.9/cobweb/equip}/single/models.py +35 -25
  19. {cobweb-launcher-0.1.8 → cobweb-launcher-0.1.9}/cobweb/task.py +10 -3
  20. {cobweb-launcher-0.1.8 → cobweb-launcher-0.1.9}/cobweb_launcher.egg-info/PKG-INFO +5 -1
  21. {cobweb-launcher-0.1.8 → cobweb-launcher-0.1.9}/cobweb_launcher.egg-info/SOURCES.txt +8 -8
  22. {cobweb-launcher-0.1.8 → cobweb-launcher-0.1.9}/setup.py +1 -1
  23. cobweb-launcher-0.1.8/cobweb/__init__.py +0 -11
  24. cobweb-launcher-0.1.8/cobweb/db/__init__.py +0 -2
  25. cobweb-launcher-0.1.8/cobweb/db/scheduler/default.py +0 -8
  26. cobweb-launcher-0.1.8/cobweb/db/storer/textfile.py +0 -15
  27. cobweb-launcher-0.1.8/cobweb/distributed/__init__.py +0 -0
  28. cobweb-launcher-0.1.8/cobweb/setting.py +0 -13
  29. cobweb-launcher-0.1.8/cobweb/single/__init__.py +0 -0
  30. {cobweb-launcher-0.1.8 → cobweb-launcher-0.1.9}/LICENSE +0 -0
  31. {cobweb-launcher-0.1.8 → cobweb-launcher-0.1.9}/README.md +0 -0
  32. {cobweb-launcher-0.1.8 → cobweb-launcher-0.1.9}/cobweb/bbb.py +0 -0
  33. {cobweb-launcher-0.1.8 → cobweb-launcher-0.1.9}/cobweb/db/oss_db.py +0 -0
  34. {cobweb-launcher-0.1.8/cobweb/db/scheduler → cobweb-launcher-0.1.9/cobweb/equip/distributed}/__init__.py +0 -0
  35. {cobweb-launcher-0.1.8/cobweb/db/storer → cobweb-launcher-0.1.9/cobweb/equip/single}/__init__.py +0 -0
  36. {cobweb-launcher-0.1.8 → cobweb-launcher-0.1.9}/cobweb/interface.py +0 -0
  37. {cobweb-launcher-0.1.8 → cobweb-launcher-0.1.9}/cobweb/log.py +0 -0
  38. {cobweb-launcher-0.1.8 → cobweb-launcher-0.1.9}/cobweb/utils.py +0 -0
  39. {cobweb-launcher-0.1.8 → cobweb-launcher-0.1.9}/cobweb_launcher.egg-info/dependency_links.txt +0 -0
  40. {cobweb-launcher-0.1.8 → cobweb-launcher-0.1.9}/cobweb_launcher.egg-info/requires.txt +0 -0
  41. {cobweb-launcher-0.1.8 → cobweb-launcher-0.1.9}/cobweb_launcher.egg-info/top_level.txt +0 -0
  42. {cobweb-launcher-0.1.8 → cobweb-launcher-0.1.9}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cobweb-launcher
3
- Version: 0.1.8
3
+ Version: 0.1.9
4
4
  Summary: spider_hole
5
5
  Home-page: https://github.com/Juannie-PP/cobweb
6
6
  Author: Juannie-PP
@@ -11,6 +11,10 @@ Classifier: Programming Language :: Python :: 3
11
11
  Requires-Python: >=3.7
12
12
  Description-Content-Type: text/markdown
13
13
  License-File: LICENSE
14
+ Requires-Dist: requests>=2.19.1
15
+ Requires-Dist: oss2>=2.18.1
16
+ Requires-Dist: redis>=4.4.4
17
+ Requires-Dist: aliyun-log-python-sdk
14
18
 
15
19
  # cobweb
16
20
 
@@ -0,0 +1,7 @@
1
+ from .bbb import Seed, Queue, DBItem
2
+ from .task import Task
3
+ from .log import log
4
+ from .db.redis_db import RedisDB
5
+
6
+ from .equip.distributed.launcher import launcher
7
+ from .equip.single.launcher import launcher as single_launcher
@@ -0,0 +1,24 @@
1
+
2
+
3
+ class LauncherModel:
4
+ task = "launcher model: task"
5
+ resident = "launcher model: resident"
6
+
7
+
8
+ class LogModel:
9
+ simple = "log model: simple"
10
+ common = "log model: common"
11
+ detailed = "log model: detailed"
12
+
13
+
14
+ class DealModel:
15
+ failure = "deal model: failure"
16
+ success = "deal model: success"
17
+ polling = "deal model: polling"
18
+
19
+
20
+ class Setting:
21
+ RESET_SCORE = None
22
+ CHECK_LOCK_TIME = None
23
+ DEAL_MODEL = None
24
+ LAUNCHER_MODEL = None
@@ -0,0 +1,3 @@
1
+ from .. import log, Seed, decorators
2
+ from ..constant import Setting, DealModel
3
+ from ..interface import SchedulerInterface, StorerInterface
@@ -1,7 +1,8 @@
1
1
  import time
2
2
  import redis
3
- from cobweb import Seed, log
4
- from cobweb.decorators import check_redis_status
3
+ from . import log, decorators, Seed, Setting, DealModel
4
+ # from cobweb.decorators import decorators.check_redis_status
5
+ # from cobweb.constant import Setting, DealModel
5
6
 
6
7
 
7
8
  class RedisDB:
@@ -11,9 +12,6 @@ class RedisDB:
11
12
  project: str,
12
13
  task_name: str,
13
14
  config: dict,
14
- model: int,
15
- cs_lct: int,
16
- rs_time: int,
17
15
  ):
18
16
  pool = redis.ConnectionPool(**config)
19
17
  self.heartbeat_key = f"{project}:{task_name}:heartbeat" # redis type string
@@ -25,11 +23,8 @@ class RedisDB:
25
23
  self.check_lock = f"{project}:{task_name}:check_seed_lock" # redis type string
26
24
  self.scheduler_lock = f"{project}:{task_name}:scheduler_lock" # redis type string
27
25
  self.client = redis.Redis(connection_pool=pool)
28
- self.model = model
29
- self.cs_lct = cs_lct
30
- self.rs_time = rs_time
31
26
 
32
- @check_redis_status
27
+ @decorators.check_redis_status
33
28
  def _get_lock(self, key, t=15, timeout=3, sleep_time=0.1):
34
29
  begin_time = int(time.time())
35
30
  while True:
@@ -55,7 +50,7 @@ class RedisDB:
55
50
  log.info("ttl: " + str(ttl))
56
51
  return False
57
52
 
58
- @check_redis_status
53
+ @decorators.check_redis_status
59
54
  def _deal_seed(self, seeds, is_add: bool):
60
55
  if not seeds:
61
56
  return None
@@ -73,15 +68,15 @@ class RedisDB:
73
68
  if item_info:
74
69
  self.client.zadd(self.spider_key, mapping=item_info, nx=is_add, xx=not is_add)
75
70
 
76
- @check_redis_status
71
+ @decorators.check_redis_status
77
72
  def add_seed(self, seeds):
78
73
  self._deal_seed(seeds, is_add=True)
79
74
 
80
- @check_redis_status
75
+ @decorators.check_redis_status
81
76
  def reset_seed(self, seeds):
82
77
  self._deal_seed(seeds, is_add=False)
83
78
 
84
- @check_redis_status
79
+ @decorators.check_redis_status
85
80
  def del_seed(self, seeds, spider_status: bool = True):
86
81
  if not seeds:
87
82
  return None
@@ -92,18 +87,16 @@ class RedisDB:
92
87
  seeds = [seed if isinstance(seed, Seed) else Seed(seed) for seed in seeds]
93
88
 
94
89
  if seeds:
95
- # redis_key = self.succeed_key if spider_status else self.failed_key
96
90
  redis_key = None
97
- if spider_status:
98
- if isinstance(self.model, int) and self.model == 2:
99
- redis_key = self.succeed_key
100
- else:
91
+ if spider_status and Setting.DEAL_MODEL in [DealModel.success, DealModel.polling]:
92
+ redis_key = self.succeed_key
93
+ elif not spider_status:
101
94
  redis_key = self.failed_key
102
95
  if redis_key:
103
96
  self.client.sadd(redis_key, *(str(seed) for seed in seeds))
104
97
  self.client.zrem(self.spider_key, *(seed.format_seed for seed in seeds))
105
98
 
106
- @check_redis_status
99
+ @decorators.check_redis_status
107
100
  def set_storer(self, key, seeds):
108
101
  if not seeds:
109
102
  return None
@@ -122,7 +115,7 @@ class RedisDB:
122
115
  self.client.zadd(self.storer_key % key, mapping=item_info)
123
116
  log.info(f"zadd storer key: length {len(item_info.keys())}")
124
117
 
125
- @check_redis_status
118
+ @decorators.check_redis_status
126
119
  def get_seed(self, length: int = 200):
127
120
  cs = time.time()
128
121
 
@@ -148,14 +141,14 @@ class RedisDB:
148
141
  log.info("push seeds into queue time: " + str(time.time() - cs))
149
142
  return result
150
143
 
151
- @check_redis_status
144
+ @decorators.check_redis_status
152
145
  def check_spider_queue(self, stop, storer_num):
153
146
  while not stop.is_set():
154
147
  # 每15s获取check锁,等待600s后仍获取不到锁则重试;获取到锁后,设置锁的存活时间为${cs_lct}s
155
- if self._get_lock(key=self.check_lock, t=self.cs_lct, timeout=600, sleep_time=3):
148
+ if self._get_lock(key=self.check_lock, t=Setting.CHECK_LOCK_TIME, timeout=600, sleep_time=3):
156
149
  heartbeat = True if self.client.exists(self.heartbeat_key) else False
157
150
  # 重启重制score值,否则获取${rs_time}分钟前的分数值
158
- score = -int(time.time()) + self.rs_time if heartbeat else "-inf"
151
+ score = -int(time.time()) + Setting.RESET_SCORE if heartbeat else "-inf"
159
152
 
160
153
  keys = self.client.keys(self.storer_key % "*")
161
154
 
@@ -170,7 +163,7 @@ class RedisDB:
170
163
  break
171
164
  for key in keys:
172
165
  self.client.zrem(key, *members)
173
- if self.model == 2:
166
+ if Setting.DEAL_MODEL in [DealModel.success, DealModel.polling]:
174
167
  self.client.sadd(self.succeed_key, *members)
175
168
  self.client.zrem(self.spider_key, *members)
176
169
  self.client.zrem(intersection_key, *members)
@@ -193,31 +186,28 @@ class RedisDB:
193
186
  if not heartbeat:
194
187
  self.client.setex(self.heartbeat_key, 15, "")
195
188
 
196
- # self.client.delete(self.check_lock)
197
- # time.sleep(3)
198
-
199
- @check_redis_status
189
+ @decorators.check_redis_status
200
190
  def set_heartbeat(self, stop):
201
191
  time.sleep(5)
202
192
  while not stop.is_set():
203
193
  self.client.setex(self.heartbeat_key, 5, "")
204
194
  time.sleep(3)
205
195
 
206
- # @check_redis_status
196
+ # @decorators.check_redis_status
207
197
  # def heartbeat(self):
208
198
  # """
209
199
  # 返回心跳key剩余存活时间
210
200
  # """
211
201
  # return self.client.ttl(self.heartbeat_key)
212
202
 
213
- @check_redis_status
203
+ @decorators.check_redis_status
214
204
  def spider_queue_length(self):
215
205
  return self.client.zcard(self.spider_key)
216
206
 
217
- @check_redis_status
207
+ @decorators.check_redis_status
218
208
  def ready_seed_length(self):
219
209
  return self.client.zcount(self.spider_key, min=0, max="+inf")
220
210
 
221
- @check_redis_status
211
+ @decorators.check_redis_status
222
212
  def get_scheduler_lock(self):
223
213
  return self._get_lock(self.scheduler_lock)
@@ -0,0 +1 @@
1
+ from .. import log, Seed, SchedulerInterface as Inf
@@ -0,0 +1,8 @@
1
+ from . import Inf
2
+
3
+
4
+ class Default(Inf):
5
+
6
+ def schedule(self):
7
+ pass
8
+
@@ -1,7 +1,7 @@
1
- from cobweb import log, Seed, SchedulerInterface
1
+ from . import Inf, log, Seed
2
2
 
3
3
 
4
- class Textfile(SchedulerInterface):
4
+ class Textfile(Inf):
5
5
 
6
6
  index = None
7
7
 
@@ -0,0 +1 @@
1
+ from .. import log, Seed, StorerInterface as Inf
@@ -1,7 +1,7 @@
1
- from cobweb import log, StorerInterface
1
+ from . import Inf, log
2
2
 
3
3
 
4
- class Console(StorerInterface):
4
+ class Console(Inf):
5
5
 
6
6
  def store(self, data_list):
7
7
  for item in data_list:
@@ -1,9 +1,9 @@
1
1
  import json
2
+ from . import Inf, log
2
3
  from aliyun.log import LogClient, LogItem, PutLogsRequest
3
- from cobweb import log, StorerInterface
4
4
 
5
5
 
6
- class Loghub(StorerInterface):
6
+ class Loghub(Inf):
7
7
 
8
8
  def __init__(self, **kwargs):
9
9
  super().__init__(**kwargs)
@@ -1,7 +1,7 @@
1
- from cobweb import log, StorerInterface
1
+ from . import Inf, log
2
2
 
3
3
 
4
- class Redis(StorerInterface):
4
+ class Textfile(Inf):
5
5
 
6
6
  def store(self, data_list):
7
7
  try:
@@ -1,5 +1,5 @@
1
1
  from functools import wraps
2
- from cobweb import log
2
+ from .log import log
3
3
 
4
4
 
5
5
  def check_redis_status(func):
@@ -0,0 +1,8 @@
1
+ from .. import Queue, DBItem, RedisDB, Seed, log
2
+ from ..constant import Setting, DealModel
3
+ from utils import (
4
+ struct_queue_name as sqn,
5
+ restore_table_name as rtn,
6
+ parse_import_model as pim,
7
+ issubclass_cobweb_inf as ici
8
+ )
@@ -1,15 +1,9 @@
1
1
  import time
2
2
  import threading
3
- from threading import Thread
4
3
 
4
+ from .. import log, sqn, rtn, pim
5
+ from .. import Queue, DBItem, RedisDB, Setting
5
6
  from .models import Scheduler, Spider, Storer
6
- from cobweb import log, Queue, DBItem, RedisDB
7
- from cobweb.setting import MODEL, RESET_SCORE, CHECK_LOCK_TIME
8
- from cobweb.utils import (
9
- struct_queue_name as sqn,
10
- restore_table_name as rtn,
11
- parse_import_model as pim,
12
- )
13
7
 
14
8
 
15
9
  def check(stop, last, spider, scheduler, storer_list, ready_seed_length, spider_queue_length):
@@ -37,11 +31,10 @@ def check(stop, last, spider, scheduler, storer_list, ready_seed_length, spider_
37
31
  )
38
32
  if (
39
33
  scheduler.stop and
40
- # not redis_ready_seed_length and
41
34
  not memory_seed_queue_length and
42
35
  not running_spider_thread_num
43
36
  ):
44
- if not MODEL:
37
+ if not Setting.LAUNCHER_MODEL:
45
38
  log.info("spider is done?")
46
39
  last.set()
47
40
  time.sleep(3)
@@ -58,7 +51,7 @@ def check(stop, last, spider, scheduler, storer_list, ready_seed_length, spider_
58
51
  not redis_ready_seed_length and
59
52
  not redis_spider_seed_length
60
53
  ):
61
- if MODEL:
54
+ if Setting.LAUNCHER_MODEL:
62
55
  log.info("waiting for push seeds...")
63
56
  status = "waiting"
64
57
  time.sleep(30)
@@ -89,20 +82,6 @@ def launcher(task):
89
82
  :param task: 任务配置信息
90
83
  """
91
84
  def decorator(func):
92
- """
93
- Item:
94
- Textfile()
95
- Loghub()
96
- Console()
97
- e.g.
98
- task.fields = "a,b"
99
- func(item, seed)
100
- a = "a"
101
- b = "b"
102
- data = {"a": "a", "b": "b"}
103
- yield item.Loghub(**data)
104
- yield item.Loghub(a=a, b=b)
105
- """
106
85
  storer_list = []
107
86
 
108
87
  # 程序结束事件
@@ -111,10 +90,7 @@ def launcher(task):
111
90
  stop = threading.Event()
112
91
 
113
92
  # 初始化redis信息
114
- redis_db = RedisDB(
115
- task.project, task.task_name, task.redis_info,
116
- model=MODEL, cs_lct=CHECK_LOCK_TIME, rs_time=RESET_SCORE
117
- )
93
+ redis_db = RedisDB(task.project, task.task_name, task.redis_info)
118
94
 
119
95
  log.info("初始化cobweb!")
120
96
 
@@ -139,9 +115,6 @@ def launcher(task):
139
115
  length=task.scheduler_queue_length, config=scheduler_config
140
116
  )
141
117
 
142
- # 初始化采集器
143
- spider = Spider(seed_queue, task.max_retries)
144
-
145
118
  # 解析存储器信息
146
119
  storer_info_list = task.storer_info or []
147
120
  if not isinstance(storer_info_list, list):
@@ -178,14 +151,17 @@ def launcher(task):
178
151
  )
179
152
  storer_list.append(storer)
180
153
 
181
- Thread(target=redis_db.check_spider_queue, args=(stop, len(storer_list))).start()
182
- Thread(target=redis_db.set_heartbeat, args=(stop,)).start()
154
+ # 初始化采集器
155
+ spider = Spider(seed_queue, storer_list and True, task.max_retries)
156
+
157
+ threading.Thread(target=redis_db.check_spider_queue, args=(stop, len(storer_list))).start()
158
+ threading.Thread(target=redis_db.set_heartbeat, args=(stop,)).start()
183
159
 
184
160
  # 推送初始种子
185
161
  # seeds = start_seeds(task.start_seed)
186
162
  redis_db.add_seed(task.seeds)
187
163
  # 启动调度器, 调度至redis队列
188
- Thread(
164
+ threading.Thread(
189
165
  # name="xxxx_schedule_seeds",
190
166
  target=scheduler.schedule_seed,
191
167
  args=(
@@ -196,7 +172,7 @@ def launcher(task):
196
172
  ).start()
197
173
 
198
174
  # 启动调度器, 调度任务队列
199
- Thread(
175
+ threading.Thread(
200
176
  # name="xxxx_schedule_task",
201
177
  target=scheduler.schedule_task,
202
178
  args=(
@@ -207,7 +183,7 @@ def launcher(task):
207
183
 
208
184
  # 启动采集器
209
185
  for index in range(task.spider_num):
210
- Thread(
186
+ threading.Thread(
211
187
  # name=f"xxxx_spider_task:{index}",
212
188
  target=spider.spider_task,
213
189
  args=(
@@ -218,7 +194,7 @@ def launcher(task):
218
194
 
219
195
  # 启动存储器
220
196
  for storer in storer_list:
221
- Thread(
197
+ threading.Thread(
222
198
  # name=f"xxxx_store_task:{storer.table}",
223
199
  target=storer.store_task,
224
200
  args=(
@@ -228,7 +204,7 @@ def launcher(task):
228
204
  )
229
205
  ).start()
230
206
 
231
- Thread(
207
+ threading.Thread(
232
208
  # name="check_spider",
233
209
  target=check,
234
210
  args=(
@@ -1,8 +1,9 @@
1
1
  import time
2
2
  from hashlib import md5
3
- from cobweb import log, Queue, Seed
4
- from cobweb.utils import issubclass_cobweb_inf
3
+ from inspect import isgenerator
5
4
 
5
+ from .. import log, ici
6
+ from .. import DealModel, Queue, Seed
6
7
  # from pympler import asizeof
7
8
 
8
9
 
@@ -11,7 +12,7 @@ class Scheduler:
11
12
  def schedule_seed(self, ready_seed_length, get_scheduler_lock, add_seed):
12
13
 
13
14
  inf_name = "SchedulerInterface"
14
- if not issubclass_cobweb_inf(self.__class__, inf_name):
15
+ if not ici(self.__class__, inf_name):
15
16
  raise Exception("not have schedule function!")
16
17
 
17
18
  if self.__class__.__name__ == "Default":
@@ -48,9 +49,10 @@ class Scheduler:
48
49
 
49
50
  class Spider:
50
51
 
51
- def __init__(self, queue, max_retries=5):
52
+ def __init__(self, queue, storage, max_retries=5):
52
53
  self.spider_in_progress = Queue()
53
54
  self.max_retries = max_retries
55
+ self.storage = storage
54
56
  self.queue = queue
55
57
 
56
58
  def spider_task(self, stop, func, item, del_seed):
@@ -65,33 +67,42 @@ class Spider:
65
67
  try:
66
68
  self.spider_in_progress.push(1, direct_insertion=True)
67
69
  # log.info("spider seed: " + str(seed))
68
- ret_count = 0
69
- status = None
70
+
70
71
  store_queue = None
71
72
  store_data = list()
72
- for it in func(item, seed):
73
- ret_count += 1
73
+
74
+ iterators = func(item, seed)
75
+
76
+ if not isgenerator(iterators):
77
+ if not self.storage:
78
+ del_seed(seed, spider_status=True)
79
+ continue
80
+ raise TypeError(f"{func.__name__} isn't a generator")
81
+
82
+ for it in iterators:
74
83
  if getattr(it, "table_name", None):
75
84
  if not store_queue:
76
85
  store_queue = it.queue()
77
86
  store_data.append(it.struct_data)
78
87
  elif isinstance(it, Seed):
79
88
  self.queue.push(it)
80
- elif any(isinstance(it, t) for t in (list, tuple)):
81
- self.queue.push([s if isinstance(s, Seed) else Seed(s) for s in it])
82
- elif isinstance(it, bool):
83
- status = it
89
+
90
+ elif isinstance(it, str) and it == DealModel.polling:
91
+ self.queue.push(seed)
92
+ break
93
+ elif isinstance(it, str) and it == DealModel.success:
94
+ del_seed(seed, spider_status=True)
95
+ break
96
+ elif isinstance(it, str) and it == DealModel.failure:
97
+ del_seed(seed, spider_status=False)
98
+ break
99
+ else:
100
+ raise TypeError("yield value type error!")
84
101
 
85
102
  if store_queue and store_data:
86
103
  store_data.append(seed)
87
104
  store_queue.push(store_data)
88
105
 
89
- if status:
90
- del_seed(seed, spider_status=True)
91
- elif not ret_count or status is False:
92
- seed._retry += 1
93
- self.queue.push(seed)
94
-
95
106
  except Exception as e:
96
107
  seed._retry += 1
97
108
  self.queue.push(seed)
@@ -106,7 +117,7 @@ class Storer:
106
117
  def store_task(self, stop, last, reset_seed, set_storer):
107
118
 
108
119
  inf_name = "StorerInterface"
109
- if not issubclass_cobweb_inf(self.__class__, inf_name):
120
+ if not ici(self.__class__, inf_name):
110
121
  return None
111
122
 
112
123
  if not getattr(self, "store", None):
@@ -131,12 +142,10 @@ class Storer:
131
142
  continue
132
143
  data_list.append(data)
133
144
 
134
- if data_list:
135
- if self.store(data_list):
136
- set_storer(store_key_id, seeds)
137
- else:
138
- reset_seed(seeds)
139
- continue
145
+ if self.store(data_list):
146
+ set_storer(store_key_id, seeds)
147
+ else:
148
+ reset_seed(seeds)
140
149
 
141
150
  time.sleep(3)
142
151
 
@@ -1,15 +1,9 @@
1
1
  import time
2
2
  import threading
3
- from threading import Thread
4
3
 
4
+ from .. import log, sqn, rtn, pim
5
+ from .. import Queue, DBItem, RedisDB, Setting
5
6
  from .models import Scheduler, Spider, Storer
6
- from cobweb import log, Queue, DBItem, RedisDB
7
- from cobweb.setting import MODEL, RESET_SCORE, CHECK_LOCK_TIME
8
- from cobweb.utils import (
9
- struct_queue_name as sqn,
10
- restore_table_name as rtn,
11
- parse_import_model as pim,
12
- )
13
7
 
14
8
 
15
9
  def check(stop, last, spider, scheduler, storer, ready_seed_length, spider_queue_length):
@@ -29,27 +23,26 @@ def check(stop, last, spider, scheduler, storer, ready_seed_length, spider_queue
29
23
  redis_ready_seed_length = ready_seed_length()
30
24
  redis_spider_seed_length = spider_queue_length()
31
25
  memory_seed_queue_length = scheduler.queue.length
32
- storer_upload_queue_length = storer.queue.length
26
+ storer_upload_queue_length = storer.queue.length if storer else None
33
27
  if (
34
28
  scheduler.stop and
35
- # not redis_ready_seed_length and
36
29
  not memory_seed_queue_length and
37
30
  not running_spider_thread_num
38
31
  ):
39
- if not MODEL:
32
+ if not Setting.LAUNCHER_MODEL:
40
33
  log.info("spider is done?")
41
34
  last.set()
42
35
  time.sleep(3)
43
36
  storer_queue_empty = True
44
- if storer.queue.length:
37
+ if storer and storer.queue.length:
45
38
  storer_queue_empty = False
46
- storer_upload_queue_length = storer.queue.length
39
+ storer_upload_queue_length = storer.queue.length if storer else None
47
40
  if (
48
41
  storer_queue_empty and
49
42
  not redis_ready_seed_length and
50
43
  not redis_spider_seed_length
51
44
  ):
52
- if MODEL:
45
+ if Setting.LAUNCHER_MODEL:
53
46
  log.info("waiting for push seeds...")
54
47
  status = "waiting"
55
48
  time.sleep(30)
@@ -78,32 +71,13 @@ def launcher(task):
78
71
  :param task: 任务配置信息
79
72
  """
80
73
  def decorator(func):
81
- """
82
- Item:
83
- Textfile()
84
- Loghub()
85
- Console()
86
- e.g.
87
- task.fields = "a,b"
88
- func(item, seed)
89
- a = "a"
90
- b = "b"
91
- data = {"a": "a", "b": "b"}
92
- yield item.Loghub(**data)
93
- yield item.Loghub(a=a, b=b)
94
- """
95
- storer_list = []
96
-
97
74
  # 程序结束事件
98
75
  last = threading.Event()
99
76
  # 停止采集事件
100
77
  stop = threading.Event()
101
78
 
102
79
  # 初始化redis信息
103
- redis_db = RedisDB(
104
- task.project, task.task_name, task.redis_info,
105
- model=MODEL, cs_lct=CHECK_LOCK_TIME, rs_time=RESET_SCORE
106
- )
80
+ redis_db = RedisDB(task.project, task.task_name, task.redis_info)
107
81
 
108
82
  # new item
109
83
  item = type("Item", (object,), {"redis_client": redis_db.client})()
@@ -113,7 +87,6 @@ def launcher(task):
113
87
  seed_queue = Queue()
114
88
 
115
89
  scheduler_info = task.scheduler_info or dict()
116
-
117
90
  # 调度器动态继承
118
91
  sql = scheduler_info.get("sql")
119
92
  table = scheduler_info.get("table")
@@ -123,22 +96,15 @@ def launcher(task):
123
96
  DB, class_name = pim(scheduler_db, "scheduler")
124
97
  # SchedulerDB, table, sql, length, size, config = task.scheduler_info
125
98
  SchedulerTmp = type(class_name, (Scheduler, DB), {})
126
-
127
99
  # 初始化调度器
128
100
  scheduler = SchedulerTmp(
129
101
  table=table, sql=sql, size=size, queue=seed_queue,
130
102
  length=task.scheduler_queue_length, config=scheduler_config
131
103
  )
132
104
 
133
- # 初始化采集器
134
- spider = Spider(seed_queue, task.max_retries)
135
-
136
105
  storer = None
137
-
138
- # 解析存储器信息
139
106
  storer_info = task.storer_info or dict()
140
107
 
141
- # for storer_info in storer_info_list:
142
108
  if storer_info:
143
109
  storer_db = storer_info["db"]
144
110
  fields = storer_info["fields"]
@@ -166,14 +132,17 @@ def launcher(task):
166
132
  queue=queue, config=storer_config
167
133
  )
168
134
 
169
- Thread(target=redis_db.check_spider_queue, args=(stop, len(storer_list))).start()
170
- Thread(target=redis_db.set_heartbeat, args=(stop,)).start()
135
+ # 初始化采集器
136
+ spider = Spider(seed_queue, storer and True, task.max_retries)
137
+
138
+ threading.Thread(target=redis_db.check_spider_queue, args=(stop, 0)).start()
139
+ threading.Thread(target=redis_db.set_heartbeat, args=(stop,)).start()
171
140
 
172
141
  # 推送初始种子
173
142
  # seeds = start_seeds(task.start_seed)
174
143
  redis_db.add_seed(task.seeds)
175
144
  # 启动调度器, 调度至redis队列
176
- Thread(
145
+ threading.Thread(
177
146
  # name="xxxx_schedule_seeds",
178
147
  target=scheduler.schedule_seed,
179
148
  args=(
@@ -184,7 +153,7 @@ def launcher(task):
184
153
  ).start()
185
154
 
186
155
  # 启动调度器, 调度任务队列
187
- Thread(
156
+ threading.Thread(
188
157
  # name="xxxx_schedule_task",
189
158
  target=scheduler.schedule_task,
190
159
  args=(
@@ -195,7 +164,7 @@ def launcher(task):
195
164
 
196
165
  # 启动采集器
197
166
  for index in range(task.spider_num):
198
- Thread(
167
+ threading.Thread(
199
168
  # name=f"xxxx_spider_task:{index}",
200
169
  target=spider.spider_task,
201
170
  args=(
@@ -206,7 +175,7 @@ def launcher(task):
206
175
 
207
176
  # 启动存储器
208
177
  if storer:
209
- Thread(
178
+ threading.Thread(
210
179
  # name=f"xxxx_store_task:{storer.table}",
211
180
  target=storer.store_task,
212
181
  args=(
@@ -216,7 +185,7 @@ def launcher(task):
216
185
  )
217
186
  ).start()
218
187
 
219
- Thread(
188
+ threading.Thread(
220
189
  # name="check_spider",
221
190
  target=check,
222
191
  args=(
@@ -1,7 +1,8 @@
1
1
  import time
2
- from cobweb import log, Queue, Seed
3
- from cobweb.utils import issubclass_cobweb_inf
2
+ from inspect import isgenerator
4
3
  # from pympler import asizeof
4
+ from .. import log, ici
5
+ from .. import DealModel, Queue, Seed
5
6
 
6
7
 
7
8
  class Scheduler:
@@ -9,7 +10,7 @@ class Scheduler:
9
10
  def schedule_seed(self, ready_seed_length, get_scheduler_lock, add_seed):
10
11
 
11
12
  inf_name = "SchedulerInterface"
12
- if not issubclass_cobweb_inf(self.__class__, inf_name):
13
+ if not ici(self.__class__, inf_name):
13
14
  raise Exception("not have schedule function!")
14
15
 
15
16
  if self.__class__.__name__ == "Default":
@@ -46,27 +47,37 @@ class Scheduler:
46
47
 
47
48
  class Spider:
48
49
 
49
- def __init__(self, queue, max_retries=5):
50
+ def __init__(self, queue, storage, max_retries=5):
50
51
  self.spider_in_progress = Queue()
51
52
  self.max_retries = max_retries
53
+ self.storage = storage
52
54
  self.queue = queue
53
55
 
54
56
  def spider_task(self, stop, func, item, del_seed):
55
57
  while not stop.is_set():
58
+
56
59
  seed = self.queue.pop()
60
+
57
61
  if not seed:
58
62
  time.sleep(3)
59
63
  continue
64
+
60
65
  elif seed._retry >= self.max_retries:
61
66
  del_seed(seed, spider_status=False)
62
67
  continue
68
+
63
69
  try:
64
70
  self.spider_in_progress.push(1, direct_insertion=True)
65
71
  # log.info("spider seed: " + str(seed))
66
- ret_count = 0
67
- status = None
68
- for it in func(item, seed):
69
- ret_count += 1
72
+ iterators = func(item, seed)
73
+
74
+ if not isgenerator(iterators):
75
+ if not self.storage:
76
+ del_seed(seed, spider_status=True)
77
+ continue
78
+ raise TypeError(f"{func.__name__} isn't a generator")
79
+
80
+ for it in iterators:
70
81
  if getattr(it, "table_name", None):
71
82
  store_queue = it.queue()
72
83
  store_queue.push(
@@ -75,16 +86,18 @@ class Spider:
75
86
  )
76
87
  elif isinstance(it, Seed):
77
88
  self.queue.push(it)
78
- elif any(isinstance(it, t) for t in (list, tuple)):
79
- self.queue.push([s if isinstance(s, Seed) else Seed(s) for s in it])
80
- elif isinstance(it, bool):
81
- status = it
82
89
 
83
- if status:
84
- del_seed(seed, spider_status=True)
85
- elif not ret_count or status is False:
86
- seed._retry += 1
87
- self.queue.push(seed)
90
+ elif isinstance(it, str) and it == DealModel.polling:
91
+ self.queue.push(seed)
92
+ break
93
+ elif isinstance(it, str) and it == DealModel.success:
94
+ del_seed(seed, spider_status=True)
95
+ break
96
+ elif isinstance(it, str) and it == DealModel.failure:
97
+ del_seed(seed, spider_status=False)
98
+ break
99
+ else:
100
+ raise TypeError("yield value type error!")
88
101
 
89
102
  except Exception as e:
90
103
  seed._retry += 1
@@ -100,7 +113,7 @@ class Storer:
100
113
  def store_task(self, stop, last, reset_seed, del_seed):
101
114
 
102
115
  inf_name = "StorerInterface"
103
- if not issubclass_cobweb_inf(self.__class__, inf_name):
116
+ if not ici(self.__class__, inf_name):
104
117
  return None
105
118
 
106
119
  if not getattr(self, "store", None):
@@ -121,13 +134,10 @@ class Storer:
121
134
  seeds.append(seed)
122
135
  data_list.append(data)
123
136
 
124
- if data_list:
125
- if self.store(data_list):
126
- del_seed(seeds)
127
- else:
128
- reset_seed(seeds)
129
- log.info("reset seeds!")
130
- continue
137
+ if self.store(data_list):
138
+ del_seed(seeds)
139
+ else:
140
+ reset_seed(seeds)
131
141
 
132
142
  time.sleep(3)
133
143
 
@@ -1,11 +1,19 @@
1
+ import os
2
+ from .constant import *
1
3
  from .utils import parse_info, struct_start_seeds
2
4
 
3
5
 
6
+ def init_task_env():
7
+ Setting.RESET_SCORE = int(os.getenv("RESET_SCORE", 600))
8
+ Setting.CHECK_LOCK_TIME = int(os.getenv("CHECK_LOCK_TIME", 30))
9
+ Setting.DEAL_MODEL = os.getenv("DEAL_MODEL", DealModel.failure)
10
+ Setting.LAUNCHER_MODEL = os.getenv("LAUNCHER_MODEL", LauncherModel.task)
11
+
12
+
4
13
  class Task:
5
14
 
6
15
  def __init__(
7
16
  self,
8
- # model=None,
9
17
  seeds=None,
10
18
  project=None,
11
19
  task_name=None,
@@ -31,8 +39,7 @@ class Task:
31
39
  :param storer_queue_length:
32
40
  :param scheduler_queue_length:
33
41
  """
34
- # self.model = model
35
-
42
+ init_task_env()
36
43
  self.seeds = struct_start_seeds(seeds)
37
44
  self.project = project or "test"
38
45
  self.task_name = task_name or "spider"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cobweb-launcher
3
- Version: 0.1.8
3
+ Version: 0.1.9
4
4
  Summary: spider_hole
5
5
  Home-page: https://github.com/Juannie-PP/cobweb
6
6
  Author: Juannie-PP
@@ -11,6 +11,10 @@ Classifier: Programming Language :: Python :: 3
11
11
  Requires-Python: >=3.7
12
12
  Description-Content-Type: text/markdown
13
13
  License-File: LICENSE
14
+ Requires-Dist: requests>=2.19.1
15
+ Requires-Dist: oss2>=2.18.1
16
+ Requires-Dist: redis>=4.4.4
17
+ Requires-Dist: aliyun-log-python-sdk
14
18
 
15
19
  # cobweb
16
20
 
@@ -3,10 +3,10 @@ README.md
3
3
  setup.py
4
4
  cobweb/__init__.py
5
5
  cobweb/bbb.py
6
+ cobweb/constant.py
6
7
  cobweb/decorators.py
7
8
  cobweb/interface.py
8
9
  cobweb/log.py
9
- cobweb/setting.py
10
10
  cobweb/task.py
11
11
  cobweb/utils.py
12
12
  cobweb/db/__init__.py
@@ -18,14 +18,14 @@ cobweb/db/scheduler/textfile.py
18
18
  cobweb/db/storer/__init__.py
19
19
  cobweb/db/storer/console.py
20
20
  cobweb/db/storer/loghub.py
21
- cobweb/db/storer/redis.py
22
21
  cobweb/db/storer/textfile.py
23
- cobweb/distributed/__init__.py
24
- cobweb/distributed/launcher.py
25
- cobweb/distributed/models.py
26
- cobweb/single/__init__.py
27
- cobweb/single/launcher.py
28
- cobweb/single/models.py
22
+ cobweb/equip/__init__.py
23
+ cobweb/equip/distributed/__init__.py
24
+ cobweb/equip/distributed/launcher.py
25
+ cobweb/equip/distributed/models.py
26
+ cobweb/equip/single/__init__.py
27
+ cobweb/equip/single/launcher.py
28
+ cobweb/equip/single/models.py
29
29
  cobweb_launcher.egg-info/PKG-INFO
30
30
  cobweb_launcher.egg-info/SOURCES.txt
31
31
  cobweb_launcher.egg-info/dependency_links.txt
@@ -5,7 +5,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
5
5
 
6
6
  setup(
7
7
  name="cobweb-launcher",
8
- version="0.1.8",
8
+ version="0.1.9",
9
9
  packages=find_packages(),
10
10
  url="https://github.com/Juannie-PP/cobweb",
11
11
  license="MIT",
@@ -1,11 +0,0 @@
1
- from .bbb import Seed, Queue, DBItem
2
- from .task import Task
3
- from .log import log
4
- from .interface import SchedulerInterface, StorerInterface
5
- from .db.redis_db import RedisDB
6
- from .db.oss_db import OssDB
7
- from .distributed.launcher import launcher
8
- from .single.launcher import launcher as single_launcher
9
- from . import setting
10
-
11
-
@@ -1,2 +0,0 @@
1
- from . import oss_db, redis_db
2
- from . import scheduler, storer
@@ -1,8 +0,0 @@
1
- from cobweb import SchedulerInterface
2
-
3
-
4
- class Default(SchedulerInterface):
5
-
6
- def schedule(self):
7
- pass
8
-
@@ -1,15 +0,0 @@
1
- from cobweb import log, StorerInterface
2
-
3
-
4
- class Textfile(StorerInterface):
5
-
6
- def store(self, data_list):
7
- try:
8
- data_str = "\n".join(str(data) for data in data_list)
9
- with open(self.table, "a") as fp:
10
- fp.write(data_str)
11
- log.info(f"save data, data length: {len(data_list)}")
12
- return True
13
- except Exception as e:
14
- return False
15
-
File without changes
@@ -1,13 +0,0 @@
1
- import os
2
-
3
-
4
- # model: 0, 1, 2
5
- MODEL = int(os.getenv("MODEL", "0"))
6
-
7
- # 重制score值的等待时间, 默认10分钟
8
- RESET_SCORE = int(os.getenv("RESET_SCORE", "600"))
9
-
10
- # 默认设置检查spider queue队列锁的存活时间为30s
11
- CHECK_LOCK_TIME = int(os.getenv("CHECK_LOCK_TIME", 30))
12
-
13
-
File without changes
File without changes