cobweb-launcher 0.1.8__tar.gz → 0.1.10__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cobweb-launcher might be problematic. Click here for more details.

Files changed (42) hide show
  1. {cobweb-launcher-0.1.8 → cobweb-launcher-0.1.10}/PKG-INFO +5 -1
  2. cobweb-launcher-0.1.10/cobweb/__init__.py +7 -0
  3. cobweb-launcher-0.1.10/cobweb/constant.py +24 -0
  4. cobweb-launcher-0.1.10/cobweb/db/__init__.py +3 -0
  5. {cobweb-launcher-0.1.8 → cobweb-launcher-0.1.10}/cobweb/db/redis_db.py +22 -32
  6. cobweb-launcher-0.1.10/cobweb/db/scheduler/__init__.py +1 -0
  7. cobweb-launcher-0.1.10/cobweb/db/scheduler/default.py +8 -0
  8. {cobweb-launcher-0.1.8 → cobweb-launcher-0.1.10}/cobweb/db/scheduler/textfile.py +2 -2
  9. cobweb-launcher-0.1.10/cobweb/db/storer/__init__.py +1 -0
  10. {cobweb-launcher-0.1.8 → cobweb-launcher-0.1.10}/cobweb/db/storer/console.py +2 -2
  11. {cobweb-launcher-0.1.8 → cobweb-launcher-0.1.10}/cobweb/db/storer/loghub.py +2 -2
  12. cobweb-launcher-0.1.8/cobweb/db/storer/redis.py → cobweb-launcher-0.1.10/cobweb/db/storer/textfile.py +2 -2
  13. {cobweb-launcher-0.1.8 → cobweb-launcher-0.1.10}/cobweb/decorators.py +1 -1
  14. cobweb-launcher-0.1.10/cobweb/equip/__init__.py +8 -0
  15. {cobweb-launcher-0.1.8/cobweb → cobweb-launcher-0.1.10/cobweb/equip}/distributed/launcher.py +15 -39
  16. {cobweb-launcher-0.1.8/cobweb → cobweb-launcher-0.1.10/cobweb/equip}/distributed/models.py +34 -25
  17. {cobweb-launcher-0.1.8/cobweb → cobweb-launcher-0.1.10/cobweb/equip}/single/launcher.py +18 -49
  18. {cobweb-launcher-0.1.8/cobweb → cobweb-launcher-0.1.10/cobweb/equip}/single/models.py +35 -25
  19. {cobweb-launcher-0.1.8 → cobweb-launcher-0.1.10}/cobweb/task.py +10 -3
  20. {cobweb-launcher-0.1.8 → cobweb-launcher-0.1.10}/cobweb_launcher.egg-info/PKG-INFO +5 -1
  21. {cobweb-launcher-0.1.8 → cobweb-launcher-0.1.10}/cobweb_launcher.egg-info/SOURCES.txt +8 -8
  22. {cobweb-launcher-0.1.8 → cobweb-launcher-0.1.10}/setup.py +1 -1
  23. cobweb-launcher-0.1.8/cobweb/__init__.py +0 -11
  24. cobweb-launcher-0.1.8/cobweb/db/__init__.py +0 -2
  25. cobweb-launcher-0.1.8/cobweb/db/scheduler/default.py +0 -8
  26. cobweb-launcher-0.1.8/cobweb/db/storer/textfile.py +0 -15
  27. cobweb-launcher-0.1.8/cobweb/distributed/__init__.py +0 -0
  28. cobweb-launcher-0.1.8/cobweb/setting.py +0 -13
  29. cobweb-launcher-0.1.8/cobweb/single/__init__.py +0 -0
  30. {cobweb-launcher-0.1.8 → cobweb-launcher-0.1.10}/LICENSE +0 -0
  31. {cobweb-launcher-0.1.8 → cobweb-launcher-0.1.10}/README.md +0 -0
  32. {cobweb-launcher-0.1.8 → cobweb-launcher-0.1.10}/cobweb/bbb.py +0 -0
  33. {cobweb-launcher-0.1.8 → cobweb-launcher-0.1.10}/cobweb/db/oss_db.py +0 -0
  34. {cobweb-launcher-0.1.8/cobweb/db/scheduler → cobweb-launcher-0.1.10/cobweb/equip/distributed}/__init__.py +0 -0
  35. {cobweb-launcher-0.1.8/cobweb/db/storer → cobweb-launcher-0.1.10/cobweb/equip/single}/__init__.py +0 -0
  36. {cobweb-launcher-0.1.8 → cobweb-launcher-0.1.10}/cobweb/interface.py +0 -0
  37. {cobweb-launcher-0.1.8 → cobweb-launcher-0.1.10}/cobweb/log.py +0 -0
  38. {cobweb-launcher-0.1.8 → cobweb-launcher-0.1.10}/cobweb/utils.py +0 -0
  39. {cobweb-launcher-0.1.8 → cobweb-launcher-0.1.10}/cobweb_launcher.egg-info/dependency_links.txt +0 -0
  40. {cobweb-launcher-0.1.8 → cobweb-launcher-0.1.10}/cobweb_launcher.egg-info/requires.txt +0 -0
  41. {cobweb-launcher-0.1.8 → cobweb-launcher-0.1.10}/cobweb_launcher.egg-info/top_level.txt +0 -0
  42. {cobweb-launcher-0.1.8 → cobweb-launcher-0.1.10}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cobweb-launcher
3
- Version: 0.1.8
3
+ Version: 0.1.10
4
4
  Summary: spider_hole
5
5
  Home-page: https://github.com/Juannie-PP/cobweb
6
6
  Author: Juannie-PP
@@ -11,6 +11,10 @@ Classifier: Programming Language :: Python :: 3
11
11
  Requires-Python: >=3.7
12
12
  Description-Content-Type: text/markdown
13
13
  License-File: LICENSE
14
+ Requires-Dist: requests>=2.19.1
15
+ Requires-Dist: oss2>=2.18.1
16
+ Requires-Dist: redis>=4.4.4
17
+ Requires-Dist: aliyun-log-python-sdk
14
18
 
15
19
  # cobweb
16
20
 
@@ -0,0 +1,7 @@
1
+ from .bbb import Seed, Queue, DBItem
2
+ from .task import Task
3
+ from .log import log
4
+ from .db.redis_db import RedisDB
5
+
6
+ from .equip.distributed.launcher import launcher
7
+ from .equip.single.launcher import launcher as single_launcher
@@ -0,0 +1,24 @@
1
+
2
+
3
+ class LauncherModel:
4
+ task = "launcher model: task"
5
+ resident = "launcher model: resident"
6
+
7
+
8
+ class LogModel:
9
+ simple = "log model: simple"
10
+ common = "log model: common"
11
+ detailed = "log model: detailed"
12
+
13
+
14
+ class DealModel:
15
+ failure = "deal model: failure"
16
+ success = "deal model: success"
17
+ polling = "deal model: polling"
18
+
19
+
20
+ class Setting:
21
+ RESET_SCORE = None
22
+ CHECK_LOCK_TIME = None
23
+ DEAL_MODEL = None
24
+ LAUNCHER_MODEL = None
@@ -0,0 +1,3 @@
1
+ from .. import log, Seed, decorators
2
+ from ..constant import Setting, DealModel
3
+ from ..interface import SchedulerInterface, StorerInterface
@@ -1,7 +1,8 @@
1
1
  import time
2
2
  import redis
3
- from cobweb import Seed, log
4
- from cobweb.decorators import check_redis_status
3
+ from . import log, decorators, Seed, Setting, DealModel
4
+ # from cobweb.decorators import decorators.check_redis_status
5
+ # from cobweb.constant import Setting, DealModel
5
6
 
6
7
 
7
8
  class RedisDB:
@@ -11,9 +12,6 @@ class RedisDB:
11
12
  project: str,
12
13
  task_name: str,
13
14
  config: dict,
14
- model: int,
15
- cs_lct: int,
16
- rs_time: int,
17
15
  ):
18
16
  pool = redis.ConnectionPool(**config)
19
17
  self.heartbeat_key = f"{project}:{task_name}:heartbeat" # redis type string
@@ -25,11 +23,8 @@ class RedisDB:
25
23
  self.check_lock = f"{project}:{task_name}:check_seed_lock" # redis type string
26
24
  self.scheduler_lock = f"{project}:{task_name}:scheduler_lock" # redis type string
27
25
  self.client = redis.Redis(connection_pool=pool)
28
- self.model = model
29
- self.cs_lct = cs_lct
30
- self.rs_time = rs_time
31
26
 
32
- @check_redis_status
27
+ @decorators.check_redis_status
33
28
  def _get_lock(self, key, t=15, timeout=3, sleep_time=0.1):
34
29
  begin_time = int(time.time())
35
30
  while True:
@@ -55,7 +50,7 @@ class RedisDB:
55
50
  log.info("ttl: " + str(ttl))
56
51
  return False
57
52
 
58
- @check_redis_status
53
+ @decorators.check_redis_status
59
54
  def _deal_seed(self, seeds, is_add: bool):
60
55
  if not seeds:
61
56
  return None
@@ -73,15 +68,15 @@ class RedisDB:
73
68
  if item_info:
74
69
  self.client.zadd(self.spider_key, mapping=item_info, nx=is_add, xx=not is_add)
75
70
 
76
- @check_redis_status
71
+ @decorators.check_redis_status
77
72
  def add_seed(self, seeds):
78
73
  self._deal_seed(seeds, is_add=True)
79
74
 
80
- @check_redis_status
75
+ @decorators.check_redis_status
81
76
  def reset_seed(self, seeds):
82
77
  self._deal_seed(seeds, is_add=False)
83
78
 
84
- @check_redis_status
79
+ @decorators.check_redis_status
85
80
  def del_seed(self, seeds, spider_status: bool = True):
86
81
  if not seeds:
87
82
  return None
@@ -92,18 +87,16 @@ class RedisDB:
92
87
  seeds = [seed if isinstance(seed, Seed) else Seed(seed) for seed in seeds]
93
88
 
94
89
  if seeds:
95
- # redis_key = self.succeed_key if spider_status else self.failed_key
96
90
  redis_key = None
97
- if spider_status:
98
- if isinstance(self.model, int) and self.model == 2:
99
- redis_key = self.succeed_key
100
- else:
91
+ if spider_status and Setting.DEAL_MODEL in [DealModel.success, DealModel.polling]:
92
+ redis_key = self.succeed_key
93
+ elif not spider_status:
101
94
  redis_key = self.failed_key
102
95
  if redis_key:
103
96
  self.client.sadd(redis_key, *(str(seed) for seed in seeds))
104
97
  self.client.zrem(self.spider_key, *(seed.format_seed for seed in seeds))
105
98
 
106
- @check_redis_status
99
+ @decorators.check_redis_status
107
100
  def set_storer(self, key, seeds):
108
101
  if not seeds:
109
102
  return None
@@ -122,7 +115,7 @@ class RedisDB:
122
115
  self.client.zadd(self.storer_key % key, mapping=item_info)
123
116
  log.info(f"zadd storer key: length {len(item_info.keys())}")
124
117
 
125
- @check_redis_status
118
+ @decorators.check_redis_status
126
119
  def get_seed(self, length: int = 200):
127
120
  cs = time.time()
128
121
 
@@ -148,14 +141,14 @@ class RedisDB:
148
141
  log.info("push seeds into queue time: " + str(time.time() - cs))
149
142
  return result
150
143
 
151
- @check_redis_status
144
+ @decorators.check_redis_status
152
145
  def check_spider_queue(self, stop, storer_num):
153
146
  while not stop.is_set():
154
147
  # 每15s获取check锁,等待600s后仍获取不到锁则重试;获取到锁后,设置锁的存活时间为${cs_lct}s
155
- if self._get_lock(key=self.check_lock, t=self.cs_lct, timeout=600, sleep_time=3):
148
+ if self._get_lock(key=self.check_lock, t=Setting.CHECK_LOCK_TIME, timeout=600, sleep_time=3):
156
149
  heartbeat = True if self.client.exists(self.heartbeat_key) else False
157
150
  # 重启重制score值,否则获取${rs_time}分钟前的分数值
158
- score = -int(time.time()) + self.rs_time if heartbeat else "-inf"
151
+ score = -int(time.time()) + Setting.RESET_SCORE if heartbeat else "-inf"
159
152
 
160
153
  keys = self.client.keys(self.storer_key % "*")
161
154
 
@@ -170,7 +163,7 @@ class RedisDB:
170
163
  break
171
164
  for key in keys:
172
165
  self.client.zrem(key, *members)
173
- if self.model == 2:
166
+ if Setting.DEAL_MODEL in [DealModel.success, DealModel.polling]:
174
167
  self.client.sadd(self.succeed_key, *members)
175
168
  self.client.zrem(self.spider_key, *members)
176
169
  self.client.zrem(intersection_key, *members)
@@ -193,31 +186,28 @@ class RedisDB:
193
186
  if not heartbeat:
194
187
  self.client.setex(self.heartbeat_key, 15, "")
195
188
 
196
- # self.client.delete(self.check_lock)
197
- # time.sleep(3)
198
-
199
- @check_redis_status
189
+ @decorators.check_redis_status
200
190
  def set_heartbeat(self, stop):
201
191
  time.sleep(5)
202
192
  while not stop.is_set():
203
193
  self.client.setex(self.heartbeat_key, 5, "")
204
194
  time.sleep(3)
205
195
 
206
- # @check_redis_status
196
+ # @decorators.check_redis_status
207
197
  # def heartbeat(self):
208
198
  # """
209
199
  # 返回心跳key剩余存活时间
210
200
  # """
211
201
  # return self.client.ttl(self.heartbeat_key)
212
202
 
213
- @check_redis_status
203
+ @decorators.check_redis_status
214
204
  def spider_queue_length(self):
215
205
  return self.client.zcard(self.spider_key)
216
206
 
217
- @check_redis_status
207
+ @decorators.check_redis_status
218
208
  def ready_seed_length(self):
219
209
  return self.client.zcount(self.spider_key, min=0, max="+inf")
220
210
 
221
- @check_redis_status
211
+ @decorators.check_redis_status
222
212
  def get_scheduler_lock(self):
223
213
  return self._get_lock(self.scheduler_lock)
@@ -0,0 +1 @@
1
+ from .. import log, Seed, SchedulerInterface as Inf
@@ -0,0 +1,8 @@
1
+ from . import Inf
2
+
3
+
4
+ class Default(Inf):
5
+
6
+ def schedule(self):
7
+ pass
8
+
@@ -1,7 +1,7 @@
1
- from cobweb import log, Seed, SchedulerInterface
1
+ from . import Inf, log, Seed
2
2
 
3
3
 
4
- class Textfile(SchedulerInterface):
4
+ class Textfile(Inf):
5
5
 
6
6
  index = None
7
7
 
@@ -0,0 +1 @@
1
+ from .. import log, Seed, StorerInterface as Inf
@@ -1,7 +1,7 @@
1
- from cobweb import log, StorerInterface
1
+ from . import Inf, log
2
2
 
3
3
 
4
- class Console(StorerInterface):
4
+ class Console(Inf):
5
5
 
6
6
  def store(self, data_list):
7
7
  for item in data_list:
@@ -1,9 +1,9 @@
1
1
  import json
2
+ from . import Inf, log
2
3
  from aliyun.log import LogClient, LogItem, PutLogsRequest
3
- from cobweb import log, StorerInterface
4
4
 
5
5
 
6
- class Loghub(StorerInterface):
6
+ class Loghub(Inf):
7
7
 
8
8
  def __init__(self, **kwargs):
9
9
  super().__init__(**kwargs)
@@ -1,7 +1,7 @@
1
- from cobweb import log, StorerInterface
1
+ from . import Inf, log
2
2
 
3
3
 
4
- class Redis(StorerInterface):
4
+ class Textfile(Inf):
5
5
 
6
6
  def store(self, data_list):
7
7
  try:
@@ -1,5 +1,5 @@
1
1
  from functools import wraps
2
- from cobweb import log
2
+ from .log import log
3
3
 
4
4
 
5
5
  def check_redis_status(func):
@@ -0,0 +1,8 @@
1
+ from .. import Queue, DBItem, RedisDB, Seed, log
2
+ from ..constant import Setting, DealModel
3
+ from ..utils import (
4
+ struct_queue_name as sqn,
5
+ restore_table_name as rtn,
6
+ parse_import_model as pim,
7
+ issubclass_cobweb_inf as ici
8
+ )
@@ -1,15 +1,9 @@
1
1
  import time
2
2
  import threading
3
- from threading import Thread
4
3
 
4
+ from .. import log, sqn, rtn, pim
5
+ from .. import Queue, DBItem, RedisDB, Setting
5
6
  from .models import Scheduler, Spider, Storer
6
- from cobweb import log, Queue, DBItem, RedisDB
7
- from cobweb.setting import MODEL, RESET_SCORE, CHECK_LOCK_TIME
8
- from cobweb.utils import (
9
- struct_queue_name as sqn,
10
- restore_table_name as rtn,
11
- parse_import_model as pim,
12
- )
13
7
 
14
8
 
15
9
  def check(stop, last, spider, scheduler, storer_list, ready_seed_length, spider_queue_length):
@@ -37,11 +31,10 @@ def check(stop, last, spider, scheduler, storer_list, ready_seed_length, spider_
37
31
  )
38
32
  if (
39
33
  scheduler.stop and
40
- # not redis_ready_seed_length and
41
34
  not memory_seed_queue_length and
42
35
  not running_spider_thread_num
43
36
  ):
44
- if not MODEL:
37
+ if not Setting.LAUNCHER_MODEL:
45
38
  log.info("spider is done?")
46
39
  last.set()
47
40
  time.sleep(3)
@@ -58,7 +51,7 @@ def check(stop, last, spider, scheduler, storer_list, ready_seed_length, spider_
58
51
  not redis_ready_seed_length and
59
52
  not redis_spider_seed_length
60
53
  ):
61
- if MODEL:
54
+ if Setting.LAUNCHER_MODEL:
62
55
  log.info("waiting for push seeds...")
63
56
  status = "waiting"
64
57
  time.sleep(30)
@@ -89,20 +82,6 @@ def launcher(task):
89
82
  :param task: 任务配置信息
90
83
  """
91
84
  def decorator(func):
92
- """
93
- Item:
94
- Textfile()
95
- Loghub()
96
- Console()
97
- e.g.
98
- task.fields = "a,b"
99
- func(item, seed)
100
- a = "a"
101
- b = "b"
102
- data = {"a": "a", "b": "b"}
103
- yield item.Loghub(**data)
104
- yield item.Loghub(a=a, b=b)
105
- """
106
85
  storer_list = []
107
86
 
108
87
  # 程序结束事件
@@ -111,10 +90,7 @@ def launcher(task):
111
90
  stop = threading.Event()
112
91
 
113
92
  # 初始化redis信息
114
- redis_db = RedisDB(
115
- task.project, task.task_name, task.redis_info,
116
- model=MODEL, cs_lct=CHECK_LOCK_TIME, rs_time=RESET_SCORE
117
- )
93
+ redis_db = RedisDB(task.project, task.task_name, task.redis_info)
118
94
 
119
95
  log.info("初始化cobweb!")
120
96
 
@@ -139,9 +115,6 @@ def launcher(task):
139
115
  length=task.scheduler_queue_length, config=scheduler_config
140
116
  )
141
117
 
142
- # 初始化采集器
143
- spider = Spider(seed_queue, task.max_retries)
144
-
145
118
  # 解析存储器信息
146
119
  storer_info_list = task.storer_info or []
147
120
  if not isinstance(storer_info_list, list):
@@ -178,14 +151,17 @@ def launcher(task):
178
151
  )
179
152
  storer_list.append(storer)
180
153
 
181
- Thread(target=redis_db.check_spider_queue, args=(stop, len(storer_list))).start()
182
- Thread(target=redis_db.set_heartbeat, args=(stop,)).start()
154
+ # 初始化采集器
155
+ spider = Spider(seed_queue, storer_list and True, task.max_retries)
156
+
157
+ threading.Thread(target=redis_db.check_spider_queue, args=(stop, len(storer_list))).start()
158
+ threading.Thread(target=redis_db.set_heartbeat, args=(stop,)).start()
183
159
 
184
160
  # 推送初始种子
185
161
  # seeds = start_seeds(task.start_seed)
186
162
  redis_db.add_seed(task.seeds)
187
163
  # 启动调度器, 调度至redis队列
188
- Thread(
164
+ threading.Thread(
189
165
  # name="xxxx_schedule_seeds",
190
166
  target=scheduler.schedule_seed,
191
167
  args=(
@@ -196,7 +172,7 @@ def launcher(task):
196
172
  ).start()
197
173
 
198
174
  # 启动调度器, 调度任务队列
199
- Thread(
175
+ threading.Thread(
200
176
  # name="xxxx_schedule_task",
201
177
  target=scheduler.schedule_task,
202
178
  args=(
@@ -207,7 +183,7 @@ def launcher(task):
207
183
 
208
184
  # 启动采集器
209
185
  for index in range(task.spider_num):
210
- Thread(
186
+ threading.Thread(
211
187
  # name=f"xxxx_spider_task:{index}",
212
188
  target=spider.spider_task,
213
189
  args=(
@@ -218,7 +194,7 @@ def launcher(task):
218
194
 
219
195
  # 启动存储器
220
196
  for storer in storer_list:
221
- Thread(
197
+ threading.Thread(
222
198
  # name=f"xxxx_store_task:{storer.table}",
223
199
  target=storer.store_task,
224
200
  args=(
@@ -228,7 +204,7 @@ def launcher(task):
228
204
  )
229
205
  ).start()
230
206
 
231
- Thread(
207
+ threading.Thread(
232
208
  # name="check_spider",
233
209
  target=check,
234
210
  args=(
@@ -1,8 +1,9 @@
1
1
  import time
2
2
  from hashlib import md5
3
- from cobweb import log, Queue, Seed
4
- from cobweb.utils import issubclass_cobweb_inf
3
+ from inspect import isgenerator
5
4
 
5
+ from .. import log, ici
6
+ from .. import DealModel, Queue, Seed
6
7
  # from pympler import asizeof
7
8
 
8
9
 
@@ -11,7 +12,7 @@ class Scheduler:
11
12
  def schedule_seed(self, ready_seed_length, get_scheduler_lock, add_seed):
12
13
 
13
14
  inf_name = "SchedulerInterface"
14
- if not issubclass_cobweb_inf(self.__class__, inf_name):
15
+ if not ici(self.__class__, inf_name):
15
16
  raise Exception("not have schedule function!")
16
17
 
17
18
  if self.__class__.__name__ == "Default":
@@ -48,9 +49,10 @@ class Scheduler:
48
49
 
49
50
  class Spider:
50
51
 
51
- def __init__(self, queue, max_retries=5):
52
+ def __init__(self, queue, storage, max_retries=5):
52
53
  self.spider_in_progress = Queue()
53
54
  self.max_retries = max_retries
55
+ self.storage = storage
54
56
  self.queue = queue
55
57
 
56
58
  def spider_task(self, stop, func, item, del_seed):
@@ -65,33 +67,42 @@ class Spider:
65
67
  try:
66
68
  self.spider_in_progress.push(1, direct_insertion=True)
67
69
  # log.info("spider seed: " + str(seed))
68
- ret_count = 0
69
- status = None
70
+
70
71
  store_queue = None
71
72
  store_data = list()
72
- for it in func(item, seed):
73
- ret_count += 1
73
+
74
+ iterators = func(item, seed)
75
+
76
+ if not isgenerator(iterators):
77
+ if not self.storage:
78
+ del_seed(seed, spider_status=True)
79
+ continue
80
+ raise TypeError(f"{func.__name__} isn't a generator")
81
+
82
+ for it in iterators:
74
83
  if getattr(it, "table_name", None):
75
84
  if not store_queue:
76
85
  store_queue = it.queue()
77
86
  store_data.append(it.struct_data)
78
87
  elif isinstance(it, Seed):
79
88
  self.queue.push(it)
80
- elif any(isinstance(it, t) for t in (list, tuple)):
81
- self.queue.push([s if isinstance(s, Seed) else Seed(s) for s in it])
82
- elif isinstance(it, bool):
83
- status = it
89
+
90
+ elif isinstance(it, str) and it == DealModel.polling:
91
+ self.queue.push(seed)
92
+ break
93
+ elif isinstance(it, str) and it == DealModel.success:
94
+ del_seed(seed, spider_status=True)
95
+ break
96
+ elif isinstance(it, str) and it == DealModel.failure:
97
+ del_seed(seed, spider_status=False)
98
+ break
99
+ else:
100
+ raise TypeError("yield value type error!")
84
101
 
85
102
  if store_queue and store_data:
86
103
  store_data.append(seed)
87
104
  store_queue.push(store_data)
88
105
 
89
- if status:
90
- del_seed(seed, spider_status=True)
91
- elif not ret_count or status is False:
92
- seed._retry += 1
93
- self.queue.push(seed)
94
-
95
106
  except Exception as e:
96
107
  seed._retry += 1
97
108
  self.queue.push(seed)
@@ -106,7 +117,7 @@ class Storer:
106
117
  def store_task(self, stop, last, reset_seed, set_storer):
107
118
 
108
119
  inf_name = "StorerInterface"
109
- if not issubclass_cobweb_inf(self.__class__, inf_name):
120
+ if not ici(self.__class__, inf_name):
110
121
  return None
111
122
 
112
123
  if not getattr(self, "store", None):
@@ -131,12 +142,10 @@ class Storer:
131
142
  continue
132
143
  data_list.append(data)
133
144
 
134
- if data_list:
135
- if self.store(data_list):
136
- set_storer(store_key_id, seeds)
137
- else:
138
- reset_seed(seeds)
139
- continue
145
+ if self.store(data_list):
146
+ set_storer(store_key_id, seeds)
147
+ else:
148
+ reset_seed(seeds)
140
149
 
141
150
  time.sleep(3)
142
151
 
@@ -1,15 +1,9 @@
1
1
  import time
2
2
  import threading
3
- from threading import Thread
4
3
 
4
+ from .. import log, sqn, rtn, pim
5
+ from .. import Queue, DBItem, RedisDB, Setting
5
6
  from .models import Scheduler, Spider, Storer
6
- from cobweb import log, Queue, DBItem, RedisDB
7
- from cobweb.setting import MODEL, RESET_SCORE, CHECK_LOCK_TIME
8
- from cobweb.utils import (
9
- struct_queue_name as sqn,
10
- restore_table_name as rtn,
11
- parse_import_model as pim,
12
- )
13
7
 
14
8
 
15
9
  def check(stop, last, spider, scheduler, storer, ready_seed_length, spider_queue_length):
@@ -29,27 +23,26 @@ def check(stop, last, spider, scheduler, storer, ready_seed_length, spider_queue
29
23
  redis_ready_seed_length = ready_seed_length()
30
24
  redis_spider_seed_length = spider_queue_length()
31
25
  memory_seed_queue_length = scheduler.queue.length
32
- storer_upload_queue_length = storer.queue.length
26
+ storer_upload_queue_length = storer.queue.length if storer else None
33
27
  if (
34
28
  scheduler.stop and
35
- # not redis_ready_seed_length and
36
29
  not memory_seed_queue_length and
37
30
  not running_spider_thread_num
38
31
  ):
39
- if not MODEL:
32
+ if not Setting.LAUNCHER_MODEL:
40
33
  log.info("spider is done?")
41
34
  last.set()
42
35
  time.sleep(3)
43
36
  storer_queue_empty = True
44
- if storer.queue.length:
37
+ if storer and storer.queue.length:
45
38
  storer_queue_empty = False
46
- storer_upload_queue_length = storer.queue.length
39
+ storer_upload_queue_length = storer.queue.length if storer else None
47
40
  if (
48
41
  storer_queue_empty and
49
42
  not redis_ready_seed_length and
50
43
  not redis_spider_seed_length
51
44
  ):
52
- if MODEL:
45
+ if Setting.LAUNCHER_MODEL:
53
46
  log.info("waiting for push seeds...")
54
47
  status = "waiting"
55
48
  time.sleep(30)
@@ -78,32 +71,13 @@ def launcher(task):
78
71
  :param task: 任务配置信息
79
72
  """
80
73
  def decorator(func):
81
- """
82
- Item:
83
- Textfile()
84
- Loghub()
85
- Console()
86
- e.g.
87
- task.fields = "a,b"
88
- func(item, seed)
89
- a = "a"
90
- b = "b"
91
- data = {"a": "a", "b": "b"}
92
- yield item.Loghub(**data)
93
- yield item.Loghub(a=a, b=b)
94
- """
95
- storer_list = []
96
-
97
74
  # 程序结束事件
98
75
  last = threading.Event()
99
76
  # 停止采集事件
100
77
  stop = threading.Event()
101
78
 
102
79
  # 初始化redis信息
103
- redis_db = RedisDB(
104
- task.project, task.task_name, task.redis_info,
105
- model=MODEL, cs_lct=CHECK_LOCK_TIME, rs_time=RESET_SCORE
106
- )
80
+ redis_db = RedisDB(task.project, task.task_name, task.redis_info)
107
81
 
108
82
  # new item
109
83
  item = type("Item", (object,), {"redis_client": redis_db.client})()
@@ -113,7 +87,6 @@ def launcher(task):
113
87
  seed_queue = Queue()
114
88
 
115
89
  scheduler_info = task.scheduler_info or dict()
116
-
117
90
  # 调度器动态继承
118
91
  sql = scheduler_info.get("sql")
119
92
  table = scheduler_info.get("table")
@@ -123,22 +96,15 @@ def launcher(task):
123
96
  DB, class_name = pim(scheduler_db, "scheduler")
124
97
  # SchedulerDB, table, sql, length, size, config = task.scheduler_info
125
98
  SchedulerTmp = type(class_name, (Scheduler, DB), {})
126
-
127
99
  # 初始化调度器
128
100
  scheduler = SchedulerTmp(
129
101
  table=table, sql=sql, size=size, queue=seed_queue,
130
102
  length=task.scheduler_queue_length, config=scheduler_config
131
103
  )
132
104
 
133
- # 初始化采集器
134
- spider = Spider(seed_queue, task.max_retries)
135
-
136
105
  storer = None
137
-
138
- # 解析存储器信息
139
106
  storer_info = task.storer_info or dict()
140
107
 
141
- # for storer_info in storer_info_list:
142
108
  if storer_info:
143
109
  storer_db = storer_info["db"]
144
110
  fields = storer_info["fields"]
@@ -166,14 +132,17 @@ def launcher(task):
166
132
  queue=queue, config=storer_config
167
133
  )
168
134
 
169
- Thread(target=redis_db.check_spider_queue, args=(stop, len(storer_list))).start()
170
- Thread(target=redis_db.set_heartbeat, args=(stop,)).start()
135
+ # 初始化采集器
136
+ spider = Spider(seed_queue, storer and True, task.max_retries)
137
+
138
+ threading.Thread(target=redis_db.check_spider_queue, args=(stop, 0)).start()
139
+ threading.Thread(target=redis_db.set_heartbeat, args=(stop,)).start()
171
140
 
172
141
  # 推送初始种子
173
142
  # seeds = start_seeds(task.start_seed)
174
143
  redis_db.add_seed(task.seeds)
175
144
  # 启动调度器, 调度至redis队列
176
- Thread(
145
+ threading.Thread(
177
146
  # name="xxxx_schedule_seeds",
178
147
  target=scheduler.schedule_seed,
179
148
  args=(
@@ -184,7 +153,7 @@ def launcher(task):
184
153
  ).start()
185
154
 
186
155
  # 启动调度器, 调度任务队列
187
- Thread(
156
+ threading.Thread(
188
157
  # name="xxxx_schedule_task",
189
158
  target=scheduler.schedule_task,
190
159
  args=(
@@ -195,7 +164,7 @@ def launcher(task):
195
164
 
196
165
  # 启动采集器
197
166
  for index in range(task.spider_num):
198
- Thread(
167
+ threading.Thread(
199
168
  # name=f"xxxx_spider_task:{index}",
200
169
  target=spider.spider_task,
201
170
  args=(
@@ -206,7 +175,7 @@ def launcher(task):
206
175
 
207
176
  # 启动存储器
208
177
  if storer:
209
- Thread(
178
+ threading.Thread(
210
179
  # name=f"xxxx_store_task:{storer.table}",
211
180
  target=storer.store_task,
212
181
  args=(
@@ -216,7 +185,7 @@ def launcher(task):
216
185
  )
217
186
  ).start()
218
187
 
219
- Thread(
188
+ threading.Thread(
220
189
  # name="check_spider",
221
190
  target=check,
222
191
  args=(
@@ -1,7 +1,8 @@
1
1
  import time
2
- from cobweb import log, Queue, Seed
3
- from cobweb.utils import issubclass_cobweb_inf
2
+ from inspect import isgenerator
4
3
  # from pympler import asizeof
4
+ from .. import log, ici
5
+ from .. import DealModel, Queue, Seed
5
6
 
6
7
 
7
8
  class Scheduler:
@@ -9,7 +10,7 @@ class Scheduler:
9
10
  def schedule_seed(self, ready_seed_length, get_scheduler_lock, add_seed):
10
11
 
11
12
  inf_name = "SchedulerInterface"
12
- if not issubclass_cobweb_inf(self.__class__, inf_name):
13
+ if not ici(self.__class__, inf_name):
13
14
  raise Exception("not have schedule function!")
14
15
 
15
16
  if self.__class__.__name__ == "Default":
@@ -46,27 +47,37 @@ class Scheduler:
46
47
 
47
48
  class Spider:
48
49
 
49
- def __init__(self, queue, max_retries=5):
50
+ def __init__(self, queue, storage, max_retries=5):
50
51
  self.spider_in_progress = Queue()
51
52
  self.max_retries = max_retries
53
+ self.storage = storage
52
54
  self.queue = queue
53
55
 
54
56
  def spider_task(self, stop, func, item, del_seed):
55
57
  while not stop.is_set():
58
+
56
59
  seed = self.queue.pop()
60
+
57
61
  if not seed:
58
62
  time.sleep(3)
59
63
  continue
64
+
60
65
  elif seed._retry >= self.max_retries:
61
66
  del_seed(seed, spider_status=False)
62
67
  continue
68
+
63
69
  try:
64
70
  self.spider_in_progress.push(1, direct_insertion=True)
65
71
  # log.info("spider seed: " + str(seed))
66
- ret_count = 0
67
- status = None
68
- for it in func(item, seed):
69
- ret_count += 1
72
+ iterators = func(item, seed)
73
+
74
+ if not isgenerator(iterators):
75
+ if not self.storage:
76
+ del_seed(seed, spider_status=True)
77
+ continue
78
+ raise TypeError(f"{func.__name__} isn't a generator")
79
+
80
+ for it in iterators:
70
81
  if getattr(it, "table_name", None):
71
82
  store_queue = it.queue()
72
83
  store_queue.push(
@@ -75,16 +86,18 @@ class Spider:
75
86
  )
76
87
  elif isinstance(it, Seed):
77
88
  self.queue.push(it)
78
- elif any(isinstance(it, t) for t in (list, tuple)):
79
- self.queue.push([s if isinstance(s, Seed) else Seed(s) for s in it])
80
- elif isinstance(it, bool):
81
- status = it
82
89
 
83
- if status:
84
- del_seed(seed, spider_status=True)
85
- elif not ret_count or status is False:
86
- seed._retry += 1
87
- self.queue.push(seed)
90
+ elif isinstance(it, str) and it == DealModel.polling:
91
+ self.queue.push(seed)
92
+ break
93
+ elif isinstance(it, str) and it == DealModel.success:
94
+ del_seed(seed, spider_status=True)
95
+ break
96
+ elif isinstance(it, str) and it == DealModel.failure:
97
+ del_seed(seed, spider_status=False)
98
+ break
99
+ else:
100
+ raise TypeError("yield value type error!")
88
101
 
89
102
  except Exception as e:
90
103
  seed._retry += 1
@@ -100,7 +113,7 @@ class Storer:
100
113
  def store_task(self, stop, last, reset_seed, del_seed):
101
114
 
102
115
  inf_name = "StorerInterface"
103
- if not issubclass_cobweb_inf(self.__class__, inf_name):
116
+ if not ici(self.__class__, inf_name):
104
117
  return None
105
118
 
106
119
  if not getattr(self, "store", None):
@@ -121,13 +134,10 @@ class Storer:
121
134
  seeds.append(seed)
122
135
  data_list.append(data)
123
136
 
124
- if data_list:
125
- if self.store(data_list):
126
- del_seed(seeds)
127
- else:
128
- reset_seed(seeds)
129
- log.info("reset seeds!")
130
- continue
137
+ if self.store(data_list):
138
+ del_seed(seeds)
139
+ else:
140
+ reset_seed(seeds)
131
141
 
132
142
  time.sleep(3)
133
143
 
@@ -1,11 +1,19 @@
1
+ import os
2
+ from .constant import *
1
3
  from .utils import parse_info, struct_start_seeds
2
4
 
3
5
 
6
+ def init_task_env():
7
+ Setting.RESET_SCORE = int(os.getenv("RESET_SCORE", 600))
8
+ Setting.CHECK_LOCK_TIME = int(os.getenv("CHECK_LOCK_TIME", 30))
9
+ Setting.DEAL_MODEL = os.getenv("DEAL_MODEL", DealModel.failure)
10
+ Setting.LAUNCHER_MODEL = os.getenv("LAUNCHER_MODEL", LauncherModel.task)
11
+
12
+
4
13
  class Task:
5
14
 
6
15
  def __init__(
7
16
  self,
8
- # model=None,
9
17
  seeds=None,
10
18
  project=None,
11
19
  task_name=None,
@@ -31,8 +39,7 @@ class Task:
31
39
  :param storer_queue_length:
32
40
  :param scheduler_queue_length:
33
41
  """
34
- # self.model = model
35
-
42
+ init_task_env()
36
43
  self.seeds = struct_start_seeds(seeds)
37
44
  self.project = project or "test"
38
45
  self.task_name = task_name or "spider"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cobweb-launcher
3
- Version: 0.1.8
3
+ Version: 0.1.10
4
4
  Summary: spider_hole
5
5
  Home-page: https://github.com/Juannie-PP/cobweb
6
6
  Author: Juannie-PP
@@ -11,6 +11,10 @@ Classifier: Programming Language :: Python :: 3
11
11
  Requires-Python: >=3.7
12
12
  Description-Content-Type: text/markdown
13
13
  License-File: LICENSE
14
+ Requires-Dist: requests>=2.19.1
15
+ Requires-Dist: oss2>=2.18.1
16
+ Requires-Dist: redis>=4.4.4
17
+ Requires-Dist: aliyun-log-python-sdk
14
18
 
15
19
  # cobweb
16
20
 
@@ -3,10 +3,10 @@ README.md
3
3
  setup.py
4
4
  cobweb/__init__.py
5
5
  cobweb/bbb.py
6
+ cobweb/constant.py
6
7
  cobweb/decorators.py
7
8
  cobweb/interface.py
8
9
  cobweb/log.py
9
- cobweb/setting.py
10
10
  cobweb/task.py
11
11
  cobweb/utils.py
12
12
  cobweb/db/__init__.py
@@ -18,14 +18,14 @@ cobweb/db/scheduler/textfile.py
18
18
  cobweb/db/storer/__init__.py
19
19
  cobweb/db/storer/console.py
20
20
  cobweb/db/storer/loghub.py
21
- cobweb/db/storer/redis.py
22
21
  cobweb/db/storer/textfile.py
23
- cobweb/distributed/__init__.py
24
- cobweb/distributed/launcher.py
25
- cobweb/distributed/models.py
26
- cobweb/single/__init__.py
27
- cobweb/single/launcher.py
28
- cobweb/single/models.py
22
+ cobweb/equip/__init__.py
23
+ cobweb/equip/distributed/__init__.py
24
+ cobweb/equip/distributed/launcher.py
25
+ cobweb/equip/distributed/models.py
26
+ cobweb/equip/single/__init__.py
27
+ cobweb/equip/single/launcher.py
28
+ cobweb/equip/single/models.py
29
29
  cobweb_launcher.egg-info/PKG-INFO
30
30
  cobweb_launcher.egg-info/SOURCES.txt
31
31
  cobweb_launcher.egg-info/dependency_links.txt
@@ -5,7 +5,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
5
5
 
6
6
  setup(
7
7
  name="cobweb-launcher",
8
- version="0.1.8",
8
+ version="0.1.10",
9
9
  packages=find_packages(),
10
10
  url="https://github.com/Juannie-PP/cobweb",
11
11
  license="MIT",
@@ -1,11 +0,0 @@
1
- from .bbb import Seed, Queue, DBItem
2
- from .task import Task
3
- from .log import log
4
- from .interface import SchedulerInterface, StorerInterface
5
- from .db.redis_db import RedisDB
6
- from .db.oss_db import OssDB
7
- from .distributed.launcher import launcher
8
- from .single.launcher import launcher as single_launcher
9
- from . import setting
10
-
11
-
@@ -1,2 +0,0 @@
1
- from . import oss_db, redis_db
2
- from . import scheduler, storer
@@ -1,8 +0,0 @@
1
- from cobweb import SchedulerInterface
2
-
3
-
4
- class Default(SchedulerInterface):
5
-
6
- def schedule(self):
7
- pass
8
-
@@ -1,15 +0,0 @@
1
- from cobweb import log, StorerInterface
2
-
3
-
4
- class Textfile(StorerInterface):
5
-
6
- def store(self, data_list):
7
- try:
8
- data_str = "\n".join(str(data) for data in data_list)
9
- with open(self.table, "a") as fp:
10
- fp.write(data_str)
11
- log.info(f"save data, data length: {len(data_list)}")
12
- return True
13
- except Exception as e:
14
- return False
15
-
File without changes
@@ -1,13 +0,0 @@
1
- import os
2
-
3
-
4
- # model: 0, 1, 2
5
- MODEL = int(os.getenv("MODEL", "0"))
6
-
7
- # 重制score值的等待时间, 默认10分钟
8
- RESET_SCORE = int(os.getenv("RESET_SCORE", "600"))
9
-
10
- # 默认设置检查spider queue队列锁的存活时间为30s
11
- CHECK_LOCK_TIME = int(os.getenv("CHECK_LOCK_TIME", 30))
12
-
13
-
File without changes