cobweb-launcher 1.3.12__tar.gz → 1.3.13__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. {cobweb-launcher-1.3.12/cobweb_launcher.egg-info → cobweb-launcher-1.3.13}/PKG-INFO +1 -1
  2. {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.13}/cobweb/base/__init__.py +9 -8
  3. {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.13}/cobweb/base/basic.py +2 -2
  4. {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.13}/cobweb/crawlers/crawler.py +5 -10
  5. {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.13}/cobweb/launchers/launcher.py +12 -7
  6. {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.13}/cobweb/launchers/launcher_pro.py +5 -5
  7. {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.13}/cobweb/pipelines/pipeline.py +2 -0
  8. {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.13/cobweb_launcher.egg-info}/PKG-INFO +1 -1
  9. {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.13}/setup.py +1 -1
  10. {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.13}/LICENSE +0 -0
  11. {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.13}/README.md +0 -0
  12. {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.13}/cobweb/__init__.py +0 -0
  13. {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.13}/cobweb/base/common_queue.py +0 -0
  14. {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.13}/cobweb/base/dotting.py +0 -0
  15. {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.13}/cobweb/base/item.py +0 -0
  16. {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.13}/cobweb/base/log.py +0 -0
  17. {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.13}/cobweb/base/request.py +0 -0
  18. {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.13}/cobweb/base/response.py +0 -0
  19. {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.13}/cobweb/base/seed.py +0 -0
  20. {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.13}/cobweb/constant.py +0 -0
  21. {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.13}/cobweb/crawlers/__init__.py +0 -0
  22. {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.13}/cobweb/db/__init__.py +0 -0
  23. {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.13}/cobweb/db/api_db.py +0 -0
  24. {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.13}/cobweb/db/redis_db.py +0 -0
  25. {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.13}/cobweb/exceptions/__init__.py +0 -0
  26. {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.13}/cobweb/exceptions/oss_db_exception.py +0 -0
  27. {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.13}/cobweb/launchers/__init__.py +0 -0
  28. {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.13}/cobweb/launchers/launcher_air.py +0 -0
  29. {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.13}/cobweb/launchers/launcher_api.py +0 -0
  30. {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.13}/cobweb/pipelines/__init__.py +0 -0
  31. {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.13}/cobweb/pipelines/pipeline_console.py +0 -0
  32. {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.13}/cobweb/pipelines/pipeline_loghub.py +0 -0
  33. {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.13}/cobweb/schedulers/__init__.py +0 -0
  34. {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.13}/cobweb/schedulers/scheduler_api.py +0 -0
  35. {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.13}/cobweb/schedulers/scheduler_redis.py +0 -0
  36. {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.13}/cobweb/setting.py +0 -0
  37. {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.13}/cobweb/utils/__init__.py +0 -0
  38. {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.13}/cobweb/utils/bloom.py +0 -0
  39. {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.13}/cobweb/utils/oss.py +0 -0
  40. {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.13}/cobweb/utils/tools.py +0 -0
  41. {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.13}/cobweb_launcher.egg-info/SOURCES.txt +0 -0
  42. {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.13}/cobweb_launcher.egg-info/dependency_links.txt +0 -0
  43. {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.13}/cobweb_launcher.egg-info/requires.txt +0 -0
  44. {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.13}/cobweb_launcher.egg-info/top_level.txt +0 -0
  45. {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.13}/setup.cfg +0 -0
  46. {cobweb-launcher-1.3.12 → cobweb-launcher-1.3.13}/test/test.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cobweb-launcher
3
- Version: 1.3.12
3
+ Version: 1.3.13
4
4
  Summary: spider_hole
5
5
  Home-page: https://github.com/Juannie-PP/cobweb
6
6
  Author: Juannie-PP
@@ -100,14 +100,15 @@ class Decorators:
100
100
  def pause(func):
101
101
  @wraps(func)
102
102
  def wrapper(self, *args, **kwargs):
103
- while not self.pause.is_set():
104
- try:
105
- func(self)
106
- except Exception as e:
107
- logger.info(f"{func.__name__}: " + str(e))
108
- finally:
109
- time.sleep(0.1)
110
- logger.info(f"{func.__name__}: close!")
103
+ while not self.stop.is_set():
104
+ while not self.pause.is_set():
105
+ try:
106
+ func(self)
107
+ except Exception as e:
108
+ logger.info(f"{func.__name__}: " + str(e))
109
+ finally:
110
+ time.sleep(0.1)
111
+ # logger.info(f"{func.__name__}: close!")
111
112
 
112
113
  return wrapper
113
114
 
@@ -142,7 +142,7 @@ class Request:
142
142
 
143
143
  def __init__(
144
144
  self,
145
- url,
145
+ # url,
146
146
  seed,
147
147
  random_ua=True,
148
148
  check_status_code=True,
@@ -152,7 +152,7 @@ class Request:
152
152
  status=None,
153
153
  **kwargs
154
154
  ):
155
- self.url = url
155
+ # self.url = url
156
156
  self.check_status_code = check_status_code
157
157
  self.request_setting = {}
158
158
 
@@ -13,15 +13,15 @@ from cobweb.base import (
13
13
  ConsoleItem,
14
14
  Decorators,
15
15
  TaskQueue,
16
- logger
17
16
  )
18
17
  from cobweb.constant import DealModel
19
18
 
20
19
 
21
20
  class Crawler(threading.Thread):
22
21
 
23
- def __init__(self, pause, custom_func: Union[Mapping[str, Callable]]):
22
+ def __init__(self, stop, pause, custom_func: Union[Mapping[str, Callable]]):
24
23
  super().__init__()
24
+ self.stop = stop
25
25
  self.pause = pause
26
26
  for func_name, _callable in custom_func.items():
27
27
  if isinstance(_callable, Callable):
@@ -57,12 +57,11 @@ class Crawler(threading.Thread):
57
57
  # member, priority = seed_info
58
58
  # seed = Seed(member, priority=priority)
59
59
  if seed.params.retry > self.spider_max_retries:
60
- # seed.params.seed_status = DealModel.fail
61
60
  TaskQueue.DOT.build(
62
61
  topic=f"{os.getenv('PROJECT')}:{os.getenv('TASK')}",
63
- # cost_time=round(time.time() - start_time, 2),
64
62
  process_task_type=seed.__class__.__name__,
65
63
  seed_status=DealModel.fail,
64
+ retries=seed.params.retry,
66
65
  **seed.to_dict
67
66
  )
68
67
  else:
@@ -75,17 +74,12 @@ class Crawler(threading.Thread):
75
74
  thread_sleep = 0.1
76
75
  if TaskQueue.RESPONSE.length >= self.download_queue_size:
77
76
  thread_sleep = 5
78
- # logger.info(f"download queue is full, sleep {thread_sleep}s")
79
77
  elif request_item := TaskQueue.DOWNLOAD.pop():
80
- # member, priority = request_info
81
- #
82
- # request_setting = json.loads(member)
83
- # request_item = Request(seed=member, **request_setting)
84
78
  if request_item.params.retry > self.spider_max_retries:
85
79
  TaskQueue.DOT.build(
86
80
  topic=f"{os.getenv('PROJECT')}:{os.getenv('TASK')}",
87
- # cost_time=round(time.time() - start_time, 2),
88
81
  process_task_type=request_item.__class__.__name__,
82
+ retries=request_item.params.retry,
89
83
  seed_status=DealModel.fail,
90
84
  **request_item.to_dict
91
85
  )
@@ -105,6 +99,7 @@ class Crawler(threading.Thread):
105
99
  topic=f"{os.getenv('PROJECT')}:{os.getenv('TASK')}",
106
100
  process_task_type=response_item.__class__.__name__,
107
101
  seed_status=DealModel.fail,
102
+ retries=response_item.params.retry,
108
103
  **response_item.to_dict
109
104
  )
110
105
  TaskQueue.DONE.push(response_item.seed)
@@ -72,7 +72,7 @@ class Launcher(threading.Thread):
72
72
  self.crawler_path = setting.CRAWLER
73
73
  self.pipeline_path = setting.PIPELINE
74
74
 
75
- self._threads = []
75
+ self._thread_info = {}
76
76
 
77
77
  self._task_info = dict(todo={}, download={})
78
78
 
@@ -131,9 +131,12 @@ class Launcher(threading.Thread):
131
131
  def check_alive(self):
132
132
  while not self.stop.is_set():
133
133
  if not self.pause.is_set():
134
- for thread in self._threads:
135
- if not thread.is_alive():
136
- thread.start()
134
+ for name, thread_info in self._thread_info.items():
135
+ instance = thread_info['instance']
136
+ if not instance.is_alive():
137
+ instance = threading.Thread(name=name, target=thread_info['func'], args=())
138
+ self._thread_info[name] = dict(instance=instance, func=thread_info['func'])
139
+ instance.start()
137
140
  time.sleep(1)
138
141
 
139
142
  def _add_thread(self, func, num=1, obj=None, name=None, args=()):
@@ -141,7 +144,9 @@ class Launcher(threading.Thread):
141
144
  name = obj.__class__.__name__ + ":" + (name or func.__name__)
142
145
  for i in range(num):
143
146
  func_name = name + "_" + str(i) if num > 1 else name
144
- self._threads.append(threading.Thread(name=func_name, target=func, args=()))
147
+ instance = threading.Thread(name=func_name, target=func, args=())
148
+ self._thread_info[func_name] = dict(instance=instance, func=func)
149
+ instance.start()
145
150
 
146
151
  @Decorators.stop
147
152
  def _polling(self):
@@ -188,8 +193,8 @@ class Launcher(threading.Thread):
188
193
  Crawler = dynamic_load_class(self.crawler_path)
189
194
  Pipeline = dynamic_load_class(self.pipeline_path)
190
195
 
191
- crawler = Crawler(pause=self.pause, custom_func=self.custom_func)
192
- pipeline = Pipeline(pause=self.pause)
196
+ crawler = Crawler(stop=self.stop, pause=self.pause, custom_func=self.custom_func)
197
+ pipeline = Pipeline(stop=self.stop, pause=self.pause)
193
198
 
194
199
  self._add_thread(obj=crawler, func=crawler.build_request_item)
195
200
  self._add_thread(obj=crawler, func=crawler.build_download_item, num=self.spider_thread_num)
@@ -1,6 +1,6 @@
1
1
  import time
2
2
 
3
- from cobweb.base import TaskQueue, Decorators
3
+ from cobweb.base import TaskQueue, Decorators, Seed, Request
4
4
  from cobweb.schedulers import RedisScheduler
5
5
  from .launcher import Launcher
6
6
 
@@ -16,13 +16,13 @@ class LauncherPro(Launcher):
16
16
  @Decorators.stop
17
17
  def _schedule(self):
18
18
  thread_sleep = self.scheduling_wait_time
19
- for q, key, size, item_info in [
20
- (TaskQueue.TODO, self._redis_todo, self.todo_queue_size, self._task_info["todo"]),
21
- (TaskQueue.DOWNLOAD, self._redis_download, self.download_queue_size, self._task_info["download"]),
19
+ for q, key, size, item_info, Cls in [
20
+ (TaskQueue.TODO, self._redis_todo, self.todo_queue_size, self._task_info["todo"], Seed),
21
+ (TaskQueue.DOWNLOAD, self._redis_download, self.download_queue_size, self._task_info["download"], Request),
22
22
  ]:
23
23
  if q.length < size:
24
24
  for member, priority in self._scheduler.schedule(key, self.scheduling_size):
25
- q.push((member, priority), direct_insertion=True)
25
+ q.push(Cls(member, priority=priority))
26
26
  self.add_working_item(key.split(":")[-1], member, priority)
27
27
  thread_sleep = 0.1
28
28
  time.sleep(thread_sleep)
@@ -10,9 +10,11 @@ class Pipeline(ABC):
10
10
 
11
11
  def __init__(
12
12
  self,
13
+ stop: threading.Event,
13
14
  pause: threading.Event,
14
15
  ):
15
16
  super().__init__()
17
+ self.stop = stop
16
18
  self.pause = pause
17
19
  self.upload_queue_size = setting.UPLOAD_QUEUE_SIZE
18
20
  self.upload_wait_time = setting.UPLOAD_WAIT_TIME
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cobweb-launcher
3
- Version: 1.3.12
3
+ Version: 1.3.13
4
4
  Summary: spider_hole
5
5
  Home-page: https://github.com/Juannie-PP/cobweb
6
6
  Author: Juannie-PP
@@ -5,7 +5,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
5
5
 
6
6
  setup(
7
7
  name="cobweb-launcher",
8
- version="1.3.12",
8
+ version="1.3.13",
9
9
  packages=find_packages(),
10
10
  url="https://github.com/Juannie-PP/cobweb",
11
11
  license="MIT",