cobweb-launcher 1.0.0__py3-none-any.whl → 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of cobweb-launcher might be problematic. Click here for more details.

Files changed (45) hide show
  1. cobweb/__init__.py +1 -1
  2. cobweb/launchers/launcher.py +1 -2
  3. {cobweb_launcher-1.0.0.dist-info → cobweb_launcher-1.0.2.dist-info}/METADATA +1 -1
  4. cobweb_launcher-1.0.2.dist-info/RECORD +32 -0
  5. cobweb/bbb.py +0 -191
  6. cobweb/config.py +0 -164
  7. cobweb/db/oss_db.py +0 -128
  8. cobweb/db/scheduler/__init__.py +0 -1
  9. cobweb/db/scheduler/default.py +0 -8
  10. cobweb/db/scheduler/textfile.py +0 -27
  11. cobweb/db/storer/__init__.py +0 -1
  12. cobweb/db/storer/console.py +0 -9
  13. cobweb/db/storer/loghub.py +0 -54
  14. cobweb/db/storer/redis.py +0 -15
  15. cobweb/db/storer/textfile.py +0 -15
  16. cobweb/decorators.py +0 -16
  17. cobweb/distributed/__init__.py +0 -0
  18. cobweb/distributed/launcher.py +0 -243
  19. cobweb/distributed/models.py +0 -143
  20. cobweb/equip/__init__.py +0 -8
  21. cobweb/equip/dev/__init__.py +0 -0
  22. cobweb/equip/dev/launcher.py +0 -202
  23. cobweb/equip/dev/models.py +0 -156
  24. cobweb/equip/distributed/__init__.py +0 -0
  25. cobweb/equip/distributed/launcher.py +0 -219
  26. cobweb/equip/distributed/models.py +0 -158
  27. cobweb/equip/download/__init__.py +0 -0
  28. cobweb/equip/download/launcher.py +0 -203
  29. cobweb/equip/download/models.py +0 -169
  30. cobweb/equip/single/__init__.py +0 -0
  31. cobweb/equip/single/launcher.py +0 -203
  32. cobweb/equip/single/models.py +0 -173
  33. cobweb/interface.py +0 -34
  34. cobweb/log.py +0 -96
  35. cobweb/new.py +0 -20
  36. cobweb/single/__init__.py +0 -0
  37. cobweb/single/launcher.py +0 -231
  38. cobweb/single/models.py +0 -134
  39. cobweb/single/nest.py +0 -153
  40. cobweb/task.py +0 -61
  41. cobweb/utils.py +0 -90
  42. cobweb_launcher-1.0.0.dist-info/RECORD +0 -69
  43. {cobweb_launcher-1.0.0.dist-info → cobweb_launcher-1.0.2.dist-info}/LICENSE +0 -0
  44. {cobweb_launcher-1.0.0.dist-info → cobweb_launcher-1.0.2.dist-info}/WHEEL +0 -0
  45. {cobweb_launcher-1.0.0.dist-info → cobweb_launcher-1.0.2.dist-info}/top_level.txt +0 -0
cobweb/single/nest.py DELETED
@@ -1,153 +0,0 @@
1
- import time
2
- import threading
3
-
4
- from equip.single import Seed, DBItem
5
- from equip.single import struct_queue_name, restore_table_name
6
- from equip.single import Distributor, Scheduler, Spider, Storer
7
-
8
-
9
- def init_task_seed(seeds):
10
- if not seeds:
11
- return None
12
- if isinstance(seeds, list) or isinstance(seeds, tuple):
13
- for seed in seeds:
14
- yield Seed(seed)
15
- elif isinstance(seeds, str) or isinstance(seeds, dict):
16
- yield Seed(seeds)
17
-
18
-
19
- def parse_storer_info(storer_info):
20
- storer_data = {}
21
- storer_info_list = []
22
- if storer_info.__class__.__name__ == 'StorerInfo':
23
- storer_info_list.append(storer_info)
24
- elif isinstance(storer_info, tuple) or isinstance(storer_info, list):
25
- storer_info_list = storer_info
26
- for info in storer_info_list:
27
- db_name = info.DB.__name__
28
- storer_data.setdefault(db_name, {"StorerDB": info.DB, "db_args_list": []})
29
- storer_data[db_name]["db_args_list"].append(info[1:])
30
- return storer_data
31
-
32
-
33
- def check(stop_event, last_event, distributor, scheduler, spider, storer_list):
34
- while True:
35
- time.sleep(3)
36
- if (
37
- scheduler.stop and
38
- not distributor.seed_queue.length and
39
- not spider.spider_in_progress.length
40
- ):
41
- last_event.set()
42
- time.sleep(10)
43
- storer_queue_empty = True
44
- for storer in storer_list:
45
- if storer.queue.length:
46
- storer_queue_empty = False
47
- break
48
- if storer_queue_empty:
49
- break
50
- last_event.clear()
51
- stop_event.set()
52
-
53
-
54
- def cobweb(task):
55
- """
56
- 任务启动装饰器
57
- :param task: 任务配置信息
58
- """
59
- def decorator(func):
60
- """
61
- func(Item, seed)
62
- Item:
63
- Item.Textfile()
64
- Item.Console()
65
- """
66
- # project task_name start_seed spider_num queue_length scheduler_info storer_info
67
-
68
- storer_list = []
69
-
70
- # 程序结束事件
71
- last_event = threading.Event()
72
- # 暂停采集事件
73
- stop_event = threading.Event()
74
-
75
- # 创建分发器
76
- distributor = Distributor()
77
-
78
- # 调度器动态继承
79
- SchedulerDB, table, sql, length, size = task.SchedulerInfo
80
- SchedulerTmp = type('Scheduler', (Scheduler, SchedulerDB), {})
81
-
82
- # 初始化调度器
83
- scheduler = SchedulerTmp(table=table, sql=sql, length=length, size=size, queue=distributor.seed_queue)
84
-
85
- # 初始化采集器
86
- spider = Spider(queue=distributor.seed_queue)
87
-
88
- # 解析存储器信息
89
- storer_data = parse_storer_info(task.storer_info)
90
-
91
- # sds
92
- item = type("item", (object,), {})
93
- for db_name in storer_data.keys():
94
- # 存储器动态继承
95
- StorerDB = storer_data[db_name]["StorerDB"]
96
- StorerTmp = type('Storer', (Storer, StorerDB), {})
97
- db_args_list = storer_data[db_name]["db_args_list"]
98
- for storer_db_args in db_args_list:
99
- table, fields, length = storer_db_args
100
- if not getattr(item, db_name, None):
101
- instance = type(db_name, (DBItem,), {})
102
- setattr(item, db_name, instance)
103
- # 创建存储xxx
104
- getattr(item, db_name).init_item(table, fields)
105
- # 创建存储队列
106
- storer_queue = struct_queue_name(db_name, table)
107
- distributor.create_queue(queue_name=storer_queue)
108
- queue = distributor.get_queue(queue_name=storer_queue)
109
- # 初始话存储器
110
- table_name = restore_table_name(table_name=table)
111
- storer = StorerTmp(table=table_name, fields=fields, length=length, queue=queue)
112
- storer_list.append(storer)
113
-
114
- # 推送初始种子
115
- distributor.distribute(init_task_seed, seeds=task.start_seed)
116
-
117
- # 启动调度器
118
- threading.Thread(
119
- target=scheduler.schedule_task,
120
- args=(distributor.distribute,),
121
- name="single_scheduler_task"
122
- ).start()
123
-
124
- # 启动采集器
125
- for index in range(task.spider_num):
126
- threading.Thread(
127
- target=spider.spider_task,
128
- args=(stop_event, distributor.distribute, func, item),
129
- name=f"single_spider_task:{index}"
130
- ).start()
131
-
132
- # 启动存储器
133
- for storer in storer_list:
134
- threading.Thread(
135
- target=storer.store_task,
136
- args=(stop_event, last_event, distributor.distribute),
137
- name=f"single_store_task:{storer.table}",
138
- ).start()
139
-
140
- threading.Thread(
141
- target=check, name="check",
142
- args=(
143
- stop_event, last_event, distributor,
144
- scheduler, spider, storer_list
145
- )
146
- ).start()
147
-
148
- # return starter(task, func)
149
- return decorator
150
-
151
-
152
-
153
-
cobweb/task.py DELETED
@@ -1,61 +0,0 @@
1
- import os
2
- from .constant import *
3
- from .utils import parse_info, struct_start_seeds
4
-
5
-
6
- def init_task_env():
7
- Setting.RESET_SCORE = int(os.getenv("RESET_SCORE", 600))
8
- Setting.CHECK_LOCK_TIME = int(os.getenv("CHECK_LOCK_TIME", 30))
9
- Setting.DEAL_MODEL = os.getenv("DEAL_MODEL", DealModel.failure)
10
- Setting.LAUNCHER_MODEL = os.getenv("LAUNCHER_MODEL", LauncherModel.task)
11
- Setting.SCHEDULER_WAIT_TIME = float(os.getenv("SCHEDULER_WAIT_TIME", 5))
12
- Setting.SCHEDULER_BLOCK_TIME = float(os.getenv("SCHEDULER_BLOCK_TIME", 3))
13
- Setting.SPIDER_WAIT_TIME = float(os.getenv("SPIDER_WAIT_TIME", 3))
14
- Setting.SPIDER_SLEEP_TIME = float(os.getenv("SPIDER_SLEEP_TIME", 0.5))
15
-
16
-
17
- class Task:
18
-
19
- def __init__(
20
- self,
21
- seeds=None,
22
- project=None,
23
- task_name=None,
24
- oss_config=None,
25
- redis_info=None,
26
- storer_info=None,
27
- scheduler_info=None,
28
- spider_num=None,
29
- max_retries=None,
30
- storer_queue_length=None,
31
- scheduler_queue_length=None,
32
- ):
33
- """
34
-
35
- :param seeds:
36
- :param project:
37
- :param task_name:
38
- :param redis_info:
39
- :param storer_info:
40
- :param scheduler_info: dict(DB="", table="", size="", config="")
41
- :param spider_num:
42
- :param max_retries:
43
- :param storer_queue_length:
44
- :param scheduler_queue_length:
45
- """
46
- init_task_env()
47
- self.seeds = struct_start_seeds(seeds)
48
- self.project = project or "test"
49
- self.task_name = task_name or "spider"
50
-
51
- self.oss_config = oss_config
52
-
53
- self.redis_info = parse_info(redis_info)
54
- self.storer_info = parse_info(storer_info)
55
- self.scheduler_info = parse_info(scheduler_info)
56
-
57
- self.spider_num = spider_num or 1
58
- self.max_retries = max_retries or 5
59
- self.storer_queue_length = storer_queue_length or 100
60
- self.scheduler_queue_length = scheduler_queue_length or 100
61
-
cobweb/utils.py DELETED
@@ -1,90 +0,0 @@
1
- import json
2
- import re
3
- import sys
4
- from abc import ABC
5
- from typing import Iterable
6
- from importlib import import_module
7
-
8
-
9
- def struct_table_name(table_name):
10
- return table_name.replace(".", "__p__").replace(":", "__c__")
11
-
12
-
13
- def restore_table_name(table_name):
14
- return table_name.replace("__p__", ".").replace("__c__", ":")
15
-
16
-
17
- def struct_queue_name(db_name, table_name):
18
- return sys.intern(f"__{db_name}_{table_name}_queue__")
19
-
20
-
21
- def parse_info(info):
22
- if not info:
23
- return info
24
-
25
- if isinstance(info, dict):
26
- return info
27
-
28
- if isinstance(info, str):
29
- return json.loads(info)
30
-
31
- if isinstance(info, Iterable):
32
- result = list()
33
- for ii in info:
34
- if isinstance(ii, str):
35
- result.append(json.loads(ii))
36
- elif isinstance(ii, dict):
37
- result.append(ii)
38
- else:
39
- raise TypeError("must be in [str, dict]")
40
-
41
- return result
42
-
43
-
44
- def struct_start_seeds(seeds):
45
- from .bbb import Seed
46
- if not seeds:
47
- return None
48
- if any(isinstance(seeds, t) for t in (list, tuple)):
49
- return [Seed(seed) for seed in seeds]
50
- elif any(isinstance(seeds, t) for t in (str, dict)):
51
- return Seed(seeds)
52
-
53
-
54
- def issubclass_cobweb_inf(_class, inf_name):
55
- for _c in _class.__mro__[1:]:
56
- if _c.__name__ == inf_name:
57
- return True
58
- return False
59
-
60
-
61
- def parse_import_model(model_info, model_type=None):
62
- if model_type not in ["scheduler", "storer"]:
63
- raise TypeError("model_type must be in scheduler, storer")
64
- if isinstance(model_info, str):
65
- if "import" in model_info:
66
- model_path, class_name = re.search(
67
- r"from (.*?) import (.*?)$", model_info
68
- ).groups()
69
- model = import_module(model_path)
70
- class_object = getattr(model, class_name)
71
- elif "." in model_info:
72
- info_list = model_info.split(".")
73
- class_name = info_list[-1]
74
- model_path = ".".join(info_list[:-1])
75
- model = import_module(model_path)
76
- class_object = getattr(model, class_name)
77
- else:
78
- model_path = f"cobweb.db.{model_type}.{model_info.lower()}"
79
- class_name = model_info.capitalize()
80
- model = import_module(model_path)
81
- class_object = getattr(model, class_name)
82
- return class_object, class_name
83
- elif issubclass(model_info, ABC):
84
- inf_name = model_type.capitalize() + "Interface"
85
- if issubclass_cobweb_inf(model_info, inf_name):
86
- return model_info, model_info.__name__
87
- raise ImportError()
88
- raise TypeError()
89
-
90
-
@@ -1,69 +0,0 @@
1
- cobweb/__init__.py,sha256=BJu97N8hZsP9aBkMFMhbLFMuFv7JOJvGKnE9NIvcFJo,44
2
- cobweb/bbb.py,sha256=zKeCeBVFQfhEv6M8CCktUTM7tXDZmAu6ZN0-ET44pUY,5707
3
- cobweb/config.py,sha256=iWrep4vW9UyUi3hvgUWb4RL2IpEwpo_ttY0EUDKWN4g,5362
4
- cobweb/constant.py,sha256=Aw2ES_nohVRLTWylZp6WMiUAlgyw4kLbae7LpwdZ5y4,1867
5
- cobweb/decorators.py,sha256=eYQI9rddPVJihAlomLTmbtQhIOzPw8dCrOFpxAq2pLY,318
6
- cobweb/interface.py,sha256=um_k2AAQl1HTOvfUlq914DjkpfZVwt2m1B65EpPKrmE,802
7
- cobweb/log.py,sha256=Gb3_y4IzTo5pJohTggBCU9rK6-ZN3hgTOHkoXHyN6CU,2384
8
- cobweb/new.py,sha256=-ZHoLZE4ZGRMBU5YWmWHMfLn2ipuTKEaCTO1pU2fX5o,303
9
- cobweb/setting.py,sha256=T693DAwLFLs9P6ZEvugP99UzXn-8PLeMEgdxRmL6cb4,1955
10
- cobweb/task.py,sha256=SyWC43C7hqQAqH-1HECXEEgp_6L6lwDhYe1kZNnHUvA,2006
11
- cobweb/utils.py,sha256=ivmRqJJNtwdOKYT4G7qQCWnL8ar9c-shxeDZzGB2E9c,2651
12
- cobweb/base/__init__.py,sha256=diiK5MygQaWjlWNLbW6eUIg-93O6glMGC9WLNM5jyOc,209
13
- cobweb/base/common_queue.py,sha256=W7PPZZFl52j3Mc916T0imHj7oAUelA6aKJwW-FecDPE,872
14
- cobweb/base/decorators.py,sha256=wDCaQ94aAZGxks9Ljc0aXq6omDXT1_yzFy83ZW6VbVI,930
15
- cobweb/base/item.py,sha256=pMriHStzUXtSvIf5Z3KXsP-bCvjlG1gM3z33wWeuoH8,966
16
- cobweb/base/log.py,sha256=L01hXdk3L2qEm9X1FOXQ9VmWIoHSELe0cyZvrdAN61A,2003
17
- cobweb/base/request.py,sha256=b08AtUSZjlFLEFIEw5uGS__FjU6QSldL20-UjZD0LbI,2128
18
- cobweb/base/response.py,sha256=7h9TwCNqRlwM_fvNmid9zOoRfHbKB8ABSU0eaVUJdVo,405
19
- cobweb/base/seed.py,sha256=XswH16eEd6iwIBpt71E2S_AsV5UVCcOEOBFoP0r5QRo,2900
20
- cobweb/crawlers/__init__.py,sha256=1sMhQ0-NJxiff3IqF2aMCXkSXcJFzzoCKIayQ5go4aI,71
21
- cobweb/crawlers/base_crawler.py,sha256=ZIdmlvL4f41yV7T77F9IhBBxRt1FH-LFm0BmIXAvP8I,4881
22
- cobweb/crawlers/file_crawler.py,sha256=VVOZ38qNAUPyNDspu3P7-zzDtrUtqefYLjOMnb_-JOw,9685
23
- cobweb/db/__init__.py,sha256=ut0iEyBLjcJL06WNG_5_d4hO5PJWvDrKWMkDOdmgh2M,30
24
- cobweb/db/oss_db.py,sha256=59HwMMfoGUr6l_BI4p84YHYoQwEn22cCM_1GZK34Uo4,4244
25
- cobweb/db/redis_db.py,sha256=MahFsAyub9OFG-xuZU6Qq0AfWV1lASFAI34g2FRGpq8,4235
26
- cobweb/db/scheduler/__init__.py,sha256=w5uIGEB1wLJ-H9RqGpzRwOEWW-BBVSk6Cc7FxZIlWCs,51
27
- cobweb/db/scheduler/default.py,sha256=XDtxNyu5KTpVAbfCOW8mR1zNFNHiMuaQ4sAhZuIYBoM,79
28
- cobweb/db/scheduler/textfile.py,sha256=P5pk75DUnbXbLNPOaMIbHh2lbwBGBlv0mitX58yK-MU,786
29
- cobweb/db/storer/__init__.py,sha256=yWUVyq8JLpuUDPnUC0igw3P8Kkw_FqNi0aAoxkMkRmc,49
30
- cobweb/db/storer/console.py,sha256=096JTALYuB_I3Qy5TjN40yEPeugO_pmqHN9VJu7wD7Y,153
31
- cobweb/db/storer/loghub.py,sha256=4ImSIpHPNU7Djp72HlUGOd2h5c9gIxGzBKL1jJ3KPkM,1702
32
- cobweb/db/storer/redis.py,sha256=7Q2XEQwBL6X_M1uvxzzuSBt6iw9piKw-_FWKm2INZDQ,412
33
- cobweb/db/storer/textfile.py,sha256=auoXGXLbIbEhMoeYIhy58qw22N2r0fQTtzVjHCjqVGA,386
34
- cobweb/distributed/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
35
- cobweb/distributed/launcher.py,sha256=jTtBXBmna_6yFdj6gyGQiiEtg8I0g5uI5h8kbHWt454,7998
36
- cobweb/distributed/models.py,sha256=PUQokXMGD-H4A99nX7qYA395Ul6IsWGruMTVa05nswY,4568
37
- cobweb/equip/__init__.py,sha256=LWhbrTnG9kD1et0D40EzLISPuE0PIS-5WD3y3CLDaWk,247
38
- cobweb/equip/dev/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
39
- cobweb/equip/dev/launcher.py,sha256=KRsw7yxklvFM85cel-EyLsNPLyrC9Hd26BMSx6-4Hac,6785
40
- cobweb/equip/dev/models.py,sha256=w3LQEhTrgqoYZn5v9TiEK2A68xuC7QH7suRP9OYnoOg,4813
41
- cobweb/equip/distributed/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
42
- cobweb/equip/distributed/launcher.py,sha256=1LzxibGXWR20XpXawakiRpEMaa9yfaj2rFSKnmEwjFc,7475
43
- cobweb/equip/distributed/models.py,sha256=qTGzxLdb2arsZSZK2HE4-MrqhraUhc2Ol5wBvlv_aWA,5008
44
- cobweb/equip/download/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
45
- cobweb/equip/download/launcher.py,sha256=lZt4WNar0_QQjUSDhaJnxvHDiuQReXeFxquVgJjI5T4,6824
46
- cobweb/equip/download/models.py,sha256=USwa48PUZ9_J8qwmEoGu4Oq7-oo-xWxL0k5j8q8PoDw,5305
47
- cobweb/equip/single/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
48
- cobweb/equip/single/launcher.py,sha256=lZt4WNar0_QQjUSDhaJnxvHDiuQReXeFxquVgJjI5T4,6824
49
- cobweb/equip/single/models.py,sha256=mfSmYBUkX-QuuyuajkIw6EI0NYhlbqrCM88YGP7dHUw,5590
50
- cobweb/exceptions/__init__.py,sha256=E9SHnJBbhD7fOgPFMswqyOf8SKRDrI_i25L0bSpohvk,32
51
- cobweb/exceptions/oss_db_exception.py,sha256=iP_AImjNHT3-Iv49zCFQ3rdLnlvuHa3h2BXApgrOYpA,636
52
- cobweb/launchers/__init__.py,sha256=qwlkEJVri7dvCgi45aX3lqAmQS0HrPicAipDvH75kew,69
53
- cobweb/launchers/launcher.py,sha256=vqRqaBEg04mX6F-EGuoAdeCA163sSZk45mNmmQ2LWUA,5654
54
- cobweb/launchers/launcher_pro.py,sha256=GlDpyP1XAY2bX5SuSBn3920D5OKNigQgLnJfu6QOmPw,6760
55
- cobweb/pipelines/__init__.py,sha256=xanY-Z1d7zRR5JhCdW2htzrAywnKBkigiaUlTFa6of0,80
56
- cobweb/pipelines/base_pipeline.py,sha256=fYnWf79GmhufXpcnMa3te18SbmnVeYLwxfyo-zLd9CY,1577
57
- cobweb/pipelines/loghub_pipeline.py,sha256=roQ8gLunvuPc4KOMIATN1nKvjOXrc_RSyzXq8YY9ZBU,1015
58
- cobweb/single/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
59
- cobweb/single/launcher.py,sha256=IoJbn87j7t7Pib_FxoWZmmX8asXOqNGb-9ospw6EYJI,7302
60
- cobweb/single/models.py,sha256=wIEV35666lxdzqjDqBHPjjh-r6zD0x24rtQYz7d4Oxw,4332
61
- cobweb/single/nest.py,sha256=49K6KQ934INfPrWQsrq9rIFpQauLbLGOFbDaHvoQzOk,5015
62
- cobweb/utils/__init__.py,sha256=JTE4sBfHnKHhD6w9Auk0MIT7O9BMOamCeryhlHNx3Zg,47
63
- cobweb/utils/oss.py,sha256=8QlVVhXv3GBk53_616MOjbDxgD7c0ooJ0X28BE9pw-M,3220
64
- cobweb/utils/tools.py,sha256=8oLxkzwaYcDqKXXuLI3A_lNabyLBr7HSPgTF6x4xbnk,1239
65
- cobweb_launcher-1.0.0.dist-info/LICENSE,sha256=z1rxSIGOyzcSb3orZxFPxzx-0C1vTocmswqBNxpKfEk,1063
66
- cobweb_launcher-1.0.0.dist-info/METADATA,sha256=aLZS3GZb0OACP69bVQXj1mOkTT3fRYNxiEB2e2G6GrE,1245
67
- cobweb_launcher-1.0.0.dist-info/WHEEL,sha256=ewwEueio1C2XeHTvT17n8dZUJgOvyCWCt0WVNLClP9o,92
68
- cobweb_launcher-1.0.0.dist-info/top_level.txt,sha256=4GETBGNsKqiCUezmT-mJn7tjhcDlu7nLIV5gGgHBW4I,7
69
- cobweb_launcher-1.0.0.dist-info/RECORD,,