cobweb-launcher 0.1.2__tar.gz → 0.1.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/PKG-INFO +1 -1
- {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/__init__.py +1 -0
- {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/db/redis_db.py +5 -3
- {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/distributed/launcher.py +14 -51
- {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/distributed/models.py +7 -3
- cobweb-launcher-0.1.4/cobweb/single/launcher.py +231 -0
- cobweb-launcher-0.1.4/cobweb/single/models.py +136 -0
- cobweb-launcher-0.1.4/cobweb/utils.py +90 -0
- {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb_launcher.egg-info/PKG-INFO +1 -1
- {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb_launcher.egg-info/SOURCES.txt +1 -1
- {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/setup.py +1 -1
- cobweb-launcher-0.1.2/cobweb/single/models.py +0 -104
- cobweb-launcher-0.1.2/cobweb/single/nest.py +0 -153
- cobweb-launcher-0.1.2/cobweb/utils.py +0 -88
- {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/LICENSE +0 -0
- {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/README.md +0 -0
- {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/bbb.py +0 -0
- {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/db/__init__.py +0 -0
- {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/db/oss_db.py +0 -0
- {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/db/scheduler/__init__.py +0 -0
- {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/db/scheduler/default.py +0 -0
- {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/db/scheduler/textfile.py +0 -0
- {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/db/storer/__init__.py +0 -0
- {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/db/storer/console.py +0 -0
- {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/db/storer/loghub.py +0 -0
- {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/db/storer/redis.py +0 -0
- {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/db/storer/textfile.py +0 -0
- {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/decorators.py +0 -0
- {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/distributed/__init__.py +0 -0
- {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/interface.py +0 -0
- {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/log.py +0 -0
- {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/setting.py +0 -0
- {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/single/__init__.py +0 -0
- {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb/task.py +0 -0
- {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb_launcher.egg-info/dependency_links.txt +0 -0
- {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb_launcher.egg-info/requires.txt +0 -0
- {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb_launcher.egg-info/top_level.txt +0 -0
- {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/setup.cfg +0 -0
| @@ -151,17 +151,19 @@ class RedisDB: | |
| 151 151 | 
             
                @check_redis_status
         | 
| 152 152 | 
             
                def check_spider_queue(self, stop, storer_num):
         | 
| 153 153 | 
             
                    while not stop.is_set():
         | 
| 154 | 
            -
                        # 每15s获取check锁,等待600s | 
| 154 | 
            +
                        # 每15s获取check锁,等待600s后仍获取不到锁则重试;获取到锁后,设置锁的存活时间为${cs_lct}s
         | 
| 155 155 | 
             
                        if self._get_lock(key=self.check_lock, t=self.cs_lct, timeout=600, sleep_time=3):
         | 
| 156 156 | 
             
                            heartbeat = True if self.client.exists(self.heartbeat_key) else False
         | 
| 157 | 
            -
                            # 重启重制score | 
| 157 | 
            +
                            # 重启重制score值,否则获取${rs_time}分钟前的分数值
         | 
| 158 158 | 
             
                            score = -int(time.time()) + self.rs_time if heartbeat else "-inf"
         | 
| 159 159 |  | 
| 160 160 | 
             
                            keys = self.client.keys(self.storer_key % "*")
         | 
| 161 | 
            +
             | 
| 161 162 | 
             
                            if keys and len(keys) >= storer_num:
         | 
| 162 163 | 
             
                                intersection_key = self.storer_key % "intersection"
         | 
| 163 164 | 
             
                                self.client.delete(intersection_key)
         | 
| 164 165 | 
             
                                self.client.zinterstore(intersection_key, keys)
         | 
| 166 | 
            +
             | 
| 165 167 | 
             
                                while True:
         | 
| 166 168 | 
             
                                    members = self.client.zrange(intersection_key, 0, 1999)
         | 
| 167 169 | 
             
                                    if not members:
         | 
| @@ -192,7 +194,7 @@ class RedisDB: | |
| 192 194 | 
             
                                self.client.setex(self.heartbeat_key, 15, "")
         | 
| 193 195 |  | 
| 194 196 | 
             
                            # self.client.delete(self.check_lock)
         | 
| 195 | 
            -
                            time.sleep(3)
         | 
| 197 | 
            +
                            # time.sleep(3)
         | 
| 196 198 |  | 
| 197 199 | 
             
                @check_redis_status
         | 
| 198 200 | 
             
                def set_heartbeat(self, stop):
         | 
| @@ -1,50 +1,15 @@ | |
| 1 1 | 
             
            import time
         | 
| 2 2 | 
             
            import threading
         | 
| 3 3 | 
             
            from threading import Thread
         | 
| 4 | 
            -
            from importlib import import_module
         | 
| 5 4 |  | 
| 6 | 
            -
            from cobweb import log, Queue, DBItem, RedisDB, OssDB, StorerInterface
         | 
| 7 | 
            -
            from cobweb.utils import struct_queue_name, restore_table_name
         | 
| 8 | 
            -
            from cobweb.setting import MODEL, RESET_SCORE, CHECK_LOCK_TIME
         | 
| 9 5 | 
             
            from .models import Scheduler, Spider, Storer
         | 
| 10 | 
            -
             | 
| 11 | 
            -
             | 
| 12 | 
            -
             | 
| 13 | 
            -
                 | 
| 14 | 
            -
             | 
| 15 | 
            -
             | 
| 16 | 
            -
             | 
| 17 | 
            -
                        obj = getattr(model, db)
         | 
| 18 | 
            -
                    else:
         | 
| 19 | 
            -
                        model = import_module(f"cobweb.db.scheduler.{db.lower()}")
         | 
| 20 | 
            -
                        obj = getattr(model, db.capitalize())
         | 
| 21 | 
            -
                    return obj
         | 
| 22 | 
            -
                    # if db.lower() in dir(StorerDB):
         | 
| 23 | 
            -
                    #     return getattr(StorerDB, db)
         | 
| 24 | 
            -
                    # else:
         | 
| 25 | 
            -
                    #     pass
         | 
| 26 | 
            -
                elif issubclass(db, StorerInterface):
         | 
| 27 | 
            -
                    return db
         | 
| 28 | 
            -
                raise TypeError()
         | 
| 29 | 
            -
             | 
| 30 | 
            -
             | 
| 31 | 
            -
            def get_storer_db(db):
         | 
| 32 | 
            -
                if isinstance(db, str):
         | 
| 33 | 
            -
                    if "." in db:
         | 
| 34 | 
            -
                        model_path = db.split(".")
         | 
| 35 | 
            -
                        model = import_module(db)
         | 
| 36 | 
            -
                        obj = getattr(model, db)
         | 
| 37 | 
            -
                    else:
         | 
| 38 | 
            -
                        model = import_module(f"cobweb.db.storer.{db.lower()}")
         | 
| 39 | 
            -
                        obj = getattr(model, db.capitalize())
         | 
| 40 | 
            -
                    return obj, db.lower()
         | 
| 41 | 
            -
                    # if db.lower() in dir(StorerDB):
         | 
| 42 | 
            -
                    #     return getattr(StorerDB, db)
         | 
| 43 | 
            -
                    # else:
         | 
| 44 | 
            -
                    #     pass
         | 
| 45 | 
            -
                elif issubclass(db, StorerInterface):
         | 
| 46 | 
            -
                    return db, db.__name__.lower()
         | 
| 47 | 
            -
                raise TypeError()
         | 
| 6 | 
            +
            from cobweb import log, Queue, DBItem, RedisDB
         | 
| 7 | 
            +
            from cobweb.setting import MODEL, RESET_SCORE, CHECK_LOCK_TIME
         | 
| 8 | 
            +
            from cobweb.utils import (
         | 
| 9 | 
            +
                struct_queue_name as sqn,
         | 
| 10 | 
            +
                restore_table_name as rtn,
         | 
| 11 | 
            +
                parse_import_model as pim,
         | 
| 12 | 
            +
            )
         | 
| 48 13 |  | 
| 49 14 |  | 
| 50 15 | 
             
            def check(stop, last, spider, scheduler, storer_list, ready_seed_length, spider_queue_length):
         | 
| @@ -164,9 +129,9 @@ def launcher(task): | |
| 164 129 | 
             
                    size = task.scheduler_info.get("size")
         | 
| 165 130 | 
             
                    scheduler_config = task.scheduler_info.get("config")
         | 
| 166 131 | 
             
                    scheduler_db = task.scheduler_info.get("db", "default")
         | 
| 167 | 
            -
                    DB =  | 
| 132 | 
            +
                    DB, class_name = pim(scheduler_db, "scheduler")
         | 
| 168 133 | 
             
                    # SchedulerDB, table, sql, length, size, config = task.scheduler_info
         | 
| 169 | 
            -
                    SchedulerTmp = type( | 
| 134 | 
            +
                    SchedulerTmp = type(class_name, (Scheduler, DB), {})
         | 
| 170 135 |  | 
| 171 136 | 
             
                    # 初始化调度器
         | 
| 172 137 | 
             
                    scheduler = SchedulerTmp(
         | 
| @@ -185,18 +150,16 @@ def launcher(task): | |
| 185 150 | 
             
                    # new item
         | 
| 186 151 | 
             
                    item = type("Item", (object,), {"redis_client": redis_db.client})()
         | 
| 187 152 |  | 
| 188 | 
            -
                    if task.oss_config:
         | 
| 189 | 
            -
                        item.oss = OssDB(**task.oss_config)
         | 
| 190 | 
            -
             | 
| 191 153 | 
             
                    for storer_info in storer_info_list:
         | 
| 192 154 | 
             
                        storer_db = storer_info["db"]
         | 
| 193 155 | 
             
                        fields = storer_info["fields"]
         | 
| 194 156 | 
             
                        storer_table = storer_info.get("table", "console")
         | 
| 195 157 | 
             
                        storer_config = storer_info.get("config")
         | 
| 196 158 |  | 
| 197 | 
            -
                        StorerDB,  | 
| 198 | 
            -
                        StorerTmp = type( | 
| 159 | 
            +
                        StorerDB, class_name = pim(storer_db, "storer")
         | 
| 160 | 
            +
                        StorerTmp = type(class_name, (Storer, StorerDB), {})
         | 
| 199 161 |  | 
| 162 | 
            +
                        db_name = class_name.lower()
         | 
| 200 163 | 
             
                        if not getattr(item, db_name, None):
         | 
| 201 164 | 
             
                            instance = type(db_name, (DBItem,), {})
         | 
| 202 165 | 
             
                            setattr(item, db_name, instance)
         | 
| @@ -204,10 +167,10 @@ def launcher(task): | |
| 204 167 | 
             
                        storer_item_instance = getattr(item, db_name)
         | 
| 205 168 | 
             
                        storer_item_instance.init_item(storer_table, fields)
         | 
| 206 169 |  | 
| 207 | 
            -
                        storer_queue =  | 
| 170 | 
            +
                        storer_queue = sqn(db_name, storer_table)
         | 
| 208 171 | 
             
                        queue = getattr(storer_item_instance, storer_queue)
         | 
| 209 172 | 
             
                        # 初始话存储器
         | 
| 210 | 
            -
                        table_name =  | 
| 173 | 
            +
                        table_name = rtn(table_name=storer_table)
         | 
| 211 174 | 
             
                        storer = StorerTmp(
         | 
| 212 175 | 
             
                            table=table_name, fields=fields,
         | 
| 213 176 | 
             
                            length=task.storer_queue_length,
         | 
| @@ -1,6 +1,8 @@ | |
| 1 1 | 
             
            import time
         | 
| 2 2 | 
             
            from hashlib import md5
         | 
| 3 | 
            -
            from cobweb import log, Queue, Seed | 
| 3 | 
            +
            from cobweb import log, Queue, Seed
         | 
| 4 | 
            +
            from utils import issubclass_cobweb_interface
         | 
| 5 | 
            +
             | 
| 4 6 | 
             
            # from pympler import asizeof
         | 
| 5 7 |  | 
| 6 8 |  | 
| @@ -8,7 +10,8 @@ class Scheduler: | |
| 8 10 |  | 
| 9 11 | 
             
                def schedule_seed(self, ready_seed_length, get_scheduler_lock, add_seed):
         | 
| 10 12 |  | 
| 11 | 
            -
                     | 
| 13 | 
            +
                    inf_name = "SchedulerInterface"
         | 
| 14 | 
            +
                    if not issubclass_cobweb_interface(self.__class__, inf_name):
         | 
| 12 15 | 
             
                        raise Exception("not have schedule function!")
         | 
| 13 16 |  | 
| 14 17 | 
             
                    if self.__class__.__name__ == "Default":
         | 
| @@ -103,7 +106,8 @@ class Storer: | |
| 103 106 |  | 
| 104 107 | 
             
                def store_task(self, stop, last, reset_seed, set_storer):
         | 
| 105 108 |  | 
| 106 | 
            -
                     | 
| 109 | 
            +
                    inf_name = "StorerInterface"
         | 
| 110 | 
            +
                    if not issubclass_cobweb_interface(self.__class__, inf_name):
         | 
| 107 111 | 
             
                        return None
         | 
| 108 112 |  | 
| 109 113 | 
             
                    if not getattr(self, "store", None):
         | 
| @@ -0,0 +1,231 @@ | |
| 1 | 
            +
            import time
         | 
| 2 | 
            +
            import threading
         | 
| 3 | 
            +
            from threading import Thread
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            from .models import Scheduler, Spider, Storer
         | 
| 6 | 
            +
            from cobweb import log, Queue, DBItem, RedisDB
         | 
| 7 | 
            +
            from cobweb.setting import MODEL, RESET_SCORE, CHECK_LOCK_TIME
         | 
| 8 | 
            +
            from cobweb.utils import (
         | 
| 9 | 
            +
                struct_queue_name as sqn,
         | 
| 10 | 
            +
                restore_table_name as rtn,
         | 
| 11 | 
            +
                parse_import_model as pim,
         | 
| 12 | 
            +
            )
         | 
| 13 | 
            +
             | 
| 14 | 
            +
             | 
| 15 | 
            +
            def check(stop, last, spider, scheduler, storer, ready_seed_length, spider_queue_length):
         | 
| 16 | 
            +
                log.info("run check thread after 30 seconds...")
         | 
| 17 | 
            +
                time.sleep(30)
         | 
| 18 | 
            +
                spider_info = """
         | 
| 19 | 
            +
            ------------------- check: {0} ------------------
         | 
| 20 | 
            +
                        redis_spider_seed_length:  {1}
         | 
| 21 | 
            +
                        redis_ready_seed_length:   {2}
         | 
| 22 | 
            +
                        running_spider_thread_num: {3}
         | 
| 23 | 
            +
                        memory_seed_queue_length:  {4}
         | 
| 24 | 
            +
                        storer_queue_length_info:  {5}
         | 
| 25 | 
            +
            -----------------------  end  -----------------------"""
         | 
| 26 | 
            +
                while True:
         | 
| 27 | 
            +
                    status = "running"
         | 
| 28 | 
            +
                    running_spider_thread_num = spider.spider_in_progress.length
         | 
| 29 | 
            +
                    redis_ready_seed_length = ready_seed_length()
         | 
| 30 | 
            +
                    redis_spider_seed_length = spider_queue_length()
         | 
| 31 | 
            +
                    memory_seed_queue_length = scheduler.queue.length
         | 
| 32 | 
            +
                    storer_upload_queue_length = storer.queue.length
         | 
| 33 | 
            +
                    if (
         | 
| 34 | 
            +
                            scheduler.stop and
         | 
| 35 | 
            +
                            # not redis_ready_seed_length and
         | 
| 36 | 
            +
                            not memory_seed_queue_length and
         | 
| 37 | 
            +
                            not running_spider_thread_num
         | 
| 38 | 
            +
                    ):
         | 
| 39 | 
            +
                        if not MODEL:
         | 
| 40 | 
            +
                            log.info("spider is done?")
         | 
| 41 | 
            +
                        last.set()
         | 
| 42 | 
            +
                        time.sleep(3)
         | 
| 43 | 
            +
                        storer_queue_empty = True
         | 
| 44 | 
            +
                        if storer.queue.length:
         | 
| 45 | 
            +
                            storer_queue_empty = False
         | 
| 46 | 
            +
                        storer_upload_queue_length = storer.queue.length
         | 
| 47 | 
            +
                        if (
         | 
| 48 | 
            +
                                storer_queue_empty and
         | 
| 49 | 
            +
                                not redis_ready_seed_length and
         | 
| 50 | 
            +
                                not redis_spider_seed_length
         | 
| 51 | 
            +
                        ):
         | 
| 52 | 
            +
                            if MODEL:
         | 
| 53 | 
            +
                                log.info("waiting for push seeds...")
         | 
| 54 | 
            +
                                status = "waiting"
         | 
| 55 | 
            +
                                time.sleep(30)
         | 
| 56 | 
            +
                            else:
         | 
| 57 | 
            +
                                log.info("spider done!")
         | 
| 58 | 
            +
                                break
         | 
| 59 | 
            +
             | 
| 60 | 
            +
                        last.clear()
         | 
| 61 | 
            +
             | 
| 62 | 
            +
                    log.info(spider_info.format(
         | 
| 63 | 
            +
                        status,
         | 
| 64 | 
            +
                        redis_spider_seed_length,
         | 
| 65 | 
            +
                        redis_ready_seed_length,
         | 
| 66 | 
            +
                        running_spider_thread_num,
         | 
| 67 | 
            +
                        memory_seed_queue_length,
         | 
| 68 | 
            +
                        storer_upload_queue_length
         | 
| 69 | 
            +
                    ))
         | 
| 70 | 
            +
             | 
| 71 | 
            +
                    time.sleep(3)
         | 
| 72 | 
            +
                stop.set()
         | 
| 73 | 
            +
             | 
| 74 | 
            +
             | 
| 75 | 
            +
            def launcher(task):
         | 
| 76 | 
            +
                """
         | 
| 77 | 
            +
                任务启动装饰器
         | 
| 78 | 
            +
                :param task: 任务配置信息
         | 
| 79 | 
            +
                """
         | 
| 80 | 
            +
                def decorator(func):
         | 
| 81 | 
            +
                    """
         | 
| 82 | 
            +
                    Item:
         | 
| 83 | 
            +
                        Textfile()
         | 
| 84 | 
            +
                        Loghub()
         | 
| 85 | 
            +
                        Console()
         | 
| 86 | 
            +
                    e.g.
         | 
| 87 | 
            +
                    task.fields = "a,b"
         | 
| 88 | 
            +
                    func(item, seed)
         | 
| 89 | 
            +
                        a = "a"
         | 
| 90 | 
            +
                        b = "b"
         | 
| 91 | 
            +
                        data = {"a": "a", "b": "b"}
         | 
| 92 | 
            +
                        yield item.Loghub(**data)
         | 
| 93 | 
            +
                        yield item.Loghub(a=a, b=b)
         | 
| 94 | 
            +
                    """
         | 
| 95 | 
            +
                    storer_list = []
         | 
| 96 | 
            +
             | 
| 97 | 
            +
                    # 程序结束事件
         | 
| 98 | 
            +
                    last = threading.Event()
         | 
| 99 | 
            +
                    # 停止采集事件
         | 
| 100 | 
            +
                    stop = threading.Event()
         | 
| 101 | 
            +
             | 
| 102 | 
            +
                    # 初始化redis信息
         | 
| 103 | 
            +
                    redis_db = RedisDB(
         | 
| 104 | 
            +
                        task.project, task.task_name, task.redis_info,
         | 
| 105 | 
            +
                        model=MODEL, cs_lct=CHECK_LOCK_TIME, rs_time=RESET_SCORE
         | 
| 106 | 
            +
                    )
         | 
| 107 | 
            +
             | 
| 108 | 
            +
                    # new item
         | 
| 109 | 
            +
                    item = type("Item", (object,), {"redis_client": redis_db.client})()
         | 
| 110 | 
            +
             | 
| 111 | 
            +
                    log.info("初始化cobweb!")
         | 
| 112 | 
            +
             | 
| 113 | 
            +
                    seed_queue = Queue()
         | 
| 114 | 
            +
             | 
| 115 | 
            +
                    scheduler_info = task.scheduler_info or dict()
         | 
| 116 | 
            +
             | 
| 117 | 
            +
                    # 调度器动态继承
         | 
| 118 | 
            +
                    sql = scheduler_info.get("sql")
         | 
| 119 | 
            +
                    table = scheduler_info.get("table")
         | 
| 120 | 
            +
                    size = scheduler_info.get("size")
         | 
| 121 | 
            +
                    scheduler_config = scheduler_info.get("config")
         | 
| 122 | 
            +
                    scheduler_db = scheduler_info.get("db", "default")
         | 
| 123 | 
            +
                    DB, class_name = pim(scheduler_db, "scheduler")
         | 
| 124 | 
            +
                    # SchedulerDB, table, sql, length, size, config = task.scheduler_info
         | 
| 125 | 
            +
                    SchedulerTmp = type(class_name, (Scheduler, DB), {})
         | 
| 126 | 
            +
             | 
| 127 | 
            +
                    # 初始化调度器
         | 
| 128 | 
            +
                    scheduler = SchedulerTmp(
         | 
| 129 | 
            +
                        table=table, sql=sql, size=size, queue=seed_queue,
         | 
| 130 | 
            +
                        length=task.scheduler_queue_length, config=scheduler_config
         | 
| 131 | 
            +
                    )
         | 
| 132 | 
            +
             | 
| 133 | 
            +
                    # 初始化采集器
         | 
| 134 | 
            +
                    spider = Spider(seed_queue, task.max_retries)
         | 
| 135 | 
            +
             | 
| 136 | 
            +
                    storer = None
         | 
| 137 | 
            +
             | 
| 138 | 
            +
                    # 解析存储器信息
         | 
| 139 | 
            +
                    storer_info = task.storer_info or dict()
         | 
| 140 | 
            +
             | 
| 141 | 
            +
                    # for storer_info in storer_info_list:
         | 
| 142 | 
            +
                    if storer_info:
         | 
| 143 | 
            +
                        storer_db = storer_info["db"]
         | 
| 144 | 
            +
                        fields = storer_info["fields"]
         | 
| 145 | 
            +
                        storer_table = storer_info.get("table", "console")
         | 
| 146 | 
            +
                        storer_config = storer_info.get("config")
         | 
| 147 | 
            +
             | 
| 148 | 
            +
                        StorerDB, class_name = pim(storer_db, "storer")
         | 
| 149 | 
            +
                        StorerTmp = type(class_name, (Storer, StorerDB), {})
         | 
| 150 | 
            +
             | 
| 151 | 
            +
                        db_name = class_name.lower()
         | 
| 152 | 
            +
                        if not getattr(item, db_name, None):
         | 
| 153 | 
            +
                            instance = type(db_name, (DBItem,), {})
         | 
| 154 | 
            +
                            setattr(item, db_name, instance)
         | 
| 155 | 
            +
             | 
| 156 | 
            +
                        storer_item_instance = getattr(item, db_name)
         | 
| 157 | 
            +
                        storer_item_instance.init_item(storer_table, fields)
         | 
| 158 | 
            +
             | 
| 159 | 
            +
                        storer_queue = sqn(db_name, storer_table)
         | 
| 160 | 
            +
                        queue = getattr(storer_item_instance, storer_queue)
         | 
| 161 | 
            +
                        # 初始话存储器
         | 
| 162 | 
            +
                        table_name = rtn(table_name=storer_table)
         | 
| 163 | 
            +
                        storer = StorerTmp(
         | 
| 164 | 
            +
                            table=table_name, fields=fields,
         | 
| 165 | 
            +
                            length=task.storer_queue_length,
         | 
| 166 | 
            +
                            queue=queue, config=storer_config
         | 
| 167 | 
            +
                        )
         | 
| 168 | 
            +
             | 
| 169 | 
            +
                    Thread(target=redis_db.check_spider_queue, args=(stop, len(storer_list))).start()
         | 
| 170 | 
            +
                    Thread(target=redis_db.set_heartbeat, args=(stop,)).start()
         | 
| 171 | 
            +
             | 
| 172 | 
            +
                    # 推送初始种子
         | 
| 173 | 
            +
                    # seeds = start_seeds(task.start_seed)
         | 
| 174 | 
            +
                    redis_db.add_seed(task.seeds)
         | 
| 175 | 
            +
                    # 启动调度器, 调度至redis队列
         | 
| 176 | 
            +
                    Thread(
         | 
| 177 | 
            +
                        # name="xxxx_schedule_seeds",
         | 
| 178 | 
            +
                        target=scheduler.schedule_seed,
         | 
| 179 | 
            +
                        args=(
         | 
| 180 | 
            +
                            redis_db.ready_seed_length,
         | 
| 181 | 
            +
                            redis_db.get_scheduler_lock,
         | 
| 182 | 
            +
                            redis_db.add_seed
         | 
| 183 | 
            +
                        )
         | 
| 184 | 
            +
                    ).start()
         | 
| 185 | 
            +
             | 
| 186 | 
            +
                    # 启动调度器, 调度任务队列
         | 
| 187 | 
            +
                    Thread(
         | 
| 188 | 
            +
                        # name="xxxx_schedule_task",
         | 
| 189 | 
            +
                        target=scheduler.schedule_task,
         | 
| 190 | 
            +
                        args=(
         | 
| 191 | 
            +
                            stop, redis_db.get_seed,
         | 
| 192 | 
            +
                            redis_db.ready_seed_length
         | 
| 193 | 
            +
                        )
         | 
| 194 | 
            +
                    ).start()
         | 
| 195 | 
            +
             | 
| 196 | 
            +
                    # 启动采集器
         | 
| 197 | 
            +
                    for index in range(task.spider_num):
         | 
| 198 | 
            +
                        Thread(
         | 
| 199 | 
            +
                            # name=f"xxxx_spider_task:{index}",
         | 
| 200 | 
            +
                            target=spider.spider_task,
         | 
| 201 | 
            +
                            args=(
         | 
| 202 | 
            +
                                stop, func, item,
         | 
| 203 | 
            +
                                redis_db.del_seed
         | 
| 204 | 
            +
                            )
         | 
| 205 | 
            +
                        ).start()
         | 
| 206 | 
            +
             | 
| 207 | 
            +
                    # 启动存储器
         | 
| 208 | 
            +
                    if storer:
         | 
| 209 | 
            +
                        Thread(
         | 
| 210 | 
            +
                            # name=f"xxxx_store_task:{storer.table}",
         | 
| 211 | 
            +
                            target=storer.store_task,
         | 
| 212 | 
            +
                            args=(
         | 
| 213 | 
            +
                                stop, last,
         | 
| 214 | 
            +
                                redis_db.reset_seed,
         | 
| 215 | 
            +
                                redis_db.set_storer
         | 
| 216 | 
            +
                            )
         | 
| 217 | 
            +
                        ).start()
         | 
| 218 | 
            +
             | 
| 219 | 
            +
                    Thread(
         | 
| 220 | 
            +
                        # name="check_spider",
         | 
| 221 | 
            +
                        target=check,
         | 
| 222 | 
            +
                        args=(
         | 
| 223 | 
            +
                            stop, last, spider,
         | 
| 224 | 
            +
                            scheduler, storer,
         | 
| 225 | 
            +
                            redis_db.ready_seed_length,
         | 
| 226 | 
            +
                            redis_db.spider_queue_length,
         | 
| 227 | 
            +
                        )
         | 
| 228 | 
            +
                    ).start()
         | 
| 229 | 
            +
             | 
| 230 | 
            +
                return decorator
         | 
| 231 | 
            +
             | 
| @@ -0,0 +1,136 @@ | |
| 1 | 
            +
            import time
         | 
| 2 | 
            +
            from cobweb import log, Queue, Seed
         | 
| 3 | 
            +
            from utils import issubclass_cobweb_interface
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            # from pympler import asizeof
         | 
| 6 | 
            +
             | 
| 7 | 
            +
             | 
| 8 | 
            +
            class Scheduler:
         | 
| 9 | 
            +
             | 
| 10 | 
            +
                def schedule_seed(self, ready_seed_length, get_scheduler_lock, add_seed):
         | 
| 11 | 
            +
             | 
| 12 | 
            +
                    inf_name = "SchedulerInterface"
         | 
| 13 | 
            +
                    if not issubclass_cobweb_interface(self.__class__, inf_name):
         | 
| 14 | 
            +
                        raise Exception("not have schedule function!")
         | 
| 15 | 
            +
             | 
| 16 | 
            +
                    if self.__class__.__name__ == "Default":
         | 
| 17 | 
            +
                        self.stop = True
         | 
| 18 | 
            +
                        return None
         | 
| 19 | 
            +
             | 
| 20 | 
            +
                    while not self.stop:
         | 
| 21 | 
            +
                        length = ready_seed_length()
         | 
| 22 | 
            +
                        if length > self.size:
         | 
| 23 | 
            +
                            time.sleep(15)
         | 
| 24 | 
            +
             | 
| 25 | 
            +
                        elif get_scheduler_lock():
         | 
| 26 | 
            +
                            seeds = self.schedule()
         | 
| 27 | 
            +
                            add_seed(seeds)
         | 
| 28 | 
            +
             | 
| 29 | 
            +
                    log.info(f"close thread: schedule_seed")
         | 
| 30 | 
            +
             | 
| 31 | 
            +
                def schedule_task(self, stop, get_seed, ready_seed_length):
         | 
| 32 | 
            +
                    time.sleep(3)
         | 
| 33 | 
            +
                    while not stop.is_set():
         | 
| 34 | 
            +
             | 
| 35 | 
            +
                        if not ready_seed_length():
         | 
| 36 | 
            +
                            time.sleep(15)
         | 
| 37 | 
            +
                            continue
         | 
| 38 | 
            +
             | 
| 39 | 
            +
                        if self.queue.length >= self.length:
         | 
| 40 | 
            +
                            time.sleep(3)
         | 
| 41 | 
            +
                            continue
         | 
| 42 | 
            +
             | 
| 43 | 
            +
                        seeds = get_seed(self.length)
         | 
| 44 | 
            +
                        self.queue.push(seeds)
         | 
| 45 | 
            +
                    log.info(f"close thread: schedule_task")
         | 
| 46 | 
            +
             | 
| 47 | 
            +
             | 
| 48 | 
            +
            class Spider:
         | 
| 49 | 
            +
             | 
| 50 | 
            +
                def __init__(self, queue, max_retries=5):
         | 
| 51 | 
            +
                    self.spider_in_progress = Queue()
         | 
| 52 | 
            +
                    self.max_retries = max_retries
         | 
| 53 | 
            +
                    self.queue = queue
         | 
| 54 | 
            +
             | 
| 55 | 
            +
                def spider_task(self, stop, func, item, del_seed):
         | 
| 56 | 
            +
                    while not stop.is_set():
         | 
| 57 | 
            +
                        seed = self.queue.pop()
         | 
| 58 | 
            +
                        if not seed:
         | 
| 59 | 
            +
                            time.sleep(3)
         | 
| 60 | 
            +
                            continue
         | 
| 61 | 
            +
                        elif seed._retry >= self.max_retries:
         | 
| 62 | 
            +
                            del_seed(seed, spider_status=False)
         | 
| 63 | 
            +
                            continue
         | 
| 64 | 
            +
                        try:
         | 
| 65 | 
            +
                            self.spider_in_progress.push(1, direct_insertion=True)
         | 
| 66 | 
            +
                            # log.info("spider seed: " + str(seed))
         | 
| 67 | 
            +
                            status = None
         | 
| 68 | 
            +
                            for it in func(item, seed):
         | 
| 69 | 
            +
                                if getattr(it, "table_name", None):
         | 
| 70 | 
            +
                                    store_queue = it.queue()
         | 
| 71 | 
            +
                                    store_queue.push(
         | 
| 72 | 
            +
                                        [seed, it.struct_data],
         | 
| 73 | 
            +
                                        direct_insertion=True
         | 
| 74 | 
            +
                                    )
         | 
| 75 | 
            +
                                elif isinstance(it, Seed):
         | 
| 76 | 
            +
                                    self.queue.push(it)
         | 
| 77 | 
            +
                                elif any(isinstance(it, t) for t in (list, tuple)):
         | 
| 78 | 
            +
                                    self.queue.push([s if isinstance(s, Seed) else Seed(s) for s in it])
         | 
| 79 | 
            +
                                elif isinstance(it, bool):
         | 
| 80 | 
            +
                                    status = it
         | 
| 81 | 
            +
                                elif it is None:
         | 
| 82 | 
            +
                                    status = False
         | 
| 83 | 
            +
             | 
| 84 | 
            +
                            if status is not None:
         | 
| 85 | 
            +
                                if status:
         | 
| 86 | 
            +
                                    del_seed(seed, spider_status=True)
         | 
| 87 | 
            +
                                else:
         | 
| 88 | 
            +
                                    seed._retry += 1
         | 
| 89 | 
            +
                                    self.queue.push(seed)
         | 
| 90 | 
            +
             | 
| 91 | 
            +
                        except Exception as e:
         | 
| 92 | 
            +
                            seed._retry += 1
         | 
| 93 | 
            +
                            self.queue.push(seed)
         | 
| 94 | 
            +
                            log.info(f"{str(seed)} -> {str(e)}")
         | 
| 95 | 
            +
                        finally:
         | 
| 96 | 
            +
                            self.spider_in_progress.pop()
         | 
| 97 | 
            +
                    log.info(f"close thread: spider")
         | 
| 98 | 
            +
             | 
| 99 | 
            +
             | 
| 100 | 
            +
            class Storer:
         | 
| 101 | 
            +
             | 
| 102 | 
            +
                def store_task(self, stop, last, reset_seed, del_seed):
         | 
| 103 | 
            +
             | 
| 104 | 
            +
                    inf_name = "StorerInterface"
         | 
| 105 | 
            +
                    if not issubclass_cobweb_interface(self.__class__, inf_name):
         | 
| 106 | 
            +
                        return None
         | 
| 107 | 
            +
             | 
| 108 | 
            +
                    if not getattr(self, "store", None):
         | 
| 109 | 
            +
                        raise Exception("not have store function!")
         | 
| 110 | 
            +
             | 
| 111 | 
            +
                    storer_name = self.__class__.__name__ + self.table
         | 
| 112 | 
            +
             | 
| 113 | 
            +
                    while not stop.is_set():
         | 
| 114 | 
            +
             | 
| 115 | 
            +
                        if last.is_set() or self.queue.length >= self.length:
         | 
| 116 | 
            +
                            seeds, data_list = [], []
         | 
| 117 | 
            +
             | 
| 118 | 
            +
                            for _ in range(self.length):
         | 
| 119 | 
            +
                                items = self.queue.pop()
         | 
| 120 | 
            +
                                if not items:
         | 
| 121 | 
            +
                                    break
         | 
| 122 | 
            +
                                seed, data = items
         | 
| 123 | 
            +
                                seeds.append(seed)
         | 
| 124 | 
            +
                                data_list.append(data)
         | 
| 125 | 
            +
             | 
| 126 | 
            +
                            if data_list:
         | 
| 127 | 
            +
                                if self.store(data_list):
         | 
| 128 | 
            +
                                    del_seed(seeds)
         | 
| 129 | 
            +
                                else:
         | 
| 130 | 
            +
                                    reset_seed(seeds)
         | 
| 131 | 
            +
                                    log.info("reset seeds!")
         | 
| 132 | 
            +
                                continue
         | 
| 133 | 
            +
             | 
| 134 | 
            +
                        time.sleep(3)
         | 
| 135 | 
            +
             | 
| 136 | 
            +
                    log.info(f"close thread: {storer_name}")
         | 
| @@ -0,0 +1,90 @@ | |
| 1 | 
            +
            import json
         | 
| 2 | 
            +
            import re
         | 
| 3 | 
            +
            import sys
         | 
| 4 | 
            +
            from abc import ABC
         | 
| 5 | 
            +
            from typing import Iterable
         | 
| 6 | 
            +
            from importlib import import_module
         | 
| 7 | 
            +
             | 
| 8 | 
            +
             | 
| 9 | 
            +
            def struct_table_name(table_name):
         | 
| 10 | 
            +
                return table_name.replace(".", "__p__").replace(":", "__c__")
         | 
| 11 | 
            +
             | 
| 12 | 
            +
             | 
| 13 | 
            +
            def restore_table_name(table_name):
         | 
| 14 | 
            +
                return table_name.replace("__p__", ".").replace("__c__", ":")
         | 
| 15 | 
            +
             | 
| 16 | 
            +
             | 
| 17 | 
            +
            def struct_queue_name(db_name, table_name):
         | 
| 18 | 
            +
                return sys.intern(f"__{db_name}_{table_name}_queue__")
         | 
| 19 | 
            +
             | 
| 20 | 
            +
             | 
| 21 | 
            +
            def parse_info(info):
         | 
| 22 | 
            +
                if not info:
         | 
| 23 | 
            +
                    return info
         | 
| 24 | 
            +
             | 
| 25 | 
            +
                if isinstance(info, dict):
         | 
| 26 | 
            +
                    return info
         | 
| 27 | 
            +
             | 
| 28 | 
            +
                if isinstance(info, str):
         | 
| 29 | 
            +
                    return json.loads(info)
         | 
| 30 | 
            +
             | 
| 31 | 
            +
                if isinstance(info, Iterable):
         | 
| 32 | 
            +
                    result = list()
         | 
| 33 | 
            +
                    for ii in info:
         | 
| 34 | 
            +
                        if isinstance(ii, str):
         | 
| 35 | 
            +
                            result.append(json.loads(ii))
         | 
| 36 | 
            +
                        elif isinstance(ii, dict):
         | 
| 37 | 
            +
                            result.append(ii)
         | 
| 38 | 
            +
                        else:
         | 
| 39 | 
            +
                            raise TypeError("must be in [str, dict]")
         | 
| 40 | 
            +
             | 
| 41 | 
            +
                    return result
         | 
| 42 | 
            +
             | 
| 43 | 
            +
             | 
| 44 | 
            +
            def struct_start_seeds(seeds):
         | 
| 45 | 
            +
                from .bbb import Seed
         | 
| 46 | 
            +
                if not seeds:
         | 
| 47 | 
            +
                    return None
         | 
| 48 | 
            +
                if any(isinstance(seeds, t) for t in (list, tuple)):
         | 
| 49 | 
            +
                    return [Seed(seed) for seed in seeds]
         | 
| 50 | 
            +
                elif any(isinstance(seeds, t) for t in (str, dict)):
         | 
| 51 | 
            +
                    return Seed(seeds)
         | 
| 52 | 
            +
             | 
| 53 | 
            +
             | 
| 54 | 
            +
            def issubclass_cobweb_interface(_class, inf_name):
         | 
| 55 | 
            +
                for _c in _class.__mro__[1:]:
         | 
| 56 | 
            +
                    if _c.__name__ == inf_name:
         | 
| 57 | 
            +
                        return True
         | 
| 58 | 
            +
                return False
         | 
| 59 | 
            +
             | 
| 60 | 
            +
             | 
| 61 | 
            +
            def parse_import_model(model_info, model_type=None):
         | 
| 62 | 
            +
                if model_type not in ["scheduler", "storer"]:
         | 
| 63 | 
            +
                    raise TypeError("model_type must be in scheduler, storer")
         | 
| 64 | 
            +
                if isinstance(model_info, str):
         | 
| 65 | 
            +
                    if "import" in model_info:
         | 
| 66 | 
            +
                        model_path, class_name = re.search(
         | 
| 67 | 
            +
                            r"from (.*?) import (.*?)$", model_info
         | 
| 68 | 
            +
                        ).groups()
         | 
| 69 | 
            +
                        model = import_module(model_path)
         | 
| 70 | 
            +
                        class_object = getattr(model, class_name)
         | 
| 71 | 
            +
                    elif "." in model_info:
         | 
| 72 | 
            +
                        info_list = model_info.split(".")
         | 
| 73 | 
            +
                        class_name = info_list[-1]
         | 
| 74 | 
            +
                        model_path = ".".join(info_list[:-1])
         | 
| 75 | 
            +
                        model = import_module(model_path)
         | 
| 76 | 
            +
                        class_object = getattr(model, class_name)
         | 
| 77 | 
            +
                    else:
         | 
| 78 | 
            +
                        model_path = f"cobweb.db.{model_type}.{model_info.lower()}"
         | 
| 79 | 
            +
                        class_name = model_info.capitalize()
         | 
| 80 | 
            +
                        model = import_module(model_path)
         | 
| 81 | 
            +
                        class_object = getattr(model, class_name)
         | 
| 82 | 
            +
                    return class_object, class_name
         | 
| 83 | 
            +
                elif issubclass(model_info, ABC):
         | 
| 84 | 
            +
                    inf_name = model_type.capitalize() + "Interface"
         | 
| 85 | 
            +
                    if issubclass_cobweb_interface(model_info, inf_name):
         | 
| 86 | 
            +
                        return model_info, model_info.__name__
         | 
| 87 | 
            +
                    raise ImportError()
         | 
| 88 | 
            +
                raise TypeError()
         | 
| 89 | 
            +
             | 
| 90 | 
            +
             | 
| @@ -24,8 +24,8 @@ cobweb/distributed/__init__.py | |
| 24 24 | 
             
            cobweb/distributed/launcher.py
         | 
| 25 25 | 
             
            cobweb/distributed/models.py
         | 
| 26 26 | 
             
            cobweb/single/__init__.py
         | 
| 27 | 
            +
            cobweb/single/launcher.py
         | 
| 27 28 | 
             
            cobweb/single/models.py
         | 
| 28 | 
            -
            cobweb/single/nest.py
         | 
| 29 29 | 
             
            cobweb_launcher.egg-info/PKG-INFO
         | 
| 30 30 | 
             
            cobweb_launcher.egg-info/SOURCES.txt
         | 
| 31 31 | 
             
            cobweb_launcher.egg-info/dependency_links.txt
         | 
| @@ -1,104 +0,0 @@ | |
| 1 | 
            -
            import time
         | 
| 2 | 
            -
            # from pympler import asizeof
         | 
| 3 | 
            -
            from single.nest import Queue
         | 
| 4 | 
            -
            from single.nest import struct_queue_name
         | 
| 5 | 
            -
            from single.nest import SchedulerInterface, StorerInterface
         | 
| 6 | 
            -
             | 
| 7 | 
            -
             | 
| 8 | 
            -
            # class Transceiver:
         | 
| 9 | 
            -
            class Distributor:
         | 
| 10 | 
            -
             | 
| 11 | 
            -
                def __init__(self):
         | 
| 12 | 
            -
                    self.seed_queue = Queue()
         | 
| 13 | 
            -
             | 
| 14 | 
            -
                @property
         | 
| 15 | 
            -
                def queue_names(self):
         | 
| 16 | 
            -
                    return tuple(self.__dict__.keys())
         | 
| 17 | 
            -
             | 
| 18 | 
            -
                @property
         | 
| 19 | 
            -
                def used_memory(self):
         | 
| 20 | 
            -
                    return asizeof.asizeof(self)
         | 
| 21 | 
            -
             | 
| 22 | 
            -
                def create_queue(self, queue_name: str):
         | 
| 23 | 
            -
                    self.__setattr__(queue_name, Queue())
         | 
| 24 | 
            -
             | 
| 25 | 
            -
                def get_queue(self, queue_name: str):
         | 
| 26 | 
            -
                    return self.__getattribute__(queue_name)
         | 
| 27 | 
            -
             | 
| 28 | 
            -
                def deal_item(self, item):
         | 
| 29 | 
            -
                    icn = item.__class__.__name__
         | 
| 30 | 
            -
                    if icn == "Seed":
         | 
| 31 | 
            -
                        self.seed_queue.push(item)
         | 
| 32 | 
            -
                    elif getattr(item, "table_name", None):
         | 
| 33 | 
            -
                        queue_name = struct_queue_name(icn, item.table_name)
         | 
| 34 | 
            -
                        getattr(self, queue_name).push(item.serialization)
         | 
| 35 | 
            -
             | 
| 36 | 
            -
                def distribute(self, callback, *args, **kwargs):
         | 
| 37 | 
            -
                    iterable = callback(*args, **kwargs)
         | 
| 38 | 
            -
                    if not iterable:
         | 
| 39 | 
            -
                        return None
         | 
| 40 | 
            -
                    for result in iterable:
         | 
| 41 | 
            -
                        self.deal_item(result)
         | 
| 42 | 
            -
                    return True
         | 
| 43 | 
            -
             | 
| 44 | 
            -
             | 
| 45 | 
            -
            class Scheduler:
         | 
| 46 | 
            -
             | 
| 47 | 
            -
                def schedule_task(self, distribute):
         | 
| 48 | 
            -
             | 
| 49 | 
            -
                    if not issubclass(self.__class__, SchedulerInterface):
         | 
| 50 | 
            -
                        return None
         | 
| 51 | 
            -
             | 
| 52 | 
            -
                    if not getattr(self, "schedule", None):
         | 
| 53 | 
            -
                        raise Exception("not have schedule function!")
         | 
| 54 | 
            -
             | 
| 55 | 
            -
                    while not self.stop:
         | 
| 56 | 
            -
             | 
| 57 | 
            -
                        if self.queue.length < self.length:
         | 
| 58 | 
            -
                            distribute(self.schedule)
         | 
| 59 | 
            -
             | 
| 60 | 
            -
                        else:
         | 
| 61 | 
            -
                            print("------------")
         | 
| 62 | 
            -
                            time.sleep(15)
         | 
| 63 | 
            -
             | 
| 64 | 
            -
             | 
| 65 | 
            -
            class Spider:
         | 
| 66 | 
            -
             | 
| 67 | 
            -
                def __init__(self, queue):
         | 
| 68 | 
            -
                    self.queue = queue
         | 
| 69 | 
            -
                    self.spider_in_progress = Queue()
         | 
| 70 | 
            -
             | 
| 71 | 
            -
                def spider_task(self, stop_event, distribute, func, item):
         | 
| 72 | 
            -
                    while not stop_event.is_set():
         | 
| 73 | 
            -
                        seed = self.queue.pop()
         | 
| 74 | 
            -
                        if not seed:
         | 
| 75 | 
            -
                            time.sleep(3)
         | 
| 76 | 
            -
                            continue
         | 
| 77 | 
            -
                        try:
         | 
| 78 | 
            -
                            self.spider_in_progress.push(1)
         | 
| 79 | 
            -
                            distribute(func, item, seed)
         | 
| 80 | 
            -
                        except Exception as e:
         | 
| 81 | 
            -
                            print(e)
         | 
| 82 | 
            -
                        finally:
         | 
| 83 | 
            -
                            self.spider_in_progress.pop()
         | 
| 84 | 
            -
             | 
| 85 | 
            -
             | 
| 86 | 
            -
            class Storer:
         | 
| 87 | 
            -
             | 
| 88 | 
            -
                def store_task(self, stop_event, last_event, distribute):
         | 
| 89 | 
            -
             | 
| 90 | 
            -
                    if not issubclass(self.__class__, StorerInterface):
         | 
| 91 | 
            -
                        return None
         | 
| 92 | 
            -
             | 
| 93 | 
            -
                    if not getattr(self, "store", None):
         | 
| 94 | 
            -
                        raise Exception("not have store function!")
         | 
| 95 | 
            -
             | 
| 96 | 
            -
                    while not stop_event.is_set():
         | 
| 97 | 
            -
                        if last_event.is_set() or self.queue.length > self.length:
         | 
| 98 | 
            -
                            data_list = []
         | 
| 99 | 
            -
                            data_length = min(self.queue.length, self.length)
         | 
| 100 | 
            -
                            for _ in range(data_length):
         | 
| 101 | 
            -
                                data = self.queue.pop()
         | 
| 102 | 
            -
                                data_list.append(data)
         | 
| 103 | 
            -
                            if data_list:
         | 
| 104 | 
            -
                                distribute(self.store, data_list)
         | 
| @@ -1,153 +0,0 @@ | |
| 1 | 
            -
            import time
         | 
| 2 | 
            -
            import threading
         | 
| 3 | 
            -
             | 
| 4 | 
            -
            from single.nest import Seed, DBItem
         | 
| 5 | 
            -
            from single.nest import struct_queue_name, restore_table_name
         | 
| 6 | 
            -
            from single.nest import Distributor, Scheduler, Spider, Storer
         | 
| 7 | 
            -
             | 
| 8 | 
            -
             | 
| 9 | 
            -
            def init_task_seed(seeds):
         | 
| 10 | 
            -
                if not seeds:
         | 
| 11 | 
            -
                    return None
         | 
| 12 | 
            -
                if isinstance(seeds, list) or isinstance(seeds, tuple):
         | 
| 13 | 
            -
                    for seed in seeds:
         | 
| 14 | 
            -
                        yield Seed(seed)
         | 
| 15 | 
            -
                elif isinstance(seeds, str) or isinstance(seeds, dict):
         | 
| 16 | 
            -
                    yield Seed(seeds)
         | 
| 17 | 
            -
             | 
| 18 | 
            -
             | 
| 19 | 
            -
            def parse_storer_info(storer_info):
         | 
| 20 | 
            -
                storer_data = {}
         | 
| 21 | 
            -
                storer_info_list = []
         | 
| 22 | 
            -
                if storer_info.__class__.__name__ == 'StorerInfo':
         | 
| 23 | 
            -
                    storer_info_list.append(storer_info)
         | 
| 24 | 
            -
                elif isinstance(storer_info, tuple) or isinstance(storer_info, list):
         | 
| 25 | 
            -
                    storer_info_list = storer_info
         | 
| 26 | 
            -
                for info in storer_info_list:
         | 
| 27 | 
            -
                    db_name = info.DB.__name__
         | 
| 28 | 
            -
                    storer_data.setdefault(db_name, {"StorerDB": info.DB, "db_args_list": []})
         | 
| 29 | 
            -
                    storer_data[db_name]["db_args_list"].append(info[1:])
         | 
| 30 | 
            -
                return storer_data
         | 
| 31 | 
            -
             | 
| 32 | 
            -
             | 
| 33 | 
            -
            def check(stop_event, last_event, distributor, scheduler, spider, storer_list):
         | 
| 34 | 
            -
                while True:
         | 
| 35 | 
            -
                    time.sleep(3)
         | 
| 36 | 
            -
                    if (
         | 
| 37 | 
            -
                            scheduler.stop and
         | 
| 38 | 
            -
                            not distributor.seed_queue.length and
         | 
| 39 | 
            -
                            not spider.spider_in_progress.length
         | 
| 40 | 
            -
                    ):
         | 
| 41 | 
            -
                        last_event.set()
         | 
| 42 | 
            -
                        time.sleep(10)
         | 
| 43 | 
            -
                        storer_queue_empty = True
         | 
| 44 | 
            -
                        for storer in storer_list:
         | 
| 45 | 
            -
                            if storer.queue.length:
         | 
| 46 | 
            -
                                storer_queue_empty = False
         | 
| 47 | 
            -
                                break
         | 
| 48 | 
            -
                        if storer_queue_empty:
         | 
| 49 | 
            -
                            break
         | 
| 50 | 
            -
                    last_event.clear()
         | 
| 51 | 
            -
                stop_event.set()
         | 
| 52 | 
            -
             | 
| 53 | 
            -
             | 
| 54 | 
            -
            def cobweb(task):
         | 
| 55 | 
            -
                """
         | 
| 56 | 
            -
                任务启动装饰器
         | 
| 57 | 
            -
                :param task: 任务配置信息
         | 
| 58 | 
            -
                """
         | 
| 59 | 
            -
                def decorator(func):
         | 
| 60 | 
            -
                    """
         | 
| 61 | 
            -
                    func(Item, seed)
         | 
| 62 | 
            -
                        Item:
         | 
| 63 | 
            -
                            Item.Textfile()
         | 
| 64 | 
            -
                            Item.Console()
         | 
| 65 | 
            -
                    """
         | 
| 66 | 
            -
                    # project task_name start_seed spider_num queue_length scheduler_info storer_info
         | 
| 67 | 
            -
             | 
| 68 | 
            -
                    storer_list = []
         | 
| 69 | 
            -
             | 
| 70 | 
            -
                    # 程序结束事件
         | 
| 71 | 
            -
                    last_event = threading.Event()
         | 
| 72 | 
            -
                    # 暂停采集事件
         | 
| 73 | 
            -
                    stop_event = threading.Event()
         | 
| 74 | 
            -
             | 
| 75 | 
            -
                    # 创建分发器
         | 
| 76 | 
            -
                    distributor = Distributor()
         | 
| 77 | 
            -
             | 
| 78 | 
            -
                    # 调度器动态继承
         | 
| 79 | 
            -
                    SchedulerDB, table, sql, length, size = task.SchedulerInfo
         | 
| 80 | 
            -
                    SchedulerTmp = type('Scheduler', (Scheduler, SchedulerDB), {})
         | 
| 81 | 
            -
             | 
| 82 | 
            -
                    # 初始化调度器
         | 
| 83 | 
            -
                    scheduler = SchedulerTmp(table=table, sql=sql, length=length, size=size, queue=distributor.seed_queue)
         | 
| 84 | 
            -
             | 
| 85 | 
            -
                    # 初始化采集器
         | 
| 86 | 
            -
                    spider = Spider(queue=distributor.seed_queue)
         | 
| 87 | 
            -
             | 
| 88 | 
            -
                    # 解析存储器信息
         | 
| 89 | 
            -
                    storer_data = parse_storer_info(task.storer_info)
         | 
| 90 | 
            -
             | 
| 91 | 
            -
                    # sds
         | 
| 92 | 
            -
                    item = type("item", (object,), {})
         | 
| 93 | 
            -
                    for db_name in storer_data.keys():
         | 
| 94 | 
            -
                        # 存储器动态继承
         | 
| 95 | 
            -
                        StorerDB = storer_data[db_name]["StorerDB"]
         | 
| 96 | 
            -
                        StorerTmp = type('Storer', (Storer, StorerDB), {})
         | 
| 97 | 
            -
                        db_args_list = storer_data[db_name]["db_args_list"]
         | 
| 98 | 
            -
                        for storer_db_args in db_args_list:
         | 
| 99 | 
            -
                            table, fields, length = storer_db_args
         | 
| 100 | 
            -
                            if not getattr(item, db_name, None):
         | 
| 101 | 
            -
                                instance = type(db_name, (DBItem,), {})
         | 
| 102 | 
            -
                                setattr(item, db_name, instance)
         | 
| 103 | 
            -
                            # 创建存储xxx
         | 
| 104 | 
            -
                            getattr(item, db_name).init_item(table, fields)
         | 
| 105 | 
            -
                            # 创建存储队列
         | 
| 106 | 
            -
                            storer_queue = struct_queue_name(db_name, table)
         | 
| 107 | 
            -
                            distributor.create_queue(queue_name=storer_queue)
         | 
| 108 | 
            -
                            queue = distributor.get_queue(queue_name=storer_queue)
         | 
| 109 | 
            -
                            # 初始话存储器
         | 
| 110 | 
            -
                            table_name = restore_table_name(table_name=table)
         | 
| 111 | 
            -
                            storer = StorerTmp(table=table_name, fields=fields, length=length, queue=queue)
         | 
| 112 | 
            -
                            storer_list.append(storer)
         | 
| 113 | 
            -
             | 
| 114 | 
            -
                    # 推送初始种子
         | 
| 115 | 
            -
                    distributor.distribute(init_task_seed, seeds=task.start_seed)
         | 
| 116 | 
            -
             | 
| 117 | 
            -
                    # 启动调度器
         | 
| 118 | 
            -
                    threading.Thread(
         | 
| 119 | 
            -
                        target=scheduler.schedule_task,
         | 
| 120 | 
            -
                        args=(distributor.distribute,),
         | 
| 121 | 
            -
                        name="single_scheduler_task"
         | 
| 122 | 
            -
                    ).start()
         | 
| 123 | 
            -
             | 
| 124 | 
            -
                    # 启动采集器
         | 
| 125 | 
            -
                    for index in range(task.spider_num):
         | 
| 126 | 
            -
                        threading.Thread(
         | 
| 127 | 
            -
                            target=spider.spider_task,
         | 
| 128 | 
            -
                            args=(stop_event, distributor.distribute, func, item),
         | 
| 129 | 
            -
                            name=f"single_spider_task:{index}"
         | 
| 130 | 
            -
                        ).start()
         | 
| 131 | 
            -
             | 
| 132 | 
            -
                    # 启动存储器
         | 
| 133 | 
            -
                    for storer in storer_list:
         | 
| 134 | 
            -
                        threading.Thread(
         | 
| 135 | 
            -
                            target=storer.store_task,
         | 
| 136 | 
            -
                            args=(stop_event, last_event, distributor.distribute),
         | 
| 137 | 
            -
                            name=f"single_store_task:{storer.table}",
         | 
| 138 | 
            -
                        ).start()
         | 
| 139 | 
            -
             | 
| 140 | 
            -
                    threading.Thread(
         | 
| 141 | 
            -
                        target=check, name="check",
         | 
| 142 | 
            -
                        args=(
         | 
| 143 | 
            -
                            stop_event, last_event, distributor,
         | 
| 144 | 
            -
                            scheduler, spider, storer_list
         | 
| 145 | 
            -
                        )
         | 
| 146 | 
            -
                    ).start()
         | 
| 147 | 
            -
             | 
| 148 | 
            -
                    # return starter(task, func)
         | 
| 149 | 
            -
                return decorator
         | 
| 150 | 
            -
             | 
| 151 | 
            -
             | 
| 152 | 
            -
             | 
| 153 | 
            -
             | 
| @@ -1,88 +0,0 @@ | |
| 1 | 
            -
            import json
         | 
| 2 | 
            -
            import sys
         | 
| 3 | 
            -
            from typing import Iterable
         | 
| 4 | 
            -
             | 
| 5 | 
            -
            import requests
         | 
| 6 | 
            -
             | 
| 7 | 
            -
             | 
| 8 | 
            -
            # from cobweb import Seed
         | 
| 9 | 
            -
             | 
| 10 | 
            -
             | 
| 11 | 
            -
            def struct_table_name(table_name):
         | 
| 12 | 
            -
                return table_name.replace(".", "__p__").replace(":", "__c__")
         | 
| 13 | 
            -
             | 
| 14 | 
            -
             | 
| 15 | 
            -
            def restore_table_name(table_name):
         | 
| 16 | 
            -
                return table_name.replace("__p__", ".").replace("__c__", ":")
         | 
| 17 | 
            -
             | 
| 18 | 
            -
             | 
| 19 | 
            -
            def struct_queue_name(db_name, table_name):
         | 
| 20 | 
            -
                return sys.intern(f"__{db_name}_{table_name}_queue__")
         | 
| 21 | 
            -
             | 
| 22 | 
            -
             | 
| 23 | 
            -
            # class StorerDB:
         | 
| 24 | 
            -
            #
         | 
| 25 | 
            -
            #     @staticmethod
         | 
| 26 | 
            -
            #     def console(self):
         | 
| 27 | 
            -
            #         from db.storer.console import Console
         | 
| 28 | 
            -
            #         table = struct_table_name(table)
         | 
| 29 | 
            -
            #         return StorerInfo(DB=Console, table=table, length=length, config=None)
         | 
| 30 | 
            -
            #
         | 
| 31 | 
            -
            #     @staticmethod
         | 
| 32 | 
            -
            #     def textfile(table, length=200):
         | 
| 33 | 
            -
            #         from db.storer.textfile import Textfile
         | 
| 34 | 
            -
            #         table = struct_table_name(table)
         | 
| 35 | 
            -
            #         return StorerInfo(DB=Textfile, table=table, length=length, config=None)
         | 
| 36 | 
            -
            #
         | 
| 37 | 
            -
            #     @staticmethod
         | 
| 38 | 
            -
            #     def loghub(table, length=200, config=None):
         | 
| 39 | 
            -
            #         from db.storer.loghub import Loghub
         | 
| 40 | 
            -
            #         table = struct_table_name(table)
         | 
| 41 | 
            -
            #         return StorerInfo(DB=Loghub, table=table, length=length, config=config)
         | 
| 42 | 
            -
             | 
| 43 | 
            -
             | 
| 44 | 
            -
            def parse_info(info):
         | 
| 45 | 
            -
                if not info:
         | 
| 46 | 
            -
                    return info
         | 
| 47 | 
            -
             | 
| 48 | 
            -
                if isinstance(info, dict):
         | 
| 49 | 
            -
                    return info
         | 
| 50 | 
            -
             | 
| 51 | 
            -
                if isinstance(info, str):
         | 
| 52 | 
            -
                    return json.loads(info)
         | 
| 53 | 
            -
             | 
| 54 | 
            -
                if isinstance(info, Iterable):
         | 
| 55 | 
            -
                    result = list()
         | 
| 56 | 
            -
                    for ii in info:
         | 
| 57 | 
            -
                        if isinstance(ii, str):
         | 
| 58 | 
            -
                            result.append(json.loads(ii))
         | 
| 59 | 
            -
                        elif isinstance(ii, dict):
         | 
| 60 | 
            -
                            result.append(ii)
         | 
| 61 | 
            -
                        else:
         | 
| 62 | 
            -
                            raise TypeError("must be in [str, dict]")
         | 
| 63 | 
            -
             | 
| 64 | 
            -
                    return result
         | 
| 65 | 
            -
             | 
| 66 | 
            -
             | 
| 67 | 
            -
            def struct_start_seeds(seeds):
         | 
| 68 | 
            -
                from .bbb import Seed
         | 
| 69 | 
            -
                if not seeds:
         | 
| 70 | 
            -
                    return None
         | 
| 71 | 
            -
                if any(isinstance(seeds, t) for t in (list, tuple)):
         | 
| 72 | 
            -
                    return [Seed(seed) for seed in seeds]
         | 
| 73 | 
            -
                elif any(isinstance(seeds, t) for t in (str, dict)):
         | 
| 74 | 
            -
                    return Seed(seeds)
         | 
| 75 | 
            -
             | 
| 76 | 
            -
             | 
| 77 | 
            -
            # def get_storer_db(db):
         | 
| 78 | 
            -
            #
         | 
| 79 | 
            -
            #     if isinstance(db, str):
         | 
| 80 | 
            -
            #         model = import_module(f" db.storer.{db.lower()}")
         | 
| 81 | 
            -
            #
         | 
| 82 | 
            -
            #         # if db.lower() in dir(StorerDB):
         | 
| 83 | 
            -
            #         #     return getattr(StorerDB, db)
         | 
| 84 | 
            -
            #         # else:
         | 
| 85 | 
            -
            #         #     pass
         | 
| 86 | 
            -
             | 
| 87 | 
            -
             | 
| 88 | 
            -
             | 
| 
            File without changes
         | 
| 
            File without changes
         | 
| 
            File without changes
         | 
| 
            File without changes
         | 
| 
            File without changes
         | 
| 
            File without changes
         | 
| 
            File without changes
         | 
| 
            File without changes
         | 
| 
            File without changes
         | 
| 
            File without changes
         | 
| 
            File without changes
         | 
| 
            File without changes
         | 
| 
            File without changes
         | 
| 
            File without changes
         | 
| 
            File without changes
         | 
| 
            File without changes
         | 
| 
            File without changes
         | 
| 
            File without changes
         | 
| 
            File without changes
         | 
| 
            File without changes
         | 
    
        {cobweb-launcher-0.1.2 → cobweb-launcher-0.1.4}/cobweb_launcher.egg-info/dependency_links.txt
    RENAMED
    
    | 
            File without changes
         | 
| 
            File without changes
         | 
| 
            File without changes
         | 
| 
            File without changes
         |