cobweb-launcher 1.2.43__py3-none-any.whl → 1.2.45__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cobweb/crawlers/crawler.py +8 -6
 - cobweb/utils/dotting.py +38 -10
 - {cobweb_launcher-1.2.43.dist-info → cobweb_launcher-1.2.45.dist-info}/METADATA +1 -1
 - {cobweb_launcher-1.2.43.dist-info → cobweb_launcher-1.2.45.dist-info}/RECORD +7 -7
 - {cobweb_launcher-1.2.43.dist-info → cobweb_launcher-1.2.45.dist-info}/LICENSE +0 -0
 - {cobweb_launcher-1.2.43.dist-info → cobweb_launcher-1.2.45.dist-info}/WHEEL +0 -0
 - {cobweb_launcher-1.2.43.dist-info → cobweb_launcher-1.2.45.dist-info}/top_level.txt +0 -0
 
    
        cobweb/crawlers/crawler.py
    CHANGED
    
    | 
         @@ -1,4 +1,5 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            import json
         
     | 
| 
      
 2 
     | 
    
         
            +
            import os
         
     | 
| 
       2 
3 
     | 
    
         
             
            import threading
         
     | 
| 
       3 
4 
     | 
    
         
             
            import time
         
     | 
| 
       4 
5 
     | 
    
         
             
            import traceback
         
     | 
| 
         @@ -6,9 +7,7 @@ from inspect import isgenerator 
     | 
|
| 
       6 
7 
     | 
    
         
             
            from typing import Union, Callable, Mapping
         
     | 
| 
       7 
8 
     | 
    
         
             
            from urllib.parse import urlparse
         
     | 
| 
       8 
9 
     | 
    
         | 
| 
       9 
     | 
    
         
            -
            import  
     | 
| 
       10 
     | 
    
         
            -
            from requests import HTTPError, Response as Res
         
     | 
| 
       11 
     | 
    
         
            -
            from requests.exceptions import ChunkedEncodingError
         
     | 
| 
      
 10 
     | 
    
         
            +
            from requests import Response as Res
         
     | 
| 
       12 
11 
     | 
    
         | 
| 
       13 
12 
     | 
    
         
             
            from cobweb.constant import DealModel, LogTemplate
         
     | 
| 
       14 
13 
     | 
    
         
             
            from cobweb.base import (
         
     | 
| 
         @@ -20,6 +19,7 @@ from cobweb.base import ( 
     | 
|
| 
       20 
19 
     | 
    
         
             
                logger
         
     | 
| 
       21 
20 
     | 
    
         
             
            )
         
     | 
| 
       22 
21 
     | 
    
         
             
            from cobweb.utils import LoghubDot
         
     | 
| 
      
 22 
     | 
    
         
            +
            proxy_type = os.getenv("PROXY_TYPE", "")
         
     | 
| 
       23 
23 
     | 
    
         | 
| 
       24 
24 
     | 
    
         | 
| 
       25 
25 
     | 
    
         
             
            class Crawler(threading.Thread):
         
     | 
| 
         @@ -149,7 +149,7 @@ class Crawler(threading.Thread): 
     | 
|
| 
       149 
149 
     | 
    
         
             
                                                    topic=urlparse(download_item.response.request.url).netloc,
         
     | 
| 
       150 
150 
     | 
    
         
             
                                                    data_size=int(download_item.response.headers.get("content-length", 0)),
         
     | 
| 
       151 
151 
     | 
    
         
             
                                                    cost_time=end_time - start_time, status = 200,
         
     | 
| 
       152 
     | 
    
         
            -
                                                    url=download_item.response.url,
         
     | 
| 
      
 152 
     | 
    
         
            +
                                                    url=download_item.response.url, proxy_type=proxy_type,
         
     | 
| 
       153 
153 
     | 
    
         
             
                                                )
         
     | 
| 
       154 
154 
     | 
    
         
             
                                            parse_iterators = self.parse(download_item)
         
     | 
| 
       155 
155 
     | 
    
         
             
                                            if not isgenerator(parse_iterators):
         
     | 
| 
         @@ -169,7 +169,7 @@ class Crawler(threading.Thread): 
     | 
|
| 
       169 
169 
     | 
    
         
             
                        except Exception as e:
         
     | 
| 
       170 
170 
     | 
    
         
             
                            exception_msg = ''.join(traceback.format_exception(type(e), e, e.__traceback__))
         
     | 
| 
       171 
171 
     | 
    
         
             
                            url = seed.url
         
     | 
| 
       172 
     | 
    
         
            -
                            status =  
     | 
| 
      
 172 
     | 
    
         
            +
                            status = e.__class__.__name__
         
     | 
| 
       173 
173 
     | 
    
         
             
                            if getattr(e, "response", None) and isinstance(e.response, Res):
         
     | 
| 
       174 
174 
     | 
    
         
             
                                url = e.response.request.url
         
     | 
| 
       175 
175 
     | 
    
         
             
                                status = e.response.status_code
         
     | 
| 
         @@ -177,7 +177,8 @@ class Crawler(threading.Thread): 
     | 
|
| 
       177 
177 
     | 
    
         
             
                                topic=urlparse(url).netloc,
         
     | 
| 
       178 
178 
     | 
    
         
             
                                data_size=-1, cost_time=-1,
         
     | 
| 
       179 
179 
     | 
    
         
             
                                status=status, url=url,
         
     | 
| 
       180 
     | 
    
         
            -
                                 
     | 
| 
      
 180 
     | 
    
         
            +
                                proxy_type=proxy_type,
         
     | 
| 
      
 181 
     | 
    
         
            +
                                msg=exception_msg,
         
     | 
| 
       181 
182 
     | 
    
         
             
                            )
         
     | 
| 
       182 
183 
     | 
    
         
             
                            logger.info(LogTemplate.download_exception.format(
         
     | 
| 
       183 
184 
     | 
    
         
             
                                detail=seed_detail_log_info,
         
     | 
| 
         @@ -208,6 +209,7 @@ class Crawler(threading.Thread): 
     | 
|
| 
       208 
209 
     | 
    
         
             
                    logger.info("spider thread close")
         
     | 
| 
       209 
210 
     | 
    
         | 
| 
       210 
211 
     | 
    
         
             
                def run(self):
         
     | 
| 
      
 212 
     | 
    
         
            +
                    threading.Thread(name="loghub_dot", target=self.loghub_dot.build_run).start()
         
     | 
| 
       211 
213 
     | 
    
         
             
                    for index in range(self.thread_num):
         
     | 
| 
       212 
214 
     | 
    
         
             
                        threading.Thread(name=f"spider_{index}", target=self.spider).start()
         
     | 
| 
       213 
215 
     | 
    
         | 
    
        cobweb/utils/dotting.py
    CHANGED
    
    | 
         @@ -1,6 +1,9 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            import json
         
     | 
| 
      
 2 
     | 
    
         
            +
            import time
         
     | 
| 
       2 
3 
     | 
    
         | 
| 
       3 
4 
     | 
    
         
             
            from aliyun.log import LogClient, LogItem, PutLogsRequest
         
     | 
| 
      
 5 
     | 
    
         
            +
             
     | 
| 
      
 6 
     | 
    
         
            +
            from base import Queue, logger
         
     | 
| 
       4 
7 
     | 
    
         
             
            from cobweb import setting
         
     | 
| 
       5 
8 
     | 
    
         | 
| 
       6 
9 
     | 
    
         | 
| 
         @@ -8,11 +11,11 @@ class LoghubDot: 
     | 
|
| 
       8 
11 
     | 
    
         | 
| 
       9 
12 
     | 
    
         
             
                def __init__(self):
         
     | 
| 
       10 
13 
     | 
    
         
             
                    self.client = LogClient(**setting.LOGHUB_CONFIG)
         
     | 
| 
      
 14 
     | 
    
         
            +
                    self.queue = Queue()
         
     | 
| 
       11 
15 
     | 
    
         | 
| 
       12 
16 
     | 
    
         
             
                def build(self, topic, **kwargs):
         
     | 
| 
       13 
17 
     | 
    
         | 
| 
       14 
18 
     | 
    
         
             
                    temp = {}
         
     | 
| 
       15 
     | 
    
         
            -
                    log_items = []
         
     | 
| 
       16 
19 
     | 
    
         
             
                    log_item = LogItem()
         
     | 
| 
       17 
20 
     | 
    
         
             
                    for key, value in kwargs.items():
         
     | 
| 
       18 
21 
     | 
    
         
             
                        if not isinstance(value, str):
         
     | 
| 
         @@ -21,12 +24,37 @@ class LoghubDot: 
     | 
|
| 
       21 
24 
     | 
    
         
             
                            temp[key] = value
         
     | 
| 
       22 
25 
     | 
    
         
             
                    contents = sorted(temp.items())
         
     | 
| 
       23 
26 
     | 
    
         
             
                    log_item.set_contents(contents)
         
     | 
| 
       24 
     | 
    
         
            -
                    log_items.append(log_item)
         
     | 
| 
       25 
     | 
    
         
            -
                    request = PutLogsRequest(
         
     | 
| 
       26 
     | 
    
         
            -
             
     | 
| 
       27 
     | 
    
         
            -
             
     | 
| 
       28 
     | 
    
         
            -
             
     | 
| 
       29 
     | 
    
         
            -
             
     | 
| 
       30 
     | 
    
         
            -
             
     | 
| 
       31 
     | 
    
         
            -
                    )
         
     | 
| 
       32 
     | 
    
         
            -
                    self. 
     | 
| 
      
 27 
     | 
    
         
            +
                    # log_items.append(log_item)
         
     | 
| 
      
 28 
     | 
    
         
            +
                    # request = PutLogsRequest(
         
     | 
| 
      
 29 
     | 
    
         
            +
                    #     project="databee-download-log",
         
     | 
| 
      
 30 
     | 
    
         
            +
                    #     logstore="log",
         
     | 
| 
      
 31 
     | 
    
         
            +
                    #     topic=topic,
         
     | 
| 
      
 32 
     | 
    
         
            +
                    #     logitems=log_items,
         
     | 
| 
      
 33 
     | 
    
         
            +
                    #     compress=True
         
     | 
| 
      
 34 
     | 
    
         
            +
                    # )
         
     | 
| 
      
 35 
     | 
    
         
            +
                    self.queue.push((topic, log_item), direct_insertion=True)
         
     | 
| 
      
 36 
     | 
    
         
            +
                    # self.client.put_logs(request=request)
         
     | 
| 
      
 37 
     | 
    
         
            +
             
     | 
| 
      
 38 
     | 
    
         
            +
                def build_run(self):
         
     | 
| 
      
 39 
     | 
    
         
            +
                    while True:
         
     | 
| 
      
 40 
     | 
    
         
            +
                        if self.queue.length < 1000:
         
     | 
| 
      
 41 
     | 
    
         
            +
                            time.sleep(0.5)
         
     | 
| 
      
 42 
     | 
    
         
            +
                            continue
         
     | 
| 
      
 43 
     | 
    
         
            +
                        try:
         
     | 
| 
      
 44 
     | 
    
         
            +
                            log_item_info = {}
         
     | 
| 
      
 45 
     | 
    
         
            +
                            for _ in range(1000):
         
     | 
| 
      
 46 
     | 
    
         
            +
                                topic, item = self.queue.pop()
         
     | 
| 
      
 47 
     | 
    
         
            +
                                if not item:
         
     | 
| 
      
 48 
     | 
    
         
            +
                                    break
         
     | 
| 
      
 49 
     | 
    
         
            +
                                log_item_info.setdefault(topic, []).append(item)
         
     | 
| 
      
 50 
     | 
    
         
            +
                            for topic, log_items in log_item_info.items():
         
     | 
| 
      
 51 
     | 
    
         
            +
                                request = PutLogsRequest(
         
     | 
| 
      
 52 
     | 
    
         
            +
                                    project="databee-download-log",
         
     | 
| 
      
 53 
     | 
    
         
            +
                                    logstore="log",
         
     | 
| 
      
 54 
     | 
    
         
            +
                                    topic=topic,
         
     | 
| 
      
 55 
     | 
    
         
            +
                                    logitems=log_items,
         
     | 
| 
      
 56 
     | 
    
         
            +
                                    compress=True
         
     | 
| 
      
 57 
     | 
    
         
            +
                                )
         
     | 
| 
      
 58 
     | 
    
         
            +
                                self.client.put_logs(request=request)
         
     | 
| 
      
 59 
     | 
    
         
            +
                        except Exception as e:
         
     | 
| 
      
 60 
     | 
    
         
            +
                            logger.info(str(e))
         
     | 
| 
         @@ -13,7 +13,7 @@ cobweb/base/response.py,sha256=eB1DWMXFCpn3cJ3yzgCRU1WeZAdayGDohRgdjdMUFN4,406 
     | 
|
| 
       13 
13 
     | 
    
         
             
            cobweb/base/seed.py,sha256=Uz_VBRlAxNYQcFHk3tsZFMlU96yPOedHaWGTvk-zKd8,2908
         
     | 
| 
       14 
14 
     | 
    
         
             
            cobweb/crawlers/__init__.py,sha256=msvkB9mTpsgyj8JfNMsmwAcpy5kWk_2NrO1Adw2Hkw0,29
         
     | 
| 
       15 
15 
     | 
    
         
             
            cobweb/crawlers/base_crawler.py,sha256=ee_WSDnPQpPTk6wlFuY2UEx5L3hcsAZFcr6i3GLSry8,5751
         
     | 
| 
       16 
     | 
    
         
            -
            cobweb/crawlers/crawler.py,sha256= 
     | 
| 
      
 16 
     | 
    
         
            +
            cobweb/crawlers/crawler.py,sha256=s5kImH3lzkyRm2AQoH1fwLaIO_CPZlPhqiBknPNaglM,8676
         
     | 
| 
       17 
17 
     | 
    
         
             
            cobweb/crawlers/file_crawler.py,sha256=2Sjbdgxzqd41WykKUQE3QQlGai3T8k-pmHNmPlTchjQ,4454
         
     | 
| 
       18 
18 
     | 
    
         
             
            cobweb/db/__init__.py,sha256=uZwSkd105EAwYo95oZQXAfofUKHVIAZZIPpNMy-hm2Q,56
         
     | 
| 
       19 
19 
     | 
    
         
             
            cobweb/db/api_db.py,sha256=bDc5dJQxq4z04h70KUTHd0OqUOEY7Cm3wcNJZtTvJIM,3015
         
     | 
| 
         @@ -34,11 +34,11 @@ cobweb/schedulers/scheduler_api.py,sha256=pFEdS1H4zuzxwMhCV-G7CoLz-rEOPv4EVo3xZU 
     | 
|
| 
       34 
34 
     | 
    
         
             
            cobweb/schedulers/scheduler_redis.py,sha256=E5fjc3nNld8GbUhUGT7uY4smRejj2J2ZIzp2g6lhxFM,2205
         
     | 
| 
       35 
35 
     | 
    
         
             
            cobweb/utils/__init__.py,sha256=Ev2LZZ1-S56iQYDqFZrqadizEv4Gk8Of-DraH-_WnKY,109
         
     | 
| 
       36 
36 
     | 
    
         
             
            cobweb/utils/bloom.py,sha256=vng-YbKgh9HbtpAWYf_nkUSbfVTOj40aqUUejRYlsCU,1752
         
     | 
| 
       37 
     | 
    
         
            -
            cobweb/utils/dotting.py,sha256= 
     | 
| 
      
 37 
     | 
    
         
            +
            cobweb/utils/dotting.py,sha256=vxK44tq_eD1uIYrkMzesUF43ZgFiu1zay3WZGTSBSx4,1898
         
     | 
| 
       38 
38 
     | 
    
         
             
            cobweb/utils/oss.py,sha256=gyt8-UB07tVphZLQXMOf-JTJwU-mWq8KZkOXKkAf3uk,3513
         
     | 
| 
       39 
39 
     | 
    
         
             
            cobweb/utils/tools.py,sha256=5JEaaAwYoV9Sdla2UBIJn6faUBuXmxUMagm9ck6FVqs,1253
         
     | 
| 
       40 
     | 
    
         
            -
            cobweb_launcher-1.2. 
     | 
| 
       41 
     | 
    
         
            -
            cobweb_launcher-1.2. 
     | 
| 
       42 
     | 
    
         
            -
            cobweb_launcher-1.2. 
     | 
| 
       43 
     | 
    
         
            -
            cobweb_launcher-1.2. 
     | 
| 
       44 
     | 
    
         
            -
            cobweb_launcher-1.2. 
     | 
| 
      
 40 
     | 
    
         
            +
            cobweb_launcher-1.2.45.dist-info/LICENSE,sha256=z1rxSIGOyzcSb3orZxFPxzx-0C1vTocmswqBNxpKfEk,1063
         
     | 
| 
      
 41 
     | 
    
         
            +
            cobweb_launcher-1.2.45.dist-info/METADATA,sha256=T46vEQNn9zyk9trC1O10bT5YBi7rNOh7M11ErhYsJ_0,6510
         
     | 
| 
      
 42 
     | 
    
         
            +
            cobweb_launcher-1.2.45.dist-info/WHEEL,sha256=ewwEueio1C2XeHTvT17n8dZUJgOvyCWCt0WVNLClP9o,92
         
     | 
| 
      
 43 
     | 
    
         
            +
            cobweb_launcher-1.2.45.dist-info/top_level.txt,sha256=4GETBGNsKqiCUezmT-mJn7tjhcDlu7nLIV5gGgHBW4I,7
         
     | 
| 
      
 44 
     | 
    
         
            +
            cobweb_launcher-1.2.45.dist-info/RECORD,,
         
     | 
| 
         
            File without changes
         
     | 
| 
         
            File without changes
         
     | 
| 
         
            File without changes
         
     |