cobweb-launcher 1.2.43__py3-none-any.whl → 1.2.45__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,5 @@
1
1
  import json
2
+ import os
2
3
  import threading
3
4
  import time
4
5
  import traceback
@@ -6,9 +7,7 @@ from inspect import isgenerator
6
7
  from typing import Union, Callable, Mapping
7
8
  from urllib.parse import urlparse
8
9
 
9
- import urllib3
10
- from requests import HTTPError, Response as Res
11
- from requests.exceptions import ChunkedEncodingError
10
+ from requests import Response as Res
12
11
 
13
12
  from cobweb.constant import DealModel, LogTemplate
14
13
  from cobweb.base import (
@@ -20,6 +19,7 @@ from cobweb.base import (
20
19
  logger
21
20
  )
22
21
  from cobweb.utils import LoghubDot
22
+ proxy_type = os.getenv("PROXY_TYPE", "")
23
23
 
24
24
 
25
25
  class Crawler(threading.Thread):
@@ -149,7 +149,7 @@ class Crawler(threading.Thread):
149
149
  topic=urlparse(download_item.response.request.url).netloc,
150
150
  data_size=int(download_item.response.headers.get("content-length", 0)),
151
151
  cost_time=end_time - start_time, status = 200,
152
- url=download_item.response.url,
152
+ url=download_item.response.url, proxy_type=proxy_type,
153
153
  )
154
154
  parse_iterators = self.parse(download_item)
155
155
  if not isgenerator(parse_iterators):
@@ -169,7 +169,7 @@ class Crawler(threading.Thread):
169
169
  except Exception as e:
170
170
  exception_msg = ''.join(traceback.format_exception(type(e), e, e.__traceback__))
171
171
  url = seed.url
172
- status = str(e)
172
+ status = e.__class__.__name__
173
173
  if getattr(e, "response", None) and isinstance(e.response, Res):
174
174
  url = e.response.request.url
175
175
  status = e.response.status_code
@@ -177,7 +177,8 @@ class Crawler(threading.Thread):
177
177
  topic=urlparse(url).netloc,
178
178
  data_size=-1, cost_time=-1,
179
179
  status=status, url=url,
180
- msg=exception_msg
180
+ proxy_type=proxy_type,
181
+ msg=exception_msg,
181
182
  )
182
183
  logger.info(LogTemplate.download_exception.format(
183
184
  detail=seed_detail_log_info,
@@ -208,6 +209,7 @@ class Crawler(threading.Thread):
208
209
  logger.info("spider thread close")
209
210
 
210
211
  def run(self):
212
+ threading.Thread(name="loghub_dot", target=self.loghub_dot.build_run).start()
211
213
  for index in range(self.thread_num):
212
214
  threading.Thread(name=f"spider_{index}", target=self.spider).start()
213
215
 
cobweb/utils/dotting.py CHANGED
@@ -1,6 +1,9 @@
1
1
  import json
2
+ import time
2
3
 
3
4
  from aliyun.log import LogClient, LogItem, PutLogsRequest
5
+
6
+ from base import Queue, logger
4
7
  from cobweb import setting
5
8
 
6
9
 
@@ -8,11 +11,11 @@ class LoghubDot:
8
11
 
9
12
  def __init__(self):
10
13
  self.client = LogClient(**setting.LOGHUB_CONFIG)
14
+ self.queue = Queue()
11
15
 
12
16
  def build(self, topic, **kwargs):
13
17
 
14
18
  temp = {}
15
- log_items = []
16
19
  log_item = LogItem()
17
20
  for key, value in kwargs.items():
18
21
  if not isinstance(value, str):
@@ -21,12 +24,37 @@ class LoghubDot:
21
24
  temp[key] = value
22
25
  contents = sorted(temp.items())
23
26
  log_item.set_contents(contents)
24
- log_items.append(log_item)
25
- request = PutLogsRequest(
26
- project="databee-download-log",
27
- logstore="log",
28
- topic=topic,
29
- logitems=log_items,
30
- compress=True
31
- )
32
- self.client.put_logs(request=request)
27
+ # log_items.append(log_item)
28
+ # request = PutLogsRequest(
29
+ # project="databee-download-log",
30
+ # logstore="log",
31
+ # topic=topic,
32
+ # logitems=log_items,
33
+ # compress=True
34
+ # )
35
+ self.queue.push((topic, log_item), direct_insertion=True)
36
+ # self.client.put_logs(request=request)
37
+
38
+ def build_run(self):
39
+ while True:
40
+ if self.queue.length < 1000:
41
+ time.sleep(0.5)
42
+ continue
43
+ try:
44
+ log_item_info = {}
45
+ for _ in range(1000):
46
+ topic, item = self.queue.pop()
47
+ if not item:
48
+ break
49
+ log_item_info.setdefault(topic, []).append(item)
50
+ for topic, log_items in log_item_info.items():
51
+ request = PutLogsRequest(
52
+ project="databee-download-log",
53
+ logstore="log",
54
+ topic=topic,
55
+ logitems=log_items,
56
+ compress=True
57
+ )
58
+ self.client.put_logs(request=request)
59
+ except Exception as e:
60
+ logger.info(str(e))
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cobweb-launcher
3
- Version: 1.2.43
3
+ Version: 1.2.45
4
4
  Summary: spider_hole
5
5
  Home-page: https://github.com/Juannie-PP/cobweb
6
6
  Author: Juannie-PP
@@ -13,7 +13,7 @@ cobweb/base/response.py,sha256=eB1DWMXFCpn3cJ3yzgCRU1WeZAdayGDohRgdjdMUFN4,406
13
13
  cobweb/base/seed.py,sha256=Uz_VBRlAxNYQcFHk3tsZFMlU96yPOedHaWGTvk-zKd8,2908
14
14
  cobweb/crawlers/__init__.py,sha256=msvkB9mTpsgyj8JfNMsmwAcpy5kWk_2NrO1Adw2Hkw0,29
15
15
  cobweb/crawlers/base_crawler.py,sha256=ee_WSDnPQpPTk6wlFuY2UEx5L3hcsAZFcr6i3GLSry8,5751
16
- cobweb/crawlers/crawler.py,sha256=pEukp5tC-axkzmcagPIpWPgmpxP0NHC1eu8iyJDFegA,8537
16
+ cobweb/crawlers/crawler.py,sha256=s5kImH3lzkyRm2AQoH1fwLaIO_CPZlPhqiBknPNaglM,8676
17
17
  cobweb/crawlers/file_crawler.py,sha256=2Sjbdgxzqd41WykKUQE3QQlGai3T8k-pmHNmPlTchjQ,4454
18
18
  cobweb/db/__init__.py,sha256=uZwSkd105EAwYo95oZQXAfofUKHVIAZZIPpNMy-hm2Q,56
19
19
  cobweb/db/api_db.py,sha256=bDc5dJQxq4z04h70KUTHd0OqUOEY7Cm3wcNJZtTvJIM,3015
@@ -34,11 +34,11 @@ cobweb/schedulers/scheduler_api.py,sha256=pFEdS1H4zuzxwMhCV-G7CoLz-rEOPv4EVo3xZU
34
34
  cobweb/schedulers/scheduler_redis.py,sha256=E5fjc3nNld8GbUhUGT7uY4smRejj2J2ZIzp2g6lhxFM,2205
35
35
  cobweb/utils/__init__.py,sha256=Ev2LZZ1-S56iQYDqFZrqadizEv4Gk8Of-DraH-_WnKY,109
36
36
  cobweb/utils/bloom.py,sha256=vng-YbKgh9HbtpAWYf_nkUSbfVTOj40aqUUejRYlsCU,1752
37
- cobweb/utils/dotting.py,sha256=PgsWdM-724Jy-MZWUsaygNWV-huqLMmdLgop7gaBxlo,872
37
+ cobweb/utils/dotting.py,sha256=vxK44tq_eD1uIYrkMzesUF43ZgFiu1zay3WZGTSBSx4,1898
38
38
  cobweb/utils/oss.py,sha256=gyt8-UB07tVphZLQXMOf-JTJwU-mWq8KZkOXKkAf3uk,3513
39
39
  cobweb/utils/tools.py,sha256=5JEaaAwYoV9Sdla2UBIJn6faUBuXmxUMagm9ck6FVqs,1253
40
- cobweb_launcher-1.2.43.dist-info/LICENSE,sha256=z1rxSIGOyzcSb3orZxFPxzx-0C1vTocmswqBNxpKfEk,1063
41
- cobweb_launcher-1.2.43.dist-info/METADATA,sha256=GcKa3nUwsRKVxxoe2lKqHylsTYtXHxbveUMAizWtdJc,6510
42
- cobweb_launcher-1.2.43.dist-info/WHEEL,sha256=ewwEueio1C2XeHTvT17n8dZUJgOvyCWCt0WVNLClP9o,92
43
- cobweb_launcher-1.2.43.dist-info/top_level.txt,sha256=4GETBGNsKqiCUezmT-mJn7tjhcDlu7nLIV5gGgHBW4I,7
44
- cobweb_launcher-1.2.43.dist-info/RECORD,,
40
+ cobweb_launcher-1.2.45.dist-info/LICENSE,sha256=z1rxSIGOyzcSb3orZxFPxzx-0C1vTocmswqBNxpKfEk,1063
41
+ cobweb_launcher-1.2.45.dist-info/METADATA,sha256=T46vEQNn9zyk9trC1O10bT5YBi7rNOh7M11ErhYsJ_0,6510
42
+ cobweb_launcher-1.2.45.dist-info/WHEEL,sha256=ewwEueio1C2XeHTvT17n8dZUJgOvyCWCt0WVNLClP9o,92
43
+ cobweb_launcher-1.2.45.dist-info/top_level.txt,sha256=4GETBGNsKqiCUezmT-mJn7tjhcDlu7nLIV5gGgHBW4I,7
44
+ cobweb_launcher-1.2.45.dist-info/RECORD,,