cobweb-launcher 1.2.43__tar.gz → 1.2.45__tar.gz

Sign up to get free protection for your applications and to get access to all the features.
Files changed (46) hide show
  1. {cobweb-launcher-1.2.43/cobweb_launcher.egg-info → cobweb-launcher-1.2.45}/PKG-INFO +1 -1
  2. {cobweb-launcher-1.2.43 → cobweb-launcher-1.2.45}/cobweb/crawlers/crawler.py +8 -6
  3. cobweb-launcher-1.2.45/cobweb/utils/dotting.py +60 -0
  4. {cobweb-launcher-1.2.43 → cobweb-launcher-1.2.45/cobweb_launcher.egg-info}/PKG-INFO +1 -1
  5. {cobweb-launcher-1.2.43 → cobweb-launcher-1.2.45}/setup.py +1 -1
  6. cobweb-launcher-1.2.43/cobweb/utils/dotting.py +0 -32
  7. {cobweb-launcher-1.2.43 → cobweb-launcher-1.2.45}/LICENSE +0 -0
  8. {cobweb-launcher-1.2.43 → cobweb-launcher-1.2.45}/README.md +0 -0
  9. {cobweb-launcher-1.2.43 → cobweb-launcher-1.2.45}/cobweb/__init__.py +0 -0
  10. {cobweb-launcher-1.2.43 → cobweb-launcher-1.2.45}/cobweb/base/__init__.py +0 -0
  11. {cobweb-launcher-1.2.43 → cobweb-launcher-1.2.45}/cobweb/base/common_queue.py +0 -0
  12. {cobweb-launcher-1.2.43 → cobweb-launcher-1.2.45}/cobweb/base/decorators.py +0 -0
  13. {cobweb-launcher-1.2.43 → cobweb-launcher-1.2.45}/cobweb/base/item.py +0 -0
  14. {cobweb-launcher-1.2.43 → cobweb-launcher-1.2.45}/cobweb/base/log.py +0 -0
  15. {cobweb-launcher-1.2.43 → cobweb-launcher-1.2.45}/cobweb/base/request.py +0 -0
  16. {cobweb-launcher-1.2.43 → cobweb-launcher-1.2.45}/cobweb/base/response.py +0 -0
  17. {cobweb-launcher-1.2.43 → cobweb-launcher-1.2.45}/cobweb/base/seed.py +0 -0
  18. {cobweb-launcher-1.2.43 → cobweb-launcher-1.2.45}/cobweb/constant.py +0 -0
  19. {cobweb-launcher-1.2.43 → cobweb-launcher-1.2.45}/cobweb/crawlers/__init__.py +0 -0
  20. {cobweb-launcher-1.2.43 → cobweb-launcher-1.2.45}/cobweb/crawlers/base_crawler.py +0 -0
  21. {cobweb-launcher-1.2.43 → cobweb-launcher-1.2.45}/cobweb/crawlers/file_crawler.py +0 -0
  22. {cobweb-launcher-1.2.43 → cobweb-launcher-1.2.45}/cobweb/db/__init__.py +0 -0
  23. {cobweb-launcher-1.2.43 → cobweb-launcher-1.2.45}/cobweb/db/api_db.py +0 -0
  24. {cobweb-launcher-1.2.43 → cobweb-launcher-1.2.45}/cobweb/db/redis_db.py +0 -0
  25. {cobweb-launcher-1.2.43 → cobweb-launcher-1.2.45}/cobweb/exceptions/__init__.py +0 -0
  26. {cobweb-launcher-1.2.43 → cobweb-launcher-1.2.45}/cobweb/exceptions/oss_db_exception.py +0 -0
  27. {cobweb-launcher-1.2.43 → cobweb-launcher-1.2.45}/cobweb/launchers/__init__.py +0 -0
  28. {cobweb-launcher-1.2.43 → cobweb-launcher-1.2.45}/cobweb/launchers/launcher.py +0 -0
  29. {cobweb-launcher-1.2.43 → cobweb-launcher-1.2.45}/cobweb/launchers/launcher_air.py +0 -0
  30. {cobweb-launcher-1.2.43 → cobweb-launcher-1.2.45}/cobweb/launchers/launcher_api.py +0 -0
  31. {cobweb-launcher-1.2.43 → cobweb-launcher-1.2.45}/cobweb/launchers/launcher_pro.py +0 -0
  32. {cobweb-launcher-1.2.43 → cobweb-launcher-1.2.45}/cobweb/pipelines/__init__.py +0 -0
  33. {cobweb-launcher-1.2.43 → cobweb-launcher-1.2.45}/cobweb/pipelines/pipeline.py +0 -0
  34. {cobweb-launcher-1.2.43 → cobweb-launcher-1.2.45}/cobweb/pipelines/pipeline_console.py +0 -0
  35. {cobweb-launcher-1.2.43 → cobweb-launcher-1.2.45}/cobweb/pipelines/pipeline_loghub.py +0 -0
  36. {cobweb-launcher-1.2.43 → cobweb-launcher-1.2.45}/cobweb/setting.py +0 -0
  37. {cobweb-launcher-1.2.43 → cobweb-launcher-1.2.45}/cobweb/utils/__init__.py +0 -0
  38. {cobweb-launcher-1.2.43 → cobweb-launcher-1.2.45}/cobweb/utils/bloom.py +0 -0
  39. {cobweb-launcher-1.2.43 → cobweb-launcher-1.2.45}/cobweb/utils/oss.py +0 -0
  40. {cobweb-launcher-1.2.43 → cobweb-launcher-1.2.45}/cobweb/utils/tools.py +0 -0
  41. {cobweb-launcher-1.2.43 → cobweb-launcher-1.2.45}/cobweb_launcher.egg-info/SOURCES.txt +0 -0
  42. {cobweb-launcher-1.2.43 → cobweb-launcher-1.2.45}/cobweb_launcher.egg-info/dependency_links.txt +0 -0
  43. {cobweb-launcher-1.2.43 → cobweb-launcher-1.2.45}/cobweb_launcher.egg-info/requires.txt +0 -0
  44. {cobweb-launcher-1.2.43 → cobweb-launcher-1.2.45}/cobweb_launcher.egg-info/top_level.txt +0 -0
  45. {cobweb-launcher-1.2.43 → cobweb-launcher-1.2.45}/setup.cfg +0 -0
  46. {cobweb-launcher-1.2.43 → cobweb-launcher-1.2.45}/test/test.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cobweb-launcher
3
- Version: 1.2.43
3
+ Version: 1.2.45
4
4
  Summary: spider_hole
5
5
  Home-page: https://github.com/Juannie-PP/cobweb
6
6
  Author: Juannie-PP
@@ -1,4 +1,5 @@
1
1
  import json
2
+ import os
2
3
  import threading
3
4
  import time
4
5
  import traceback
@@ -6,9 +7,7 @@ from inspect import isgenerator
6
7
  from typing import Union, Callable, Mapping
7
8
  from urllib.parse import urlparse
8
9
 
9
- import urllib3
10
- from requests import HTTPError, Response as Res
11
- from requests.exceptions import ChunkedEncodingError
10
+ from requests import Response as Res
12
11
 
13
12
  from cobweb.constant import DealModel, LogTemplate
14
13
  from cobweb.base import (
@@ -20,6 +19,7 @@ from cobweb.base import (
20
19
  logger
21
20
  )
22
21
  from cobweb.utils import LoghubDot
22
+ proxy_type = os.getenv("PROXY_TYPE", "")
23
23
 
24
24
 
25
25
  class Crawler(threading.Thread):
@@ -149,7 +149,7 @@ class Crawler(threading.Thread):
149
149
  topic=urlparse(download_item.response.request.url).netloc,
150
150
  data_size=int(download_item.response.headers.get("content-length", 0)),
151
151
  cost_time=end_time - start_time, status = 200,
152
- url=download_item.response.url,
152
+ url=download_item.response.url, proxy_type=proxy_type,
153
153
  )
154
154
  parse_iterators = self.parse(download_item)
155
155
  if not isgenerator(parse_iterators):
@@ -169,7 +169,7 @@ class Crawler(threading.Thread):
169
169
  except Exception as e:
170
170
  exception_msg = ''.join(traceback.format_exception(type(e), e, e.__traceback__))
171
171
  url = seed.url
172
- status = str(e)
172
+ status = e.__class__.__name__
173
173
  if getattr(e, "response", None) and isinstance(e.response, Res):
174
174
  url = e.response.request.url
175
175
  status = e.response.status_code
@@ -177,7 +177,8 @@ class Crawler(threading.Thread):
177
177
  topic=urlparse(url).netloc,
178
178
  data_size=-1, cost_time=-1,
179
179
  status=status, url=url,
180
- msg=exception_msg
180
+ proxy_type=proxy_type,
181
+ msg=exception_msg,
181
182
  )
182
183
  logger.info(LogTemplate.download_exception.format(
183
184
  detail=seed_detail_log_info,
@@ -208,6 +209,7 @@ class Crawler(threading.Thread):
208
209
  logger.info("spider thread close")
209
210
 
210
211
  def run(self):
212
+ threading.Thread(name="loghub_dot", target=self.loghub_dot.build_run).start()
211
213
  for index in range(self.thread_num):
212
214
  threading.Thread(name=f"spider_{index}", target=self.spider).start()
213
215
 
@@ -0,0 +1,60 @@
1
+ import json
2
+ import time
3
+
4
+ from aliyun.log import LogClient, LogItem, PutLogsRequest
5
+
6
+ from base import Queue, logger
7
+ from cobweb import setting
8
+
9
+
10
+ class LoghubDot:
11
+
12
+ def __init__(self):
13
+ self.client = LogClient(**setting.LOGHUB_CONFIG)
14
+ self.queue = Queue()
15
+
16
+ def build(self, topic, **kwargs):
17
+
18
+ temp = {}
19
+ log_item = LogItem()
20
+ for key, value in kwargs.items():
21
+ if not isinstance(value, str):
22
+ temp[key] = json.dumps(value, ensure_ascii=False)
23
+ else:
24
+ temp[key] = value
25
+ contents = sorted(temp.items())
26
+ log_item.set_contents(contents)
27
+ # log_items.append(log_item)
28
+ # request = PutLogsRequest(
29
+ # project="databee-download-log",
30
+ # logstore="log",
31
+ # topic=topic,
32
+ # logitems=log_items,
33
+ # compress=True
34
+ # )
35
+ self.queue.push((topic, log_item), direct_insertion=True)
36
+ # self.client.put_logs(request=request)
37
+
38
+ def build_run(self):
39
+ while True:
40
+ if self.queue.length < 1000:
41
+ time.sleep(0.5)
42
+ continue
43
+ try:
44
+ log_item_info = {}
45
+ for _ in range(1000):
46
+ topic, item = self.queue.pop()
47
+ if not item:
48
+ break
49
+ log_item_info.setdefault(topic, []).append(item)
50
+ for topic, log_items in log_item_info.items():
51
+ request = PutLogsRequest(
52
+ project="databee-download-log",
53
+ logstore="log",
54
+ topic=topic,
55
+ logitems=log_items,
56
+ compress=True
57
+ )
58
+ self.client.put_logs(request=request)
59
+ except Exception as e:
60
+ logger.info(str(e))
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: cobweb-launcher
3
- Version: 1.2.43
3
+ Version: 1.2.45
4
4
  Summary: spider_hole
5
5
  Home-page: https://github.com/Juannie-PP/cobweb
6
6
  Author: Juannie-PP
@@ -5,7 +5,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
5
5
 
6
6
  setup(
7
7
  name="cobweb-launcher",
8
- version="1.2.43",
8
+ version="1.2.45",
9
9
  packages=find_packages(),
10
10
  url="https://github.com/Juannie-PP/cobweb",
11
11
  license="MIT",
@@ -1,32 +0,0 @@
1
- import json
2
-
3
- from aliyun.log import LogClient, LogItem, PutLogsRequest
4
- from cobweb import setting
5
-
6
-
7
- class LoghubDot:
8
-
9
- def __init__(self):
10
- self.client = LogClient(**setting.LOGHUB_CONFIG)
11
-
12
- def build(self, topic, **kwargs):
13
-
14
- temp = {}
15
- log_items = []
16
- log_item = LogItem()
17
- for key, value in kwargs.items():
18
- if not isinstance(value, str):
19
- temp[key] = json.dumps(value, ensure_ascii=False)
20
- else:
21
- temp[key] = value
22
- contents = sorted(temp.items())
23
- log_item.set_contents(contents)
24
- log_items.append(log_item)
25
- request = PutLogsRequest(
26
- project="databee-download-log",
27
- logstore="log",
28
- topic=topic,
29
- logitems=log_items,
30
- compress=True
31
- )
32
- self.client.put_logs(request=request)