crawlo 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (59) hide show
  1. crawlo/__init__.py +5 -0
  2. crawlo/__version__.py +2 -0
  3. crawlo/core/__init__.py +2 -0
  4. crawlo/core/engine.py +157 -0
  5. crawlo/core/processor.py +40 -0
  6. crawlo/core/scheduler.py +35 -0
  7. crawlo/crawler.py +107 -0
  8. crawlo/downloader/__init__.py +78 -0
  9. crawlo/downloader/aiohttp_downloader.py +96 -0
  10. crawlo/downloader/httpx_downloader.py +48 -0
  11. crawlo/event.py +11 -0
  12. crawlo/exceptions.py +64 -0
  13. crawlo/extension/__init__.py +31 -0
  14. crawlo/extension/log_interval.py +49 -0
  15. crawlo/extension/log_stats.py +44 -0
  16. crawlo/items/__init__.py +24 -0
  17. crawlo/items/items.py +88 -0
  18. crawlo/middleware/__init__.py +21 -0
  19. crawlo/middleware/default_header.py +32 -0
  20. crawlo/middleware/download_delay.py +28 -0
  21. crawlo/middleware/middleware_manager.py +140 -0
  22. crawlo/middleware/request_ignore.py +30 -0
  23. crawlo/middleware/response_code.py +19 -0
  24. crawlo/middleware/response_filter.py +26 -0
  25. crawlo/middleware/retry.py +84 -0
  26. crawlo/network/__init__.py +7 -0
  27. crawlo/network/request.py +52 -0
  28. crawlo/network/response.py +93 -0
  29. crawlo/pipelines/__init__.py +13 -0
  30. crawlo/pipelines/console_pipeline.py +20 -0
  31. crawlo/pipelines/mongo_pipeline.py +5 -0
  32. crawlo/pipelines/mysql_pipeline.py +5 -0
  33. crawlo/pipelines/pipeline_manager.py +56 -0
  34. crawlo/settings/__init__.py +7 -0
  35. crawlo/settings/default_settings.py +39 -0
  36. crawlo/settings/setting_manager.py +100 -0
  37. crawlo/spider/__init__.py +36 -0
  38. crawlo/stats_collector.py +47 -0
  39. crawlo/subscriber.py +27 -0
  40. crawlo/task_manager.py +27 -0
  41. crawlo/templates/item_template.tmpl +22 -0
  42. crawlo/templates/project_template/items/__init__.py +0 -0
  43. crawlo/templates/project_template/main.py +33 -0
  44. crawlo/templates/project_template/setting.py +190 -0
  45. crawlo/templates/project_template/spiders/__init__.py +0 -0
  46. crawlo/templates/spider_template.tmpl +31 -0
  47. crawlo/utils/__init__.py +7 -0
  48. crawlo/utils/date_tools.py +20 -0
  49. crawlo/utils/func_tools.py +22 -0
  50. crawlo/utils/log.py +39 -0
  51. crawlo/utils/pqueue.py +16 -0
  52. crawlo/utils/project.py +58 -0
  53. crawlo/utils/system.py +11 -0
  54. crawlo-1.0.0.dist-info/METADATA +36 -0
  55. crawlo-1.0.0.dist-info/RECORD +59 -0
  56. crawlo-1.0.0.dist-info/WHEEL +5 -0
  57. crawlo-1.0.0.dist-info/entry_points.txt +2 -0
  58. crawlo-1.0.0.dist-info/licenses/LICENSE +23 -0
  59. crawlo-1.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,93 @@
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ import re
4
+ import ujson
5
+ from typing import Dict
6
+ from parsel import Selector
7
+ from http.cookies import SimpleCookie
8
+ from urllib.parse import urljoin as _urljoin
9
+
10
+ from crawlo import Request
11
+ from crawlo.exceptions import DecodeError
12
+
13
+
14
+ class Response(object):
15
+
16
+ def __init__(
17
+ self,
18
+ url: str,
19
+ *,
20
+ headers: Dict,
21
+ body: bytes = b"",
22
+ method: str = 'GET',
23
+ request: Request = None,
24
+ status_code: int = 200,
25
+ ):
26
+ self.url = url
27
+ self.headers = headers
28
+ self.body = body
29
+ self.method = method
30
+ self.request = request
31
+ self.status_code = status_code
32
+ self.encoding = request.encoding
33
+ self._selector = None
34
+ self._text_cache = None
35
+
36
+ @property
37
+ def text(self):
38
+ # 请求缓存
39
+ if self._text_cache:
40
+ return self._text_cache
41
+ try:
42
+ self._text_cache = self.body.decode(self.encoding)
43
+ except UnicodeDecodeError:
44
+ try:
45
+ _encoding_re = re.compile(r"charset=([\w-]+)", flags=re.I)
46
+ _encoding_string = self.headers.get('Content-Type', '') or self.headers.get('content-type', '')
47
+ _encoding = _encoding_re.search(_encoding_string)
48
+ if _encoding:
49
+ _encoding = _encoding.group(1)
50
+ self._text_cache = self.body.decode(_encoding)
51
+ else:
52
+ raise DecodeError(f"{self.request} {self.request.encoding} error.")
53
+ except UnicodeDecodeError as exp:
54
+ raise UnicodeDecodeError(
55
+ exp.encoding, exp.object, exp.start, exp.end, f"{self.request} error."
56
+ )
57
+ return self._text_cache
58
+
59
+ def json(self):
60
+ return ujson.loads(self.text)
61
+
62
+ def urljoin(self, url):
63
+ return _urljoin(self.url, url)
64
+
65
+ def xpath(self, xpath_str):
66
+ if self._selector is None:
67
+ self._selector = Selector(self.text)
68
+ return self._selector.xpath(xpath_str)
69
+
70
+ def css(self, css_str):
71
+ if self._selector is None:
72
+ self._selector = Selector(self.text)
73
+ return self._selector.css(css_str)
74
+
75
+ def re_search(self, pattern, flags=re.DOTALL):
76
+ return re.search(pattern, self.text, flags=flags)
77
+
78
+ def re_findall(self, pattern, flags=re.DOTALL):
79
+ return re.findall(pattern, self.text, flags=flags)
80
+
81
+ def get_cookies(self):
82
+ cookie_headers = self.headers.getlist('Set-Cookie') or []
83
+ cookies = SimpleCookie()
84
+ for header in cookie_headers:
85
+ cookies.load(header)
86
+ return {k: v.value for k, v in cookies.items()}
87
+
88
+ @property
89
+ def meta(self):
90
+ return self.request.meta
91
+
92
+ def __str__(self):
93
+ return f"{self.url} {self.status_code} {self.request.encoding} "
@@ -0,0 +1,13 @@
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ from crawlo.items.items import Item
4
+
5
+
6
+ class BasePipeline:
7
+
8
+ def process_item(self, item: Item, spider):
9
+ raise NotImplementedError
10
+
11
+ @classmethod
12
+ def create_instance(cls, crawler):
13
+ return cls()
@@ -0,0 +1,20 @@
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ from crawlo import Item
4
+ from crawlo.spider import Spider
5
+ from crawlo.utils.log import get_logger
6
+
7
+
8
+ class DebugPipeline:
9
+
10
+ def __init__(self, logger):
11
+ self.logger = logger
12
+
13
+ @classmethod
14
+ def create_instance(cls, crawler):
15
+ logger = get_logger(cls.__name__, crawler.settings.get('LOG_LEVEL'))
16
+ return cls(logger)
17
+
18
+ async def process_item(self, item: Item, spider: Spider) -> Item:
19
+ self.logger.debug(item.to_dict())
20
+ return item
@@ -0,0 +1,5 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+
4
+ class MongoPipeline(object):
5
+ pass
@@ -0,0 +1,5 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+
4
+ class MySQLPipeline(object):
5
+ pass
@@ -0,0 +1,56 @@
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ from typing import List
4
+ from pprint import pformat
5
+ from asyncio import create_task
6
+
7
+
8
+ from crawlo.utils.log import get_logger
9
+ from crawlo.event import item_successful, item_discard
10
+ from crawlo.utils.project import load_class, common_call
11
+ from crawlo.exceptions import PipelineInitError, ItemDiscard, InvalidOutputError
12
+
13
+
14
+ class PipelineManager:
15
+
16
+ def __init__(self, crawler):
17
+ self.crawler = crawler
18
+ self.pipelines: List = []
19
+ self.methods: List = []
20
+
21
+ self.logger = get_logger(self.__class__.__name__, self.crawler.settings.get('LOG_LEVEL'))
22
+ pipelines = self.crawler.settings.get_list('PIPELINES')
23
+ self._add_pipelines(pipelines)
24
+ self._add_methods()
25
+
26
+ @classmethod
27
+ def create_instance(cls, *args, **kwargs):
28
+ o = cls(*args, **kwargs)
29
+ return o
30
+
31
+ def _add_pipelines(self, pipelines):
32
+ for pipeline in pipelines:
33
+ pipeline_cls = load_class(pipeline)
34
+ if not hasattr(pipeline_cls, 'create_instance'):
35
+ raise PipelineInitError(
36
+ f"Pipeline init failed, must inherit from `BasePipeline` or have a `create_instance` method"
37
+ )
38
+ self.pipelines.append(pipeline_cls.create_instance(self.crawler))
39
+ if pipelines:
40
+ self.logger.info(f"enabled pipelines: \n {pformat(pipelines)}")
41
+
42
+ def _add_methods(self):
43
+ for pipeline in self.pipelines:
44
+ if hasattr(pipeline, 'process_item'):
45
+ self.methods.append(pipeline.process_item)
46
+
47
+ async def process_item(self, item):
48
+ try:
49
+ for method in self.methods:
50
+ item = await common_call(method, item, self.crawler.spider)
51
+ if item is None:
52
+ raise InvalidOutputError(f"{method.__qualname__} return None is not supported.")
53
+ except ItemDiscard as exc:
54
+ create_task(self.crawler.subscriber.notify(item_discard, item, exc, self.crawler.spider))
55
+ else:
56
+ create_task(self.crawler.subscriber.notify(item_successful, item, self.crawler.spider))
@@ -0,0 +1,7 @@
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ """
4
+ # @Time : 2025-05-11 11:08
5
+ # @Author : oscar
6
+ # @Desc : None
7
+ """
@@ -0,0 +1,39 @@
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+
4
+ VERSION = 1.0
5
+
6
+ # 并发数
7
+ CONCURRENCY = 8
8
+
9
+ # 下载超时时长
10
+ DOWNLOAD_TIMEOUT = 60
11
+
12
+ INTERVAL = 5
13
+
14
+ # --------------------------------------------------- delay ------------------------------------------------------------
15
+ # 下载延迟,默认关闭
16
+ DOWNLOAD_DELAY = 0
17
+ # 下载延迟范围
18
+ RANDOM_RANGE = (0.75, 1.25)
19
+ # 是否需要随机
20
+ RANDOMNESS = True
21
+
22
+ # --------------------------------------------------- retry ------------------------------------------------------------
23
+ MAX_RETRY_TIMES = 2
24
+ IGNORE_HTTP_CODES = [403, 404]
25
+ RETRY_HTTP_CODES = [408, 429, 500, 502, 503, 504, 522, 524]
26
+ # 允许通过的状态码
27
+ ALLOWED_CODES = []
28
+
29
+ STATS_DUMP = True
30
+ # ssl 验证
31
+ VERIFY_SSL = True
32
+ # 是否使用同一个session
33
+ USE_SESSION = True
34
+ # 日志级别
35
+ LOG_LEVEL = 'DEBUG'
36
+ # 选择下载器
37
+ DOWNLOADER = "crawlo.downloader.aiohttp_downloader.AioHttpDownloader" # HttpXDownloader
38
+
39
+ EXTENSIONS = []
@@ -0,0 +1,100 @@
1
+ #!/usr/bin/python
2
+ # -*- coding: UTF-8 -*-
3
+ import json
4
+ from copy import deepcopy
5
+ from importlib import import_module
6
+ from collections.abc import MutableMapping
7
+
8
+ from crawlo.settings import default_settings
9
+
10
+
11
+ class SettingManager(MutableMapping):
12
+
13
+ def __init__(self, values=None):
14
+ self.attributes = {}
15
+ self.set_settings(default_settings)
16
+ self.update_attributes(values)
17
+
18
+ def get(self, key, default=None):
19
+ """安全获取值,不触发递归"""
20
+ value = self.attributes.get(key, default)
21
+ return value if value is not None else default
22
+
23
+ def get_int(self, key, default=0):
24
+ return int(self.get(key, default=default))
25
+
26
+ def get_float(self, key, default=0.0):
27
+ return float(self.get(key, default=default))
28
+
29
+ def get_bool(self, key, default=False):
30
+ got = self.get(key, default=default)
31
+ if isinstance(got, bool):
32
+ return got
33
+ if isinstance(got, (int, float)):
34
+ return bool(got)
35
+ got_lower = str(got).strip().lower()
36
+ if got_lower in ('1', 'true'):
37
+ return True
38
+ if got_lower in ('0', 'false'):
39
+ return False
40
+ raise ValueError(
41
+ f"Unsupported value for boolean setting: {got}. "
42
+ "Supported values are: 0/1, True/False, '0'/'1', 'True'/'False' (case-insensitive)."
43
+ )
44
+
45
+ def get_list(self, key, default=None):
46
+ values = self.get(key, default or [])
47
+ if isinstance(values, str):
48
+ return [v.strip() for v in values.split(',') if v.strip()]
49
+ try:
50
+ return list(values)
51
+ except TypeError:
52
+ return [values]
53
+
54
+ def get_dict(self, key, default=None):
55
+ value = self.get(key, default or {})
56
+ if isinstance(value, str):
57
+ value = json.loads(value)
58
+ try:
59
+ return dict(value)
60
+ except TypeError:
61
+ return value
62
+
63
+ def set(self, key, value):
64
+ self.attributes[key] = value
65
+
66
+ def set_settings(self, module):
67
+ if isinstance(module, str):
68
+ module = import_module(module)
69
+ for key in dir(module):
70
+ if key.isupper():
71
+ self.set(key, getattr(module, key))
72
+
73
+ # 实现 MutableMapping 必须的方法
74
+ def __getitem__(self, item):
75
+ return self.attributes[item]
76
+
77
+ def __setitem__(self, key, value):
78
+ self.set(key, value)
79
+
80
+ def __delitem__(self, key):
81
+ del self.attributes[key]
82
+
83
+ def __iter__(self):
84
+ return iter(self.attributes)
85
+
86
+ def __len__(self):
87
+ return len(self.attributes)
88
+
89
+ def __str__(self):
90
+ return f'<Settings: {self.attributes}>'
91
+
92
+ __repr__ = __str__
93
+
94
+ def update_attributes(self, attributes):
95
+ if attributes is not None:
96
+ for key, value in attributes.items():
97
+ self.set(key, value)
98
+
99
+ def copy(self):
100
+ return deepcopy(self)
@@ -0,0 +1,36 @@
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ from crawlo import Request
4
+
5
+
6
+ class Spider(object):
7
+ def __init__(self):
8
+ if not hasattr(self, 'start_urls'):
9
+ self.start_urls = []
10
+ self.crawler = None
11
+
12
+ @classmethod
13
+ def create_instance(cls, crawler):
14
+ o = cls()
15
+ o.crawler = crawler
16
+ return o
17
+
18
+ def start_requests(self):
19
+ if self.start_urls:
20
+ for url in self.start_urls:
21
+ yield Request(url=url)
22
+ else:
23
+ if hasattr(self, 'start_url') and isinstance(getattr(self, 'start_url'), str):
24
+ yield Request(getattr(self, 'start_url'))
25
+
26
+ def parse(self, response):
27
+ raise NotImplementedError
28
+
29
+ async def spider_opened(self):
30
+ pass
31
+
32
+ async def spider_closed(self):
33
+ pass
34
+
35
+ def __str__(self):
36
+ return self.__class__.__name__
@@ -0,0 +1,47 @@
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ """
4
+ # @Time : 2025-05-17 09:57
5
+ # @Author : crawl-coder
6
+ # @Desc : 统计信息收集器
7
+ """
8
+ from pprint import pformat
9
+ from crawlo.utils.log import get_logger
10
+
11
+
12
+ class StatsCollector(object):
13
+
14
+ def __init__(self, crawler):
15
+ self.crawler = crawler
16
+ self._dump = self.crawler.settings.get_bool('STATS_DUMP')
17
+ self._stats = {}
18
+ self.logger = get_logger(self.__class__.__name__, "INFO")
19
+
20
+ def inc_value(self, key, count=1, start=0):
21
+ self._stats[key] = self._stats.setdefault(key, start) + count
22
+
23
+ def get_value(self, key, default=None):
24
+ return self._stats.get(key, default)
25
+
26
+ def get_stats(self):
27
+ return self._stats
28
+
29
+ def set_stats(self, stats):
30
+ self._stats = stats
31
+
32
+ def clear_stats(self):
33
+ self._stats.clear()
34
+
35
+ def close_spider(self, spider_name, reason):
36
+ self._stats['reason'] = reason
37
+ if self._dump:
38
+ self.logger.info(f'{spider_name} stats: \n{pformat(self._stats)}')
39
+
40
+ def __getitem__(self, item):
41
+ return self._stats[item]
42
+
43
+ def __setitem__(self, key, value):
44
+ self._stats[key] = value
45
+
46
+ def __delitem__(self, key):
47
+ del self._stats[key]
crawlo/subscriber.py ADDED
@@ -0,0 +1,27 @@
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ import asyncio
4
+ from collections import defaultdict
5
+ from inspect import iscoroutinefunction
6
+ from typing import Dict, Set, Callable, Coroutine
7
+
8
+ from crawlo.exceptions import ReceiverTypeError
9
+
10
+
11
+ class Subscriber:
12
+
13
+ def __init__(self):
14
+ self._subscribers: Dict[str, Set[Callable[..., Coroutine]]] = defaultdict(set)
15
+
16
+ def subscribe(self, receiver: Callable[..., Coroutine], *, event: str) -> None:
17
+ if not iscoroutinefunction(receiver):
18
+ raise ReceiverTypeError(f"{receiver.__qualname__} must be a coroutine function")
19
+ self._subscribers[event].add(receiver)
20
+
21
+ def unsubscribe(self, receiver: Callable[..., Coroutine], *, event: str) -> None:
22
+ self._subscribers[event].discard(receiver)
23
+
24
+ async def notify(self, event: str, *args, **kwargs) -> None:
25
+ for receiver in self._subscribers[event]:
26
+ # 不能 await
27
+ asyncio.create_task(receiver(*args, **kwargs))
crawlo/task_manager.py ADDED
@@ -0,0 +1,27 @@
1
+ #!/usr/bin/python
2
+ # -*- coding:UTF-8 -*-
3
+ import asyncio
4
+ from asyncio import Task, Future, Semaphore
5
+ from typing import Set, Final
6
+
7
+
8
+ class TaskManager:
9
+
10
+ def __init__(self, total_concurrency: int = 8):
11
+ self.current_task: Final[Set] = set()
12
+ self.semaphore: Semaphore = Semaphore(total_concurrency)
13
+
14
+ def create_task(self, coroutine) -> Task:
15
+ task = asyncio.create_task(coroutine)
16
+ self.current_task.add(task)
17
+
18
+ def done_callback(_future: Future) -> None:
19
+ self.current_task.remove(task)
20
+ self.semaphore.release()
21
+
22
+ task.add_done_callback(done_callback)
23
+
24
+ return task
25
+
26
+ def all_done(self) -> bool:
27
+ return len(self.current_task) == 0
@@ -0,0 +1,22 @@
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Created on {DATE}
4
+ ---------
5
+ @summary:
6
+ ---------
7
+ @author: {USER}
8
+ """
9
+
10
+ from crawlo import Item
11
+
12
+
13
+ class ${item_name}Item(Item):
14
+ """
15
+ This class was generated by feapder
16
+ command: feapder create -i ${command}
17
+ """
18
+
19
+ __table_name__ = "${table_name}"
20
+
21
+ def __init__(self, *args, **kwargs):
22
+ ${propertys}
File without changes
@@ -0,0 +1,33 @@
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Created on {DATE}
4
+ ---------
5
+ @summary: 爬虫入口
6
+ ---------
7
+ @author: {USER}
8
+ """
9
+
10
+ from crawlo import ArgumentParser
11
+
12
+ from spiders import *
13
+
14
+
15
+
16
+ def crawl_xxx():
17
+ """
18
+ Spider爬虫
19
+ """
20
+ spider = xxx.XXXSpider(redis_key="xxx:xxx")
21
+ spider.start()
22
+
23
+
24
+
25
+ if __name__ == "__main__":
26
+ parser = ArgumentParser(description="xxx爬虫")
27
+
28
+ parser.add_argument(
29
+ "--crawl_xxx", action="store_true", help="xxx爬虫", function=crawl_xxx
30
+ )
31
+ parser.start()
32
+
33
+ # main.py作为爬虫启动的统一入口,提供命令行的方式启动多个爬虫