crawlo 1.0.4__py3-none-any.whl → 1.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (95) hide show
  1. crawlo/__init__.py +25 -9
  2. crawlo/__version__.py +1 -1
  3. crawlo/core/__init__.py +2 -2
  4. crawlo/core/engine.py +158 -158
  5. crawlo/core/processor.py +40 -40
  6. crawlo/core/scheduler.py +57 -57
  7. crawlo/crawler.py +424 -242
  8. crawlo/downloader/__init__.py +78 -78
  9. crawlo/downloader/aiohttp_downloader.py +200 -259
  10. crawlo/downloader/cffi_downloader.py +277 -0
  11. crawlo/downloader/httpx_downloader.py +246 -187
  12. crawlo/event.py +11 -11
  13. crawlo/exceptions.py +73 -64
  14. crawlo/extension/__init__.py +31 -31
  15. crawlo/extension/log_interval.py +49 -49
  16. crawlo/extension/log_stats.py +44 -44
  17. crawlo/extension/logging_extension.py +35 -0
  18. crawlo/filters/__init__.py +37 -37
  19. crawlo/filters/aioredis_filter.py +150 -150
  20. crawlo/filters/memory_filter.py +202 -202
  21. crawlo/items/__init__.py +62 -62
  22. crawlo/items/items.py +115 -119
  23. crawlo/middleware/__init__.py +21 -21
  24. crawlo/middleware/default_header.py +32 -32
  25. crawlo/middleware/download_delay.py +28 -28
  26. crawlo/middleware/middleware_manager.py +135 -140
  27. crawlo/middleware/proxy.py +246 -0
  28. crawlo/middleware/request_ignore.py +30 -30
  29. crawlo/middleware/response_code.py +18 -18
  30. crawlo/middleware/response_filter.py +26 -26
  31. crawlo/middleware/retry.py +90 -90
  32. crawlo/network/__init__.py +7 -7
  33. crawlo/network/request.py +203 -204
  34. crawlo/network/response.py +166 -166
  35. crawlo/pipelines/__init__.py +13 -13
  36. crawlo/pipelines/console_pipeline.py +39 -39
  37. crawlo/pipelines/mongo_pipeline.py +116 -116
  38. crawlo/pipelines/mysql_batch_pipline.py +273 -134
  39. crawlo/pipelines/mysql_pipeline.py +195 -195
  40. crawlo/pipelines/pipeline_manager.py +56 -56
  41. crawlo/settings/__init__.py +7 -7
  42. crawlo/settings/default_settings.py +169 -94
  43. crawlo/settings/setting_manager.py +99 -99
  44. crawlo/spider/__init__.py +41 -36
  45. crawlo/stats_collector.py +59 -59
  46. crawlo/subscriber.py +106 -106
  47. crawlo/task_manager.py +27 -27
  48. crawlo/templates/item_template.tmpl +21 -21
  49. crawlo/templates/project_template/main.py +32 -32
  50. crawlo/templates/project_template/setting.py +189 -189
  51. crawlo/templates/spider_template.tmpl +30 -30
  52. crawlo/utils/__init__.py +7 -7
  53. crawlo/utils/concurrency_manager.py +124 -124
  54. crawlo/utils/date_tools.py +233 -177
  55. crawlo/utils/db_helper.py +344 -0
  56. crawlo/utils/func_tools.py +82 -82
  57. crawlo/utils/log.py +129 -39
  58. crawlo/utils/pqueue.py +173 -173
  59. crawlo/utils/project.py +59 -59
  60. crawlo/utils/request.py +267 -122
  61. crawlo/utils/system.py +11 -11
  62. crawlo/utils/tools.py +5 -303
  63. crawlo/utils/url.py +39 -39
  64. {crawlo-1.0.4.dist-info → crawlo-1.0.5.dist-info}/METADATA +49 -48
  65. crawlo-1.0.5.dist-info/RECORD +84 -0
  66. {crawlo-1.0.4.dist-info → crawlo-1.0.5.dist-info}/top_level.txt +1 -0
  67. examples/__init__.py +0 -0
  68. examples/gxb/__init__.py +0 -0
  69. examples/gxb/items.py +36 -0
  70. examples/gxb/run.py +15 -0
  71. examples/gxb/settings.py +71 -0
  72. examples/gxb/spider/__init__.py +0 -0
  73. examples/gxb/spider/miit_spider.py +180 -0
  74. examples/gxb/spider/telecom_device_licenses.py +129 -0
  75. tests/__init__.py +7 -7
  76. tests/test_proxy_health_check.py +33 -0
  77. tests/test_proxy_middleware_integration.py +137 -0
  78. tests/test_proxy_providers.py +57 -0
  79. tests/test_proxy_stats.py +20 -0
  80. tests/test_proxy_strategies.py +60 -0
  81. crawlo/downloader/playwright_downloader.py +0 -161
  82. crawlo-1.0.4.dist-info/RECORD +0 -79
  83. tests/baidu_spider/__init__.py +0 -7
  84. tests/baidu_spider/demo.py +0 -94
  85. tests/baidu_spider/items.py +0 -25
  86. tests/baidu_spider/middleware.py +0 -49
  87. tests/baidu_spider/pipeline.py +0 -55
  88. tests/baidu_spider/request_fingerprints.txt +0 -9
  89. tests/baidu_spider/run.py +0 -27
  90. tests/baidu_spider/settings.py +0 -80
  91. tests/baidu_spider/spiders/__init__.py +0 -7
  92. tests/baidu_spider/spiders/bai_du.py +0 -61
  93. tests/baidu_spider/spiders/sina.py +0 -79
  94. {crawlo-1.0.4.dist-info → crawlo-1.0.5.dist-info}/WHEEL +0 -0
  95. {crawlo-1.0.4.dist-info → crawlo-1.0.5.dist-info}/entry_points.txt +0 -0
@@ -1,80 +0,0 @@
1
- #!/usr/bin/python
2
- # -*- coding:UTF-8 -*-
3
-
4
- PROJECT_NAME = 'baidu_spider'
5
-
6
- CONCURRENCY = 4
7
-
8
- USE_SESSION = True
9
-
10
- # 下载延迟
11
- DOWNLOAD_DELAY = 0.5
12
- RANDOMNESS = False
13
-
14
- # --------------------------------------------------- 公共MySQL配置 -----------------------------------------------------
15
- MYSQL_HOST = '43.139.14.225'
16
- MYSQL_PORT = 3306
17
- MYSQL_USER = 'picker'
18
- MYSQL_PASSWORD = 'kmcNbbz6TbSihttZ'
19
- MYSQL_DB = 'stock_pro'
20
- MYSQL_TABLE = 'articles' # 可选,默认使用spider名称
21
- MYSQL_BATCH_SIZE = 500
22
-
23
- # asyncmy专属配置
24
- MYSQL_POOL_MIN = 5 # 连接池最小连接数
25
- MYSQL_POOL_MAX = 20 # 连接池最大连接数
26
-
27
- # 选择下载器
28
- # DOWNLOADER = "crawlo.downloader.httpx_downloader.HttpXDownloader"
29
-
30
- MIDDLEWARES = [
31
- 'crawlo.middleware.download_delay.DownloadDelayMiddleware',
32
- 'crawlo.middleware.default_header.DefaultHeaderMiddleware',
33
- 'crawlo.middleware.response_filter.ResponseFilterMiddleware',
34
- 'crawlo.middleware.retry.RetryMiddleware',
35
- 'crawlo.middleware.response_code.ResponseCodeMiddleware',
36
- 'crawlo.middleware.request_ignore.RequestIgnoreMiddleware',
37
- # 'baidu_spider.middleware.TestMiddleWare',
38
- # 'baidu_spider.middleware.TestMiddleWare2'
39
- ]
40
-
41
- EXTENSIONS = [
42
- 'crawlo.extension.log_interval.LogIntervalExtension',
43
- 'crawlo.extension.log_stats.LogStats',
44
- ]
45
-
46
- PIPELINES = [
47
- 'crawlo.pipelines.console_pipeline.ConsolePipeline',
48
- 'crawlo.pipelines.mysql_pipeline.AsyncmyMySQLPipeline', # 或 AiomysqlMySQLPipeline
49
- # 'crawlo.pipelines.mysql_batch_pipline.AsyncmyMySQLPipeline', # 或 AiomysqlMySQLPipeline
50
- # 'baidu_spider.pipeline.TestPipeline',
51
- # 'baidu_spider.pipeline.MongoPipeline',
52
- ]
53
-
54
- USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36'
55
- DEFAULT_HEADERS = {
56
- "accept": "application/json, text/javascript, */*; q=0.01",
57
- "accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
58
- "cache-control": "no-cache",
59
- "pragma": "no-cache",
60
- "priority": "u=1, i",
61
- "sec-ch-ua": "\"Chromium\";v=\"136\", \"Google Chrome\";v=\"136\", \"Not.A/Brand\";v=\"99\"",
62
- "sec-ch-ua-mobile": "?0",
63
- "sec-ch-ua-platform": "\"macOS\"",
64
- "sec-fetch-dest": "empty",
65
- "sec-fetch-mode": "cors",
66
- "sec-fetch-site": "same-origin",
67
- # "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36",
68
- "x-requested-with": "XMLHttpRequest"
69
- }
70
-
71
- # --------------------------------------DB ---------------------------------------------
72
- Mongo_Params = ''
73
- MONGODB_DB = 'news'
74
-
75
- REDIS_TTL = 0
76
- CLEANUP_FP = False
77
-
78
- FILTER_CLASS = 'crawlo.filters.aioredis_filter.AioRedisFilter'
79
- # FILTER_CLASS = 'crawlo.filters.redis_filter.RedisFilter'
80
- # FILTER_CLASS = 'crawlo.filters.memory_filter.MemoryFileFilter'
@@ -1,7 +0,0 @@
1
- #!/usr/bin/python
2
- # -*- coding:UTF-8 -*-
3
- """
4
- # @Time : 2025-05-11 12:20
5
- # @Author : oscar
6
- # @Desc : None
7
- """
@@ -1,61 +0,0 @@
1
- #!/usr/bin/python
2
- # -*- coding:UTF-8 -*-
3
- """
4
- # @Time : 2025-02-05 13:05
5
- # @Author : oscar
6
- # @Desc : None
7
- """
8
- import asyncio
9
- from crawlo import Request
10
- from crawlo.spider import Spider
11
-
12
- from items import BauDuItem
13
-
14
-
15
- class BaiDuSpider(Spider):
16
- start_urls = ["https://www.baidu.com/", "https://www.baidu.com/"]
17
-
18
- custom_settings = {
19
- 'CONCURRENCY': 1
20
- }
21
-
22
- name = "bai_du"
23
-
24
- # headers = {
25
- # "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36"
26
- # }
27
- #
28
- user_gent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36"
29
-
30
- async def parse(self, response):
31
- for i in range(5):
32
- url = f"https://www.baidu.com"
33
- # url = f"https://www.httpbin.org/404"
34
- r = Request(url=url, callback=self.parse_page, dont_filter=True)
35
- yield r
36
-
37
- async def parse_page(self, response):
38
- for i in range(5):
39
- url = f"https://www.baidu.com"
40
- meta = {'test': 'hhhh'}
41
- r = Request(url=url, callback=self.parse_detail, meta=meta, dont_filter=False)
42
- yield r
43
-
44
- def parse_detail(self, response):
45
- item = BauDuItem()
46
- item['title'] = response.xpath('//title/text()').get()
47
-
48
- item['url'] = response.url
49
-
50
- yield item
51
-
52
- async def spider_opened(self):
53
- pass
54
-
55
- async def spider_closed(self):
56
- pass
57
-
58
-
59
- if __name__ == '__main__':
60
- b = BaiDuSpider()
61
- b.start_requests()
@@ -1,79 +0,0 @@
1
- #!/usr/bin/python
2
- # -*- coding:UTF-8 -*-
3
- """
4
- # @Time : 2025-02-05 13:05
5
- # @Author : oscar
6
- # @Desc : None
7
- """
8
- import time
9
-
10
- from crawlo import Request
11
- from crawlo.spider import Spider
12
- from crawlo.utils.date_tools import timestamp_to_datetime, format_datetime
13
-
14
- from tests.baidu_spider.items import ArticleItem
15
-
16
-
17
- class SinaSpider(Spider):
18
- # 获取当前时间戳,并减去 10 分钟(600 秒)
19
- current_time_minus_10min = int(time.time()) - 6000
20
- # 构造 URL
21
- url = f'https://news.10jqka.com.cn/tapp/news/push/stock/?page=1&tag=&track=website&ctime={current_time_minus_10min}'
22
-
23
- start_urls = [url]
24
- name = 'sina'
25
- # mysql_table = 'news_10jqka'
26
-
27
- allowed_domains = ['*']
28
-
29
- def start_requests(self):
30
- for url in self.start_urls:
31
- yield Request(url=url, callback=self.parse, dont_filter=True)
32
-
33
- async def parse(self, response):
34
- jsonp_str = response.json()
35
- rows = jsonp_str.get('data', {}).get('list', [])
36
- for row in rows:
37
- article_id = row.get('id')
38
- title = row.get('title')
39
- digest = row.get('digest')
40
- short = row.get('short')
41
- detail_url = row.get('url')
42
- tag = row.get('tag')
43
- ctime = row.get('ctime')
44
- source = row.get('source')
45
- meta = {
46
- 'article_id': article_id,
47
- 'title': title,
48
- 'digest': digest,
49
- 'short': short,
50
- 'detail_url': detail_url,
51
- 'source': source,
52
- 'tag': tag,
53
- 'ctime': timestamp_to_datetime(int(ctime))
54
- }
55
-
56
- yield Request(url=detail_url, callback=self.parse_detail, encoding='gbk', meta=meta)
57
-
58
- @staticmethod
59
- async def parse_detail(response):
60
- item = ArticleItem()
61
- meta = response.meta
62
- content = ''.join(response.xpath('//*[@id="contentApp"]/p/text()').extract()).strip()
63
- ctime = meta.get('ctime')
64
- item['article_id'] = meta.get('article_id')
65
- item['title'] = meta.get('title')
66
- item['digest'] = content
67
- item['short'] = meta.get('short')
68
- item['url'] = meta.get('detail_url')
69
- item['tag'] = meta.get('tag').strip()
70
- item['ctime'] = format_datetime(ctime)
71
- item['source'] = meta.get('source')
72
-
73
- yield item
74
-
75
- async def spider_opened(self):
76
- pass
77
-
78
- async def spider_closed(self):
79
- pass
File without changes