crawlo 1.1.1__py3-none-any.whl → 1.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (128) hide show
  1. crawlo/__init__.py +34 -33
  2. crawlo/__version__.py +1 -1
  3. crawlo/cli.py +40 -40
  4. crawlo/commands/__init__.py +13 -13
  5. crawlo/commands/check.py +594 -594
  6. crawlo/commands/genspider.py +152 -126
  7. crawlo/commands/list.py +156 -147
  8. crawlo/commands/run.py +285 -285
  9. crawlo/commands/startproject.py +196 -111
  10. crawlo/commands/stats.py +188 -188
  11. crawlo/commands/utils.py +187 -0
  12. crawlo/config.py +280 -0
  13. crawlo/core/__init__.py +2 -2
  14. crawlo/core/engine.py +171 -158
  15. crawlo/core/enhanced_engine.py +190 -0
  16. crawlo/core/processor.py +40 -40
  17. crawlo/core/scheduler.py +166 -57
  18. crawlo/crawler.py +1028 -495
  19. crawlo/downloader/__init__.py +242 -78
  20. crawlo/downloader/aiohttp_downloader.py +212 -199
  21. crawlo/downloader/cffi_downloader.py +251 -241
  22. crawlo/downloader/httpx_downloader.py +259 -246
  23. crawlo/event.py +11 -11
  24. crawlo/exceptions.py +82 -78
  25. crawlo/extension/__init__.py +31 -31
  26. crawlo/extension/log_interval.py +49 -49
  27. crawlo/extension/log_stats.py +44 -44
  28. crawlo/extension/logging_extension.py +34 -34
  29. crawlo/filters/__init__.py +154 -37
  30. crawlo/filters/aioredis_filter.py +242 -150
  31. crawlo/filters/memory_filter.py +269 -202
  32. crawlo/items/__init__.py +23 -23
  33. crawlo/items/base.py +21 -21
  34. crawlo/items/fields.py +53 -53
  35. crawlo/items/items.py +104 -104
  36. crawlo/middleware/__init__.py +21 -21
  37. crawlo/middleware/default_header.py +32 -32
  38. crawlo/middleware/download_delay.py +28 -28
  39. crawlo/middleware/middleware_manager.py +135 -135
  40. crawlo/middleware/proxy.py +248 -245
  41. crawlo/middleware/request_ignore.py +30 -30
  42. crawlo/middleware/response_code.py +18 -18
  43. crawlo/middleware/response_filter.py +26 -26
  44. crawlo/middleware/retry.py +125 -90
  45. crawlo/mode_manager.py +201 -0
  46. crawlo/network/__init__.py +21 -7
  47. crawlo/network/request.py +311 -203
  48. crawlo/network/response.py +271 -166
  49. crawlo/pipelines/__init__.py +22 -13
  50. crawlo/pipelines/bloom_dedup_pipeline.py +157 -0
  51. crawlo/pipelines/console_pipeline.py +39 -39
  52. crawlo/pipelines/csv_pipeline.py +317 -0
  53. crawlo/pipelines/database_dedup_pipeline.py +225 -0
  54. crawlo/pipelines/json_pipeline.py +219 -0
  55. crawlo/pipelines/memory_dedup_pipeline.py +116 -0
  56. crawlo/pipelines/mongo_pipeline.py +116 -116
  57. crawlo/pipelines/mysql_pipeline.py +195 -195
  58. crawlo/pipelines/pipeline_manager.py +56 -56
  59. crawlo/pipelines/redis_dedup_pipeline.py +163 -0
  60. crawlo/project.py +153 -153
  61. crawlo/queue/__init__.py +0 -0
  62. crawlo/queue/pqueue.py +37 -0
  63. crawlo/queue/queue_manager.py +308 -0
  64. crawlo/queue/redis_priority_queue.py +209 -0
  65. crawlo/settings/__init__.py +7 -7
  66. crawlo/settings/default_settings.py +245 -167
  67. crawlo/settings/setting_manager.py +99 -99
  68. crawlo/spider/__init__.py +639 -129
  69. crawlo/stats_collector.py +59 -59
  70. crawlo/subscriber.py +106 -106
  71. crawlo/task_manager.py +30 -27
  72. crawlo/templates/crawlo.cfg.tmpl +10 -10
  73. crawlo/templates/project/__init__.py.tmpl +3 -3
  74. crawlo/templates/project/items.py.tmpl +17 -17
  75. crawlo/templates/project/middlewares.py.tmpl +87 -76
  76. crawlo/templates/project/pipelines.py.tmpl +342 -64
  77. crawlo/templates/project/run.py.tmpl +252 -0
  78. crawlo/templates/project/settings.py.tmpl +251 -54
  79. crawlo/templates/project/spiders/__init__.py.tmpl +5 -5
  80. crawlo/templates/spider/spider.py.tmpl +178 -32
  81. crawlo/utils/__init__.py +7 -7
  82. crawlo/utils/controlled_spider_mixin.py +440 -0
  83. crawlo/utils/date_tools.py +233 -233
  84. crawlo/utils/db_helper.py +343 -343
  85. crawlo/utils/func_tools.py +82 -82
  86. crawlo/utils/large_scale_config.py +287 -0
  87. crawlo/utils/large_scale_helper.py +344 -0
  88. crawlo/utils/log.py +128 -128
  89. crawlo/utils/queue_helper.py +176 -0
  90. crawlo/utils/request.py +267 -267
  91. crawlo/utils/request_serializer.py +220 -0
  92. crawlo/utils/spider_loader.py +62 -62
  93. crawlo/utils/system.py +11 -11
  94. crawlo/utils/tools.py +4 -4
  95. crawlo/utils/url.py +39 -39
  96. crawlo-1.1.3.dist-info/METADATA +635 -0
  97. crawlo-1.1.3.dist-info/RECORD +113 -0
  98. examples/__init__.py +7 -7
  99. examples/controlled_spider_example.py +205 -0
  100. tests/__init__.py +7 -7
  101. tests/test_final_validation.py +154 -0
  102. tests/test_proxy_health_check.py +32 -32
  103. tests/test_proxy_middleware_integration.py +136 -136
  104. tests/test_proxy_providers.py +56 -56
  105. tests/test_proxy_stats.py +19 -19
  106. tests/test_proxy_strategies.py +59 -59
  107. tests/test_redis_config.py +29 -0
  108. tests/test_redis_queue.py +225 -0
  109. tests/test_request_serialization.py +71 -0
  110. tests/test_scheduler.py +242 -0
  111. crawlo/pipelines/mysql_batch_pipline.py +0 -273
  112. crawlo/utils/pqueue.py +0 -174
  113. crawlo-1.1.1.dist-info/METADATA +0 -220
  114. crawlo-1.1.1.dist-info/RECORD +0 -100
  115. examples/baidu_spider/__init__.py +0 -7
  116. examples/baidu_spider/demo.py +0 -94
  117. examples/baidu_spider/items.py +0 -46
  118. examples/baidu_spider/middleware.py +0 -49
  119. examples/baidu_spider/pipeline.py +0 -55
  120. examples/baidu_spider/run.py +0 -27
  121. examples/baidu_spider/settings.py +0 -121
  122. examples/baidu_spider/spiders/__init__.py +0 -7
  123. examples/baidu_spider/spiders/bai_du.py +0 -61
  124. examples/baidu_spider/spiders/miit.py +0 -159
  125. examples/baidu_spider/spiders/sina.py +0 -79
  126. {crawlo-1.1.1.dist-info → crawlo-1.1.3.dist-info}/WHEEL +0 -0
  127. {crawlo-1.1.1.dist-info → crawlo-1.1.3.dist-info}/entry_points.txt +0 -0
  128. {crawlo-1.1.1.dist-info → crawlo-1.1.3.dist-info}/top_level.txt +0 -0
@@ -1,159 +0,0 @@
1
- #!/usr/bin/python
2
- # -*- coding:UTF-8 -*-
3
- """
4
- # @Time : 2025-08-22 14:00
5
- # @Author : oscar
6
- # @Desc : 爬取工信部无线电设备核准信息(支持全量34652页)
7
- """
8
-
9
- import json
10
- import asyncio
11
- import random
12
-
13
- from crawlo import Request
14
- from crawlo.spider import Spider
15
- from crawlo.utils.log import get_logger
16
- from crawlo.utils.date_tools import to_datetime
17
-
18
- # 引入定义好的 Item
19
- from examples.baidu_spider.items import MiitDeviceItem
20
-
21
-
22
- logger = get_logger(__name__)
23
-
24
-
25
- class MiitDeviceSpider(Spider):
26
- name = 'miit_device'
27
- allowed_domains = ['ythzxfw.miit.gov.cn']
28
-
29
- # 字段映射表
30
- FIELD_MAPPING = {
31
- "articleField01": ("核准证编号", "approval_certificate_no"),
32
- "articleField02": ("设备名称", "device_name"),
33
- "articleField03": ("设备型号", "model_number"),
34
- "articleField04": ("申请单位", "applicant"),
35
- "articleField05": ("备注", "remarks"),
36
- "articleField06": ("有效期", "validity_period"),
37
- "articleField07": ("频率容限", "frequency_tolerance"),
38
- "articleField08": ("频率范围", "frequency_range"),
39
- "articleField09": ("发射功率", "transmission_power"),
40
- "articleField10": ("占用带宽", "occupied_bandwidth"),
41
- "articleField11": ("杂散发射限制", "spurious_emission_limit"),
42
- "articleField12": ("发证日期", "issue_date"),
43
- "articleField13": ("核准代码", "approval_code"),
44
- "articleField14": ("CMIIT ID", "cmiit_id"),
45
- "articleField15": ("调制方式", "modulation_scheme"),
46
- "articleField16": ("技术体制/功能模块", "technology_module"),
47
- "createTime": ("createTime", "create_time"),
48
- "articleId": ("articleId", "article_id")
49
- }
50
-
51
- headers = {
52
- "Accept": "application/json, text/plain, */*",
53
- "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
54
- "Authorization": "null",
55
- "Cache-Control": "no-cache",
56
- "Connection": "keep-alive",
57
- "Content-Type": "application/json;charset=UTF-8",
58
- "Origin": "https://ythzxfw.miit.gov.cn",
59
- "Pragma": "no-cache",
60
- "Referer": "https://ythzxfw.miit.gov.cn/oldyth/resultQuery",
61
- "Sec-Fetch-Dest": "empty",
62
- "Sec-Fetch-Mode": "cors",
63
- "Sec-Fetch-Site": "same-origin",
64
- "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36",
65
- "sec-ch-ua": '"Not;A=Brand";v="99", "Google Chrome";v="139", "Chromium";v="139"',
66
- "sec-ch-ua-mobile": "?0",
67
- "sec-ch-ua-platform": '"macOS"'
68
- }
69
-
70
- cookies = {
71
- "wzws_sessionid": "gjdjYmMyNYFkZjRiZjCgaKkOx4AyNDBlOjQ3ZTozMmUwOmQ5MmI6ZjFjZTphNWJiOjk5ZmU6OTU4OQ==",
72
- "ariauseGraymode": "false",
73
- "Hm_lvt_a73626d298a849004aacc34159f68abd": "1755909833",
74
- "Hm_lpvt_a73626d298a849004aacc34159f68abd": "1755909833",
75
- "HMACCOUNT": "6C5E4C6C47DC62FF"
76
- }
77
-
78
- # 分页配置
79
- start_page = 1 # 起始页
80
- end_page = 34652 # 总页数
81
- current_page = 1
82
- page_size = 5 # 每页条数
83
-
84
- # 请求间隔(秒),防止被封
85
- min_delay = 1.5
86
- max_delay = 3.0
87
-
88
- def start_requests(self):
89
- # 从起始页开始
90
- yield self.make_request(self.start_page)
91
-
92
- def make_request(self, page):
93
- """封装请求创建"""
94
- data = {
95
- "categoryId": "352",
96
- "currentPage": page,
97
- "pageSize": self.page_size,
98
- "searchContent": ""
99
- }
100
- return Request(
101
- method='POST',
102
- url='https://ythzxfw.miit.gov.cn/oldyth/user-center/tbAppSearch/selectResult',
103
- headers=self.headers,
104
- cookies=self.cookies,
105
- body=json.dumps(data, separators=(',', ':'), ensure_ascii=False),
106
- callback=self.parse,
107
- dont_filter=True,
108
- meta={'page': page} # 记录当前页码,便于日志和调试
109
- )
110
-
111
- async def parse(self, response):
112
- page = response.meta.get('page', 'unknown')
113
- try:
114
- json_data = response.json()
115
- success = json_data.get("success")
116
- code = json_data.get("code")
117
-
118
- if not success or code != 200:
119
- logger.error(f"第 {page} 页请求失败: code={code}, msg={json_data.get('msg')}")
120
- return
121
-
122
- tb_app_article = json_data.get('params', {}).get('tbAppArticle', {})
123
- records = tb_app_article.get('list', [])
124
- total_count = tb_app_article.get('total', 0) # 总数据条数,例如 173256
125
-
126
- logger.info(f"✅ 第 {page} 页解析成功,共 {len(records)} 条数据。总计: {total_count} 条")
127
-
128
- for raw_item in records:
129
- item = MiitDeviceItem()
130
- for field_key, (chinese_name, english_field) in self.FIELD_MAPPING.items():
131
- value = raw_item.get(field_key)
132
- if english_field == 'issue_date' and value:
133
- value = to_datetime(value.split()[0])
134
- item[english_field] = value
135
- yield item
136
-
137
- # ✅ 核心修复:根据 total_count 和 page_size 计算真实总页数
138
- # 注意:需要向上取整,例如 173256 / 5 = 34651.2,应该有 34652 页
139
- import math
140
- calculated_total_pages = math.ceil(total_count / self.page_size)
141
-
142
- # 现在使用 calculated_total_pages 来判断是否继续翻页
143
- next_page = page + 1
144
- if next_page <= calculated_total_pages:
145
- delay = random.uniform(self.min_delay, self.max_delay)
146
- logger.debug(f"等待 {delay:.2f}s 后请求第 {next_page} 页...")
147
- await asyncio.sleep(delay)
148
- yield self.make_request(next_page)
149
- else:
150
- logger.info(f"🎉 爬取完成!已到达最后一页 {calculated_total_pages}")
151
-
152
- except Exception as e:
153
- logger.error(f"❌ 解析第 {page} 页失败: {e}, 响应: {response.text[:500]}...")
154
-
155
- async def spider_opened(self):
156
- logger.info(f"MiitDeviceSpider 启动,准备爬取 {self.start_page} 至 {self.end_page} 页...")
157
-
158
- async def spider_closed(self):
159
- logger.info("MiitDeviceSpider 结束。")
@@ -1,79 +0,0 @@
1
- #!/usr/bin/python
2
- # -*- coding:UTF-8 -*-
3
- """
4
- # @Time : 2025-02-05 13:05
5
- # @Author : oscar
6
- # @Desc : None
7
- """
8
- import time
9
-
10
- from crawlo import Request
11
- from crawlo.spider import Spider
12
- from crawlo.utils.date_tools import to_datetime
13
-
14
- from examples.baidu_spider.items import ArticleItem
15
-
16
-
17
- class SinaSpider(Spider):
18
- # 获取当前时间戳,并减去 10 分钟(600 秒)
19
- current_time_minus_10min = int(time.time()) - 6000
20
- # 构造 URL
21
- url = f'https://news.10jqka.com.cn/tapp/news/push/stock/?page=1&tag=&track=website&ctime={current_time_minus_10min}'
22
-
23
- start_urls = [url]
24
- name = 'sina'
25
- # mysql_table = 'news_10jqka'
26
-
27
- allowed_domains = ['*']
28
-
29
- def start_requests(self):
30
- for url in self.start_urls:
31
- yield Request(url=url, callback=self.parse, dont_filter=True)
32
-
33
- async def parse(self, response):
34
- jsonp_str = response.json()
35
- rows = jsonp_str.get('data', {}).get('list', [])
36
- for row in rows:
37
- article_id = row.get('id')
38
- title = row.get('title')
39
- digest = row.get('digest')
40
- short = row.get('short')
41
- detail_url = row.get('url')
42
- tag = row.get('tag')
43
- ctime = row.get('ctime')
44
- source = row.get('source')
45
- meta = {
46
- 'article_id': article_id,
47
- 'title': title,
48
- 'digest': digest,
49
- 'short': short,
50
- 'detail_url': detail_url,
51
- 'source': source,
52
- 'tag': tag,
53
- 'ctime': to_datetime(int(ctime))
54
- }
55
-
56
- yield Request(url=detail_url, callback=self.parse_detail, encoding='gbk', meta=meta)
57
-
58
- @staticmethod
59
- async def parse_detail(response):
60
- item = ArticleItem()
61
- meta = response.meta
62
- content = ''.join(response.xpath('//*[@id="contentApp"]/p/text()').extract()).strip()
63
- ctime = meta.get('ctime')
64
- item['article_id'] = meta.get('article_id')
65
- item['title'] = meta.get('title')
66
- item['digest'] = content
67
- item['short'] = meta.get('short')
68
- item['url'] = meta.get('detail_url')
69
- item['tag'] = meta.get('tag').strip()
70
- item['ctime'] = to_datetime(ctime)
71
- item['source'] = meta.get('source')
72
-
73
- yield item
74
-
75
- async def spider_opened(self):
76
- pass
77
-
78
- async def spider_closed(self):
79
- pass
File without changes