crawlo 1.1.0__py3-none-any.whl → 1.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (120) hide show
  1. crawlo/__init__.py +34 -24
  2. crawlo/__version__.py +1 -1
  3. crawlo/cli.py +40 -40
  4. crawlo/commands/__init__.py +13 -13
  5. crawlo/commands/check.py +594 -155
  6. crawlo/commands/genspider.py +152 -111
  7. crawlo/commands/list.py +156 -119
  8. crawlo/commands/run.py +285 -170
  9. crawlo/commands/startproject.py +196 -101
  10. crawlo/commands/stats.py +188 -167
  11. crawlo/commands/utils.py +187 -0
  12. crawlo/config.py +280 -0
  13. crawlo/core/__init__.py +2 -2
  14. crawlo/core/engine.py +171 -158
  15. crawlo/core/enhanced_engine.py +190 -0
  16. crawlo/core/processor.py +40 -40
  17. crawlo/core/scheduler.py +162 -57
  18. crawlo/crawler.py +1028 -493
  19. crawlo/downloader/__init__.py +242 -78
  20. crawlo/downloader/aiohttp_downloader.py +212 -199
  21. crawlo/downloader/cffi_downloader.py +252 -277
  22. crawlo/downloader/httpx_downloader.py +257 -246
  23. crawlo/event.py +11 -11
  24. crawlo/exceptions.py +78 -78
  25. crawlo/extension/__init__.py +31 -31
  26. crawlo/extension/log_interval.py +49 -49
  27. crawlo/extension/log_stats.py +44 -44
  28. crawlo/extension/logging_extension.py +34 -34
  29. crawlo/filters/__init__.py +154 -37
  30. crawlo/filters/aioredis_filter.py +242 -150
  31. crawlo/filters/memory_filter.py +269 -202
  32. crawlo/items/__init__.py +23 -23
  33. crawlo/items/base.py +21 -21
  34. crawlo/items/fields.py +53 -53
  35. crawlo/items/items.py +104 -104
  36. crawlo/middleware/__init__.py +21 -21
  37. crawlo/middleware/default_header.py +32 -32
  38. crawlo/middleware/download_delay.py +28 -28
  39. crawlo/middleware/middleware_manager.py +135 -135
  40. crawlo/middleware/proxy.py +248 -245
  41. crawlo/middleware/request_ignore.py +30 -30
  42. crawlo/middleware/response_code.py +18 -18
  43. crawlo/middleware/response_filter.py +26 -26
  44. crawlo/middleware/retry.py +125 -90
  45. crawlo/mode_manager.py +201 -0
  46. crawlo/network/__init__.py +21 -7
  47. crawlo/network/request.py +311 -203
  48. crawlo/network/response.py +269 -166
  49. crawlo/pipelines/__init__.py +13 -13
  50. crawlo/pipelines/console_pipeline.py +39 -39
  51. crawlo/pipelines/csv_pipeline.py +317 -0
  52. crawlo/pipelines/json_pipeline.py +219 -0
  53. crawlo/pipelines/mongo_pipeline.py +116 -116
  54. crawlo/pipelines/mysql_pipeline.py +195 -195
  55. crawlo/pipelines/pipeline_manager.py +56 -56
  56. crawlo/project.py +153 -0
  57. crawlo/queue/pqueue.py +37 -0
  58. crawlo/queue/queue_manager.py +304 -0
  59. crawlo/queue/redis_priority_queue.py +192 -0
  60. crawlo/settings/__init__.py +7 -7
  61. crawlo/settings/default_settings.py +226 -169
  62. crawlo/settings/setting_manager.py +99 -99
  63. crawlo/spider/__init__.py +639 -129
  64. crawlo/stats_collector.py +59 -59
  65. crawlo/subscriber.py +106 -106
  66. crawlo/task_manager.py +30 -27
  67. crawlo/templates/crawlo.cfg.tmpl +10 -10
  68. crawlo/templates/project/__init__.py.tmpl +3 -3
  69. crawlo/templates/project/items.py.tmpl +17 -17
  70. crawlo/templates/project/middlewares.py.tmpl +87 -76
  71. crawlo/templates/project/pipelines.py.tmpl +336 -64
  72. crawlo/templates/project/run.py.tmpl +239 -0
  73. crawlo/templates/project/settings.py.tmpl +248 -54
  74. crawlo/templates/project/spiders/__init__.py.tmpl +5 -5
  75. crawlo/templates/spider/spider.py.tmpl +178 -32
  76. crawlo/utils/__init__.py +7 -7
  77. crawlo/utils/controlled_spider_mixin.py +336 -0
  78. crawlo/utils/date_tools.py +233 -233
  79. crawlo/utils/db_helper.py +343 -343
  80. crawlo/utils/func_tools.py +82 -82
  81. crawlo/utils/large_scale_config.py +287 -0
  82. crawlo/utils/large_scale_helper.py +344 -0
  83. crawlo/utils/log.py +128 -128
  84. crawlo/utils/queue_helper.py +176 -0
  85. crawlo/utils/request.py +267 -267
  86. crawlo/utils/request_serializer.py +220 -0
  87. crawlo/utils/spider_loader.py +62 -62
  88. crawlo/utils/system.py +11 -11
  89. crawlo/utils/tools.py +4 -4
  90. crawlo/utils/url.py +39 -39
  91. crawlo-1.1.2.dist-info/METADATA +567 -0
  92. crawlo-1.1.2.dist-info/RECORD +108 -0
  93. examples/__init__.py +7 -0
  94. tests/__init__.py +7 -7
  95. tests/test_final_validation.py +154 -0
  96. tests/test_proxy_health_check.py +32 -32
  97. tests/test_proxy_middleware_integration.py +136 -136
  98. tests/test_proxy_providers.py +56 -56
  99. tests/test_proxy_stats.py +19 -19
  100. tests/test_proxy_strategies.py +59 -59
  101. tests/test_redis_config.py +29 -0
  102. tests/test_redis_queue.py +225 -0
  103. tests/test_request_serialization.py +71 -0
  104. tests/test_scheduler.py +242 -0
  105. crawlo/pipelines/mysql_batch_pipline.py +0 -273
  106. crawlo/utils/concurrency_manager.py +0 -125
  107. crawlo/utils/pqueue.py +0 -174
  108. crawlo/utils/project.py +0 -197
  109. crawlo-1.1.0.dist-info/METADATA +0 -49
  110. crawlo-1.1.0.dist-info/RECORD +0 -97
  111. examples/gxb/items.py +0 -36
  112. examples/gxb/run.py +0 -16
  113. examples/gxb/settings.py +0 -72
  114. examples/gxb/spider/__init__.py +0 -2
  115. examples/gxb/spider/miit_spider.py +0 -180
  116. examples/gxb/spider/telecom_device.py +0 -129
  117. {examples/gxb → crawlo/queue}/__init__.py +0 -0
  118. {crawlo-1.1.0.dist-info → crawlo-1.1.2.dist-info}/WHEEL +0 -0
  119. {crawlo-1.1.0.dist-info → crawlo-1.1.2.dist-info}/entry_points.txt +0 -0
  120. {crawlo-1.1.0.dist-info → crawlo-1.1.2.dist-info}/top_level.txt +0 -0
@@ -1,129 +0,0 @@
1
- # -*- coding: utf-8 -*-
2
- import json
3
- from crawlo import Spider, Request
4
- from crawlo.utils.log import get_logger
5
-
6
- from examples.gxb.items import TelecomLicenseItem
7
- from examples.gxb.settings import HEADERS, COOKIES
8
-
9
-
10
- logger = get_logger(__name__)
11
-
12
- class TelecomDeviceLicensesSpider(Spider):
13
- name = 'telecom_device'
14
- allowed_domains = ['ythzxfw.miit.gov.cn']
15
- # API 的基础 URL
16
- base_api_url = 'https://ythzxfw.miit.gov.cn/oldyth/user-center/tbAppSearch/selectResult'
17
-
18
- # 配置:起始页码和结束页码
19
- start_page = 1
20
- end_page = 26405
21
- data = {
22
- "categoryId": "144",
23
- "currentPage": 1,
24
- "pageSize": 5,
25
- "searchContent": ""
26
- }
27
-
28
-
29
- def start_requests(self):
30
- """从第一页开始,逐页发起请求"""
31
-
32
- yield Request(
33
- url=self.base_api_url,
34
- method='POST',
35
- headers=HEADERS,
36
- cookies=COOKIES,
37
- body=json.dumps(self.data),
38
- callback=self.parse,
39
- meta={'page': 1},
40
- dont_filter=True
41
- )
42
-
43
-
44
- def parse(self, response):
45
- """
46
- 解析 API 响应
47
- :param response: Scrapy Response 对象
48
- """
49
- page = response.meta['page']
50
- self.logger.info(f"正在解析第 {page} 页,状态码: {response.status_code}")
51
-
52
- try:
53
- json_data = response.json()
54
-
55
- if not json_data.get('success'):
56
- self.logger.error(f"第 {page} 页请求失败: {json_data.get('msg', 'Unknown error')}")
57
- return
58
-
59
- # 提取总页数和总记录数(可选,用于验证)
60
- total_records = json_data.get("params", {}).get("tbAppArticle", {}).get("total", 0)
61
- self.logger.info(f"第 {page} 页,总记录数: {total_records}")
62
-
63
- article_list = json_data.get("params", {}).get("tbAppArticle", {}).get("list", [])
64
-
65
- if not article_list:
66
- self.logger.warning(f"第 {page} 页未找到数据")
67
- return
68
-
69
- self.logger.info(f"第 {page} 页成功解析到 {len(article_list)} 条记录")
70
-
71
- # 将每条记录作为独立的 item yield 出去
72
- for item in article_list:
73
- # 清洗数据:移除 HTML 标签
74
- cleaned_item = self.clean_item(item)
75
- item = TelecomLicenseItem()
76
- item['license_number'] = cleaned_item.get('articleField01')
77
- item['device_name'] = cleaned_item.get('articleField02')
78
- item['device_model'] = cleaned_item.get('articleField03')
79
- item['applicant'] = cleaned_item.get('articleField04')
80
- item['manufacturer'] = cleaned_item.get('articleField05')
81
- item['issue_date'] = cleaned_item.get('articleField06')
82
- item['expiry_date'] = cleaned_item.get('articleField07')
83
- item['certificate_type'] = cleaned_item.get('articleField08')
84
- item['remarks'] = cleaned_item.get('articleField09')
85
- item['certificate_status'] = cleaned_item.get('articleField10')
86
- item['origin'] = cleaned_item.get('articleField11')
87
- item['article_id'] = cleaned_item.get('articleId')
88
- item['article_edit_date'] = cleaned_item.get('articleEdate')
89
- item['create_time'] = cleaned_item.get('createTime')
90
- yield item
91
-
92
- # --- 自动翻页逻辑 ---
93
- # 检查是否还有下一页
94
- # 方法1:根据当前页码和预设的总页数
95
- if page < self.end_page:
96
- next_page = page + 1
97
- self.data['currentPage'] = next_page
98
- self.logger.debug(f"准备爬取下一页: {next_page}")
99
- yield Request(
100
- url=self.base_api_url,
101
- method='POST',
102
- headers=HEADERS,
103
- cookies=COOKIES,
104
- body=json.dumps(self.data),
105
- callback=self.parse,
106
- meta={'page': next_page},
107
- dont_filter=True
108
- )
109
-
110
- except Exception as e:
111
- self.logger.error(f"解析第 {page} 页响应失败: {e}", exc_info=True)
112
-
113
- @staticmethod
114
- def clean_item(item: dict) -> dict:
115
- """
116
- 清洗单条记录,移除 HTML 标签等
117
- :param item: 原始字典
118
- :return: 清洗后的字典
119
- """
120
- import re
121
- html_tag_re = re.compile(r'<[^>]+>')
122
- cleaned = {}
123
- for k, v in item.items():
124
- if isinstance(v, str):
125
- # 移除 HTML 标签并去除首尾空白
126
- cleaned[k] = html_tag_re.sub('', v).strip()
127
- else:
128
- cleaned[k] = v
129
- return cleaned
File without changes
File without changes