crawlo 1.0.9__py3-none-any.whl → 1.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (111) hide show
  1. crawlo/__init__.py +33 -24
  2. crawlo/__version__.py +1 -1
  3. crawlo/cli.py +40 -40
  4. crawlo/commands/__init__.py +13 -13
  5. crawlo/commands/check.py +594 -106
  6. crawlo/commands/genspider.py +125 -110
  7. crawlo/commands/list.py +147 -92
  8. crawlo/commands/run.py +286 -181
  9. crawlo/commands/startproject.py +111 -101
  10. crawlo/commands/stats.py +188 -59
  11. crawlo/core/__init__.py +2 -2
  12. crawlo/core/engine.py +158 -158
  13. crawlo/core/processor.py +40 -40
  14. crawlo/core/scheduler.py +57 -57
  15. crawlo/crawler.py +494 -492
  16. crawlo/downloader/__init__.py +78 -78
  17. crawlo/downloader/aiohttp_downloader.py +199 -199
  18. crawlo/downloader/cffi_downloader.py +242 -277
  19. crawlo/downloader/httpx_downloader.py +246 -246
  20. crawlo/event.py +11 -11
  21. crawlo/exceptions.py +78 -78
  22. crawlo/extension/__init__.py +31 -31
  23. crawlo/extension/log_interval.py +49 -49
  24. crawlo/extension/log_stats.py +44 -44
  25. crawlo/extension/logging_extension.py +34 -34
  26. crawlo/filters/__init__.py +37 -37
  27. crawlo/filters/aioredis_filter.py +150 -150
  28. crawlo/filters/memory_filter.py +202 -202
  29. crawlo/items/__init__.py +23 -23
  30. crawlo/items/base.py +21 -21
  31. crawlo/items/fields.py +53 -53
  32. crawlo/items/items.py +104 -104
  33. crawlo/middleware/__init__.py +21 -21
  34. crawlo/middleware/default_header.py +32 -32
  35. crawlo/middleware/download_delay.py +28 -28
  36. crawlo/middleware/middleware_manager.py +135 -135
  37. crawlo/middleware/proxy.py +245 -245
  38. crawlo/middleware/request_ignore.py +30 -30
  39. crawlo/middleware/response_code.py +18 -18
  40. crawlo/middleware/response_filter.py +26 -26
  41. crawlo/middleware/retry.py +90 -90
  42. crawlo/network/__init__.py +7 -7
  43. crawlo/network/request.py +203 -203
  44. crawlo/network/response.py +166 -166
  45. crawlo/pipelines/__init__.py +13 -13
  46. crawlo/pipelines/console_pipeline.py +39 -39
  47. crawlo/pipelines/mongo_pipeline.py +116 -116
  48. crawlo/pipelines/mysql_batch_pipline.py +272 -272
  49. crawlo/pipelines/mysql_pipeline.py +195 -195
  50. crawlo/pipelines/pipeline_manager.py +56 -56
  51. crawlo/project.py +153 -0
  52. crawlo/settings/__init__.py +7 -7
  53. crawlo/settings/default_settings.py +166 -168
  54. crawlo/settings/setting_manager.py +99 -99
  55. crawlo/spider/__init__.py +129 -129
  56. crawlo/stats_collector.py +59 -59
  57. crawlo/subscriber.py +106 -106
  58. crawlo/task_manager.py +27 -27
  59. crawlo/templates/crawlo.cfg.tmpl +10 -10
  60. crawlo/templates/project/__init__.py.tmpl +3 -3
  61. crawlo/templates/project/items.py.tmpl +17 -17
  62. crawlo/templates/project/middlewares.py.tmpl +75 -75
  63. crawlo/templates/project/pipelines.py.tmpl +63 -63
  64. crawlo/templates/project/settings.py.tmpl +54 -54
  65. crawlo/templates/project/spiders/__init__.py.tmpl +5 -5
  66. crawlo/templates/spider/spider.py.tmpl +31 -31
  67. crawlo/utils/__init__.py +7 -7
  68. crawlo/utils/date_tools.py +233 -233
  69. crawlo/utils/db_helper.py +343 -343
  70. crawlo/utils/func_tools.py +82 -82
  71. crawlo/utils/log.py +128 -128
  72. crawlo/utils/pqueue.py +173 -173
  73. crawlo/utils/request.py +267 -267
  74. crawlo/utils/spider_loader.py +62 -62
  75. crawlo/utils/system.py +11 -11
  76. crawlo/utils/tools.py +4 -4
  77. crawlo/utils/url.py +39 -39
  78. crawlo-1.1.1.dist-info/METADATA +220 -0
  79. crawlo-1.1.1.dist-info/RECORD +100 -0
  80. examples/__init__.py +7 -0
  81. examples/baidu_spider/__init__.py +7 -0
  82. examples/baidu_spider/demo.py +94 -0
  83. examples/baidu_spider/items.py +46 -0
  84. examples/baidu_spider/middleware.py +49 -0
  85. examples/baidu_spider/pipeline.py +55 -0
  86. examples/baidu_spider/run.py +27 -0
  87. examples/baidu_spider/settings.py +121 -0
  88. examples/baidu_spider/spiders/__init__.py +7 -0
  89. examples/baidu_spider/spiders/bai_du.py +61 -0
  90. examples/baidu_spider/spiders/miit.py +159 -0
  91. examples/baidu_spider/spiders/sina.py +79 -0
  92. tests/__init__.py +7 -7
  93. tests/test_proxy_health_check.py +32 -32
  94. tests/test_proxy_middleware_integration.py +136 -136
  95. tests/test_proxy_providers.py +56 -56
  96. tests/test_proxy_stats.py +19 -19
  97. tests/test_proxy_strategies.py +59 -59
  98. crawlo/utils/concurrency_manager.py +0 -125
  99. crawlo/utils/project.py +0 -197
  100. crawlo-1.0.9.dist-info/METADATA +0 -49
  101. crawlo-1.0.9.dist-info/RECORD +0 -97
  102. examples/gxb/__init__.py +0 -0
  103. examples/gxb/items.py +0 -36
  104. examples/gxb/run.py +0 -16
  105. examples/gxb/settings.py +0 -72
  106. examples/gxb/spider/__init__.py +0 -0
  107. examples/gxb/spider/miit_spider.py +0 -180
  108. examples/gxb/spider/telecom_device.py +0 -129
  109. {crawlo-1.0.9.dist-info → crawlo-1.1.1.dist-info}/WHEEL +0 -0
  110. {crawlo-1.0.9.dist-info → crawlo-1.1.1.dist-info}/entry_points.txt +0 -0
  111. {crawlo-1.0.9.dist-info → crawlo-1.1.1.dist-info}/top_level.txt +0 -0
@@ -1,129 +0,0 @@
1
- # -*- coding: utf-8 -*-
2
- import json
3
- from crawlo import Spider, Request
4
- from crawlo.utils.log import get_logger
5
-
6
- from examples.gxb.items import TelecomLicenseItem
7
- from examples.gxb.settings import HEADERS, COOKIES
8
-
9
-
10
- logger = get_logger(__name__)
11
-
12
- class TelecomDeviceLicensesSpider(Spider):
13
- name = 'telecom_device'
14
- allowed_domains = ['ythzxfw.miit.gov.cn']
15
- # API 的基础 URL
16
- base_api_url = 'https://ythzxfw.miit.gov.cn/oldyth/user-center/tbAppSearch/selectResult'
17
-
18
- # 配置:起始页码和结束页码
19
- start_page = 1
20
- end_page = 26405
21
- data = {
22
- "categoryId": "144",
23
- "currentPage": 1,
24
- "pageSize": 5,
25
- "searchContent": ""
26
- }
27
-
28
-
29
- def start_requests(self):
30
- """从第一页开始,逐页发起请求"""
31
-
32
- yield Request(
33
- url=self.base_api_url,
34
- method='POST',
35
- headers=HEADERS,
36
- cookies=COOKIES,
37
- body=json.dumps(self.data),
38
- callback=self.parse,
39
- meta={'page': 1},
40
- dont_filter=True
41
- )
42
-
43
-
44
- def parse(self, response):
45
- """
46
- 解析 API 响应
47
- :param response: Scrapy Response 对象
48
- """
49
- page = response.meta['page']
50
- self.logger.info(f"正在解析第 {page} 页,状态码: {response.status_code}")
51
-
52
- try:
53
- json_data = response.json()
54
-
55
- if not json_data.get('success'):
56
- self.logger.error(f"第 {page} 页请求失败: {json_data.get('msg', 'Unknown error')}")
57
- return
58
-
59
- # 提取总页数和总记录数(可选,用于验证)
60
- total_records = json_data.get("params", {}).get("tbAppArticle", {}).get("total", 0)
61
- self.logger.info(f"第 {page} 页,总记录数: {total_records}")
62
-
63
- article_list = json_data.get("params", {}).get("tbAppArticle", {}).get("list", [])
64
-
65
- if not article_list:
66
- self.logger.warning(f"第 {page} 页未找到数据")
67
- return
68
-
69
- self.logger.info(f"第 {page} 页成功解析到 {len(article_list)} 条记录")
70
-
71
- # 将每条记录作为独立的 item yield 出去
72
- for item in article_list:
73
- # 清洗数据:移除 HTML 标签
74
- cleaned_item = self.clean_item(item)
75
- item = TelecomLicenseItem()
76
- item['license_number'] = cleaned_item.get('articleField01')
77
- item['device_name'] = cleaned_item.get('articleField02')
78
- item['device_model'] = cleaned_item.get('articleField03')
79
- item['applicant'] = cleaned_item.get('articleField04')
80
- item['manufacturer'] = cleaned_item.get('articleField05')
81
- item['issue_date'] = cleaned_item.get('articleField06')
82
- item['expiry_date'] = cleaned_item.get('articleField07')
83
- item['certificate_type'] = cleaned_item.get('articleField08')
84
- item['remarks'] = cleaned_item.get('articleField09')
85
- item['certificate_status'] = cleaned_item.get('articleField10')
86
- item['origin'] = cleaned_item.get('articleField11')
87
- item['article_id'] = cleaned_item.get('articleId')
88
- item['article_edit_date'] = cleaned_item.get('articleEdate')
89
- item['create_time'] = cleaned_item.get('createTime')
90
- yield item
91
-
92
- # --- 自动翻页逻辑 ---
93
- # 检查是否还有下一页
94
- # 方法1:根据当前页码和预设的总页数
95
- if page < self.end_page:
96
- next_page = page + 1
97
- self.data['currentPage'] = next_page
98
- self.logger.debug(f"准备爬取下一页: {next_page}")
99
- yield Request(
100
- url=self.base_api_url,
101
- method='POST',
102
- headers=HEADERS,
103
- cookies=COOKIES,
104
- body=json.dumps(self.data),
105
- callback=self.parse,
106
- meta={'page': next_page},
107
- dont_filter=True
108
- )
109
-
110
- except Exception as e:
111
- self.logger.error(f"解析第 {page} 页响应失败: {e}", exc_info=True)
112
-
113
- @staticmethod
114
- def clean_item(item: dict) -> dict:
115
- """
116
- 清洗单条记录,移除 HTML 标签等
117
- :param item: 原始字典
118
- :return: 清洗后的字典
119
- """
120
- import re
121
- html_tag_re = re.compile(r'<[^>]+>')
122
- cleaned = {}
123
- for k, v in item.items():
124
- if isinstance(v, str):
125
- # 移除 HTML 标签并去除首尾空白
126
- cleaned[k] = html_tag_re.sub('', v).strip()
127
- else:
128
- cleaned[k] = v
129
- return cleaned
File without changes