crawlo 1.0.4__py3-none-any.whl → 1.0.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +25 -9
- crawlo/__version__.py +1 -1
- crawlo/core/__init__.py +2 -2
- crawlo/core/engine.py +158 -158
- crawlo/core/processor.py +40 -40
- crawlo/core/scheduler.py +57 -57
- crawlo/crawler.py +424 -242
- crawlo/downloader/__init__.py +78 -78
- crawlo/downloader/aiohttp_downloader.py +200 -259
- crawlo/downloader/cffi_downloader.py +277 -0
- crawlo/downloader/httpx_downloader.py +246 -187
- crawlo/event.py +11 -11
- crawlo/exceptions.py +73 -64
- crawlo/extension/__init__.py +31 -31
- crawlo/extension/log_interval.py +49 -49
- crawlo/extension/log_stats.py +44 -44
- crawlo/extension/logging_extension.py +35 -0
- crawlo/filters/__init__.py +37 -37
- crawlo/filters/aioredis_filter.py +150 -150
- crawlo/filters/memory_filter.py +202 -202
- crawlo/items/__init__.py +62 -62
- crawlo/items/items.py +115 -119
- crawlo/middleware/__init__.py +21 -21
- crawlo/middleware/default_header.py +32 -32
- crawlo/middleware/download_delay.py +28 -28
- crawlo/middleware/middleware_manager.py +135 -140
- crawlo/middleware/proxy.py +246 -0
- crawlo/middleware/request_ignore.py +30 -30
- crawlo/middleware/response_code.py +18 -18
- crawlo/middleware/response_filter.py +26 -26
- crawlo/middleware/retry.py +90 -90
- crawlo/network/__init__.py +7 -7
- crawlo/network/request.py +203 -204
- crawlo/network/response.py +166 -166
- crawlo/pipelines/__init__.py +13 -13
- crawlo/pipelines/console_pipeline.py +39 -39
- crawlo/pipelines/mongo_pipeline.py +116 -116
- crawlo/pipelines/mysql_batch_pipline.py +273 -134
- crawlo/pipelines/mysql_pipeline.py +195 -195
- crawlo/pipelines/pipeline_manager.py +56 -56
- crawlo/settings/__init__.py +7 -7
- crawlo/settings/default_settings.py +169 -94
- crawlo/settings/setting_manager.py +99 -99
- crawlo/spider/__init__.py +41 -36
- crawlo/stats_collector.py +59 -59
- crawlo/subscriber.py +106 -106
- crawlo/task_manager.py +27 -27
- crawlo/templates/item_template.tmpl +21 -21
- crawlo/templates/project_template/main.py +32 -32
- crawlo/templates/project_template/setting.py +189 -189
- crawlo/templates/spider_template.tmpl +30 -30
- crawlo/utils/__init__.py +7 -7
- crawlo/utils/concurrency_manager.py +124 -124
- crawlo/utils/date_tools.py +233 -177
- crawlo/utils/db_helper.py +344 -0
- crawlo/utils/func_tools.py +82 -82
- crawlo/utils/log.py +129 -39
- crawlo/utils/pqueue.py +173 -173
- crawlo/utils/project.py +59 -59
- crawlo/utils/request.py +267 -122
- crawlo/utils/system.py +11 -11
- crawlo/utils/tools.py +5 -303
- crawlo/utils/url.py +39 -39
- {crawlo-1.0.4.dist-info → crawlo-1.0.5.dist-info}/METADATA +49 -48
- crawlo-1.0.5.dist-info/RECORD +84 -0
- {crawlo-1.0.4.dist-info → crawlo-1.0.5.dist-info}/top_level.txt +1 -0
- examples/__init__.py +0 -0
- examples/gxb/__init__.py +0 -0
- examples/gxb/items.py +36 -0
- examples/gxb/run.py +15 -0
- examples/gxb/settings.py +71 -0
- examples/gxb/spider/__init__.py +0 -0
- examples/gxb/spider/miit_spider.py +180 -0
- examples/gxb/spider/telecom_device_licenses.py +129 -0
- tests/__init__.py +7 -7
- tests/test_proxy_health_check.py +33 -0
- tests/test_proxy_middleware_integration.py +137 -0
- tests/test_proxy_providers.py +57 -0
- tests/test_proxy_stats.py +20 -0
- tests/test_proxy_strategies.py +60 -0
- crawlo/downloader/playwright_downloader.py +0 -161
- crawlo-1.0.4.dist-info/RECORD +0 -79
- tests/baidu_spider/__init__.py +0 -7
- tests/baidu_spider/demo.py +0 -94
- tests/baidu_spider/items.py +0 -25
- tests/baidu_spider/middleware.py +0 -49
- tests/baidu_spider/pipeline.py +0 -55
- tests/baidu_spider/request_fingerprints.txt +0 -9
- tests/baidu_spider/run.py +0 -27
- tests/baidu_spider/settings.py +0 -80
- tests/baidu_spider/spiders/__init__.py +0 -7
- tests/baidu_spider/spiders/bai_du.py +0 -61
- tests/baidu_spider/spiders/sina.py +0 -79
- {crawlo-1.0.4.dist-info → crawlo-1.0.5.dist-info}/WHEEL +0 -0
- {crawlo-1.0.4.dist-info → crawlo-1.0.5.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
# tests/test_proxy_strategies.py
|
|
2
|
+
import pytest
|
|
3
|
+
from crawlo import Request
|
|
4
|
+
from crawlo.proxy.strategies import STRATEGIES
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@pytest.fixture
|
|
8
|
+
def mock_proxies():
|
|
9
|
+
"""提供测试用的代理列表"""
|
|
10
|
+
return [
|
|
11
|
+
{'url': 'http://p1:8080'},
|
|
12
|
+
{'url': 'http://p2:8080'},
|
|
13
|
+
{'url': 'http://p3:8080'},
|
|
14
|
+
]
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@pytest.fixture
|
|
18
|
+
def mock_stats():
|
|
19
|
+
"""提供测试用的统计信息"""
|
|
20
|
+
return {
|
|
21
|
+
'http://p1:8080': {'total': 10},
|
|
22
|
+
'http://p2:8080': {'total': 5},
|
|
23
|
+
'http://p3:8080': {'total': 1},
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@pytest.fixture
|
|
28
|
+
def mock_request():
|
|
29
|
+
"""提供测试用的请求对象"""
|
|
30
|
+
return Request("https://example.com")
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def test_random_strategy(mock_proxies, mock_request, mock_stats):
|
|
34
|
+
"""测试随机策略"""
|
|
35
|
+
strategy = STRATEGIES['random']
|
|
36
|
+
chosen = strategy(mock_proxies, mock_request, mock_stats)
|
|
37
|
+
assert chosen in [p['url'] for p in mock_proxies]
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def test_least_used_strategy(mock_proxies, mock_request, mock_stats):
|
|
41
|
+
"""测试最少使用策略"""
|
|
42
|
+
strategy = STRATEGIES['least_used']
|
|
43
|
+
chosen = strategy(mock_proxies, mock_request, mock_stats)
|
|
44
|
+
assert chosen == 'http://p3:8080' # total=1
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def test_domain_rule_strategy(mock_proxies, mock_request, mock_stats):
|
|
48
|
+
"""测试域名规则策略"""
|
|
49
|
+
from crawlo.proxy.strategies.domain_rule import domain_rule_strategy
|
|
50
|
+
request = Request("https://taobao.com/item/123")
|
|
51
|
+
rules = {'taobao.com': 'http://special:8080'}
|
|
52
|
+
|
|
53
|
+
# Monkey patch 确保有回退策略
|
|
54
|
+
old_strategy = STRATEGIES['least_used']
|
|
55
|
+
try:
|
|
56
|
+
STRATEGIES['least_used'] = lambda p, r, s: 'http://fallback:8080'
|
|
57
|
+
chosen = domain_rule_strategy(mock_proxies, request, mock_stats, rules)
|
|
58
|
+
assert chosen == 'http://special:8080'
|
|
59
|
+
finally:
|
|
60
|
+
STRATEGIES['least_used'] = old_strategy
|
|
@@ -1,161 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/python
|
|
2
|
-
# -*- coding:UTF-8 -*-
|
|
3
|
-
from typing import Optional, Dict, Any
|
|
4
|
-
from playwright.async_api import Browser, Page, Response as PlaywrightResponse
|
|
5
|
-
from crawlo import Response, Request
|
|
6
|
-
from crawlo.downloader import DownloaderBase
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
class PlaywrightDownloader(DownloaderBase):
|
|
10
|
-
def __init__(self, crawler):
|
|
11
|
-
super().__init__(crawler)
|
|
12
|
-
# Playwright 核心对象
|
|
13
|
-
self.browser: Optional[Browser] = None # 浏览器实例
|
|
14
|
-
self.context: Optional[Any] = None # 浏览器上下文(隔离cookies等)
|
|
15
|
-
|
|
16
|
-
# 可配置参数(通过crawler.settings覆盖默认值)
|
|
17
|
-
self._browser_type: str = "chromium" # 浏览器类型(chromium/firefox/webkit)
|
|
18
|
-
self._headless: bool = True # 是否无头模式
|
|
19
|
-
self._timeout: int = 30000 # 操作超时(毫秒)
|
|
20
|
-
self._viewport: Dict[str, int] = {"width": 1280, "height": 720} # 视口大小
|
|
21
|
-
self._extra_launch_args: Dict[str, Any] = {} # 浏览器启动额外参数
|
|
22
|
-
|
|
23
|
-
async def _init_browser(self):
|
|
24
|
-
"""初始化Playwright浏览器实例"""
|
|
25
|
-
from playwright.async_api import async_playwright
|
|
26
|
-
|
|
27
|
-
# 启动Playwright引擎
|
|
28
|
-
playwright = await async_playwright().start()
|
|
29
|
-
|
|
30
|
-
# 根据配置选择浏览器类型
|
|
31
|
-
browser_launcher = {
|
|
32
|
-
"chromium": playwright.chromium,
|
|
33
|
-
"firefox": playwright.firefox,
|
|
34
|
-
"webkit": playwright.webkit
|
|
35
|
-
}.get(self._browser_type, playwright.chromium) # 默认chromium
|
|
36
|
-
|
|
37
|
-
# 启动浏览器(含启动参数)
|
|
38
|
-
self.browser = await browser_launcher.launch(
|
|
39
|
-
headless=self._headless, # 无头模式开关
|
|
40
|
-
timeout=self._timeout, # 启动超时
|
|
41
|
-
**self._extra_launch_args # 透传额外参数(如代理配置)
|
|
42
|
-
)
|
|
43
|
-
|
|
44
|
-
# 创建浏览器上下文(隔离环境)
|
|
45
|
-
self.context = await self.browser.new_context(
|
|
46
|
-
viewport=self._viewport, # 设置窗口大小
|
|
47
|
-
user_agent=self.crawler.settings.get("USER_AGENT") # 自定义UA
|
|
48
|
-
)
|
|
49
|
-
|
|
50
|
-
def open(self):
|
|
51
|
-
"""从crawler配置加载参数"""
|
|
52
|
-
super().open() # 调用父类初始化
|
|
53
|
-
|
|
54
|
-
# 读取配置(支持在settings.py中覆盖)
|
|
55
|
-
self._browser_type = self.crawler.settings.get("PLAYWRIGHT_BROWSER", "chromium")
|
|
56
|
-
self._headless = self.crawler.settings.get_bool("HEADLESS", True)
|
|
57
|
-
self._timeout = self.crawler.settings.get_int("PLAYWRIGHT_TIMEOUT", 30000)
|
|
58
|
-
self._viewport = self.crawler.settings.get_dict("VIEWPORT", {"width": 1280, "height": 720})
|
|
59
|
-
self._extra_launch_args = self.crawler.settings.get_dict("PLAYWRIGHT_LAUNCH_ARGS", {})
|
|
60
|
-
|
|
61
|
-
async def download(self, request: Request) -> Response:
|
|
62
|
-
"""
|
|
63
|
-
核心下载方法:
|
|
64
|
-
1. 创建新页面Tab
|
|
65
|
-
2. 加载目标URL
|
|
66
|
-
3. 获取渲染后的内容
|
|
67
|
-
"""
|
|
68
|
-
if not self.browser:
|
|
69
|
-
await self._init_browser() # 懒加载浏览器
|
|
70
|
-
|
|
71
|
-
page = await self.context.new_page() # 每个请求独立Page(自动隔离)
|
|
72
|
-
|
|
73
|
-
try:
|
|
74
|
-
# 设置请求头(模拟浏览器)
|
|
75
|
-
if request.headers:
|
|
76
|
-
await page.set_extra_http_headers(request.headers)
|
|
77
|
-
|
|
78
|
-
# 导航到目标URL(支持等待策略配置)
|
|
79
|
-
response = await page.goto(
|
|
80
|
-
request.url,
|
|
81
|
-
timeout=self._timeout,
|
|
82
|
-
wait_until="domcontentloaded" # 等待策略:domcontentloaded/networkidle/load
|
|
83
|
-
)
|
|
84
|
-
|
|
85
|
-
# 特殊处理POST请求(Playwright限制需用API方式)
|
|
86
|
-
if request.method.lower() == "post":
|
|
87
|
-
return await self._handle_post_request(request, page)
|
|
88
|
-
|
|
89
|
-
# 执行自定义JavaScript(用于提取动态数据)
|
|
90
|
-
if request.meta.get("execute_js"):
|
|
91
|
-
result = await page.evaluate(request.meta["execute_js"])
|
|
92
|
-
request.meta["js_result"] = result # 存储JS执行结果
|
|
93
|
-
|
|
94
|
-
# 获取渲染后的完整HTML(含动态生成内容)
|
|
95
|
-
body = await page.content()
|
|
96
|
-
|
|
97
|
-
# 调试模式下截图(用于排查页面问题)
|
|
98
|
-
if self.crawler.settings.get_bool("DEBUG"):
|
|
99
|
-
screenshot = await page.screenshot(type="png")
|
|
100
|
-
request.meta["screenshot"] = screenshot # 截图存入request.meta
|
|
101
|
-
|
|
102
|
-
# 构造统一响应对象
|
|
103
|
-
return self._structure_response(request, response, body)
|
|
104
|
-
|
|
105
|
-
except Exception as e:
|
|
106
|
-
self.logger.error(f"页面下载失败: {str(e)}")
|
|
107
|
-
raise
|
|
108
|
-
finally:
|
|
109
|
-
await page.close() # 确保页面关闭,避免资源泄漏
|
|
110
|
-
|
|
111
|
-
async def _handle_post_request(self, request: Request, page: Page) -> Response:
|
|
112
|
-
"""
|
|
113
|
-
处理POST请求的特殊方法:
|
|
114
|
-
通过页面内fetch API发送POST请求,并监听响应
|
|
115
|
-
"""
|
|
116
|
-
async with page.expect_response(request.url) as response_info:
|
|
117
|
-
# 在页面上下文中执行fetch
|
|
118
|
-
await page.evaluate(
|
|
119
|
-
"""async ({url, headers, body}) => {
|
|
120
|
-
await fetch(url, {
|
|
121
|
-
method: 'POST',
|
|
122
|
-
headers: headers,
|
|
123
|
-
body: body
|
|
124
|
-
});
|
|
125
|
-
}""",
|
|
126
|
-
{
|
|
127
|
-
"url": request.url,
|
|
128
|
-
"headers": request.headers or {},
|
|
129
|
-
"body": request.body or ""
|
|
130
|
-
}
|
|
131
|
-
)
|
|
132
|
-
|
|
133
|
-
response = await response_info.value # 获取API响应
|
|
134
|
-
body = await response.text() # 读取响应体
|
|
135
|
-
return self._structure_response(request, response, body)
|
|
136
|
-
|
|
137
|
-
@staticmethod
|
|
138
|
-
def _structure_response(
|
|
139
|
-
request: Request,
|
|
140
|
-
response: PlaywrightResponse,
|
|
141
|
-
body: str
|
|
142
|
-
) -> Response:
|
|
143
|
-
"""
|
|
144
|
-
标准化响应格式:
|
|
145
|
-
将Playwright的响应转换为crawlo的统一Response对象
|
|
146
|
-
"""
|
|
147
|
-
return Response(
|
|
148
|
-
url=str(response.url), # 最终URL(含重定向)
|
|
149
|
-
headers=response.headers, # 响应头
|
|
150
|
-
status_code=response.status, # HTTP状态码
|
|
151
|
-
body=body.encode('utf-8'), # 响应体(转bytes)
|
|
152
|
-
request=request # 关联的请求对象
|
|
153
|
-
)
|
|
154
|
-
|
|
155
|
-
async def close(self) -> None:
|
|
156
|
-
"""资源清理:关闭浏览器实例和上下文"""
|
|
157
|
-
if self.context:
|
|
158
|
-
await self.context.close()
|
|
159
|
-
if self.browser:
|
|
160
|
-
await self.browser.close()
|
|
161
|
-
await super().close() # 调用父类清理逻辑
|
crawlo-1.0.4.dist-info/RECORD
DELETED
|
@@ -1,79 +0,0 @@
|
|
|
1
|
-
crawlo/__init__.py,sha256=XOWXajnhT2HVql5cycwGkQ0MS85bpQnFdM7tl0Fusik,327
|
|
2
|
-
crawlo/__version__.py,sha256=acuR_XSJzp4OrQ5T8-Ac5gYe48mUwObuwjRmisFmZ7k,22
|
|
3
|
-
crawlo/crawler.py,sha256=rqKjMLDU6qlm2D2gIhkezF5jFOCz0TgYyq-nS7MEFMU,9237
|
|
4
|
-
crawlo/event.py,sha256=7-y6HNv_EIJSYQNzsj0mVK-Gg4ON3wdQeMdQjfFJPlw,313
|
|
5
|
-
crawlo/exceptions.py,sha256=7dtEJBxb9yvmMJe6MQyDB0LuV9que1J_jQN4QYeyO4g,916
|
|
6
|
-
crawlo/stats_collector.py,sha256=NkO09CB-220qz5rxFcD_dedGfr2VPFrDo4hya0Zh8Qc,1577
|
|
7
|
-
crawlo/subscriber.py,sha256=3d4eYtkSgPj-18-mTZM6RQLSil-ux13FUcmfFxr3sMk,3730
|
|
8
|
-
crawlo/task_manager.py,sha256=AS7Xu_8Q_eb3jg9QSkK_wv6W1rRXaI6WjDp8p6h9ltU,721
|
|
9
|
-
crawlo/core/__init__.py,sha256=PnFyJdVNHBoPmV1sW0AHQXijeoSTQ8cMYrbNM1JK8kA,41
|
|
10
|
-
crawlo/core/engine.py,sha256=OcGsY2ikDNXK4j9VqB0bUHs2v1TyWvSJu2mD1W9CbGc,5872
|
|
11
|
-
crawlo/core/processor.py,sha256=qmCqAeqhwYu-UE86evYesaGt9qpuSIfH-ZIZKcXFCZc,1140
|
|
12
|
-
crawlo/core/scheduler.py,sha256=3pnm5L241aEHnJJArnbCnirooo4wWFgAmnP1tMB049k,1891
|
|
13
|
-
crawlo/downloader/__init__.py,sha256=ukrDBULCaoDWoMLCO3XcQhDoasF0oUzj0PHnJ_ACJaE,2306
|
|
14
|
-
crawlo/downloader/aiohttp_downloader.py,sha256=EVCfbaCUJlTa1ZG32NhxKyi8FVFh-NoK0u57ct1MWos,9844
|
|
15
|
-
crawlo/downloader/httpx_downloader.py,sha256=UQ7u3U_Iy8u1i2q0fDBakfu3C6EBN-T5Q0US6u-Um84,7002
|
|
16
|
-
crawlo/downloader/playwright_downloader.py,sha256=q-Yy-hAaS7attXeAr7HAWAuFO2u1EOp_NdHnUPoDRRA,6566
|
|
17
|
-
crawlo/extension/__init__.py,sha256=LPy9XyCu089k6L6oVENIi_imr75AEuY8QTtSJjRioiw,1139
|
|
18
|
-
crawlo/extension/log_interval.py,sha256=S-hSoiz9GdmgHrac4vDQ52fleoBcH-kzdPUD8YRAons,1922
|
|
19
|
-
crawlo/extension/log_stats.py,sha256=WeSnOoSKB8pI_xmcGdh906XnF1xwo6fgJnf_prElwwI,1742
|
|
20
|
-
crawlo/filters/__init__.py,sha256=BCZl86BHiTfDGRe_b1TlNSr6pfNbMKTu0Uq0j4gX_1Q,977
|
|
21
|
-
crawlo/filters/aioredis_filter.py,sha256=MJT74BeVZTjdExKEzdrWKc7WPXFss1k-txc7E54H77E,5522
|
|
22
|
-
crawlo/filters/memory_filter.py,sha256=bs2WUe7CdHiXgr344vzDqMfBv1b3RwXJMnwxpDb64Pw,6639
|
|
23
|
-
crawlo/items/__init__.py,sha256=JUw4wZX50DidJuCMLkP41ik_wTKum2b8iDxm7EbRRds,2063
|
|
24
|
-
crawlo/items/items.py,sha256=00TdAYChF5Rbbgm6a6d-GCxkx4gXP-rA-_Q7u33BuFI,3990
|
|
25
|
-
crawlo/middleware/__init__.py,sha256=ldaGFNbiJnK9Fx12Vdf9fDNfzXxoETtShp5r-vodtw0,549
|
|
26
|
-
crawlo/middleware/default_header.py,sha256=i_Uj07JObyeZFxL7ZAZmvZsHvA1HGtkNab1sA0d-nWI,1067
|
|
27
|
-
crawlo/middleware/download_delay.py,sha256=2M-TchDA7MwyTfYy0Hzh_bW9wlHlpiP-oQlys7crTj0,966
|
|
28
|
-
crawlo/middleware/middleware_manager.py,sha256=kMqnSf4kltDkRf8PV0Xs9Ple9z-oKKQrMS0Q0_-4vNQ,6489
|
|
29
|
-
crawlo/middleware/request_ignore.py,sha256=QI2z4fUnJ-4xvPTZAmsL-GqR4RFHS1xq9iDr5KFrMco,997
|
|
30
|
-
crawlo/middleware/response_code.py,sha256=tmef2QVl3JCiTMii6VQkASlOY2OyqmOPoOfNxIK1eF8,659
|
|
31
|
-
crawlo/middleware/response_filter.py,sha256=ep8ZxDlfIefi9YqK8dPASEp5TTDRo9QEY_jMceC411s,837
|
|
32
|
-
crawlo/middleware/retry.py,sha256=BV-rYm3WVp8Hcrxc0JUGYfYAwvOWfXTWtatp3S5K9oU,3375
|
|
33
|
-
crawlo/network/__init__.py,sha256=VaD0GmsgDYJ8UMgrtjeOc1Wc7lDGee1uAF3neRpyug0,123
|
|
34
|
-
crawlo/network/request.py,sha256=eyju3BddPU8eNoueY48eqju8I96r9RasmNXXSaUU8dg,7086
|
|
35
|
-
crawlo/network/response.py,sha256=6TO8hvkcgtVVPjQhXI3ywx5E_lV0eGrTBceEi60t55c,6034
|
|
36
|
-
crawlo/pipelines/__init__.py,sha256=Hk-M6X0VCGLp6OEdgnhXGhGhKS5TjKf6dkg8bU9pvUE,260
|
|
37
|
-
crawlo/pipelines/console_pipeline.py,sha256=KABkR3J-rqO0Awox7lizxKR2XuHfVhWPiVRgIybwwu4,1248
|
|
38
|
-
crawlo/pipelines/mongo_pipeline.py,sha256=lv-Zn_mWdE_jVy7Nh30Lzqm3YhtLRV5rMy-m4rBWYe0,4442
|
|
39
|
-
crawlo/pipelines/mysql_batch_pipline.py,sha256=g111iuPTRyKr0q4PHTJYIfsYAFf8CCuyYY6DDLSpMO0,4889
|
|
40
|
-
crawlo/pipelines/mysql_pipeline.py,sha256=ZlRWwZLewG9SBLBZ1wWNZ8yAj5xWWitb7BKRSrqEWtI,7857
|
|
41
|
-
crawlo/pipelines/pipeline_manager.py,sha256=JIoX5D-oDfUT7VJrb5m355wi43SChb4nNb09z_0F4_g,2118
|
|
42
|
-
crawlo/settings/__init__.py,sha256=xsukVKn_h2Hopm1Nj-bXkhbfyS62QTTvJi7fhZUwR9M,123
|
|
43
|
-
crawlo/settings/default_settings.py,sha256=zNMVMo_9s1DGr1TiPzwZjSmxuD4qj_JT_oCCmkoMfjs,2579
|
|
44
|
-
crawlo/settings/setting_manager.py,sha256=SxKB1aCWh4OySM_bH9cYng9I3PAmrSP-Q8XOZEWEwbI,2899
|
|
45
|
-
crawlo/spider/__init__.py,sha256=pP_TChnozpHeuS87Bs-Sj31hb0R7glYN3K6BsRw4FOA,905
|
|
46
|
-
crawlo/templates/item_template.tmpl,sha256=bo0cjaFOT1jMrtLjXs6z7Mhwev-s3037suD4BL2_ji4,351
|
|
47
|
-
crawlo/templates/spider_template.tmpl,sha256=dDMOa_17uWKB3FopYrDYLMYhHGaYevm1hI9AVeY6QAg,805
|
|
48
|
-
crawlo/templates/project_template/main.py,sha256=RbGWsdtpOTPMD-jL80sDqu8C-TgI9qrrwJZ8NeK0PZM,594
|
|
49
|
-
crawlo/templates/project_template/setting.py,sha256=NjP9KuhL3pBtRQfC4djBFq4CvBR4H1_OSVDfWMZITh0,9206
|
|
50
|
-
crawlo/templates/project_template/items/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
51
|
-
crawlo/templates/project_template/spiders/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
52
|
-
crawlo/utils/__init__.py,sha256=BDORpyjMN7VGPKImnCDKSkprS-petgD7ezc9rMlBvb0,123
|
|
53
|
-
crawlo/utils/concurrency_manager.py,sha256=nBrHlrKqGENINDA6zrbpK0jCbcjWqobI10vYy9Sg3wU,5106
|
|
54
|
-
crawlo/utils/date_tools.py,sha256=9OVJB66_0BvRq-lwUE4JYcd6J5RGADW3lcKEWS6lCi0,5319
|
|
55
|
-
crawlo/utils/func_tools.py,sha256=WUZEGpWMuDDX7g-QySM7iaiC74erW2SSkZoUvDw1NjM,2369
|
|
56
|
-
crawlo/utils/log.py,sha256=vNHCIw9owCZ4voNM_hni7fOEyt9kKoOhIrjvl76lKQg,989
|
|
57
|
-
crawlo/utils/pqueue.py,sha256=4Ymkm38fRFqEcSJeD_ULkuBaCk6QdYvJdnYvtJjh-Tk,5386
|
|
58
|
-
crawlo/utils/project.py,sha256=qRErB6Ru81-PpSnT9g2ZPyfbWCwZ8hygpMAWhwIhC_M,1485
|
|
59
|
-
crawlo/utils/request.py,sha256=ftnyr6f--StdcYju5FaU_khRARaxMMktJS8wROA1Fe0,4119
|
|
60
|
-
crawlo/utils/system.py,sha256=HvWV1acxou0Rn0L7pNq4CnV_GWFeU0Tmjy3_nLD8M64,237
|
|
61
|
-
crawlo/utils/tools.py,sha256=isRvzTMU3n1FWMhzTAt-7TVrHmH8JPUwgWyVel71Wj0,9462
|
|
62
|
-
crawlo/utils/url.py,sha256=rlgX2VlJv6JvLmCDTsbxzMSXE6R5ZL_0dLEqprsA-JU,1482
|
|
63
|
-
tests/__init__.py,sha256=6i631BPnS_TR_BWUjtjB5CBO-zv9kRkwQTQvSya2wHE,123
|
|
64
|
-
tests/baidu_spider/__init__.py,sha256=xlj0-TBQBhcKglllla_bQbufNiv10UFE0KsWMLvzFz4,123
|
|
65
|
-
tests/baidu_spider/demo.py,sha256=MTEHkm7U4Kyx5QULCgR6to391xn4XPay6fmuV1c1uRc,24278
|
|
66
|
-
tests/baidu_spider/items.py,sha256=vkbdnCw4tjYLmCL4oDIUxDNCgpYNqZHEG-6lVN3qfvI,443
|
|
67
|
-
tests/baidu_spider/middleware.py,sha256=I71ZMmWTiDBFq4t2zfTE7IIXCqwaaeQ1DvKGW70q2Yg,1397
|
|
68
|
-
tests/baidu_spider/pipeline.py,sha256=TUK_LnrU818UYmCn2_gKeNaTZjaj9qjrlndRLsR4wf0,1437
|
|
69
|
-
tests/baidu_spider/request_fingerprints.txt,sha256=TJAuFJZZ_uvYExfruA9bEsIiArz86vxe95QoF2lbnfE,585
|
|
70
|
-
tests/baidu_spider/run.py,sha256=YVe9qwn-2XBRRoZdUnwPRrWlBO5YAmKnyLRI3RpfogE,646
|
|
71
|
-
tests/baidu_spider/settings.py,sha256=EenFOFgupwnn7HIySKSHBgP9--qxxkiWgIi2NDltXRw,2811
|
|
72
|
-
tests/baidu_spider/spiders/__init__.py,sha256=eJ_ih4GiGfwQzPILeouy1Hnc4BrPz0KNPYlLHYvrvoc,123
|
|
73
|
-
tests/baidu_spider/spiders/bai_du.py,sha256=pw4WccbmBR07CuSqCgm_7x9SH63FDJS_sXSaN5Ew5Tw,1589
|
|
74
|
-
tests/baidu_spider/spiders/sina.py,sha256=BKQGJiCS8aiZ2f27C99WcK90QQJwgUY-vS4fUaQSdIQ,2456
|
|
75
|
-
crawlo-1.0.4.dist-info/METADATA,sha256=dzEuRJVuBVSeKTQeEvOXRhfRcyjhcZqJFlPWivAZ9UE,1743
|
|
76
|
-
crawlo-1.0.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
77
|
-
crawlo-1.0.4.dist-info/entry_points.txt,sha256=GD9PBhKQN83EaxPYtz7NhcGeZeh3bdr2jWbTixOs-lw,59
|
|
78
|
-
crawlo-1.0.4.dist-info/top_level.txt,sha256=bKtfejkszFTNHm7Z6aqtt0AUG8DdeNeL4AoZsg4XdZY,13
|
|
79
|
-
crawlo-1.0.4.dist-info/RECORD,,
|
tests/baidu_spider/__init__.py
DELETED
tests/baidu_spider/demo.py
DELETED
|
@@ -1,94 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/python
|
|
2
|
-
# -*- coding:UTF-8 -*-
|
|
3
|
-
"""
|
|
4
|
-
# @Time : 2025-07-14 13:04
|
|
5
|
-
# @Author : crawl-coder
|
|
6
|
-
# @Desc : None
|
|
7
|
-
"""
|
|
8
|
-
import asyncio
|
|
9
|
-
import asyncmy
|
|
10
|
-
from datetime import datetime
|
|
11
|
-
import logging
|
|
12
|
-
|
|
13
|
-
from settings import MYSQL_HOST, MYSQL_USER, MYSQL_PASSWORD, MYSQL_TABLE, MYSQL_DB
|
|
14
|
-
|
|
15
|
-
# 配置日志
|
|
16
|
-
logging.basicConfig(
|
|
17
|
-
level=logging.INFO,
|
|
18
|
-
format='%(asctime)s - %(levelname)s - %(message)s'
|
|
19
|
-
)
|
|
20
|
-
logger = logging.getLogger(__name__)
|
|
21
|
-
|
|
22
|
-
# 测试数据
|
|
23
|
-
value_tuples = [('3250219', '煤炭开采板块午后直线拉升 郑州煤电涨停', '煤炭开采板块午后直线拉升,()涨停,()、()、()、()、()等纷纷走高。消息面上,中国煤炭运销协会表示,加强行业自律,整治内卷式竞争,促进煤炭市场供需平衡。', '煤炭开采板块午后直线拉升,郑州煤电涨停,山煤国际、辽宁能源、华电能源、恒源煤电、盘江股份等纷纷走高。消息面上,中国煤炭运销协会表示,加强行业自律,整治内卷式竞争,促进煤炭市场供需平衡。', 'https://news.10jqka.com.cn/20250714/c669607368.shtml', '异动', '2025-07-14 13:04:07', ''), ('3250025', '现货白银日内涨幅扩大至1.00%,现报38.77美元/盎司', '现货白银日内涨幅扩大至1.00%,现报38.77美元/盎司。', '现货白银日内涨幅扩大至1.00%,现报38.77美元/盎司。', 'https://news.10jqka.com.cn/20250714/c669606752.shtml', 'A股', '2025-07-14 11:59:10', ''), ('3250061', '7月14日午间公告一览:晨化股份使用3000万元闲置资金购买理财产品', '()公告,公司及子公司近日使用自有闲置资金购买了3000万元的理财产品。此次投资包括两个金额各为1500万元的银河盛汇稳健1号集合资产管理计划,预期年化收益率为3.2%。该理财产品的起息日为2025年7月10日,到期日为2026年7月6日。公司表示,此举是在确保日常运营和资金安全的前提下进行的,不会影响公司主营业务的正常开展,旨在提高资金使用效率,获取较好的投资回报。', '晨化股份公告,公司及子公司近日使用自有闲置资金购买了3000万元的理财产品。此次投资包括两个金额各为1500万元的银河盛汇稳健1号集合资产管理计划,预期年化收益率为3.2%。该理财产品的起息日为2025年7月10日,到期日为2026年7月6日。公司表示,此举是在确保日常运营和资金安全的前提下进行的,不会影响公司主营业务的正常开展,旨在提高资金使用效率,获取较好的投资回报。', 'https://news.10jqka.com.cn/20250714/c669606926.shtml', '公告,A股', '2025-07-14 12:15:33', ''), ('3250123', '鸿蒙智行问界产品总监彭磊调任享界系列产品总监', '7月14日,鸿蒙智行App显示,彭磊认证信息已变更为享界系列产品总监。此前担任鸿蒙智行问界产品总监的彭磊今天开始在微博发文预热享界产品,宣传鸿蒙智行享界第二款车享界S9T。不过,他的微博认证未同步更新,仍为鸿蒙智行问界产品总监。', '7月14日,鸿蒙智行App显示,彭磊认证信息已变更为享界系列产品总监。此前担任鸿蒙智行问界产品总监的彭磊今天开始在微博发文预热享界产品,宣传鸿蒙智行享界第二款车享界S9T。不过,他的微博认证未同步更新,仍为鸿蒙智行问界产品总监。', 'https://news.10jqka.com.cn/20250714/c669607077.shtml', '港股,A股', '2025-07-14 12:34:46', ''), ('3250153', '韩国前总统尹锡悦再次未出席内乱特检组传唤调查', '记者获悉,韩国前总统尹锡悦14日再次未出席内乱特检组的传唤调查。(央视新闻)', '记者获悉,韩国前总统尹锡悦14日再次未出席内乱特检组的传唤调查。(央视新闻)', 'https://news.10jqka.com.cn/20250714/c669607226.shtml', '美股,港股,A股', '2025-07-14 12:48:40', ''), ('3250135', '立讯精密旗下立铠精密在黄石成立新公司', '企查查APP显示,近日,立铠精密科技(黄石)有限公司成立,法定代表人为杨立强,注册资本5亿元,经营范围包括移动终端设备销售、互联网设备销售、汽车零部件及配件制造等。企查查股权穿透显示,该公司由立铠精密科技(盐城)有限公司全资持股,后者大股东为()。', '企查查APP显示,近日,立铠精密科技(黄石)有限公司成立,法定代表人为杨立强,注册资本5亿元,经营范围包括移动终端设备销售、互联网设备销售、汽车零部件及配件制造等。企查查股权穿透显示,该公司由立铠精密科技(盐城)有限公司全资持股,后者大股东为立讯精密。', 'https://news.10jqka.com.cn/20250714/c669607154.shtml', 'A股', '2025-07-14 12:40:40', ''), ('3250197', '沪深京三市成交额超1万亿元 较上日此时缩量686亿元', '据()iFinD数据,沪深京三市成交额超1万亿元,较上日此时686亿元,预计全天成交金额约1.6万亿元。截至目前,沪市成交额4158亿元,深市成交额5688亿元,北证50成交额154亿元。', '据同花顺iFinD数据,沪深京三市成交额超1万亿元,较上日此时缩量686亿元,预计全天成交金额约1.6万亿元。截至目前,沪市成交额4158亿元,深市成交额5688亿元,北证50成交额154亿元。', 'https://news.10jqka.com.cn/20250714/c669607345.shtml', 'A股', '2025-07-14 13:01:38', ''), ('3250129', '伟仕佳杰:公司已开始探索稳定币支付及结算解决方案', '伟仕佳杰在港交所发布公告称,本公司日前已开始进行合作洽谈,以探索稳定币支付及结算的解决方案。该合作讨论主要聚焦本公司在东南亚地区的ICT产品分销、供应链服务及数位生态业务,本次合作讨论不涉及任何加密货币投机行为。该讨论处于初步阶段,且尚未达成任何具法律约束力的协定。', '伟仕佳杰在港交所发布公告称,本公司日前已开始进行合作洽谈,以探索稳定币支付及结算的解决方案。该合作讨论主要聚焦本公司在东南亚地区的ICT产品分销、供应链服务及数位生态业务,本次合作讨论不涉及任何加密货币投机行为。该讨论处于初步阶段,且尚未达成任何具法律约束力的协定。', 'https://news.10jqka.com.cn/20250714/c669607132.shtml', '港股', '2025-07-14 12:37:13', ''), ('3250065', '海南航空发声明打假', '海南航空今日在网络社交平台发布声明。声明称,近期发现部分网络平台及线下场所存在商户和个人未经授权,擅自盗用海南航空商标或公司名称,公然冒充公司员工或关联方,非法开展销售活动。所售商品涵盖空乘制服、飞机模型、机上用品、周边文创产品等多个品类。更有甚者,疑似穿着海南航空享有专利保护的乘务员制服,在直播平台上大肆赚取流量、非法牟利,严重扰乱市场正常秩序。声明表示,“海南航空官方商品及服务仅通过海南航空官网、官方App、经我司正式授权的直营渠道以及指定的合作方进行发布与销售。任何单位或个人均无权擅自使用我司依法享有的商标、名称、专利、著作权等知识产权,更不得假冒我司员工身份开展任何商业活动。”声明还指出,如在日常生活或网络浏览过程中遇到可疑情况,可随时拨打海南航空服务热线95339进行咨询核实,或通过官方网站在线客服反馈相关线索。如未经核实在非正规渠道购买海南航空服务或相关产品,海南航空将不会履行相关义务或承担任何赔偿责任。', '海南航空今日在网络社交平台发布声明。声明称,近期发现部分网络平台及线下场所存在商户和个人未经授权,擅自盗用海南航空商标或公司名称,公然冒充公司员工或关联方,非法开展销售活动。所售商品涵盖空乘制服、飞机模型、机上用品、周边文创产品等多个品类。更有甚者,疑似穿着海南航空享有专利保护的乘务员制服,在直播平台上大肆赚取流量、非法牟利,严重扰乱市场正常秩序。 声明表示,“海南航空官方商品及服务仅通过海南航空官网、官方 App、经我司正式授权的直营渠道以及指定的合作方进行发布与销售。任何单位或个人均无权擅自使用我司依法享有的商标、名称、专利、著作权等知识产权,更不得假冒我司员工身份开展任何商业活动。” 声明还指出,如在日常生活或网络浏览过程中遇到可疑情况,可随时拨打海南航空服务热线 95339 进行咨询核实,或通过官方网站在线客服反馈相关线索。如未经核实在非正规渠道购买海南航空服务或相关产品,海南航空将不会履行相关义务或承担任何赔偿责任。', 'https://news.10jqka.com.cn/20250714/c669606930.shtml', '美股,港股,A股', '2025-07-14 12:15:49', ''), ('3250067', '陕西科研团队光镊切片显微术破解悬浮细胞三维观测难题 成果有望用于医学成像等领域', '据中国科学院西安光机所消息,该所超快光科学与技术全国重点实验室研究员姚保利、徐孝浩团队在生物光学显微成像及微操纵方面取得重要进展,提出了“光镊切片显微术”,实现了悬浮生物细胞的全光式三维成像,为光镊技术开拓了新应用方向,在类器官构建、三维中性原子阵列组装、细胞自动化识别及分选方面具有巨大应用潜力。上述成果在国际顶级学术期刊《科学进展》发表。(科创板日报)', '据中国科学院西安光机所消息,该所超快光科学与技术全国重点实验室研究员姚保利、徐孝浩团队在生物光学显微成像及微操纵方面取得重要进展,提出了“光镊切片显微术”,实现了悬浮生物细胞的全光式三维成像,为光镊技术开拓了新应用方向,在类器官构建、三维中性原子阵列组装、细胞自动化识别及分选方面具有巨大应用潜力。上述成果在国际顶级学术期刊《科学进展》发表。(科创板日报)', 'https://news.10jqka.com.cn/20250714/c669606940.shtml', '港股,A股', '2025-07-14 12:16:39', ''), ('3250121', '公安部:2025年上半年全国机动车达4.6亿辆 驾驶人达5.5亿人', '公安部统计,截至2025年6月底,全国机动车保有量达4.6亿辆,其中汽车3.59亿辆;机动车驾驶人5.5亿人,其中汽车驾驶人5.15亿人。2025年上半年全国新注册登记机动车1688万辆,新领证驾驶人1258万人。上半年新注册登记机动车1688万辆,新注册登记汽车1250万辆。2025年上半年,全国新注册登记机动车1688万辆。其中,汽车新注册登记1250万辆,同比增长0.68%。新能源汽车保有量达3689万辆,上半年新注册登记562.2万辆。截至6月底,全国新能源汽车保有量达3689万辆,占汽车总量的10.27%。其中,纯电动汽车保有量2553.9万辆,占新能源汽车总量的69.23%。上半年新注册登记新能源汽车562.2万辆,同比增长27.86%,创历史新高。新能源汽车新注册登记量占汽车新注册登记量的44.97%。(央视新闻)', '公安部统计,截至2025年6月底,全国机动车保有量达4.6亿辆,其中汽车3.59亿辆;机动车驾驶人5.5亿人,其中汽车驾驶人5.15亿人。2025年上半年全国新注册登记机动车1688万辆,新领证驾驶人1258万人。上半年新注册登记机动车1688万辆,新注册登记汽车1250万辆。2025年上半年,全国新注册登记机动车1688万辆。其中,汽车新注册登记1250万辆,同比增长0.68%。新能源汽车保有量达3689万辆,上半年新注册登记562.2万辆。截至6月底,全国新能源汽车保有量达3689万辆,占汽车总量的10.27%。其中,纯电动汽车保有量2553.9万辆,占新能源汽车总量的69.23%。上半年新注册登记新能源汽车562.2万辆,同比增长27.86%,创历史新高。新能源汽车新注册登记量占汽车新注册登记量的44.97%。(央视新闻)', 'https://news.10jqka.com.cn/20250714/c669607063.shtml', '港股,A股', '2025-07-14 12:33:07', ''), ('3250099', '全国首单低空运营管理责任险落地苏州', '全国首个面向低空经济运营管理方的保险“安翼计划”近日在苏州落地。该保险方案由苏州东吴财产保险股份有限公司联合中国财产再保险有限责任公司研发,为苏州市盛泽湖全空间无人体系示范岛的试飞运营场地管理方提供风险保障。“安翼计划”针对低空经济运营中特有的信号干扰、操作失误、外部撞击等风险场景,提供精细化、专业化风险解决方案,将保险保障聚焦于低空运营管理方,系统构建了覆盖场地方责任、低空运营管理风险的完整保障体系。(新华日报)', '全国首个面向低空经济运营管理方的保险“安翼计划”近日在苏州落地。该保险方案由苏州东吴财产保险股份有限公司联合中国财产再保险有限责任公司研发,为苏州市盛泽湖全空间无人体系示范岛的试飞运营场地管理方提供风险保障。“安翼计划”针对低空经济运营中特有的信号干扰、操作失误、外部撞击等风险场景,提供精细化、专业化风险解决方案,将保险保障聚焦于低空运营管理方,系统构建了覆盖场地方责任、低空运营管理风险的完整保障体系。(新华日报)', 'https://news.10jqka.com.cn/20250714/c669607049.shtml', '美股,港股,A股', '2025-07-14 12:30:52', ''), ('3250159', '韩正会见印度外长苏杰生', '据新华社报道,7月14日,国家副主席韩正在北京会见印度外长苏杰生。韩正表示,去年10月,习近平主席同莫迪总理在喀山成功会晤,引领中印关系重启再出发。中印都是发展中大国、全球南方重要成员,做相互成就的伙伴,实现“龙象共舞”,是双方正确选择。双方要进一步落实两国领导人达成的重要共识,坚持高层引领、稳步推进务实合作、尊重彼此关切,推动中印关系持续健康稳定发展。苏杰生表示,莫迪总理同习近平主席喀山会晤后,印中关系稳步改善。印方愿以领导人共识为指引,保持印中关系发展势头,推进互利合作,加强在多边机制中的沟通协调。印方支持中方作为轮值主席国办好今年的上海合作组织峰会。', '据新华社报道,7月14日,国家副主席韩正在北京会见印度外长苏杰生。韩正表示,去年10月,习近平主席同莫迪总理在喀山成功会晤,引领中印关系重启再出发。中印都是发展中大国、全球南方重要成员,做相互成就的伙伴,实现“龙象共舞”,是双方正确选择。双方要进一步落实两国领导人达成的重要共识,坚持高层引领、稳步推进务实合作、尊重彼此关切,推动中印关系持续健康稳定发展。苏杰生表示,莫迪总理同习近平主席喀山会晤后,印中关系稳步改善。印方愿以领导人共识为指引,保持印中关系发展势头,推进互利合作,加强在多边机制中的沟通协调。印方支持中方作为轮值主席国办好今年的上海合作组织峰会。', 'https://news.10jqka.com.cn/20250714/c669607238.shtml', '美股,港股,A股', '2025-07-14 12:50:33', ''), ('3250059', '泰国机构将2025年到访游客人数预估从4000万人次下调至3500万人次', '泰国机构将2025年到访游客人数预估从4000万人次下调至3500万人次。', '泰国机构将2025年到访游客人数预估从4000万人次下调至3500万人次。', 'https://news.10jqka.com.cn/20250714/c669606902.shtml', 'A股', '2025-07-14 12:13:04', ''), ('3250131', '宇树科技获得出版物零售许可', '天眼查App显示,7月4日,杭州宇树科技股份有限公司新增一则行政许可信息,许可内容为出版物零售,许可机关为区党委宣传部(区新闻出版局)。值得注意的是,今年5月,该公司经营范围新增出版物零售。', '天眼查App显示,7月4日,杭州宇树科技股份有限公司新增一则行政许可信息,许可内容为出版物零售,许可机关为区党委宣传部(区新闻出版局)。值得注意的是,今年5月,该公司经营范围新增出版物零售。', 'https://news.10jqka.com.cn/20250714/c669607142.shtml', '港股,A股', '2025-07-14 12:38:20', ''), ('3250079', '韩特检组对军方24处场所展开扣押搜查,正式调查尹锡悦外患罪嫌疑', '韩国内乱特检组于7月14日对军方无人机作战司令部、国防部、国防部防间谍司令部等24处军事相关场所进行了全方位的扣押搜查,对前总统尹锡悦涉嫌外患罪的情况展开正式调查。搜查地点还包括国家安保室、驻白翎岛部队、无人机作战司令部司令金龙大的住宅等。(央视新闻)', '韩国内乱特检组于7月14日对军方无人机作战司令部、国防部、国防部防间谍司令部等24处军事相关场所进行了全方位的扣押搜查,对前总统尹锡悦涉嫌外患罪的情况展开正式调查。搜查地点还包括国家安保室、驻白翎岛部队、无人机作战司令部司令金龙大的住宅等。(央视新闻)', 'https://news.10jqka.com.cn/20250714/c669607015.shtml', '美股,港股,A股', '2025-07-14 12:26:27', ''), ('3250125', '中国中铁在秦皇岛成立新公司 含海洋服务业务', '企查查APP显示,近日,中铁大桥局集团(秦皇岛)供应链管理有限公司成立,法定代表人为何士博,注册资本为100万元,经营范围包含:供应链管理服务;工程管理服务;海洋服务;海洋工程装备销售;金属结构销售等。企查查股权穿透显示,该公司由()间接全资持股。', '企查查APP显示,近日,中铁大桥局集团(秦皇岛)供应链管理有限公司成立,法定代表人为何士博,注册资本为100万元,经营范围包含:供应链管理服务;工程管理服务;海洋服务;海洋工程装备销售;金属结构销售等。企查查股权穿透显示,该公司由中国中铁间接全资持股。', 'https://news.10jqka.com.cn/20250714/c669607089.shtml', '港股,A股', '2025-07-14 12:35:49', ''), ('3250163', '河南发布高温红色预警 部分地区将升至40℃以上', '河南省气象台7月14日12时升级发布高温红色预警:预计7月14日下午,新乡中西部、焦作、洛阳北部、郑州、开封、商丘西部、许昌东部、周口、漯河东部、驻马店部分县(市、区)最高气温将升至40℃以上。7月15日白天,黄河以北和洛阳北部、郑州、开封、商丘西部、许昌、周口北部、漯河北部部分县(市、区)最高气温将升至40℃以上。7月16日白天,黄河以南部分县(市、区)最高气温将升至40℃以上。(央视新闻)', '河南省气象台7月14日12时升级发布高温红色预警:预计7月14日下午,新乡中西部、焦作、洛阳北部、郑州、开封、商丘西部、许昌东部、周口、漯河东部、驻马店部分县(市、区)最高气温将升至40℃以上。7月15日白天,黄河以北和洛阳北部、郑州、开封、商丘西部、许昌、周口北部、漯河北部部分县(市、区)最高气温将升至40℃以上。7月16日白天,黄河以南部分县(市、区)最高气温将升至40℃以上。(央视新闻)', 'https://news.10jqka.com.cn/20250714/c669607268.shtml', '港股,A股', '2025-07-14 12:53:40', ''), ('3250169', '平均每天10班船驶向欧美 从深圳盐田港看中国外贸“加速度”', '海关统计,今年上半年,我国货物贸易进出口总值21.79万亿元,同比增长2.9%。广东深圳()是世界最大的单体集装箱码头之一,每天都有超过4万个标准集装箱从这里发往全球,是名副其实的“世界级航运枢纽”。深圳海关数据显示,今年上半年,盐田港集装箱吞吐量超758万标箱,同比增长12.7%,实现2位数增长。不仅如此,盐田港今年还新增了11条航线,目前每周近百条航线连通全球,平均每天就有4班船驶向欧洲、6班船前往美国。可以说,深圳用稳定高效的港口体系,为外贸企业跑出“加速度”提供了坚实支撑。(央视)', '海关统计,今年上半年,我国货物贸易进出口总值21.79万亿元,同比增长2.9%。广东深圳盐田港是世界最大的单体集装箱码头之一,每天都有超过4万个标准集装箱从这里发往全球,是名副其实的“世界级航运枢纽”。深圳海关数据显示,今年上半年,盐田港集装箱吞吐量超758万标箱,同比增长12.7%,实现2位数增长。不仅如此,盐田港今年还新增了11条航线,目前每周近百条航线连通全球,平均每天就有4班船驶向欧洲、6班船前往美国。可以说,深圳用稳定高效的港口体系,为外贸企业跑出“加速度”提供了坚实支撑。(央视)', 'https://news.10jqka.com.cn/20250714/c669607289.shtml', '港股,A股', '2025-07-14 12:56:18', ''), ('3250035', '港股午评:恒生指数涨0.11%,恒生科技指数涨0.2%', '港股午间收盘,恒生指数涨0.11%,恒生科技指数涨0.2%。比特币屡创新高,港股加密货币概念股冲高,欧科云链涨超30%,雄岸科技涨超20%,新火科技控股涨超9%。', '港股午间收盘,恒生指数涨0.11%,恒生科技指数涨0.2%。比特币屡创新高,港股加密货币概念股冲高,欧科云链涨超30%,雄岸科技涨超20%,新火科技控股涨超9%。', 'https://news.10jqka.com.cn/20250714/c669606787.shtml', '港股,A股', '2025-07-14 12:01:55', '')]
|
|
24
|
-
async def create_pool():
|
|
25
|
-
"""创建AsyncMy连接池"""
|
|
26
|
-
return await asyncmy.create_pool(
|
|
27
|
-
host=MYSQL_HOST,
|
|
28
|
-
user=MYSQL_USER,
|
|
29
|
-
password=MYSQL_PASSWORD,
|
|
30
|
-
db=MYSQL_DB,
|
|
31
|
-
minsize=1,
|
|
32
|
-
maxsize=5,
|
|
33
|
-
autocommit=False,
|
|
34
|
-
connect_timeout=10,
|
|
35
|
-
read_timeout=30,
|
|
36
|
-
charset='utf8mb4'
|
|
37
|
-
)
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
async def batch_insert(pool, chunk_size=100):
|
|
41
|
-
"""批量插入数据"""
|
|
42
|
-
sql = """
|
|
43
|
-
INSERT INTO `articles`
|
|
44
|
-
(`article_id`, `title`, `digest`, `short`, `url`, `tag`, `ctime`, `source`)
|
|
45
|
-
VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
|
|
46
|
-
"""
|
|
47
|
-
|
|
48
|
-
total_items = len(value_tuples)
|
|
49
|
-
success_count = 0
|
|
50
|
-
start_time = datetime.now()
|
|
51
|
-
|
|
52
|
-
try:
|
|
53
|
-
async with pool.acquire() as conn:
|
|
54
|
-
async with conn.cursor() as cursor:
|
|
55
|
-
try:
|
|
56
|
-
affected_rows = await cursor.executemany(sql, value_tuples)
|
|
57
|
-
print(affected_rows)
|
|
58
|
-
await conn.commit()
|
|
59
|
-
success_count += len(value_tuples)
|
|
60
|
-
logger.info(f"成功插入 {len(value_tuples)} 条记录,当前总计: {success_count}/{total_items}")
|
|
61
|
-
except asyncmy.errors.Error as e:
|
|
62
|
-
await conn.rollback()
|
|
63
|
-
logger.error(f"批次插入失败: {e}, 失败记录: {len(value_tuples)} 条")
|
|
64
|
-
raise
|
|
65
|
-
|
|
66
|
-
except Exception as e:
|
|
67
|
-
logger.error(f"批量插入过程中发生错误: {e}")
|
|
68
|
-
raise
|
|
69
|
-
finally:
|
|
70
|
-
elapsed = (datetime.now() - start_time).total_seconds()
|
|
71
|
-
logger.info(f"批量插入完成. 成功: {success_count}, 失败: {total_items - success_count}, 耗时: {elapsed:.2f}秒")
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
async def main():
|
|
75
|
-
try:
|
|
76
|
-
# 创建连接池
|
|
77
|
-
pool = await create_pool()
|
|
78
|
-
logger.info("数据库连接池创建成功")
|
|
79
|
-
|
|
80
|
-
# 执行批量插入
|
|
81
|
-
await batch_insert(pool, chunk_size=50)
|
|
82
|
-
|
|
83
|
-
except Exception as e:
|
|
84
|
-
logger.error(f"主程序错误: {e}")
|
|
85
|
-
finally:
|
|
86
|
-
# 关闭连接池
|
|
87
|
-
if 'pool' in locals():
|
|
88
|
-
pool.close()
|
|
89
|
-
await pool.wait_closed()
|
|
90
|
-
logger.info("数据库连接池已关闭")
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
if __name__ == '__main__':
|
|
94
|
-
asyncio.run(main())
|
tests/baidu_spider/items.py
DELETED
|
@@ -1,25 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/python
|
|
2
|
-
# -*- coding:UTF-8 -*-
|
|
3
|
-
"""
|
|
4
|
-
# @Time : 2025-05-11 13:35
|
|
5
|
-
# @Author : oscar
|
|
6
|
-
# @Desc : None
|
|
7
|
-
"""
|
|
8
|
-
from crawlo.items import Field
|
|
9
|
-
from crawlo.items.items import Item
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
class BauDuItem(Item):
|
|
13
|
-
url = Field()
|
|
14
|
-
title = Field()
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
class ArticleItem(Item):
|
|
18
|
-
article_id = Field()
|
|
19
|
-
title = Field()
|
|
20
|
-
digest = Field()
|
|
21
|
-
short = Field()
|
|
22
|
-
url = Field()
|
|
23
|
-
tag = Field()
|
|
24
|
-
ctime = Field()
|
|
25
|
-
source = Field()
|
tests/baidu_spider/middleware.py
DELETED
|
@@ -1,49 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/python
|
|
2
|
-
# -*- coding:UTF-8 -*-
|
|
3
|
-
"""
|
|
4
|
-
# @Time : 2025-05-17 11:21
|
|
5
|
-
# @Author : crawl-coder
|
|
6
|
-
# @Desc : None
|
|
7
|
-
"""
|
|
8
|
-
import asyncio
|
|
9
|
-
import random
|
|
10
|
-
|
|
11
|
-
from crawlo.exceptions import IgnoreRequestError
|
|
12
|
-
from crawlo.middleware import BaseMiddleware
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
class TestMiddleWare(BaseMiddleware):
|
|
16
|
-
|
|
17
|
-
async def process_request(self, request, spider):
|
|
18
|
-
# 请求预处理
|
|
19
|
-
# print('process_request', request, spider)
|
|
20
|
-
# if random.randint(1, 5) == 1:
|
|
21
|
-
# raise IgnoreRequestError('url不正确')
|
|
22
|
-
pass
|
|
23
|
-
|
|
24
|
-
def process_response(self, request, response, spider):
|
|
25
|
-
# 响应预处理
|
|
26
|
-
# print('process_response', response, response, spider)
|
|
27
|
-
return response
|
|
28
|
-
|
|
29
|
-
def process_exception(self, request, exception, spider):
|
|
30
|
-
# 异常预处理
|
|
31
|
-
# print('process_exception', request, exception, spider)
|
|
32
|
-
pass
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
class TestMiddleWare2(BaseMiddleware):
|
|
36
|
-
def process_request(self, request, spider):
|
|
37
|
-
# 请求预处理
|
|
38
|
-
# print('process_request2', request, spider)
|
|
39
|
-
pass
|
|
40
|
-
|
|
41
|
-
def process_response(self, request, response, spider):
|
|
42
|
-
# 响应预处理
|
|
43
|
-
# print('process_response2', response, response, spider)
|
|
44
|
-
return response
|
|
45
|
-
|
|
46
|
-
def process_exception(self, request, exception, spider):
|
|
47
|
-
# 异常预处理
|
|
48
|
-
# print('process_exception2', request, exception, spider)
|
|
49
|
-
pass
|
tests/baidu_spider/pipeline.py
DELETED
|
@@ -1,55 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/python
|
|
2
|
-
# -*- coding:UTF-8 -*-
|
|
3
|
-
import pymongo
|
|
4
|
-
|
|
5
|
-
from motor.motor_asyncio import AsyncIOMotorClient
|
|
6
|
-
|
|
7
|
-
from random import randint
|
|
8
|
-
|
|
9
|
-
from crawlo.event import spider_closed
|
|
10
|
-
from crawlo.exceptions import ItemDiscard
|
|
11
|
-
from crawlo.utils.log import get_logger
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
class TestPipeline(object):
|
|
15
|
-
|
|
16
|
-
async def process_item(self, item, spider):
|
|
17
|
-
if randint(1, 3) == 1:
|
|
18
|
-
raise ItemDiscard('重复数据')
|
|
19
|
-
return item
|
|
20
|
-
|
|
21
|
-
@classmethod
|
|
22
|
-
def create_instance(cls, *args, **kwargs):
|
|
23
|
-
return cls()
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
class MongoPipeline(object):
|
|
27
|
-
|
|
28
|
-
def __init__(self, conn, col):
|
|
29
|
-
self.conn = conn
|
|
30
|
-
self.col = col
|
|
31
|
-
|
|
32
|
-
self.logger = get_logger(self.__class__.__name__)
|
|
33
|
-
|
|
34
|
-
@classmethod
|
|
35
|
-
def create_instance(cls, crawler):
|
|
36
|
-
settings = crawler.settings
|
|
37
|
-
mongo_params = settings.get('MONGODB_PARAMS', None)
|
|
38
|
-
db_name = settings.get('MONGODB_DB', None)
|
|
39
|
-
project_name = settings.get('PROJECT_NAME', None)
|
|
40
|
-
|
|
41
|
-
conn = AsyncIOMotorClient(**mongo_params) if mongo_params else AsyncIOMotorClient()
|
|
42
|
-
|
|
43
|
-
col = conn[db_name][project_name]
|
|
44
|
-
o = cls(conn, col)
|
|
45
|
-
crawler.subscriber.subscribe(o.spider_closed, event=spider_closed)
|
|
46
|
-
return o
|
|
47
|
-
|
|
48
|
-
async def process_item(self, item, spider):
|
|
49
|
-
await self.col.insert_one(item.to_dict())
|
|
50
|
-
return item
|
|
51
|
-
|
|
52
|
-
async def spider_closed(self):
|
|
53
|
-
self.logger.info('MongoDB closed.')
|
|
54
|
-
self.conn.close()
|
|
55
|
-
|
|
@@ -1,9 +0,0 @@
|
|
|
1
|
-
48e727ce8566d65e5233eaac29498b03e2908dd78a90dad7fdd7510e8ada9e32
|
|
2
|
-
c5e78d04ca9b1113e6a8076792aaa7b09ff4b040bd790c684689745aa7edb1ae
|
|
3
|
-
8287f5ad5e2f06687e88cc31d64fdbd3b1b56cee71fbc3344ad8cea852ea9dd3
|
|
4
|
-
f84661b1d15a6e96c6a77a6484c173be3fb502f73e256e8f72f98982674a7992
|
|
5
|
-
f5c1693afa1293e758331a8e95aa6277ffa49105ccd0d79115d8e85375863adc
|
|
6
|
-
f6f2175b1ae909ac0dd41aa2ed735b8305dde6f92d51dd2e411a0c695cfc4843
|
|
7
|
-
3ba6793c55838d267567f6b65b3406bbad30e89e187d3fbe88e6ae55db24dd9d
|
|
8
|
-
488f1d28fe532f1113f634dfa58a2bccae1d34af5421e4064e2ae024d5010280
|
|
9
|
-
dbf802098ea25af78c5751fdc750624296e79d9e1d968e33e5956860ebb5ecc7
|
tests/baidu_spider/run.py
DELETED
|
@@ -1,27 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/python
|
|
2
|
-
# -*- coding:UTF-8 -*-
|
|
3
|
-
"""
|
|
4
|
-
# @Time : 2025-02-05 13:12
|
|
5
|
-
# @Author : oscar
|
|
6
|
-
# @Desc : None
|
|
7
|
-
"""
|
|
8
|
-
import asyncio
|
|
9
|
-
from crawlo.crawler import CrawlerProcess
|
|
10
|
-
|
|
11
|
-
# from crawlo.utils import system as _
|
|
12
|
-
from tests.baidu_spider.spiders.bai_du import BaiDuSpider
|
|
13
|
-
from crawlo.utils.project import get_settings
|
|
14
|
-
from tests.baidu_spider.spiders.sina import SinaSpider
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
async def main():
|
|
18
|
-
settings = get_settings()
|
|
19
|
-
process = CrawlerProcess(settings)
|
|
20
|
-
# await process.crawl(BaiDuSpider)
|
|
21
|
-
await process.crawl(SinaSpider)
|
|
22
|
-
|
|
23
|
-
await process.start()
|
|
24
|
-
|
|
25
|
-
if __name__ == '__main__':
|
|
26
|
-
asyncio.run(main())
|
|
27
|
-
# 观看到第18集
|