crawlo 1.1.2__py3-none-any.whl → 1.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +34 -34
- crawlo/__version__.py +1 -1
- crawlo/cli.py +40 -40
- crawlo/commands/__init__.py +13 -13
- crawlo/commands/check.py +594 -594
- crawlo/commands/genspider.py +151 -151
- crawlo/commands/list.py +155 -155
- crawlo/commands/run.py +285 -285
- crawlo/commands/startproject.py +196 -196
- crawlo/commands/stats.py +188 -188
- crawlo/commands/utils.py +186 -186
- crawlo/config.py +279 -279
- crawlo/core/__init__.py +2 -2
- crawlo/core/engine.py +171 -171
- crawlo/core/enhanced_engine.py +189 -189
- crawlo/core/processor.py +40 -40
- crawlo/core/scheduler.py +166 -162
- crawlo/crawler.py +1027 -1027
- crawlo/downloader/__init__.py +242 -242
- crawlo/downloader/aiohttp_downloader.py +212 -212
- crawlo/downloader/cffi_downloader.py +251 -251
- crawlo/downloader/httpx_downloader.py +259 -257
- crawlo/event.py +11 -11
- crawlo/exceptions.py +82 -78
- crawlo/extension/__init__.py +31 -31
- crawlo/extension/log_interval.py +49 -49
- crawlo/extension/log_stats.py +44 -44
- crawlo/extension/logging_extension.py +34 -34
- crawlo/filters/__init__.py +154 -154
- crawlo/filters/aioredis_filter.py +242 -242
- crawlo/filters/memory_filter.py +269 -269
- crawlo/items/__init__.py +23 -23
- crawlo/items/base.py +21 -21
- crawlo/items/fields.py +53 -53
- crawlo/items/items.py +104 -104
- crawlo/middleware/__init__.py +21 -21
- crawlo/middleware/default_header.py +32 -32
- crawlo/middleware/download_delay.py +28 -28
- crawlo/middleware/middleware_manager.py +135 -135
- crawlo/middleware/proxy.py +248 -248
- crawlo/middleware/request_ignore.py +30 -30
- crawlo/middleware/response_code.py +18 -18
- crawlo/middleware/response_filter.py +26 -26
- crawlo/middleware/retry.py +125 -125
- crawlo/mode_manager.py +200 -200
- crawlo/network/__init__.py +21 -21
- crawlo/network/request.py +311 -311
- crawlo/network/response.py +271 -269
- crawlo/pipelines/__init__.py +22 -13
- crawlo/pipelines/bloom_dedup_pipeline.py +157 -0
- crawlo/pipelines/console_pipeline.py +39 -39
- crawlo/pipelines/csv_pipeline.py +316 -316
- crawlo/pipelines/database_dedup_pipeline.py +225 -0
- crawlo/pipelines/json_pipeline.py +218 -218
- crawlo/pipelines/memory_dedup_pipeline.py +116 -0
- crawlo/pipelines/mongo_pipeline.py +116 -116
- crawlo/pipelines/mysql_pipeline.py +195 -195
- crawlo/pipelines/pipeline_manager.py +56 -56
- crawlo/pipelines/redis_dedup_pipeline.py +163 -0
- crawlo/project.py +153 -153
- crawlo/queue/pqueue.py +37 -37
- crawlo/queue/queue_manager.py +307 -303
- crawlo/queue/redis_priority_queue.py +208 -191
- crawlo/settings/__init__.py +7 -7
- crawlo/settings/default_settings.py +245 -226
- crawlo/settings/setting_manager.py +99 -99
- crawlo/spider/__init__.py +639 -639
- crawlo/stats_collector.py +59 -59
- crawlo/subscriber.py +106 -106
- crawlo/task_manager.py +30 -30
- crawlo/templates/crawlo.cfg.tmpl +10 -10
- crawlo/templates/project/__init__.py.tmpl +3 -3
- crawlo/templates/project/items.py.tmpl +17 -17
- crawlo/templates/project/middlewares.py.tmpl +86 -86
- crawlo/templates/project/pipelines.py.tmpl +341 -335
- crawlo/templates/project/run.py.tmpl +251 -238
- crawlo/templates/project/settings.py.tmpl +250 -247
- crawlo/templates/project/spiders/__init__.py.tmpl +5 -5
- crawlo/templates/spider/spider.py.tmpl +177 -177
- crawlo/utils/__init__.py +7 -7
- crawlo/utils/controlled_spider_mixin.py +439 -335
- crawlo/utils/date_tools.py +233 -233
- crawlo/utils/db_helper.py +343 -343
- crawlo/utils/func_tools.py +82 -82
- crawlo/utils/large_scale_config.py +286 -286
- crawlo/utils/large_scale_helper.py +343 -343
- crawlo/utils/log.py +128 -128
- crawlo/utils/queue_helper.py +175 -175
- crawlo/utils/request.py +267 -267
- crawlo/utils/request_serializer.py +219 -219
- crawlo/utils/spider_loader.py +62 -62
- crawlo/utils/system.py +11 -11
- crawlo/utils/tools.py +4 -4
- crawlo/utils/url.py +39 -39
- {crawlo-1.1.2.dist-info → crawlo-1.1.3.dist-info}/METADATA +635 -567
- crawlo-1.1.3.dist-info/RECORD +113 -0
- examples/__init__.py +7 -7
- examples/controlled_spider_example.py +205 -0
- tests/__init__.py +7 -7
- tests/test_final_validation.py +153 -153
- tests/test_proxy_health_check.py +32 -32
- tests/test_proxy_middleware_integration.py +136 -136
- tests/test_proxy_providers.py +56 -56
- tests/test_proxy_stats.py +19 -19
- tests/test_proxy_strategies.py +59 -59
- tests/test_redis_config.py +28 -28
- tests/test_redis_queue.py +224 -224
- tests/test_request_serialization.py +70 -70
- tests/test_scheduler.py +241 -241
- crawlo-1.1.2.dist-info/RECORD +0 -108
- {crawlo-1.1.2.dist-info → crawlo-1.1.3.dist-info}/WHEEL +0 -0
- {crawlo-1.1.2.dist-info → crawlo-1.1.3.dist-info}/entry_points.txt +0 -0
- {crawlo-1.1.2.dist-info → crawlo-1.1.3.dist-info}/top_level.txt +0 -0
crawlo/utils/url.py
CHANGED
|
@@ -1,40 +1,40 @@
|
|
|
1
|
-
from urllib.parse import urldefrag
|
|
2
|
-
from w3lib.url import add_or_replace_parameter
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
def escape_ajax(url: str) -> str:
|
|
6
|
-
"""
|
|
7
|
-
根据Google AJAX爬取规范转换URL(处理哈希片段#!):
|
|
8
|
-
https://developers.google.com/webmasters/ajax-crawling/docs/getting-started
|
|
9
|
-
|
|
10
|
-
规则说明:
|
|
11
|
-
1. 仅当URL包含 `#!` 时才转换(表示这是AJAX可爬取页面)
|
|
12
|
-
2. 将 `#!key=value` 转换为 `?_escaped_fragment_=key%3Dvalue`
|
|
13
|
-
3. 保留原始查询参数(如果有)
|
|
14
|
-
|
|
15
|
-
示例:
|
|
16
|
-
>>> escape_ajax("www.example.com/ajax.html#!key=value")
|
|
17
|
-
'www.example.com/ajax.html?_escaped_fragment_=key%3Dvalue'
|
|
18
|
-
>>> escape_ajax("www.example.com/ajax.html?k1=v1#!key=value")
|
|
19
|
-
'www.example.com/ajax.html?k1=v1&_escaped_fragment_=key%3Dvalue'
|
|
20
|
-
>>> escape_ajax("www.example.com/ajax.html#!")
|
|
21
|
-
'www.example.com/ajax.html?_escaped_fragment_='
|
|
22
|
-
|
|
23
|
-
非AJAX可爬取的URL(无#!)原样返回:
|
|
24
|
-
>>> escape_ajax("www.example.com/ajax.html#normal")
|
|
25
|
-
'www.example.com/ajax.html#normal'
|
|
26
|
-
"""
|
|
27
|
-
# 分离URL的基础部分和哈希片段
|
|
28
|
-
de_frag, frag = urldefrag(url)
|
|
29
|
-
|
|
30
|
-
# 仅处理以"!"开头的哈希片段(Google规范)
|
|
31
|
-
if not frag.startswith("!"):
|
|
32
|
-
return url # 不符合规则则原样返回
|
|
33
|
-
|
|
34
|
-
# 调用辅助函数添加 `_escaped_fragment_` 参数
|
|
35
|
-
return add_or_replace_parameter(de_frag, "_escaped_fragment_", frag[1:])
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
if __name__ == '__main__':
|
|
39
|
-
f = escape_ajax('http://example.com/page#!')
|
|
1
|
+
from urllib.parse import urldefrag
|
|
2
|
+
from w3lib.url import add_or_replace_parameter
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def escape_ajax(url: str) -> str:
|
|
6
|
+
"""
|
|
7
|
+
根据Google AJAX爬取规范转换URL(处理哈希片段#!):
|
|
8
|
+
https://developers.google.com/webmasters/ajax-crawling/docs/getting-started
|
|
9
|
+
|
|
10
|
+
规则说明:
|
|
11
|
+
1. 仅当URL包含 `#!` 时才转换(表示这是AJAX可爬取页面)
|
|
12
|
+
2. 将 `#!key=value` 转换为 `?_escaped_fragment_=key%3Dvalue`
|
|
13
|
+
3. 保留原始查询参数(如果有)
|
|
14
|
+
|
|
15
|
+
示例:
|
|
16
|
+
>>> escape_ajax("www.example.com/ajax.html#!key=value")
|
|
17
|
+
'www.example.com/ajax.html?_escaped_fragment_=key%3Dvalue'
|
|
18
|
+
>>> escape_ajax("www.example.com/ajax.html?k1=v1#!key=value")
|
|
19
|
+
'www.example.com/ajax.html?k1=v1&_escaped_fragment_=key%3Dvalue'
|
|
20
|
+
>>> escape_ajax("www.example.com/ajax.html#!")
|
|
21
|
+
'www.example.com/ajax.html?_escaped_fragment_='
|
|
22
|
+
|
|
23
|
+
非AJAX可爬取的URL(无#!)原样返回:
|
|
24
|
+
>>> escape_ajax("www.example.com/ajax.html#normal")
|
|
25
|
+
'www.example.com/ajax.html#normal'
|
|
26
|
+
"""
|
|
27
|
+
# 分离URL的基础部分和哈希片段
|
|
28
|
+
de_frag, frag = urldefrag(url)
|
|
29
|
+
|
|
30
|
+
# 仅处理以"!"开头的哈希片段(Google规范)
|
|
31
|
+
if not frag.startswith("!"):
|
|
32
|
+
return url # 不符合规则则原样返回
|
|
33
|
+
|
|
34
|
+
# 调用辅助函数添加 `_escaped_fragment_` 参数
|
|
35
|
+
return add_or_replace_parameter(de_frag, "_escaped_fragment_", frag[1:])
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
if __name__ == '__main__':
|
|
39
|
+
f = escape_ajax('http://example.com/page#!')
|
|
40
40
|
print(f)
|