PyPI - crawlo - Versions diffs - 1.1.2__py3-none-any.whl → 1.1.3__py3-none-any.whl - Mend

crawlo 1.1.2py3-none-any.whl → 1.1.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of crawlo might be problematic. Click here for more details.

Files changed (113) hide show

crawlo/__init__.py +34 -34
crawlo/__version__.py +1 -1
crawlo/cli.py +40 -40
crawlo/commands/__init__.py +13 -13
crawlo/commands/check.py +594 -594
crawlo/commands/genspider.py +151 -151
crawlo/commands/list.py +155 -155
crawlo/commands/run.py +285 -285
crawlo/commands/startproject.py +196 -196
crawlo/commands/stats.py +188 -188
crawlo/commands/utils.py +186 -186
crawlo/config.py +279 -279
crawlo/core/__init__.py +2 -2
crawlo/core/engine.py +171 -171
crawlo/core/enhanced_engine.py +189 -189
crawlo/core/processor.py +40 -40
crawlo/core/scheduler.py +166 -162
crawlo/crawler.py +1027 -1027
crawlo/downloader/__init__.py +242 -242
crawlo/downloader/aiohttp_downloader.py +212 -212
crawlo/downloader/cffi_downloader.py +251 -251
crawlo/downloader/httpx_downloader.py +259 -257
crawlo/event.py +11 -11
crawlo/exceptions.py +82 -78
crawlo/extension/__init__.py +31 -31
crawlo/extension/log_interval.py +49 -49
crawlo/extension/log_stats.py +44 -44
crawlo/extension/logging_extension.py +34 -34
crawlo/filters/__init__.py +154 -154
crawlo/filters/aioredis_filter.py +242 -242
crawlo/filters/memory_filter.py +269 -269
crawlo/items/__init__.py +23 -23
crawlo/items/base.py +21 -21
crawlo/items/fields.py +53 -53
crawlo/items/items.py +104 -104
crawlo/middleware/__init__.py +21 -21
crawlo/middleware/default_header.py +32 -32
crawlo/middleware/download_delay.py +28 -28
crawlo/middleware/middleware_manager.py +135 -135
crawlo/middleware/proxy.py +248 -248
crawlo/middleware/request_ignore.py +30 -30
crawlo/middleware/response_code.py +18 -18
crawlo/middleware/response_filter.py +26 -26
crawlo/middleware/retry.py +125 -125
crawlo/mode_manager.py +200 -200
crawlo/network/__init__.py +21 -21
crawlo/network/request.py +311 -311
crawlo/network/response.py +271 -269
crawlo/pipelines/__init__.py +22 -13
crawlo/pipelines/bloom_dedup_pipeline.py +157 -0
crawlo/pipelines/console_pipeline.py +39 -39
crawlo/pipelines/csv_pipeline.py +316 -316
crawlo/pipelines/database_dedup_pipeline.py +225 -0
crawlo/pipelines/json_pipeline.py +218 -218
crawlo/pipelines/memory_dedup_pipeline.py +116 -0
crawlo/pipelines/mongo_pipeline.py +116 -116
crawlo/pipelines/mysql_pipeline.py +195 -195
crawlo/pipelines/pipeline_manager.py +56 -56
crawlo/pipelines/redis_dedup_pipeline.py +163 -0
crawlo/project.py +153 -153
crawlo/queue/pqueue.py +37 -37
crawlo/queue/queue_manager.py +307 -303
crawlo/queue/redis_priority_queue.py +208 -191
crawlo/settings/__init__.py +7 -7
crawlo/settings/default_settings.py +245 -226
crawlo/settings/setting_manager.py +99 -99
crawlo/spider/__init__.py +639 -639
crawlo/stats_collector.py +59 -59
crawlo/subscriber.py +106 -106
crawlo/task_manager.py +30 -30
crawlo/templates/crawlo.cfg.tmpl +10 -10
crawlo/templates/project/__init__.py.tmpl +3 -3
crawlo/templates/project/items.py.tmpl +17 -17
crawlo/templates/project/middlewares.py.tmpl +86 -86
crawlo/templates/project/pipelines.py.tmpl +341 -335
crawlo/templates/project/run.py.tmpl +251 -238
crawlo/templates/project/settings.py.tmpl +250 -247
crawlo/templates/project/spiders/__init__.py.tmpl +5 -5
crawlo/templates/spider/spider.py.tmpl +177 -177
crawlo/utils/__init__.py +7 -7
crawlo/utils/controlled_spider_mixin.py +439 -335
crawlo/utils/date_tools.py +233 -233
crawlo/utils/db_helper.py +343 -343
crawlo/utils/func_tools.py +82 -82
crawlo/utils/large_scale_config.py +286 -286
crawlo/utils/large_scale_helper.py +343 -343
crawlo/utils/log.py +128 -128
crawlo/utils/queue_helper.py +175 -175
crawlo/utils/request.py +267 -267
crawlo/utils/request_serializer.py +219 -219
crawlo/utils/spider_loader.py +62 -62
crawlo/utils/system.py +11 -11
crawlo/utils/tools.py +4 -4
crawlo/utils/url.py +39 -39
{crawlo-1.1.2.dist-info → crawlo-1.1.3.dist-info}/METADATA +635 -567
crawlo-1.1.3.dist-info/RECORD +113 -0
examples/__init__.py +7 -7
examples/controlled_spider_example.py +205 -0
tests/__init__.py +7 -7
tests/test_final_validation.py +153 -153
tests/test_proxy_health_check.py +32 -32
tests/test_proxy_middleware_integration.py +136 -136
tests/test_proxy_providers.py +56 -56
tests/test_proxy_stats.py +19 -19
tests/test_proxy_strategies.py +59 -59
tests/test_redis_config.py +28 -28
tests/test_redis_queue.py +224 -224
tests/test_request_serialization.py +70 -70
tests/test_scheduler.py +241 -241
crawlo-1.1.2.dist-info/RECORD +0 -108
{crawlo-1.1.2.dist-info → crawlo-1.1.3.dist-info}/WHEEL +0 -0
{crawlo-1.1.2.dist-info → crawlo-1.1.3.dist-info}/entry_points.txt +0 -0
{crawlo-1.1.2.dist-info → crawlo-1.1.3.dist-info}/top_level.txt +0 -0

crawlo/utils/url.py CHANGED Viewed

@@ -1,40 +1,40 @@
-from urllib.parse import urldefrag
-from w3lib.url import add_or_replace_parameter
-def escape_ajax(url: str) -> str:
-    """
-    根据Google AJAX爬取规范转换URL（处理哈希片段#!）：
-    https://developers.google.com/webmasters/ajax-crawling/docs/getting-started
-    规则说明：
-    1. 仅当URL包含 `#!` 时才转换（表示这是AJAX可爬取页面）
-    2. 将 `#!key=value` 转换为 `?_escaped_fragment_=key%3Dvalue`
-    3. 保留原始查询参数（如果有）
-    示例：
-    >>> escape_ajax("www.example.com/ajax.html#!key=value")
-    'www.example.com/ajax.html?_escaped_fragment_=key%3Dvalue'
-    >>> escape_ajax("www.example.com/ajax.html?k1=v1#!key=value")
-    'www.example.com/ajax.html?k1=v1&_escaped_fragment_=key%3Dvalue'
-    >>> escape_ajax("www.example.com/ajax.html#!")
-    'www.example.com/ajax.html?_escaped_fragment_='
-    非AJAX可爬取的URL（无#!）原样返回：
-    >>> escape_ajax("www.example.com/ajax.html#normal")
-    'www.example.com/ajax.html#normal'
-    """
-    # 分离URL的基础部分和哈希片段
-    de_frag, frag = urldefrag(url)
-    # 仅处理以"!"开头的哈希片段（Google规范）
-    if not frag.startswith("!"):
-        return url  # 不符合规则则原样返回
-    # 调用辅助函数添加 `_escaped_fragment_` 参数
-    return add_or_replace_parameter(de_frag, "_escaped_fragment_", frag[1:])
-if __name__ == '__main__':
-    f = escape_ajax('http://example.com/page#!')
+from urllib.parse import urldefrag
+from w3lib.url import add_or_replace_parameter
+def escape_ajax(url: str) -> str:
+    """
+    根据Google AJAX爬取规范转换URL（处理哈希片段#!）：
+    https://developers.google.com/webmasters/ajax-crawling/docs/getting-started
+    规则说明：
+    1. 仅当URL包含 `#!` 时才转换（表示这是AJAX可爬取页面）
+    2. 将 `#!key=value` 转换为 `?_escaped_fragment_=key%3Dvalue`
+    3. 保留原始查询参数（如果有）
+    示例：
+    >>> escape_ajax("www.example.com/ajax.html#!key=value")
+    'www.example.com/ajax.html?_escaped_fragment_=key%3Dvalue'
+    >>> escape_ajax("www.example.com/ajax.html?k1=v1#!key=value")
+    'www.example.com/ajax.html?k1=v1&_escaped_fragment_=key%3Dvalue'
+    >>> escape_ajax("www.example.com/ajax.html#!")
+    'www.example.com/ajax.html?_escaped_fragment_='
+    非AJAX可爬取的URL（无#!）原样返回：
+    >>> escape_ajax("www.example.com/ajax.html#normal")
+    'www.example.com/ajax.html#normal'
+    """
+    # 分离URL的基础部分和哈希片段
+    de_frag, frag = urldefrag(url)
+    # 仅处理以"!"开头的哈希片段（Google规范）
+    if not frag.startswith("!"):
+        return url  # 不符合规则则原样返回
+    # 调用辅助函数添加 `_escaped_fragment_` 参数
+    return add_or_replace_parameter(de_frag, "_escaped_fragment_", frag[1:])
+if __name__ == '__main__':
+    f = escape_ajax('http://example.com/page#!')
     print(f)

crawlo 1.1.2__py3-none-any.whl → 1.1.3__py3-none-any.whl

Potentially problematic release.

crawlo 1.1.2py3-none-any.whl → 1.1.3py3-none-any.whl