crawlo 1.0.3__py3-none-any.whl → 1.0.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +25 -9
- crawlo/__version__.py +1 -1
- crawlo/core/__init__.py +2 -2
- crawlo/core/engine.py +158 -158
- crawlo/core/processor.py +40 -40
- crawlo/core/scheduler.py +57 -57
- crawlo/crawler.py +424 -242
- crawlo/downloader/__init__.py +78 -78
- crawlo/downloader/aiohttp_downloader.py +200 -259
- crawlo/downloader/cffi_downloader.py +277 -0
- crawlo/downloader/httpx_downloader.py +246 -187
- crawlo/event.py +11 -11
- crawlo/exceptions.py +73 -64
- crawlo/extension/__init__.py +31 -31
- crawlo/extension/log_interval.py +49 -49
- crawlo/extension/log_stats.py +44 -44
- crawlo/extension/logging_extension.py +35 -0
- crawlo/filters/__init__.py +37 -37
- crawlo/filters/aioredis_filter.py +150 -158
- crawlo/filters/memory_filter.py +202 -202
- crawlo/items/__init__.py +62 -62
- crawlo/items/items.py +115 -119
- crawlo/middleware/__init__.py +21 -21
- crawlo/middleware/default_header.py +32 -32
- crawlo/middleware/download_delay.py +28 -28
- crawlo/middleware/middleware_manager.py +135 -140
- crawlo/middleware/proxy.py +246 -0
- crawlo/middleware/request_ignore.py +30 -30
- crawlo/middleware/response_code.py +18 -18
- crawlo/middleware/response_filter.py +26 -26
- crawlo/middleware/retry.py +90 -90
- crawlo/network/__init__.py +7 -7
- crawlo/network/request.py +203 -204
- crawlo/network/response.py +166 -166
- crawlo/pipelines/__init__.py +13 -13
- crawlo/pipelines/console_pipeline.py +39 -39
- crawlo/pipelines/mongo_pipeline.py +116 -116
- crawlo/pipelines/mysql_batch_pipline.py +273 -134
- crawlo/pipelines/mysql_pipeline.py +195 -195
- crawlo/pipelines/pipeline_manager.py +56 -56
- crawlo/settings/__init__.py +7 -7
- crawlo/settings/default_settings.py +169 -93
- crawlo/settings/setting_manager.py +99 -99
- crawlo/spider/__init__.py +41 -36
- crawlo/stats_collector.py +59 -59
- crawlo/subscriber.py +106 -106
- crawlo/task_manager.py +27 -27
- crawlo/templates/item_template.tmpl +21 -21
- crawlo/templates/project_template/main.py +32 -32
- crawlo/templates/project_template/setting.py +189 -189
- crawlo/templates/spider_template.tmpl +30 -30
- crawlo/utils/__init__.py +7 -7
- crawlo/utils/concurrency_manager.py +124 -124
- crawlo/utils/date_tools.py +233 -177
- crawlo/utils/db_helper.py +344 -0
- crawlo/utils/func_tools.py +82 -82
- crawlo/utils/log.py +129 -39
- crawlo/utils/pqueue.py +173 -173
- crawlo/utils/project.py +59 -59
- crawlo/utils/request.py +267 -122
- crawlo/utils/system.py +11 -11
- crawlo/utils/tools.py +5 -303
- crawlo/utils/url.py +39 -39
- {crawlo-1.0.3.dist-info → crawlo-1.0.5.dist-info}/METADATA +49 -48
- crawlo-1.0.5.dist-info/RECORD +84 -0
- {crawlo-1.0.3.dist-info → crawlo-1.0.5.dist-info}/top_level.txt +1 -0
- examples/__init__.py +0 -0
- examples/gxb/__init__.py +0 -0
- examples/gxb/items.py +36 -0
- examples/gxb/run.py +15 -0
- examples/gxb/settings.py +71 -0
- examples/gxb/spider/__init__.py +0 -0
- examples/gxb/spider/miit_spider.py +180 -0
- examples/gxb/spider/telecom_device_licenses.py +129 -0
- tests/__init__.py +7 -7
- tests/test_proxy_health_check.py +33 -0
- tests/test_proxy_middleware_integration.py +137 -0
- tests/test_proxy_providers.py +57 -0
- tests/test_proxy_stats.py +20 -0
- tests/test_proxy_strategies.py +60 -0
- crawlo/downloader/playwright_downloader.py +0 -161
- crawlo/filters/redis_filter.py +0 -120
- crawlo-1.0.3.dist-info/RECORD +0 -80
- tests/baidu_spider/__init__.py +0 -7
- tests/baidu_spider/demo.py +0 -94
- tests/baidu_spider/items.py +0 -25
- tests/baidu_spider/middleware.py +0 -49
- tests/baidu_spider/pipeline.py +0 -55
- tests/baidu_spider/request_fingerprints.txt +0 -9
- tests/baidu_spider/run.py +0 -27
- tests/baidu_spider/settings.py +0 -78
- tests/baidu_spider/spiders/__init__.py +0 -7
- tests/baidu_spider/spiders/bai_du.py +0 -61
- tests/baidu_spider/spiders/sina.py +0 -79
- {crawlo-1.0.3.dist-info → crawlo-1.0.5.dist-info}/WHEEL +0 -0
- {crawlo-1.0.3.dist-info → crawlo-1.0.5.dist-info}/entry_points.txt +0 -0
tests/baidu_spider/pipeline.py
DELETED
|
@@ -1,55 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/python
|
|
2
|
-
# -*- coding:UTF-8 -*-
|
|
3
|
-
import pymongo
|
|
4
|
-
|
|
5
|
-
from motor.motor_asyncio import AsyncIOMotorClient
|
|
6
|
-
|
|
7
|
-
from random import randint
|
|
8
|
-
|
|
9
|
-
from crawlo.event import spider_closed
|
|
10
|
-
from crawlo.exceptions import ItemDiscard
|
|
11
|
-
from crawlo.utils.log import get_logger
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
class TestPipeline(object):
|
|
15
|
-
|
|
16
|
-
async def process_item(self, item, spider):
|
|
17
|
-
if randint(1, 3) == 1:
|
|
18
|
-
raise ItemDiscard('重复数据')
|
|
19
|
-
return item
|
|
20
|
-
|
|
21
|
-
@classmethod
|
|
22
|
-
def create_instance(cls, *args, **kwargs):
|
|
23
|
-
return cls()
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
class MongoPipeline(object):
|
|
27
|
-
|
|
28
|
-
def __init__(self, conn, col):
|
|
29
|
-
self.conn = conn
|
|
30
|
-
self.col = col
|
|
31
|
-
|
|
32
|
-
self.logger = get_logger(self.__class__.__name__)
|
|
33
|
-
|
|
34
|
-
@classmethod
|
|
35
|
-
def create_instance(cls, crawler):
|
|
36
|
-
settings = crawler.settings
|
|
37
|
-
mongo_params = settings.get('MONGODB_PARAMS', None)
|
|
38
|
-
db_name = settings.get('MONGODB_DB', None)
|
|
39
|
-
project_name = settings.get('PROJECT_NAME', None)
|
|
40
|
-
|
|
41
|
-
conn = AsyncIOMotorClient(**mongo_params) if mongo_params else AsyncIOMotorClient()
|
|
42
|
-
|
|
43
|
-
col = conn[db_name][project_name]
|
|
44
|
-
o = cls(conn, col)
|
|
45
|
-
crawler.subscriber.subscribe(o.spider_closed, event=spider_closed)
|
|
46
|
-
return o
|
|
47
|
-
|
|
48
|
-
async def process_item(self, item, spider):
|
|
49
|
-
await self.col.insert_one(item.to_dict())
|
|
50
|
-
return item
|
|
51
|
-
|
|
52
|
-
async def spider_closed(self):
|
|
53
|
-
self.logger.info('MongoDB closed.')
|
|
54
|
-
self.conn.close()
|
|
55
|
-
|
|
@@ -1,9 +0,0 @@
|
|
|
1
|
-
48e727ce8566d65e5233eaac29498b03e2908dd78a90dad7fdd7510e8ada9e32
|
|
2
|
-
c5e78d04ca9b1113e6a8076792aaa7b09ff4b040bd790c684689745aa7edb1ae
|
|
3
|
-
8287f5ad5e2f06687e88cc31d64fdbd3b1b56cee71fbc3344ad8cea852ea9dd3
|
|
4
|
-
f84661b1d15a6e96c6a77a6484c173be3fb502f73e256e8f72f98982674a7992
|
|
5
|
-
f5c1693afa1293e758331a8e95aa6277ffa49105ccd0d79115d8e85375863adc
|
|
6
|
-
f6f2175b1ae909ac0dd41aa2ed735b8305dde6f92d51dd2e411a0c695cfc4843
|
|
7
|
-
3ba6793c55838d267567f6b65b3406bbad30e89e187d3fbe88e6ae55db24dd9d
|
|
8
|
-
488f1d28fe532f1113f634dfa58a2bccae1d34af5421e4064e2ae024d5010280
|
|
9
|
-
dbf802098ea25af78c5751fdc750624296e79d9e1d968e33e5956860ebb5ecc7
|
tests/baidu_spider/run.py
DELETED
|
@@ -1,27 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/python
|
|
2
|
-
# -*- coding:UTF-8 -*-
|
|
3
|
-
"""
|
|
4
|
-
# @Time : 2025-02-05 13:12
|
|
5
|
-
# @Author : oscar
|
|
6
|
-
# @Desc : None
|
|
7
|
-
"""
|
|
8
|
-
import asyncio
|
|
9
|
-
from crawlo.crawler import CrawlerProcess
|
|
10
|
-
|
|
11
|
-
# from crawlo.utils import system as _
|
|
12
|
-
from tests.baidu_spider.spiders.bai_du import BaiDuSpider
|
|
13
|
-
from crawlo.utils.project import get_settings
|
|
14
|
-
from tests.baidu_spider.spiders.sina import SinaSpider
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
async def main():
|
|
18
|
-
settings = get_settings()
|
|
19
|
-
process = CrawlerProcess(settings)
|
|
20
|
-
# await process.crawl(BaiDuSpider)
|
|
21
|
-
await process.crawl(SinaSpider)
|
|
22
|
-
|
|
23
|
-
await process.start()
|
|
24
|
-
|
|
25
|
-
if __name__ == '__main__':
|
|
26
|
-
asyncio.run(main())
|
|
27
|
-
# 观看到第18集
|
tests/baidu_spider/settings.py
DELETED
|
@@ -1,78 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/python
|
|
2
|
-
# -*- coding:UTF-8 -*-
|
|
3
|
-
|
|
4
|
-
PROJECT_NAME = 'baidu_spider'
|
|
5
|
-
|
|
6
|
-
CONCURRENCY = 4
|
|
7
|
-
|
|
8
|
-
USE_SESSION = True
|
|
9
|
-
|
|
10
|
-
# 下载延迟
|
|
11
|
-
DOWNLOAD_DELAY = 0.5
|
|
12
|
-
RANDOMNESS = False
|
|
13
|
-
|
|
14
|
-
# --------------------------------------------------- 公共MySQL配置 -----------------------------------------------------
|
|
15
|
-
MYSQL_HOST = '43.139.14.225'
|
|
16
|
-
MYSQL_PORT = 3306
|
|
17
|
-
MYSQL_USER = 'picker'
|
|
18
|
-
MYSQL_PASSWORD = 'kmcNbbz6TbSihttZ'
|
|
19
|
-
MYSQL_DB = 'stock_pro'
|
|
20
|
-
MYSQL_TABLE = 'articles' # 可选,默认使用spider名称
|
|
21
|
-
MYSQL_BATCH_SIZE = 500
|
|
22
|
-
|
|
23
|
-
# asyncmy专属配置
|
|
24
|
-
MYSQL_POOL_MIN = 5 # 连接池最小连接数
|
|
25
|
-
MYSQL_POOL_MAX = 20 # 连接池最大连接数
|
|
26
|
-
|
|
27
|
-
# 选择下载器
|
|
28
|
-
# DOWNLOADER = "crawlo.downloader.httpx_downloader.HttpXDownloader"
|
|
29
|
-
|
|
30
|
-
MIDDLEWARES = [
|
|
31
|
-
'crawlo.middleware.download_delay.DownloadDelayMiddleware',
|
|
32
|
-
'crawlo.middleware.default_header.DefaultHeaderMiddleware',
|
|
33
|
-
'crawlo.middleware.response_filter.ResponseFilterMiddleware',
|
|
34
|
-
'crawlo.middleware.retry.RetryMiddleware',
|
|
35
|
-
'crawlo.middleware.response_code.ResponseCodeMiddleware',
|
|
36
|
-
'crawlo.middleware.request_ignore.RequestIgnoreMiddleware',
|
|
37
|
-
# 'baidu_spider.middleware.TestMiddleWare',
|
|
38
|
-
# 'baidu_spider.middleware.TestMiddleWare2'
|
|
39
|
-
]
|
|
40
|
-
|
|
41
|
-
EXTENSIONS = [
|
|
42
|
-
'crawlo.extension.log_interval.LogIntervalExtension',
|
|
43
|
-
'crawlo.extension.log_stats.LogStats',
|
|
44
|
-
]
|
|
45
|
-
|
|
46
|
-
PIPELINES = [
|
|
47
|
-
'crawlo.pipelines.console_pipeline.ConsolePipeline',
|
|
48
|
-
'crawlo.pipelines.mysql_pipeline.AsyncmyMySQLPipeline', # 或 AiomysqlMySQLPipeline
|
|
49
|
-
# 'crawlo.pipelines.mysql_batch_pipline.AsyncmyMySQLPipeline', # 或 AiomysqlMySQLPipeline
|
|
50
|
-
# 'baidu_spider.pipeline.TestPipeline',
|
|
51
|
-
# 'baidu_spider.pipeline.MongoPipeline',
|
|
52
|
-
]
|
|
53
|
-
|
|
54
|
-
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36'
|
|
55
|
-
DEFAULT_HEADERS = {
|
|
56
|
-
"accept": "application/json, text/javascript, */*; q=0.01",
|
|
57
|
-
"accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
|
|
58
|
-
"cache-control": "no-cache",
|
|
59
|
-
"pragma": "no-cache",
|
|
60
|
-
"priority": "u=1, i",
|
|
61
|
-
"sec-ch-ua": "\"Chromium\";v=\"136\", \"Google Chrome\";v=\"136\", \"Not.A/Brand\";v=\"99\"",
|
|
62
|
-
"sec-ch-ua-mobile": "?0",
|
|
63
|
-
"sec-ch-ua-platform": "\"macOS\"",
|
|
64
|
-
"sec-fetch-dest": "empty",
|
|
65
|
-
"sec-fetch-mode": "cors",
|
|
66
|
-
"sec-fetch-site": "same-origin",
|
|
67
|
-
# "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36",
|
|
68
|
-
"x-requested-with": "XMLHttpRequest"
|
|
69
|
-
}
|
|
70
|
-
|
|
71
|
-
# --------------------------------------DB ---------------------------------------------
|
|
72
|
-
Mongo_Params = ''
|
|
73
|
-
MONGODB_DB = 'news'
|
|
74
|
-
|
|
75
|
-
CLEANUP_FP = True
|
|
76
|
-
|
|
77
|
-
FILTER_CLASS = 'crawlo.filters.aioredis_filter.AioRedisFilter'
|
|
78
|
-
# FILTER_CLASS = 'crawlo.filters.memory_filter.MemoryFileFilter'
|
|
@@ -1,61 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/python
|
|
2
|
-
# -*- coding:UTF-8 -*-
|
|
3
|
-
"""
|
|
4
|
-
# @Time : 2025-02-05 13:05
|
|
5
|
-
# @Author : oscar
|
|
6
|
-
# @Desc : None
|
|
7
|
-
"""
|
|
8
|
-
import asyncio
|
|
9
|
-
from crawlo import Request
|
|
10
|
-
from crawlo.spider import Spider
|
|
11
|
-
|
|
12
|
-
from items import BauDuItem
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
class BaiDuSpider(Spider):
|
|
16
|
-
start_urls = ["https://www.baidu.com/", "https://www.baidu.com/"]
|
|
17
|
-
|
|
18
|
-
custom_settings = {
|
|
19
|
-
'CONCURRENCY': 1
|
|
20
|
-
}
|
|
21
|
-
|
|
22
|
-
name = "bai_du"
|
|
23
|
-
|
|
24
|
-
# headers = {
|
|
25
|
-
# "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36"
|
|
26
|
-
# }
|
|
27
|
-
#
|
|
28
|
-
user_gent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36"
|
|
29
|
-
|
|
30
|
-
async def parse(self, response):
|
|
31
|
-
for i in range(5):
|
|
32
|
-
url = f"https://www.baidu.com"
|
|
33
|
-
# url = f"https://www.httpbin.org/404"
|
|
34
|
-
r = Request(url=url, callback=self.parse_page, dont_filter=True)
|
|
35
|
-
yield r
|
|
36
|
-
|
|
37
|
-
async def parse_page(self, response):
|
|
38
|
-
for i in range(5):
|
|
39
|
-
url = f"https://www.baidu.com"
|
|
40
|
-
meta = {'test': 'hhhh'}
|
|
41
|
-
r = Request(url=url, callback=self.parse_detail, meta=meta, dont_filter=False)
|
|
42
|
-
yield r
|
|
43
|
-
|
|
44
|
-
def parse_detail(self, response):
|
|
45
|
-
item = BauDuItem()
|
|
46
|
-
item['title'] = response.xpath('//title/text()').get()
|
|
47
|
-
|
|
48
|
-
item['url'] = response.url
|
|
49
|
-
|
|
50
|
-
yield item
|
|
51
|
-
|
|
52
|
-
async def spider_opened(self):
|
|
53
|
-
pass
|
|
54
|
-
|
|
55
|
-
async def spider_closed(self):
|
|
56
|
-
pass
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
if __name__ == '__main__':
|
|
60
|
-
b = BaiDuSpider()
|
|
61
|
-
b.start_requests()
|
|
@@ -1,79 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/python
|
|
2
|
-
# -*- coding:UTF-8 -*-
|
|
3
|
-
"""
|
|
4
|
-
# @Time : 2025-02-05 13:05
|
|
5
|
-
# @Author : oscar
|
|
6
|
-
# @Desc : None
|
|
7
|
-
"""
|
|
8
|
-
import time
|
|
9
|
-
|
|
10
|
-
from crawlo import Request
|
|
11
|
-
from crawlo.spider import Spider
|
|
12
|
-
from crawlo.utils.date_tools import timestamp_to_datetime, format_datetime
|
|
13
|
-
|
|
14
|
-
from tests.baidu_spider.items import ArticleItem
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
class SinaSpider(Spider):
|
|
18
|
-
# 获取当前时间戳,并减去 10 分钟(600 秒)
|
|
19
|
-
current_time_minus_10min = int(time.time()) - 6000
|
|
20
|
-
# 构造 URL
|
|
21
|
-
url = f'https://news.10jqka.com.cn/tapp/news/push/stock/?page=1&tag=&track=website&ctime={current_time_minus_10min}'
|
|
22
|
-
|
|
23
|
-
start_urls = [url]
|
|
24
|
-
name = 'sina'
|
|
25
|
-
# mysql_table = 'news_10jqka'
|
|
26
|
-
|
|
27
|
-
allowed_domains = ['*']
|
|
28
|
-
|
|
29
|
-
def start_requests(self):
|
|
30
|
-
for url in self.start_urls:
|
|
31
|
-
yield Request(url=url, callback=self.parse, dont_filter=True)
|
|
32
|
-
|
|
33
|
-
async def parse(self, response):
|
|
34
|
-
jsonp_str = response.json()
|
|
35
|
-
rows = jsonp_str.get('data', {}).get('list', [])
|
|
36
|
-
for row in rows:
|
|
37
|
-
article_id = row.get('id')
|
|
38
|
-
title = row.get('title')
|
|
39
|
-
digest = row.get('digest')
|
|
40
|
-
short = row.get('short')
|
|
41
|
-
detail_url = row.get('url')
|
|
42
|
-
tag = row.get('tag')
|
|
43
|
-
ctime = row.get('ctime')
|
|
44
|
-
source = row.get('source')
|
|
45
|
-
meta = {
|
|
46
|
-
'article_id': article_id,
|
|
47
|
-
'title': title,
|
|
48
|
-
'digest': digest,
|
|
49
|
-
'short': short,
|
|
50
|
-
'detail_url': detail_url,
|
|
51
|
-
'source': source,
|
|
52
|
-
'tag': tag,
|
|
53
|
-
'ctime': timestamp_to_datetime(int(ctime))
|
|
54
|
-
}
|
|
55
|
-
|
|
56
|
-
yield Request(url=detail_url, callback=self.parse_detail, encoding='gbk', meta=meta)
|
|
57
|
-
|
|
58
|
-
@staticmethod
|
|
59
|
-
async def parse_detail(response):
|
|
60
|
-
item = ArticleItem()
|
|
61
|
-
meta = response.meta
|
|
62
|
-
content = ''.join(response.xpath('//*[@id="contentApp"]/p/text()').extract()).strip()
|
|
63
|
-
ctime = meta.get('ctime')
|
|
64
|
-
item['article_id'] = meta.get('article_id')
|
|
65
|
-
item['title'] = meta.get('title')
|
|
66
|
-
item['digest'] = content
|
|
67
|
-
item['short'] = meta.get('short')
|
|
68
|
-
item['url'] = meta.get('detail_url')
|
|
69
|
-
item['tag'] = meta.get('tag').strip()
|
|
70
|
-
item['ctime'] = format_datetime(ctime)
|
|
71
|
-
item['source'] = meta.get('source')
|
|
72
|
-
|
|
73
|
-
yield item
|
|
74
|
-
|
|
75
|
-
async def spider_opened(self):
|
|
76
|
-
pass
|
|
77
|
-
|
|
78
|
-
async def spider_closed(self):
|
|
79
|
-
pass
|
|
File without changes
|
|
File without changes
|