reyfetch 1.0.35__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- reyfetch/__init__.py +21 -0
- reyfetch/rali.py +990 -0
- reyfetch/rall.py +19 -0
- reyfetch/rbaidu.py +467 -0
- reyfetch/rbase.py +243 -0
- reyfetch/rdouban.py +565 -0
- reyfetch/rgeneral.py +158 -0
- reyfetch/rsina.py +239 -0
- reyfetch/rtoutiao.py +71 -0
- reyfetch/rweibo.py +90 -0
- reyfetch-1.0.35.dist-info/METADATA +30 -0
- reyfetch-1.0.35.dist-info/RECORD +14 -0
- reyfetch-1.0.35.dist-info/WHEEL +4 -0
- reyfetch-1.0.35.dist-info/licenses/LICENSE +7 -0
reyfetch/rbase.py
ADDED
@@ -0,0 +1,243 @@
|
|
1
|
+
# !/usr/bin/env python
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
|
4
|
+
"""
|
5
|
+
@Time : 2023-12-29 23:14:18
|
6
|
+
@Author : Rey
|
7
|
+
@Contact : reyxbo@163.com
|
8
|
+
@Explain : Base methods.
|
9
|
+
"""
|
10
|
+
|
11
|
+
|
12
|
+
from typing import Any, Literal
|
13
|
+
from types import MethodType
|
14
|
+
from threading import get_ident as threading_get_ident
|
15
|
+
from selenium.webdriver import Edge, Chrome, EdgeOptions, ChromeOptions
|
16
|
+
from reydb.rdb import Database
|
17
|
+
from reykit.rbase import Base
|
18
|
+
from reykit.rnet import join_url
|
19
|
+
|
20
|
+
|
21
|
+
__all__ = (
|
22
|
+
'FetchBase',
|
23
|
+
'FetchRequest',
|
24
|
+
'FetchCrawl',
|
25
|
+
'FetchBrowser',
|
26
|
+
'crawl_page',
|
27
|
+
'FetchRequestWithDatabase',
|
28
|
+
'FetchRequestDatabaseRecord'
|
29
|
+
)
|
30
|
+
|
31
|
+
|
32
|
+
class FetchBase(Base):
|
33
|
+
"""
|
34
|
+
Fetch base type.
|
35
|
+
"""
|
36
|
+
|
37
|
+
|
38
|
+
class FetchRequest(FetchBase):
|
39
|
+
"""
|
40
|
+
Request API fetch type.
|
41
|
+
"""
|
42
|
+
|
43
|
+
|
44
|
+
class FetchCrawl(FetchBase):
|
45
|
+
"""
|
46
|
+
Crawl Web fetch type.
|
47
|
+
"""
|
48
|
+
|
49
|
+
|
50
|
+
class FetchBrowser(FetchBase):
|
51
|
+
"""
|
52
|
+
Control browser fetch type.
|
53
|
+
"""
|
54
|
+
|
55
|
+
|
56
|
+
def __init__(
|
57
|
+
self,
|
58
|
+
driver: Literal['edge', 'chrome'] = 'edge',
|
59
|
+
headless: bool = False
|
60
|
+
) -> None:
|
61
|
+
"""
|
62
|
+
Build instance attributes.
|
63
|
+
|
64
|
+
Parameters
|
65
|
+
----------
|
66
|
+
driver : Browser driver type.
|
67
|
+
- `Literal['edge']`: Edge browser.
|
68
|
+
- `Literal['chrome']`: Chrome browser.
|
69
|
+
headless : Whether use headless mode.
|
70
|
+
"""
|
71
|
+
|
72
|
+
# Parameter.
|
73
|
+
match driver:
|
74
|
+
case 'edge':
|
75
|
+
driver_type = Edge
|
76
|
+
driver_option_type = EdgeOptions
|
77
|
+
case 'chrome':
|
78
|
+
driver_type = Chrome
|
79
|
+
driver_option_type = ChromeOptions
|
80
|
+
|
81
|
+
# Option.
|
82
|
+
options = driver_option_type()
|
83
|
+
|
84
|
+
## Headless.
|
85
|
+
if headless:
|
86
|
+
options.add_argument('--headless')
|
87
|
+
|
88
|
+
# Driver.
|
89
|
+
self.driver = driver_type(options)
|
90
|
+
|
91
|
+
|
92
|
+
def request(
|
93
|
+
self,
|
94
|
+
url: str,
|
95
|
+
params: dict[str, Any] | None = None
|
96
|
+
) -> None:
|
97
|
+
"""
|
98
|
+
Request URL.
|
99
|
+
|
100
|
+
Parameters
|
101
|
+
----------
|
102
|
+
url : URL.
|
103
|
+
params : URL parameters.
|
104
|
+
"""
|
105
|
+
|
106
|
+
# Parameter.
|
107
|
+
params = params or {}
|
108
|
+
url = join_url(url, params)
|
109
|
+
|
110
|
+
# Request.
|
111
|
+
self.driver.get(url)
|
112
|
+
|
113
|
+
|
114
|
+
@property
|
115
|
+
def page(self) -> str:
|
116
|
+
"""
|
117
|
+
Return page elements document.
|
118
|
+
|
119
|
+
Returns
|
120
|
+
-------
|
121
|
+
Page elements document.
|
122
|
+
"""
|
123
|
+
|
124
|
+
# Parameter.
|
125
|
+
page_source = self.driver.page_source
|
126
|
+
|
127
|
+
return page_source
|
128
|
+
|
129
|
+
|
130
|
+
__call__ = request
|
131
|
+
|
132
|
+
|
133
|
+
def crawl_page(
|
134
|
+
url: str,
|
135
|
+
params: dict[str, Any] | None = None
|
136
|
+
) -> str:
|
137
|
+
"""
|
138
|
+
Crawl page elements document.
|
139
|
+
|
140
|
+
Parameters
|
141
|
+
----------
|
142
|
+
url : URL.
|
143
|
+
params : URL parameters.
|
144
|
+
|
145
|
+
Returns
|
146
|
+
-------
|
147
|
+
Page elements document.
|
148
|
+
"""
|
149
|
+
|
150
|
+
# Parameter.
|
151
|
+
browser = FetchBrowser(headless=True)
|
152
|
+
|
153
|
+
# Request.
|
154
|
+
browser.request(url, params)
|
155
|
+
|
156
|
+
# Page.
|
157
|
+
page = browser.page
|
158
|
+
|
159
|
+
return page
|
160
|
+
|
161
|
+
|
162
|
+
class FetchRequestWithDatabase(FetchRequest):
|
163
|
+
"""
|
164
|
+
With database method reuqest API fetch type.
|
165
|
+
Can create database used `self.build_db` method.
|
166
|
+
"""
|
167
|
+
|
168
|
+
db: Database | None
|
169
|
+
db_names: dict[str, str]
|
170
|
+
build_db: MethodType
|
171
|
+
|
172
|
+
|
173
|
+
class FetchRequestDatabaseRecord(FetchRequest):
|
174
|
+
"""
|
175
|
+
Request API fetch type of record into the database, can multi threaded.
|
176
|
+
"""
|
177
|
+
|
178
|
+
|
179
|
+
def __init__(
|
180
|
+
self,
|
181
|
+
api: FetchRequestWithDatabase | None = None,
|
182
|
+
database: str | None = None,
|
183
|
+
table: str | None = None
|
184
|
+
) -> None:
|
185
|
+
"""
|
186
|
+
Build instance attributes.
|
187
|
+
|
188
|
+
Parameters
|
189
|
+
----------
|
190
|
+
api : `API` instance.
|
191
|
+
- `None`: Not record.
|
192
|
+
database : Index `API.db_names` database name.
|
193
|
+
table : Index `API.db_names` table name.
|
194
|
+
"""
|
195
|
+
|
196
|
+
# Build.
|
197
|
+
self.api = api
|
198
|
+
self.database = database
|
199
|
+
self.table = table
|
200
|
+
self.data: dict[int, dict[str, Any]] = {}
|
201
|
+
|
202
|
+
|
203
|
+
def __setitem__(self, key: str, value: Any) -> None:
|
204
|
+
"""
|
205
|
+
Update record data parameter.
|
206
|
+
|
207
|
+
Parameters
|
208
|
+
----------
|
209
|
+
key : Parameter key.
|
210
|
+
value : Parameter value.
|
211
|
+
"""
|
212
|
+
|
213
|
+
# Check.
|
214
|
+
if self.api.db is None:
|
215
|
+
return
|
216
|
+
|
217
|
+
# Parameter.
|
218
|
+
thread_id = threading_get_ident()
|
219
|
+
record = self.data.setdefault(thread_id, {})
|
220
|
+
|
221
|
+
# Update.
|
222
|
+
record[key] = value
|
223
|
+
|
224
|
+
|
225
|
+
def record(self) -> None:
|
226
|
+
"""
|
227
|
+
Insert record to table of database.
|
228
|
+
"""
|
229
|
+
|
230
|
+
# Check.
|
231
|
+
if self.api.db is None:
|
232
|
+
return
|
233
|
+
|
234
|
+
# Parameter.
|
235
|
+
thread_id = threading_get_ident()
|
236
|
+
record = self.data.setdefault(thread_id, {})
|
237
|
+
table = self.api.db_names[self.table]
|
238
|
+
|
239
|
+
# Insert.
|
240
|
+
self.api.db.execute.insert(table, record)
|
241
|
+
|
242
|
+
# Delete.
|
243
|
+
del self.data[thread_id]
|