fb_scraper_request 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fb_graphql_scraper/__init__.py +14 -0
- fb_graphql_scraper/base/__init__.py +0 -0
- fb_graphql_scraper/base/base_page.py +90 -0
- fb_graphql_scraper/example.py +26 -0
- fb_graphql_scraper/facebook_graphql_scraper.py +304 -0
- fb_graphql_scraper/pages/__init__.py +0 -0
- fb_graphql_scraper/pages/page_optional.py +127 -0
- fb_graphql_scraper/tests/__init__.py +0 -0
- fb_graphql_scraper/utils/__init__.py +0 -0
- fb_graphql_scraper/utils/locator.py +54 -0
- fb_graphql_scraper/utils/parser.py +118 -0
- fb_graphql_scraper/utils/utils.py +318 -0
- fb_scraper_request-0.2.0.dist-info/METADATA +292 -0
- fb_scraper_request-0.2.0.dist-info/RECORD +17 -0
- fb_scraper_request-0.2.0.dist-info/WHEEL +5 -0
- fb_scraper_request-0.2.0.dist-info/licenses/LICENSE +19 -0
- fb_scraper_request-0.2.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
"""Facebook GraphQL Scraper - Simple API to scrape public Facebook posts without login.
|
|
2
|
+
|
|
3
|
+
Example:
|
|
4
|
+
from facebook_graphql_scraper import FacebookGraphqlScraper
|
|
5
|
+
|
|
6
|
+
fb = FacebookGraphqlScraper()
|
|
7
|
+
result = fb.get_user_posts("vietgiaitri", days_limit=3)
|
|
8
|
+
print(result["data"])
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from .facebook_graphql_scraper import FacebookGraphqlScraper
|
|
12
|
+
|
|
13
|
+
__version__ = "0.2.0"
|
|
14
|
+
__all__ = ["FacebookGraphqlScraper"]
|
|
File without changes
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
# -*- coding:utf-8 -*-
|
|
2
|
+
from selenium.webdriver.chrome.options import Options as ChromeOptions
|
|
3
|
+
from selenium.webdriver.chrome.service import Service
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class BasePage:
|
|
7
|
+
def __init__(self, driver_path: str, open_browser: bool = False):
|
|
8
|
+
chrome_options = self._build_options(open_browser)
|
|
9
|
+
normalized_driver_path = self._normalize_path(driver_path)
|
|
10
|
+
|
|
11
|
+
if self._looks_like_chrome_binary(normalized_driver_path):
|
|
12
|
+
chrome_options.binary_location = normalized_driver_path
|
|
13
|
+
service = Service()
|
|
14
|
+
elif normalized_driver_path:
|
|
15
|
+
service = Service(normalized_driver_path)
|
|
16
|
+
else:
|
|
17
|
+
service = Service()
|
|
18
|
+
|
|
19
|
+
self.driver = self._build_driver(service=service, chrome_options=chrome_options)
|
|
20
|
+
self.driver.maximize_window()
|
|
21
|
+
|
|
22
|
+
@staticmethod
|
|
23
|
+
def _build_options(open_browser: bool) -> ChromeOptions:
|
|
24
|
+
options = ChromeOptions()
|
|
25
|
+
options.add_argument("--disable-blink-features")
|
|
26
|
+
options.add_argument("--disable-notifications")
|
|
27
|
+
options.add_argument("--disable-blink-features=AutomationControlled")
|
|
28
|
+
if not open_browser:
|
|
29
|
+
options.add_argument("--headless=new")
|
|
30
|
+
options.add_argument("--blink-settings=imagesEnabled=false")
|
|
31
|
+
return options
|
|
32
|
+
|
|
33
|
+
@staticmethod
|
|
34
|
+
def _normalize_path(path: str | None) -> str | None:
|
|
35
|
+
if not path:
|
|
36
|
+
return None
|
|
37
|
+
# In notebook strings users often escape spaces (e.g. "Google\ Chrome").
|
|
38
|
+
return path.replace("\\ ", " ").strip()
|
|
39
|
+
|
|
40
|
+
@staticmethod
|
|
41
|
+
def _looks_like_chrome_binary(path: str | None) -> bool:
|
|
42
|
+
if not path:
|
|
43
|
+
return False
|
|
44
|
+
normalized = path.lower()
|
|
45
|
+
return normalized.endswith("/google chrome") or normalized.endswith("/chrome")
|
|
46
|
+
|
|
47
|
+
@staticmethod
|
|
48
|
+
def _build_driver(service: Service, chrome_options: ChromeOptions):
|
|
49
|
+
from seleniumwire import webdriver
|
|
50
|
+
|
|
51
|
+
try:
|
|
52
|
+
return webdriver.Chrome(service=service, options=chrome_options)
|
|
53
|
+
except AttributeError as exc:
|
|
54
|
+
if "VERSION_CHOICES" not in str(exc):
|
|
55
|
+
raise
|
|
56
|
+
# Some selenium-wire/pyOpenSSL combinations expose this at runtime.
|
|
57
|
+
BasePage._patch_seleniumwire_tls_version_choices()
|
|
58
|
+
return webdriver.Chrome(service=service, options=chrome_options)
|
|
59
|
+
|
|
60
|
+
@staticmethod
|
|
61
|
+
def _patch_seleniumwire_tls_version_choices() -> None:
|
|
62
|
+
from OpenSSL import SSL
|
|
63
|
+
from seleniumwire.thirdparty.mitmproxy.net import tls
|
|
64
|
+
|
|
65
|
+
if hasattr(tls, "VERSION_CHOICES"):
|
|
66
|
+
return
|
|
67
|
+
|
|
68
|
+
basic_options = SSL.OP_CIPHER_SERVER_PREFERENCE
|
|
69
|
+
if hasattr(SSL, "OP_NO_COMPRESSION"):
|
|
70
|
+
basic_options |= SSL.OP_NO_COMPRESSION
|
|
71
|
+
|
|
72
|
+
default_method = getattr(SSL, "SSLv23_METHOD", getattr(SSL, "TLS_METHOD", None))
|
|
73
|
+
default_options = basic_options
|
|
74
|
+
if hasattr(SSL, "OP_NO_SSLv2"):
|
|
75
|
+
default_options |= SSL.OP_NO_SSLv2
|
|
76
|
+
if hasattr(SSL, "OP_NO_SSLv3"):
|
|
77
|
+
default_options |= SSL.OP_NO_SSLv3
|
|
78
|
+
|
|
79
|
+
version_choices = {
|
|
80
|
+
"all": (default_method, basic_options),
|
|
81
|
+
"secure": (default_method, default_options),
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
for name in ("TLSv1", "TLSv1_1", "TLSv1_2"):
|
|
85
|
+
method_name = f"{name}_METHOD"
|
|
86
|
+
method = getattr(SSL, method_name, None)
|
|
87
|
+
if method is not None:
|
|
88
|
+
version_choices[name] = (method, basic_options)
|
|
89
|
+
|
|
90
|
+
tls.VERSION_CHOICES = version_choices
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
from fb_graphql_scraper.facebook_graphql_scraper import FacebookGraphqlScraper as fb_graphql_scraper
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
## Example.1 - without logging in
|
|
6
|
+
if __name__ == "__main__":
|
|
7
|
+
facebook_user_name = "love.yuweishao"
|
|
8
|
+
facebook_user_id = "100044253168423"
|
|
9
|
+
days_limit = 100 # Number of days within which to scrape posts
|
|
10
|
+
driver_path = "/Users/hongshangren/Downloads/chromedriver-mac-arm64_136/chromedriver"
|
|
11
|
+
fb_spider = fb_graphql_scraper(driver_path=driver_path, open_browser=False)
|
|
12
|
+
res = fb_spider.get_user_posts(fb_username_or_userid=facebook_user_id, days_limit=days_limit,display_progress=True)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
## Example.2 - login in your facebook account to collect data
|
|
16
|
+
# if __name__ == "__main__":
|
|
17
|
+
# facebook_user_name = "love.yuweishao"
|
|
18
|
+
# facebook_user_id = "100044253168423"
|
|
19
|
+
# fb_account = "facebook_account"
|
|
20
|
+
# fb_pwd = "facebook_paswword"
|
|
21
|
+
# days_limit = 30 # Number of days within which to scrape posts
|
|
22
|
+
# driver_path = "/Users/hongshangren/Downloads/chromedriver-mac-arm64_136/chromedriver"
|
|
23
|
+
# fb_spider = fb_graphql_scraper(fb_account=fb_account,fb_pwd=fb_pwd, driver_path=driver_path, open_browser=False)
|
|
24
|
+
# res = fb_spider.get_user_posts(fb_username_or_userid=facebook_user_name, days_limit=days_limit,display_progress=True)
|
|
25
|
+
# print(res)
|
|
26
|
+
|
|
@@ -0,0 +1,304 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
import re
|
|
3
|
+
import pandas as pd
|
|
4
|
+
import requests
|
|
5
|
+
from fb_graphql_scraper.utils.parser import RequestsParser
|
|
6
|
+
from fb_graphql_scraper.utils.locator import *
|
|
7
|
+
from fb_graphql_scraper.utils.utils import *
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class FacebookSettings:
|
|
11
|
+
"""Facebook GraphQL Scraper - No login required
|
|
12
|
+
|
|
13
|
+
Example:
|
|
14
|
+
from fb_graphql_scraper.facebook_graphql_scraper import FacebookGraphqlScraper as fb_graphql_scraper
|
|
15
|
+
|
|
16
|
+
if __name__ == "__main__":
|
|
17
|
+
facebook_user_name = "love.yuweishao"
|
|
18
|
+
days_limit = 30
|
|
19
|
+
fb_spider = fb_graphql_scraper()
|
|
20
|
+
res = fb_spider.get_user_posts(fb_username_or_userid=facebook_user_name, days_limit=days_limit, display_progress=True)
|
|
21
|
+
print(res)
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
def __init__(self):
|
|
25
|
+
self._set_container()
|
|
26
|
+
self._set_stop_point()
|
|
27
|
+
self.doc_id = "26420831597536910"
|
|
28
|
+
self.requests_parser = RequestsParser()
|
|
29
|
+
|
|
30
|
+
def _set_container(self):
|
|
31
|
+
self.post_id_list = []
|
|
32
|
+
self.reaction_count_list = []
|
|
33
|
+
self.profile_feed = []
|
|
34
|
+
self.res = {
|
|
35
|
+
"post_caption": [],
|
|
36
|
+
"post_date": [],
|
|
37
|
+
"post_likes": [],
|
|
38
|
+
"comment_share_type": [],
|
|
39
|
+
"comment_share_value": [],
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
def _set_stop_point(self):
|
|
43
|
+
self.pre_diff_days = float("-inf")
|
|
44
|
+
self.counts_of_same_diff_days = 0
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class FacebookGraphqlScraper(FacebookSettings):
|
|
48
|
+
def __init__(self):
|
|
49
|
+
super().__init__()
|
|
50
|
+
|
|
51
|
+
def get_user_id_from_username(self, username: str) -> tuple:
|
|
52
|
+
"""Resolve Facebook username to numeric user ID and extract profile info"""
|
|
53
|
+
# Check if already numeric ID
|
|
54
|
+
if username.isdigit():
|
|
55
|
+
return username, []
|
|
56
|
+
|
|
57
|
+
# Try to get user ID from profile page
|
|
58
|
+
url = f"https://www.facebook.com/{username}?locale=en_us"
|
|
59
|
+
headers = {
|
|
60
|
+
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36"
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
profile_feed = []
|
|
64
|
+
|
|
65
|
+
try:
|
|
66
|
+
response = requests.get(
|
|
67
|
+
url, headers=headers, allow_redirects=True, timeout=10
|
|
68
|
+
)
|
|
69
|
+
# Look for user ID in page content
|
|
70
|
+
patterns = [
|
|
71
|
+
r'"userID":"(\d+)"',
|
|
72
|
+
r'"actorID":"(\d+)"',
|
|
73
|
+
r'"id":"(\d+)"',
|
|
74
|
+
r'"profile_owner":"(\d+)"',
|
|
75
|
+
r"entity_id=(\d+)",
|
|
76
|
+
r'"owner":{"__typename":"User","id":"(\d+)"}',
|
|
77
|
+
]
|
|
78
|
+
user_id = username
|
|
79
|
+
for pattern in patterns:
|
|
80
|
+
match = re.search(pattern, response.text)
|
|
81
|
+
if match:
|
|
82
|
+
user_id = match.group(1)
|
|
83
|
+
print(f"Resolved '{username}' -> '{user_id}'")
|
|
84
|
+
break
|
|
85
|
+
|
|
86
|
+
# Extract profile name
|
|
87
|
+
name_patterns = [
|
|
88
|
+
r'"name":"([^"]+)","__typename":"User"',
|
|
89
|
+
r'"pageName":"([^"]+)"',
|
|
90
|
+
r"<title>([^<]+)</title>",
|
|
91
|
+
]
|
|
92
|
+
for pattern in name_patterns:
|
|
93
|
+
match = re.search(pattern, response.text)
|
|
94
|
+
if match:
|
|
95
|
+
name = match.group(1).replace(" | Facebook", "").strip()
|
|
96
|
+
if name:
|
|
97
|
+
profile_feed.append(name)
|
|
98
|
+
break
|
|
99
|
+
|
|
100
|
+
# Extract followers count if available
|
|
101
|
+
follower_patterns = [
|
|
102
|
+
r"(\d+(?:[,.]\d+)?)\s*followers",
|
|
103
|
+
r'"follower_count":(\d+)',
|
|
104
|
+
r'"followers":\{"count":(\d+)\}',
|
|
105
|
+
]
|
|
106
|
+
for pattern in follower_patterns:
|
|
107
|
+
match = re.search(pattern, response.text, re.IGNORECASE)
|
|
108
|
+
if match:
|
|
109
|
+
followers = match.group(1)
|
|
110
|
+
profile_feed.append(f"{followers} followers")
|
|
111
|
+
break
|
|
112
|
+
|
|
113
|
+
return user_id, profile_feed
|
|
114
|
+
|
|
115
|
+
except Exception as e:
|
|
116
|
+
print(f"Error resolving user ID: {e}")
|
|
117
|
+
return username, profile_feed
|
|
118
|
+
|
|
119
|
+
def format_data(self, res_in, fb_username_or_userid, new_reactions):
|
|
120
|
+
final_res = pd.json_normalize(res_in)
|
|
121
|
+
final_res["context"] = self.requests_parser.context_list
|
|
122
|
+
final_res["username_or_userid"] = fb_username_or_userid
|
|
123
|
+
final_res["owing_profile"] = self.requests_parser.owning_profile
|
|
124
|
+
final_res["sub_reactions"] = new_reactions
|
|
125
|
+
final_res["post_url"] = "https://www.facebook.com/" + final_res["post_id"]
|
|
126
|
+
final_res["time"] = self.requests_parser.creation_list
|
|
127
|
+
final_res["published_date"] = pd.to_datetime(final_res["time"], unit="s")
|
|
128
|
+
final_res["published_date2"] = final_res["published_date"].dt.strftime(
|
|
129
|
+
"%Y-%m-%d"
|
|
130
|
+
)
|
|
131
|
+
final_res = final_res[
|
|
132
|
+
[
|
|
133
|
+
"post_id",
|
|
134
|
+
"post_url",
|
|
135
|
+
"username_or_userid",
|
|
136
|
+
"owing_profile",
|
|
137
|
+
"published_date",
|
|
138
|
+
"published_date2",
|
|
139
|
+
"time",
|
|
140
|
+
"reaction_count.count",
|
|
141
|
+
"comment_rendering_instance.comments.total_count",
|
|
142
|
+
"share_count.count",
|
|
143
|
+
"sub_reactions",
|
|
144
|
+
"context",
|
|
145
|
+
"video_view_count",
|
|
146
|
+
]
|
|
147
|
+
].to_dict(orient="records")
|
|
148
|
+
filtered_post_id = []
|
|
149
|
+
filtered_data = []
|
|
150
|
+
for each_data in list(final_res):
|
|
151
|
+
if each_data["post_id"] not in filtered_post_id:
|
|
152
|
+
filtered_data.append(each_data)
|
|
153
|
+
filtered_post_id.append(each_data["post_id"])
|
|
154
|
+
return filtered_data
|
|
155
|
+
|
|
156
|
+
def process_reactions(self, res_in):
|
|
157
|
+
reactions_out = []
|
|
158
|
+
for each_res in res_in:
|
|
159
|
+
each_reactions = each_res["top_reactions"]["edges"]
|
|
160
|
+
processed_reactions = self.requests_parser.process_reactions(
|
|
161
|
+
reactions_in=each_reactions
|
|
162
|
+
)
|
|
163
|
+
reactions_out.append(processed_reactions)
|
|
164
|
+
return reactions_out
|
|
165
|
+
|
|
166
|
+
def get_user_posts(
|
|
167
|
+
self,
|
|
168
|
+
fb_username_or_userid: str,
|
|
169
|
+
days_limit: int = 61,
|
|
170
|
+
display_progress: bool = True,
|
|
171
|
+
) -> dict:
|
|
172
|
+
self.requests_parser._clean_res()
|
|
173
|
+
self._set_container()
|
|
174
|
+
self._set_stop_point()
|
|
175
|
+
|
|
176
|
+
# Auto-resolve username to user ID and extract profile info
|
|
177
|
+
user_id, profile_feed = self.get_user_id_from_username(fb_username_or_userid)
|
|
178
|
+
|
|
179
|
+
print(f"Collecting posts for {user_id} (doc_id: {self.doc_id})")
|
|
180
|
+
if profile_feed:
|
|
181
|
+
print(f"Profile info: {profile_feed}")
|
|
182
|
+
|
|
183
|
+
return self.requests_flow(
|
|
184
|
+
doc_id=self.doc_id,
|
|
185
|
+
fb_username_or_userid=user_id,
|
|
186
|
+
days_limit=days_limit,
|
|
187
|
+
profile_feed=profile_feed,
|
|
188
|
+
display_progress=display_progress,
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
def requests_flow(
|
|
192
|
+
self,
|
|
193
|
+
doc_id: str,
|
|
194
|
+
fb_username_or_userid: str,
|
|
195
|
+
days_limit: int,
|
|
196
|
+
profile_feed: list,
|
|
197
|
+
display_progress=True,
|
|
198
|
+
):
|
|
199
|
+
"""
|
|
200
|
+
Fetch more posts from a user's Facebook profile using the requests module.
|
|
201
|
+
|
|
202
|
+
Flow:
|
|
203
|
+
1. Get the document ID of the target Facebook profile.
|
|
204
|
+
2. Use the requests module to fetch data from the profile.
|
|
205
|
+
3. Continuously fetch data by checking for new posts until the specified days limit is reached.
|
|
206
|
+
|
|
207
|
+
Args:
|
|
208
|
+
doc_id (str): The document ID of the target Facebook account.
|
|
209
|
+
fb_username_or_userid (str): The Facebook username or user ID of the target account.
|
|
210
|
+
days_limit (int): The number of days for which to fetch posts (limits the time range of retrieved posts).
|
|
211
|
+
profile_feed (list): A list containing the posts retrieved from the target profile.
|
|
212
|
+
|
|
213
|
+
Helper Functions:
|
|
214
|
+
1. get_before_time:
|
|
215
|
+
Retrieves Facebook posts from a specified time period before the current date.
|
|
216
|
+
|
|
217
|
+
2. get_payload:
|
|
218
|
+
Prepares the payload for the next round of requests to the server.
|
|
219
|
+
|
|
220
|
+
3. get_next_page_status:
|
|
221
|
+
Checks whether the target Facebook user has more posts available for retrieval.
|
|
222
|
+
|
|
223
|
+
4. compare_timestamp:
|
|
224
|
+
Verifies whether a retrieved post falls within the specified time period for collection.
|
|
225
|
+
"""
|
|
226
|
+
self.requests_parser._clean_res() # Clear all arrays used to store the results
|
|
227
|
+
self._set_container() # 清空用於儲存貼文資訊的array
|
|
228
|
+
self._set_stop_point() # 設置/重置停止條件 | 停止條件: 瀏覽器無法往下取得更多貼文(n次) or 已取得目標天數內貼文
|
|
229
|
+
url = "https://www.facebook.com/api/graphql/"
|
|
230
|
+
before_time = get_before_time()
|
|
231
|
+
loop_limit = 5000
|
|
232
|
+
is_first_time = True
|
|
233
|
+
# Extract data
|
|
234
|
+
for i in range(loop_limit):
|
|
235
|
+
if is_first_time:
|
|
236
|
+
payload_in = get_payload(
|
|
237
|
+
doc_id_in=doc_id,
|
|
238
|
+
id_in=fb_username_or_userid,
|
|
239
|
+
before_time=before_time, # input before_time
|
|
240
|
+
)
|
|
241
|
+
print("playload_in:", payload_in)
|
|
242
|
+
response = requests.post(
|
|
243
|
+
url=url,
|
|
244
|
+
data=payload_in,
|
|
245
|
+
)
|
|
246
|
+
data = response.content
|
|
247
|
+
decoded_data = data.decode("utf-8")
|
|
248
|
+
body_content = decoded_data.split("\n")
|
|
249
|
+
print(body_content[:5])
|
|
250
|
+
self.requests_parser.parse_body(body_content=body_content)
|
|
251
|
+
is_first_time = False
|
|
252
|
+
|
|
253
|
+
# if not the first tiime send request, use function 'get_next_payload' for extracting end cursor to scrape next round
|
|
254
|
+
elif not is_first_time:
|
|
255
|
+
next_cursor = get_next_cursor(body_content_in=body_content)
|
|
256
|
+
payload_in = get_next_payload(
|
|
257
|
+
doc_id_in=doc_id,
|
|
258
|
+
id_in=fb_username_or_userid,
|
|
259
|
+
before_time=before_time, # input before_time
|
|
260
|
+
cursor_in=next_cursor,
|
|
261
|
+
)
|
|
262
|
+
|
|
263
|
+
response = requests.post(
|
|
264
|
+
url=url,
|
|
265
|
+
data=payload_in,
|
|
266
|
+
)
|
|
267
|
+
body = response.content
|
|
268
|
+
decoded_body = body.decode("utf-8")
|
|
269
|
+
body_content = decoded_body.split("\n")
|
|
270
|
+
self.requests_parser.parse_body(body_content=body_content)
|
|
271
|
+
|
|
272
|
+
# Check progress
|
|
273
|
+
next_page_status = get_next_page_status(body_content=body_content)
|
|
274
|
+
|
|
275
|
+
before_time = str(self.requests_parser.creation_list[-1])
|
|
276
|
+
if not next_page_status:
|
|
277
|
+
print("There are no more posts.")
|
|
278
|
+
break
|
|
279
|
+
|
|
280
|
+
# date_object = int(datetime.strptime(before_time, "%Y-%m-%d"))
|
|
281
|
+
if compare_timestamp(
|
|
282
|
+
timestamp=int(before_time),
|
|
283
|
+
days_limit=days_limit,
|
|
284
|
+
display_progress=display_progress,
|
|
285
|
+
):
|
|
286
|
+
print(
|
|
287
|
+
f"The scraper has successfully retrieved posts from the past {str(days_limit)} days."
|
|
288
|
+
)
|
|
289
|
+
break
|
|
290
|
+
|
|
291
|
+
res_out = self.requests_parser.collect_posts()
|
|
292
|
+
new_reactions = self.process_reactions(res_in=res_out)
|
|
293
|
+
# create result
|
|
294
|
+
final_res = self.format_data(
|
|
295
|
+
res_in=res_out,
|
|
296
|
+
fb_username_or_userid=fb_username_or_userid,
|
|
297
|
+
new_reactions=new_reactions,
|
|
298
|
+
)
|
|
299
|
+
return {
|
|
300
|
+
"fb_username_or_userid": fb_username_or_userid,
|
|
301
|
+
"profile": profile_feed,
|
|
302
|
+
"data": final_res,
|
|
303
|
+
"raw_data": self.requests_parser.res_new,
|
|
304
|
+
}
|
|
File without changes
|
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
from fb_graphql_scraper.utils.locator import *
|
|
3
|
+
from selenium.webdriver.common.by import By
|
|
4
|
+
from selenium.webdriver.support.ui import WebDriverWait
|
|
5
|
+
from selenium.webdriver.support import expected_conditions as EC
|
|
6
|
+
from selenium.webdriver.common.action_chains import ActionChains
|
|
7
|
+
from selenium.webdriver.common.keys import Keys
|
|
8
|
+
import time
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class PageOptional(object):
|
|
12
|
+
def __init__(self, driver=None, fb_account: str = None, fb_pwd: str = None):
|
|
13
|
+
self.locator = PageLocators
|
|
14
|
+
self.xpath_elements = PageXpath
|
|
15
|
+
self.class_elements = PageClass
|
|
16
|
+
self.page_text = PageText
|
|
17
|
+
self.driver = driver
|
|
18
|
+
self.fb_account = fb_account
|
|
19
|
+
self.fb_pwd = fb_pwd
|
|
20
|
+
|
|
21
|
+
# Loggin account
|
|
22
|
+
if self.fb_account and self.fb_pwd:
|
|
23
|
+
login_page_url = "https://www.facebook.com/login"
|
|
24
|
+
self.driver.get(url=login_page_url)
|
|
25
|
+
self.login_page()
|
|
26
|
+
|
|
27
|
+
def login_page(self):
|
|
28
|
+
try:
|
|
29
|
+
self.login_account(user=self.fb_account,
|
|
30
|
+
password=self.fb_pwd,
|
|
31
|
+
)
|
|
32
|
+
time.sleep(5)
|
|
33
|
+
except Exception as e:
|
|
34
|
+
print(f"Login faield, message: {e}")
|
|
35
|
+
|
|
36
|
+
def clean_requests(self):
|
|
37
|
+
print(f"Before cleaning driver requests, the number of requests are: {len(self.driver.requests)}")
|
|
38
|
+
try:
|
|
39
|
+
print("Try to clear driver requests..")
|
|
40
|
+
del self.driver.requests
|
|
41
|
+
print(f"Clear, the number of requests are: {len(self.driver.requests)}")
|
|
42
|
+
except Exception as e:
|
|
43
|
+
print(f"Clear unsuccessfully, message: {e}")
|
|
44
|
+
|
|
45
|
+
def get_in_url(self):
|
|
46
|
+
self.driver.get(url=self.url)
|
|
47
|
+
|
|
48
|
+
def login_account(self, user: str, password: str):
|
|
49
|
+
user_element = self.driver.find_element(By.NAME, "email")
|
|
50
|
+
user_element.send_keys(user)
|
|
51
|
+
password_element = self.driver.find_element(By.NAME, "pass")
|
|
52
|
+
password_element.send_keys(password)
|
|
53
|
+
password_element.send_keys(Keys.ENTER)
|
|
54
|
+
|
|
55
|
+
def scroll_window(self):
|
|
56
|
+
self.driver.execute_script(
|
|
57
|
+
"window.scrollTo(0,document.body.scrollHeight)")
|
|
58
|
+
|
|
59
|
+
def scroll_window_with_parameter(self, parameter_in: str):
|
|
60
|
+
self.driver.execute_script(f"window.scrollBy(0, {parameter_in});")
|
|
61
|
+
|
|
62
|
+
def set_browser_zoom_percent(self, zoom_percent: int):
|
|
63
|
+
zoom_percent = str(zoom_percent)
|
|
64
|
+
self.driver.execute_script(
|
|
65
|
+
f"document.body.style.zoom='{zoom_percent}%'")
|
|
66
|
+
|
|
67
|
+
def move_to_element(self, element_in):
|
|
68
|
+
ActionChains(self.driver).move_to_element(element_in).perform()
|
|
69
|
+
|
|
70
|
+
def load_next_page(self, url:str, clear_limit:int=20):
|
|
71
|
+
""">> Move on to target facebook user page,
|
|
72
|
+
before moving, clean driver's requests first,
|
|
73
|
+
or driver would store previous account's data.
|
|
74
|
+
Args: url (str): user(kol) links"""
|
|
75
|
+
i = 0
|
|
76
|
+
while i <= clear_limit:
|
|
77
|
+
self.clean_requests()
|
|
78
|
+
if len(self.driver.requests) == 0:
|
|
79
|
+
print("Clear all driver requests already!")
|
|
80
|
+
break
|
|
81
|
+
i += 1
|
|
82
|
+
self.driver.get(url=url)
|
|
83
|
+
|
|
84
|
+
def click_display_button(self):
|
|
85
|
+
elements = self.driver.find_elements(self.locator.DISPLAY_MORE)
|
|
86
|
+
for _ in range(10):
|
|
87
|
+
for each_element in elements:
|
|
88
|
+
if each_element.text == self.page_text.DISPLAY_MORE or each_element.text == self.page_text.DISPLAY_MORE2:
|
|
89
|
+
self.move_to_element(element_in=each_element)
|
|
90
|
+
self.scroll_window_with_parameter(parameter_in="500")
|
|
91
|
+
try:
|
|
92
|
+
each_element.click()
|
|
93
|
+
elements = self.driver.find_elements(
|
|
94
|
+
self.locator.DISPLAY_MORE)
|
|
95
|
+
except Exception as e:
|
|
96
|
+
print(
|
|
97
|
+
f"Click display more unsucessfully, error message:\n{e}")
|
|
98
|
+
|
|
99
|
+
def click_display_button2(self):
|
|
100
|
+
display_more_xpath = f"//div[@class='{PageClass.DISPLAY_MORE}' and @role='{PageRoleValue.DISPLAY_MORE}' and text()='{PageText.DISPLAY_MORE}']"
|
|
101
|
+
elements = self.driver.find_elements(By.XPATH, display_more_xpath)
|
|
102
|
+
for _ in range(10):
|
|
103
|
+
for each_element in elements:
|
|
104
|
+
if each_element.text == self.page_text.DISPLAY_MORE or each_element.text == self.page_text.DISPLAY_MORE2:
|
|
105
|
+
self.move_to_element(element_in=each_element)
|
|
106
|
+
self.scroll_window_with_parameter(parameter_in="500")
|
|
107
|
+
try:
|
|
108
|
+
each_element.click()
|
|
109
|
+
elements = self.driver.find_elements(
|
|
110
|
+
self.locator.DISPLAY_MORE)
|
|
111
|
+
except Exception as e:
|
|
112
|
+
print(
|
|
113
|
+
f"Click display more unsucessfully, error message:\n{e}")
|
|
114
|
+
|
|
115
|
+
def click_reject_login_button(self):
|
|
116
|
+
try:
|
|
117
|
+
reject_login_button = WebDriverWait(self.driver, 10).until(
|
|
118
|
+
EC.visibility_of_element_located((self.locator.CLOSELOGIN)))
|
|
119
|
+
reject_login_button.click()
|
|
120
|
+
except Exception as e:
|
|
121
|
+
print(f"Click reject button failed, message:{e}")
|
|
122
|
+
|
|
123
|
+
def quit_driver(self):
|
|
124
|
+
self.driver.quit()
|
|
125
|
+
|
|
126
|
+
def close_driver(self):
|
|
127
|
+
self.driver.close()
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
from selenium.webdriver.common.by import By
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class PageXpath(object):
|
|
6
|
+
CLOSE_LOGIN_BUTTON = "/html/body/div[1]/div/div[1]/div/div[5]/div/div/div[1]/div/div[2]/div/div/div/div[1]/div/i"
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class PageClass(object):
|
|
10
|
+
DISPLAY_MORE = "x1i10hfl xjbqb8w x1ejq31n xd10rxx x1sy0etr x17r0tee x972fbf xcfux6l x1qhh985 xm0m39n x9f619 x1ypdohk xt0psk2 xe8uvvx xdj266r x11i5rnm xat24cr x1mh8g0r xexx8yu x4uap5 x18d9i69 xkhd6sd x16tdsg8 x1hl2dhg xggy1nq x1a2a7pz xt0b8zv xzsf02u x1s688f"
|
|
11
|
+
CONTENTS = "x1yztbdb x1n2onr6 xh8yej3 x1ja2u2z"
|
|
12
|
+
CAPTION = "x193iq5w xeuugli x13faqbe x1vvkbs x1xmvt09 x1lliihq x1s928wv xhkezso x1gmr53x x1cpjm7i x1fgarty x1943h6x xudqn12 x3x7a5m x6prxxf xvq8zen xo1l8bm xzsf02u x1yc453h"
|
|
13
|
+
CAPTION2 = "xdj266r x11i5rnm xat24cr x1mh8g0r x1vvkbs x126k92a"
|
|
14
|
+
POSTDATE = "x1i10hfl xjbqb8w x1ejq31n xd10rxx x1sy0etr x17r0tee x972fbf xcfux6l x1qhh985 xm0m39n x9f619 x1ypdohk xt0psk2 xe8uvvx xdj266r x11i5rnm xat24cr x1mh8g0r xexx8yu x4uap5 x18d9i69 xkhd6sd x16tdsg8 x1hl2dhg xggy1nq x1a2a7pz x1heor9g xt0b8zv xo1l8bm"
|
|
15
|
+
POSTDATE = "x1i10hfl xjbqb8w x1ejq31n xd10rxx x1sy0etr x17r0tee x972fbf xcfux6l x1qhh985 xm0m39n x9f619 x1ypdohk xt0psk2 xe8uvvx xdj266r x11i5rnm xat24cr x1mh8g0r xexx8yu x4uap5 x18d9i69 xkhd6sd x16tdsg8 x1hl2dhg xggy1nq x1a2a7pz x1heor9g xt0b8zv xo1l8bm"
|
|
16
|
+
LIKES = "x6s0dn4 x78zum5 x1iyjqo2 x6ikm8r x10wlt62"
|
|
17
|
+
COMMENT_SHARE_PARENTS = "x9f619 x1n2onr6 x1ja2u2z x78zum5 x2lah0s x1qughib x1qjc9v5 xozqiw3 x1q0g3np xykv574 xbmpl8g x4cne27 xifccgj"
|
|
18
|
+
COMMENT_SHARE_CHILD = "x193iq5w xeuugli x13faqbe x1vvkbs x1xmvt09 x1lliihq x1s928wv xhkezso x1gmr53x x1cpjm7i x1fgarty x1943h6x xudqn12 x3x7a5m x6prxxf xvq8zen xo1l8bm xi81zsa"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class SoupElement(object):
|
|
22
|
+
pass
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class PageText(object):
|
|
26
|
+
DISPLAY_MORE = "查看更多"
|
|
27
|
+
DISPLAY_MORE2 = "顯示更多"
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class PageRoleValue(object):
|
|
31
|
+
DISPLAY_MORE = "button"
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class PageLocators(object):
|
|
35
|
+
CLOSELOGIN = (
|
|
36
|
+
By.XPATH, "//div[@aria-label='Close' or @aria-label='Dismiss' or @aria-label='關閉']|//div[@role='button' and (contains(@aria-label, 'Close') or contains(@aria-label, 'Dismiss'))]|/html/body/div[2]/div/div[1]/div/div[5]/div/div/div[1]/div/div[2]/div/div/div/div[1]/div/i")
|
|
37
|
+
DISPLAY_MORE = (
|
|
38
|
+
By.XPATH, f"//div[@class='{PageClass.DISPLAY_MORE}' and @role='{PageRoleValue.DISPLAY_MORE}' and text()='{PageText.DISPLAY_MORE}']")
|
|
39
|
+
|
|
40
|
+
LOGGINUSR1 = (
|
|
41
|
+
"/html/body/div[1]/div[1]/div[1]/div/div/div/div[2]/div/div[1]/form/div[1]/div[1]/input"
|
|
42
|
+
)
|
|
43
|
+
LOGGINPWD1 = (
|
|
44
|
+
"/html/body/div[1]/div[1]/div[1]/div/div/div/div[2]/div/div[1]/form/div[1]/div[2]/div/input"
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
LOGGINUSR2 = (
|
|
48
|
+
"/html/body/div[1]/div/div[1]/div/div[5]/div/div/div[1]/div/div[2]/div/div/div/div[2]/form/div/div[3]/div/label/div/div/input")
|
|
49
|
+
LOGGINPWD2 = (
|
|
50
|
+
"/html/body/div[1]/div/div[1]/div/div[5]/div/div/div[1]/div/div[2]/div/div/div/div[2]/form/div/div[4]/div/label/div/div/input")
|
|
51
|
+
|
|
52
|
+
# version.3: facebook login page
|
|
53
|
+
LOGGINUSR3 = (By.NAME, "email")
|
|
54
|
+
LOGGINPWD3 = (By.NAME, "pass")
|