fb_scraper_request 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,118 @@
1
+ # -*- coding: utf-8 -*-
2
+ import pandas as pd
3
+ from seleniumwire.utils import decode
4
+ import json
5
+ from urllib.parse import parse_qs, unquote
6
+ from fb_graphql_scraper.utils.utils import *
7
+
8
+
9
+ class RequestsParser(object):
10
+ def __init__(self) -> None:
11
+ # self.driver = driver
12
+ self.reaction_names = ["讚", "哈", "怒", "大心", "加油", "哇", "嗚"]
13
+ self.en_reaction_names = ["like", "haha", "angry", "love", "care", "sorry", "wow"]
14
+
15
+ def get_graphql_body_content(self, req_response, req_url):
16
+ target_url = "https://www.facebook.com/api/graphql/"
17
+ if req_response and req_url == target_url:
18
+ response = req_response
19
+ body = decode(response.body, response.headers.get(
20
+ 'Content-Encoding', 'identity'))
21
+ body_content = body.decode("utf-8").split("\n")
22
+ return body_content
23
+ return None
24
+
25
+ def _clean_res(self):
26
+ self.res_new = []
27
+ self.feedback_list = []
28
+ self.context_list = []
29
+ self.creation_list = []
30
+ self.author_id_list = []
31
+ self.author_id_list2 = []
32
+ self.owning_profile = []
33
+
34
+ def parse_body(self, body_content):
35
+ for each_body in body_content:
36
+ json_data = json.loads(each_body)
37
+ self.res_new.append(json_data)
38
+ try:
39
+ each_res = json_data['data']['node'].copy()
40
+ each_feedback = find_feedback_with_subscription_target_id(
41
+ each_res
42
+ )
43
+ if each_feedback:
44
+ self.feedback_list.append(each_feedback)
45
+ message_text = find_message_text(json_data)
46
+ creation_time = find_creation(json_data)
47
+ owing_profile = find_owning_profile(json_data)
48
+ if message_text:
49
+ self.context_list.append(message_text)
50
+ elif not message_text:
51
+ self.context_list.append(None)
52
+ if creation_time:
53
+ self.creation_list.append(creation_time)
54
+ self.owning_profile.append(owing_profile)
55
+
56
+ # Did not display or record error message at here
57
+ except Exception as e:
58
+ pass
59
+
60
+ def collect_posts(self):
61
+ res_out = []
62
+ for each in self.feedback_list:
63
+ res_out.append({
64
+ "post_id": each['subscription_target_id'],
65
+ "reaction_count": each['reaction_count'],
66
+ "top_reactions": each['top_reactions'],
67
+ "share_count": each['share_count'],
68
+ "comment_rendering_instance": each['comment_rendering_instance'],
69
+ "video_view_count": each['video_view_count']
70
+ })
71
+ return res_out
72
+
73
+ def convert_res_to_df(self, res_in):
74
+ df_res = pd.json_normalize(res_in)
75
+ df_res = df_res[[
76
+ 'post_id',
77
+ 'reaction_count.count',
78
+ 'comment_rendering_instance.comments.total_count',
79
+ 'share_count.count',
80
+ 'top_reactions.edges',
81
+ 'video_view_count'
82
+ ]]
83
+ return df_res
84
+
85
+ def process_reactions(self, reactions_in) -> dict:
86
+ """Extract sub reaction value:
87
+ Args:
88
+ reactions_in (_type_): _description_
89
+ Returns:
90
+ _dict_: {
91
+ "like": value,
92
+ "haha": value,
93
+ "angry": value,
94
+ "love": value,
95
+ "care": value,
96
+ "sorry": value,
97
+ "wow": value
98
+ }
99
+ Note:
100
+ """
101
+ reaction_hash = {}
102
+ for each_react in reactions_in:
103
+ reaction_hash[each_react['node']['localized_name']
104
+ ] = each_react['reaction_count'] # get reaction value
105
+ return reaction_hash
106
+
107
+ def extract_first_payload(self, payload:str):
108
+ parsed_data = parse_qs(payload)
109
+ print("Parsed data:", parsed_data) # Debug: Show the parsed data
110
+ decoded_data = {
111
+ unquote(k): [unquote(v) for v in vals]
112
+ for k, vals in parsed_data.items()
113
+ } # 解碼 keys 和 values
114
+ first_payload = {k: v[0] for k, v in decoded_data.items()} # 如果只需要第一個值作為字典中的單一值
115
+ payload_variables = json.loads(first_payload['variables'])
116
+ first_payload['variables'] = payload_variables
117
+ print(first_payload)
118
+ return first_payload
@@ -0,0 +1,318 @@
1
+ # -*- coding: utf-8 -*-
2
+ import concurrent.futures as futures
3
+ import requests
4
+ import re
5
+ from bs4 import BeautifulSoup
6
+ from datetime import datetime, timedelta
7
+ import pytz
8
+ import time
9
+ import json
10
+
11
+
12
+ # if key: 'subscription_target_id' in feedback, store this feedback
13
+ def find_feedback_with_subscription_target_id(data):
14
+ if isinstance(data, dict):
15
+ if 'feedback' in data and isinstance(data['feedback'], dict):
16
+ feedback = data['feedback']
17
+ if 'subscription_target_id' in list(feedback.keys()):
18
+ return feedback
19
+
20
+ # Traverse the values of the dictionary and continue recursively searching
21
+ for value in data.values():
22
+ result = find_feedback_with_subscription_target_id(value)
23
+ if result:
24
+ return result
25
+
26
+ # If it is a list, traverse each element in the list and continue recursively searching
27
+ elif isinstance(data, list):
28
+ for item in data:
29
+ result = find_feedback_with_subscription_target_id(item)
30
+ if result:
31
+ return result
32
+
33
+ # If no matching feedback is found, return None
34
+ return None
35
+
36
+
37
+ def find_message_text(data):
38
+ if isinstance(data, dict):
39
+ # type is dict,check 'story' key
40
+ if 'story' in data:
41
+ # if key 'story's value type is dict, and include 'message' key
42
+ if isinstance(data['story'], dict) and 'message' in data['story']:
43
+ # if key 'message's value type is dict, and include 'text' key
44
+ if isinstance(data['story']['message'], dict) and 'text' in data['story']['message']:
45
+ # return 'text' key
46
+ return data['story']['message']['text']
47
+
48
+ # recursively check each value in dict if can not find anything
49
+ for value in data.values():
50
+ result = find_message_text(value)
51
+ if result:
52
+ return result
53
+ elif isinstance(data, list):
54
+ # if array, check each element recursively
55
+ for item in data:
56
+ result = find_message_text(item)
57
+ if result:
58
+ return result
59
+ # 如果沒有符合條件的值,return None
60
+ return None
61
+
62
+
63
+ def find_creation(data):
64
+ if isinstance(data, dict):
65
+ # If it's a dictionary, check if it contains the 'story' key
66
+ if 'story' in data:
67
+ # If the value of the 'story' key is a dictionary and contains the 'creation_time' key
68
+ if isinstance(data['story'], dict) and 'creation_time' in data['story']:
69
+ # Return the value of the 'creation_time' key
70
+ return data['story']['creation_time']
71
+
72
+ # If no matching condition is found, recursively check each value in the dictionary
73
+ for value in data.values():
74
+ result = find_creation(value)
75
+ if result:
76
+ return result
77
+
78
+ elif isinstance(data, list):
79
+ # If it's a list, recursively check each element in the list
80
+ for item in data:
81
+ result = find_creation(item)
82
+ if result:
83
+ return result
84
+ # If no matching condition is found, return None
85
+ return None
86
+
87
+
88
+ def find_actors(data):
89
+ if isinstance(data, dict):
90
+ # If it's a dictionary, check if it contains the 'story' key
91
+ if 'story' in data:
92
+ # If the value of the 'story' key is a dictionary and contains the 'actors' key
93
+ if isinstance(data['story'], dict) and 'actors' in data['story']:
94
+ # Return the value of the 'id' key under 'actors'
95
+ return data['story']['actors']['id']
96
+
97
+ # If no matching condition is found, recursively check each value in the dictionary
98
+ for value in data.values():
99
+ result = find_actors(value)
100
+ if result:
101
+ return result
102
+
103
+ elif isinstance(data, list):
104
+ # If it's a list, recursively check each element in the list
105
+ for item in data:
106
+ result = find_actors(item)
107
+ if result:
108
+ return result
109
+ # If no matching condition is found, return None
110
+ return None
111
+
112
+
113
+ def find_owning_profile(data):
114
+ if isinstance(data, dict):
115
+ # If it's a dictionary, check if it contains the 'story' key
116
+ if 'owning_profile' in data:
117
+ # If the value of the 'story' key is a dictionary and contains the 'actors' key
118
+ if isinstance(data['owning_profile'], dict):
119
+ # Return the value of the 'id' key under 'actors'
120
+ return data['owning_profile']
121
+
122
+ # If no matching condition is found, recursively check each value in the dictionary
123
+ for value in data.values():
124
+ result = find_owning_profile(value)
125
+ if result:
126
+ return result
127
+
128
+ elif isinstance(data, list):
129
+ # If it's a list, recursively check each element in the list
130
+ for item in data:
131
+ result = find_owning_profile(item)
132
+ if result:
133
+ return result
134
+ # If no matching condition is found, return None
135
+ return None
136
+
137
+
138
+ def timeout(timelimit):
139
+ def decorator(func):
140
+ def decorated(*args, **kwargs):
141
+ with futures.ThreadPoolExecutor(max_workers=1) as executor:
142
+ future = executor.submit(func, *args, **kwargs)
143
+ try:
144
+ result = future.result(timelimit)
145
+ except futures.TimeoutError:
146
+ print('Time out!')
147
+ raise TimeoutError from None
148
+ else:
149
+ pass
150
+ executor._threads.clear()
151
+ futures.thread._threads_queues.clear()
152
+ return result
153
+ return decorated
154
+ return decorator
155
+
156
+
157
+ def get_current_time(timezone="Asia/Taipei"):
158
+ current_time_utc = datetime.utcnow()
159
+ target_timezone = pytz.timezone(timezone)
160
+ target_current_time = current_time_utc.replace(
161
+ tzinfo=pytz.utc).astimezone(target_timezone)
162
+ return target_current_time
163
+
164
+
165
+ def days_difference_from_now(tmp_creation_array: list) -> int:
166
+ """計算第一次發文日期與當前日間隔天數
167
+
168
+ Args:
169
+ tmp_creation_array (list): _description_
170
+
171
+ Returns:
172
+ int: 間隔天數
173
+ """
174
+ timestamp = min(tmp_creation_array)
175
+ current_date_time = datetime.now()
176
+ date_time_obj = datetime.fromtimestamp(timestamp)
177
+ difference = current_date_time - date_time_obj
178
+ return difference.days
179
+
180
+
181
+ def is_date_exceed_limit(max_days_ago, days_limit: int = 61):
182
+ if max_days_ago > days_limit:
183
+ return True
184
+ return False
185
+
186
+ def pause(pause_time: int = 1):
187
+ time.sleep(pause_time)
188
+
189
+
190
+ def get_payload(doc_id_in: str, id_in: str, before_time: str = None):
191
+ variables_dict = {
192
+ "afterTime": None,
193
+ "beforeTime": before_time,
194
+ "count": 3,
195
+ "cursor": None,
196
+ "feedLocation": "TIMELINE",
197
+ "feedbackSource": 0,
198
+ "focusCommentID": None,
199
+ "memorializedSplitTimeFilter": None,
200
+ "omitPinnedPost": True,
201
+ "postedBy": {"group": "OWNER"},
202
+ "privacy": {"exclusivity": "INCLUSIVE", "filter": "ALL"},
203
+ "privacySelectorRenderLocation": "COMET_STREAM",
204
+ "renderLocation": "timeline",
205
+ "scale": 3,
206
+ "stream_count": 1,
207
+ "taggedInOnly": False,
208
+ "useDefaultActor": False,
209
+ "id": id_in,
210
+ "__relay_internal__pv__CometImmersivePhotoCanUserDisable3DMotionrelayprovider": False,
211
+ "__relay_internal__pv__IsWorkUserrelayprovider": False,
212
+ "__relay_internal__pv__IsMergQAPollsrelayprovider": False,
213
+ "__relay_internal__pv__CometUFIReactionsEnableShortNamerelayprovider": False,
214
+ "__relay_internal__pv__CometUFIShareActionMigrationrelayprovider": False,
215
+ "__relay_internal__pv__StoriesArmadilloReplyEnabledrelayprovider": False,
216
+ "__relay_internal__pv__StoriesTrayShouldShowMetadatarelayprovider": False,
217
+ "__relay_internal__pv__StoriesRingrelayprovider": False,
218
+ "__relay_internal__pv__EventCometCardImage_prefetchEventImagerelayprovider": False
219
+ }
220
+
221
+ payload_out = {
222
+ "variables": json.dumps(variables_dict),
223
+ "doc_id": doc_id_in
224
+ }
225
+ return payload_out
226
+
227
+ def get_next_payload(
228
+ doc_id_in:str,
229
+ id_in:str,
230
+ before_time:str,
231
+ cursor_in:str
232
+ ):
233
+ variables_dict = {
234
+ "afterTime": None,
235
+ "beforeTime": before_time,
236
+ "count": 3,
237
+ "cursor": cursor_in,
238
+ "feedLocation": "TIMELINE",
239
+ "feedbackSource": 0,
240
+ "focusCommentID": None,
241
+ "memorializedSplitTimeFilter": None,
242
+ "omitPinnedPost": True,
243
+ "postedBy": {"group": "OWNER"},
244
+ "privacy": {"exclusivity": "INCLUSIVE", "filter": "ALL"},
245
+ "privacySelectorRenderLocation": "COMET_STREAM",
246
+ "renderLocation": "timeline",
247
+ "scale": 3,
248
+ "stream_count": 1,
249
+ "taggedInOnly": False,
250
+ "useDefaultActor": False,
251
+ "id": id_in,
252
+ "__relay_internal__pv__CometImmersivePhotoCanUserDisable3DMotionrelayprovider": False,
253
+ "__relay_internal__pv__IsWorkUserrelayprovider": False,
254
+ "__relay_internal__pv__IsMergQAPollsrelayprovider": False,
255
+ "__relay_internal__pv__CometUFIReactionsEnableShortNamerelayprovider": False,
256
+ "__relay_internal__pv__CometUFIShareActionMigrationrelayprovider": False,
257
+ "__relay_internal__pv__StoriesArmadilloReplyEnabledrelayprovider": False,
258
+ "__relay_internal__pv__StoriesTrayShouldShowMetadatarelayprovider": False,
259
+ "__relay_internal__pv__StoriesRingrelayprovider": False,
260
+ "__relay_internal__pv__EventCometCardImage_prefetchEventImagerelayprovider": False
261
+ }
262
+ payload_out = {
263
+ "variables": json.dumps(variables_dict),
264
+ "doc_id": doc_id_in
265
+ }
266
+ return payload_out
267
+
268
+ def get_next_cursor(body_content_in):
269
+ for i in range(len(body_content_in)-1, -1, -1):
270
+ try:
271
+ json_tail = json.loads(body_content_in[i])
272
+ nex_cursor = json_tail.get("data").get(
273
+ "page_info").get("end_cursor")
274
+ return nex_cursor
275
+ except AttributeError:
276
+ pass
277
+
278
+ def get_next_page_status(body_content):
279
+ for each_body in body_content:
280
+ try:
281
+ tmp_json = json.loads(each_body)
282
+ next_page_status = tmp_json.get("data").get(
283
+ "page_info").get("has_next_page")
284
+ return next_page_status
285
+ except Exception as e:
286
+ pass
287
+ return True # sometimes, scraper can not collect API's "has_next" info, Program choose return True, I will improve this step in the near future.
288
+
289
+
290
+ def compare_timestamp(timestamp: int, days_limit: int, display_progress: bool) -> bool:
291
+ timestamp_date = datetime.utcfromtimestamp(timestamp).date()
292
+ current_date = datetime.utcnow().date()
293
+ past_date = current_date - timedelta(days=days_limit)
294
+ if display_progress:
295
+ days_remaining = (timestamp_date - past_date).days
296
+ if days_remaining > 0:
297
+ print(f"{days_remaining} more days of posts to collect.")
298
+ else:
299
+ print("Target days reached or exceeded.")
300
+ return timestamp_date < past_date
301
+
302
+
303
+ def get_before_time(time_zone='Asia/Taipei'):
304
+ location_tz = pytz.timezone(time_zone)
305
+ current_time = datetime.now(location_tz)
306
+ timestamp = str(int(current_time.timestamp()))
307
+ return timestamp
308
+
309
+ def get_posts_image(post_id:str):
310
+ url = f"https://www.facebook.com/plugins/post.php?href=https%3A%2F%2Fwww.facebook.com%2Ftoolbox003%2Fposts%2F{post_id}&show_text=true&width=800"
311
+ """You can check out the content through the link
312
+ to better understand what I'm talking about haha"""
313
+ response = requests.get(url=url)
314
+ response.status_code
315
+ soup = BeautifulSoup(response.text, "html.parser")
316
+ pattern = re.compile(r"^https://scontent")
317
+ all_src_links = [tag['src'] for tag in soup.find_all(src=pattern)]
318
+ return all_src_links