fb_scraper_request 0.2.3__tar.gz → 0.2.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (22) hide show
  1. {fb_scraper_request-0.2.3 → fb_scraper_request-0.2.5}/PKG-INFO +2 -1
  2. fb_scraper_request-0.2.5/fb_scraper_request/base/__init__.py +0 -0
  3. fb_scraper_request-0.2.5/fb_scraper_request/base/base_page.py +90 -0
  4. {fb_scraper_request-0.2.3 → fb_scraper_request-0.2.5}/fb_scraper_request/facebook_graphql_scraper.py +0 -1
  5. fb_scraper_request-0.2.5/fb_scraper_request/pages/__init__.py +0 -0
  6. fb_scraper_request-0.2.5/fb_scraper_request/pages/page_optional.py +131 -0
  7. fb_scraper_request-0.2.5/fb_scraper_request/tests/__init__.py +0 -0
  8. fb_scraper_request-0.2.5/fb_scraper_request/utils/__init__.py +0 -0
  9. fb_scraper_request-0.2.5/fb_scraper_request/utils/parser.py +131 -0
  10. fb_scraper_request-0.2.5/fb_scraper_request/utils/utils.py +318 -0
  11. {fb_scraper_request-0.2.3 → fb_scraper_request-0.2.5}/fb_scraper_request.egg-info/PKG-INFO +2 -1
  12. fb_scraper_request-0.2.5/fb_scraper_request.egg-info/SOURCES.txt +19 -0
  13. {fb_scraper_request-0.2.3 → fb_scraper_request-0.2.5}/fb_scraper_request.egg-info/requires.txt +1 -0
  14. {fb_scraper_request-0.2.3 → fb_scraper_request-0.2.5}/pyproject.toml +5 -3
  15. fb_scraper_request-0.2.3/fb_scraper_request.egg-info/SOURCES.txt +0 -11
  16. {fb_scraper_request-0.2.3 → fb_scraper_request-0.2.5}/LICENSE +0 -0
  17. {fb_scraper_request-0.2.3 → fb_scraper_request-0.2.5}/README.md +0 -0
  18. {fb_scraper_request-0.2.3 → fb_scraper_request-0.2.5}/fb_scraper_request/__init__.py +0 -0
  19. {fb_scraper_request-0.2.3 → fb_scraper_request-0.2.5}/fb_scraper_request/example.py +0 -0
  20. {fb_scraper_request-0.2.3 → fb_scraper_request-0.2.5}/fb_scraper_request.egg-info/dependency_links.txt +0 -0
  21. {fb_scraper_request-0.2.3 → fb_scraper_request-0.2.5}/fb_scraper_request.egg-info/top_level.txt +0 -0
  22. {fb_scraper_request-0.2.3 → fb_scraper_request-0.2.5}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: fb_scraper_request
3
- Version: 0.2.3
3
+ Version: 0.2.5
4
4
  Summary: Facebook GraphQL Scraper - No login required, simple API to scrape public Facebook posts
5
5
  Author-email: Nguyen Minh Quang <quangforwork1203@gmail.com>
6
6
  License: MIT
@@ -24,6 +24,7 @@ License-File: LICENSE
24
24
  Requires-Dist: requests>=2.28.0
25
25
  Requires-Dist: pytz
26
26
  Requires-Dist: pip>=26.0.1
27
+ Requires-Dist: beautifulsoup4>=4.12.0
27
28
  Provides-Extra: dev
28
29
  Requires-Dist: build>=0.8.0; extra == "dev"
29
30
  Requires-Dist: twine>=4.0.0; extra == "dev"
@@ -0,0 +1,90 @@
1
+ # -*- coding:utf-8 -*-
2
+ from selenium.webdriver.chrome.options import Options as ChromeOptions
3
+ from selenium.webdriver.chrome.service import Service
4
+
5
+
6
+ class BasePage:
7
+ def __init__(self, driver_path: str, open_browser: bool = False):
8
+ chrome_options = self._build_options(open_browser)
9
+ normalized_driver_path = self._normalize_path(driver_path)
10
+
11
+ if self._looks_like_chrome_binary(normalized_driver_path):
12
+ chrome_options.binary_location = normalized_driver_path
13
+ service = Service()
14
+ elif normalized_driver_path:
15
+ service = Service(normalized_driver_path)
16
+ else:
17
+ service = Service()
18
+
19
+ self.driver = self._build_driver(service=service, chrome_options=chrome_options)
20
+ self.driver.maximize_window()
21
+
22
+ @staticmethod
23
+ def _build_options(open_browser: bool) -> ChromeOptions:
24
+ options = ChromeOptions()
25
+ options.add_argument("--disable-blink-features")
26
+ options.add_argument("--disable-notifications")
27
+ options.add_argument("--disable-blink-features=AutomationControlled")
28
+ if not open_browser:
29
+ options.add_argument("--headless=new")
30
+ options.add_argument("--blink-settings=imagesEnabled=false")
31
+ return options
32
+
33
+ @staticmethod
34
+ def _normalize_path(path: str | None) -> str | None:
35
+ if not path:
36
+ return None
37
+ # In notebook strings users often escape spaces (e.g. "Google\ Chrome").
38
+ return path.replace("\\ ", " ").strip()
39
+
40
+ @staticmethod
41
+ def _looks_like_chrome_binary(path: str | None) -> bool:
42
+ if not path:
43
+ return False
44
+ normalized = path.lower()
45
+ return normalized.endswith("/google chrome") or normalized.endswith("/chrome")
46
+
47
+ @staticmethod
48
+ def _build_driver(service: Service, chrome_options: ChromeOptions):
49
+ from seleniumwire import webdriver
50
+
51
+ try:
52
+ return webdriver.Chrome(service=service, options=chrome_options)
53
+ except AttributeError as exc:
54
+ if "VERSION_CHOICES" not in str(exc):
55
+ raise
56
+ # Some selenium-wire/pyOpenSSL combinations expose this at runtime.
57
+ BasePage._patch_seleniumwire_tls_version_choices()
58
+ return webdriver.Chrome(service=service, options=chrome_options)
59
+
60
+ @staticmethod
61
+ def _patch_seleniumwire_tls_version_choices() -> None:
62
+ from OpenSSL import SSL
63
+ from seleniumwire.thirdparty.mitmproxy.net import tls
64
+
65
+ if hasattr(tls, "VERSION_CHOICES"):
66
+ return
67
+
68
+ basic_options = SSL.OP_CIPHER_SERVER_PREFERENCE
69
+ if hasattr(SSL, "OP_NO_COMPRESSION"):
70
+ basic_options |= SSL.OP_NO_COMPRESSION
71
+
72
+ default_method = getattr(SSL, "SSLv23_METHOD", getattr(SSL, "TLS_METHOD", None))
73
+ default_options = basic_options
74
+ if hasattr(SSL, "OP_NO_SSLv2"):
75
+ default_options |= SSL.OP_NO_SSLv2
76
+ if hasattr(SSL, "OP_NO_SSLv3"):
77
+ default_options |= SSL.OP_NO_SSLv3
78
+
79
+ version_choices = {
80
+ "all": (default_method, basic_options),
81
+ "secure": (default_method, default_options),
82
+ }
83
+
84
+ for name in ("TLSv1", "TLSv1_1", "TLSv1_2"):
85
+ method_name = f"{name}_METHOD"
86
+ method = getattr(SSL, method_name, None)
87
+ if method is not None:
88
+ version_choices[name] = (method, basic_options)
89
+
90
+ tls.VERSION_CHOICES = version_choices
@@ -3,7 +3,6 @@ import re
3
3
  import requests
4
4
  from datetime import datetime
5
5
  from fb_scraper_request.utils.parser import RequestsParser
6
- from fb_scraper_request.utils.locator import *
7
6
  from fb_scraper_request.utils.utils import *
8
7
 
9
8
 
@@ -0,0 +1,131 @@
1
+ # -*- coding: utf-8 -*-
2
+ from fb_scraper_request.utils.locator import *
3
+ from selenium.webdriver.common.by import By
4
+ from selenium.webdriver.support.ui import WebDriverWait
5
+ from selenium.webdriver.support import expected_conditions as EC
6
+ from selenium.webdriver.common.action_chains import ActionChains
7
+ from selenium.webdriver.common.keys import Keys
8
+ import time
9
+
10
+
11
+ class PageOptional(object):
12
+ def __init__(self, driver=None, fb_account: str = None, fb_pwd: str = None):
13
+ self.locator = PageLocators
14
+ self.xpath_elements = PageXpath
15
+ self.class_elements = PageClass
16
+ self.page_text = PageText
17
+ self.driver = driver
18
+ self.fb_account = fb_account
19
+ self.fb_pwd = fb_pwd
20
+
21
+ # Loggin account
22
+ if self.fb_account and self.fb_pwd:
23
+ login_page_url = "https://www.facebook.com/login"
24
+ self.driver.get(url=login_page_url)
25
+ self.login_page()
26
+
27
+ def login_page(self):
28
+ try:
29
+ self.login_account(
30
+ user=self.fb_account,
31
+ password=self.fb_pwd,
32
+ )
33
+ time.sleep(5)
34
+ except Exception as e:
35
+ print(f"Login faield, message: {e}")
36
+
37
+ def clean_requests(self):
38
+ print(
39
+ f"Before cleaning driver requests, the number of requests are: {len(self.driver.requests)}"
40
+ )
41
+ try:
42
+ print("Try to clear driver requests..")
43
+ del self.driver.requests
44
+ print(f"Clear, the number of requests are: {len(self.driver.requests)}")
45
+ except Exception as e:
46
+ print(f"Clear unsuccessfully, message: {e}")
47
+
48
+ def get_in_url(self):
49
+ self.driver.get(url=self.url)
50
+
51
+ def login_account(self, user: str, password: str):
52
+ user_element = self.driver.find_element(By.NAME, "email")
53
+ user_element.send_keys(user)
54
+ password_element = self.driver.find_element(By.NAME, "pass")
55
+ password_element.send_keys(password)
56
+ password_element.send_keys(Keys.ENTER)
57
+
58
+ def scroll_window(self):
59
+ self.driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
60
+
61
+ def scroll_window_with_parameter(self, parameter_in: str):
62
+ self.driver.execute_script(f"window.scrollBy(0, {parameter_in});")
63
+
64
+ def set_browser_zoom_percent(self, zoom_percent: int):
65
+ zoom_percent = str(zoom_percent)
66
+ self.driver.execute_script(f"document.body.style.zoom='{zoom_percent}%'")
67
+
68
+ def move_to_element(self, element_in):
69
+ ActionChains(self.driver).move_to_element(element_in).perform()
70
+
71
+ def load_next_page(self, url: str, clear_limit: int = 20):
72
+ """>> Move on to target facebook user page,
73
+ before moving, clean driver's requests first,
74
+ or driver would store previous account's data.
75
+ Args: url (str): user(kol) links"""
76
+ i = 0
77
+ while i <= clear_limit:
78
+ self.clean_requests()
79
+ if len(self.driver.requests) == 0:
80
+ print("Clear all driver requests already!")
81
+ break
82
+ i += 1
83
+ self.driver.get(url=url)
84
+
85
+ def click_display_button(self):
86
+ elements = self.driver.find_elements(self.locator.DISPLAY_MORE)
87
+ for _ in range(10):
88
+ for each_element in elements:
89
+ if (
90
+ each_element.text == self.page_text.DISPLAY_MORE
91
+ or each_element.text == self.page_text.DISPLAY_MORE2
92
+ ):
93
+ self.move_to_element(element_in=each_element)
94
+ self.scroll_window_with_parameter(parameter_in="500")
95
+ try:
96
+ each_element.click()
97
+ elements = self.driver.find_elements(self.locator.DISPLAY_MORE)
98
+ except Exception as e:
99
+ print(f"Click display more unsucessfully, error message:\n{e}")
100
+
101
+ def click_display_button2(self):
102
+ display_more_xpath = f"//div[@class='{PageClass.DISPLAY_MORE}' and @role='{PageRoleValue.DISPLAY_MORE}' and text()='{PageText.DISPLAY_MORE}']"
103
+ elements = self.driver.find_elements(By.XPATH, display_more_xpath)
104
+ for _ in range(10):
105
+ for each_element in elements:
106
+ if (
107
+ each_element.text == self.page_text.DISPLAY_MORE
108
+ or each_element.text == self.page_text.DISPLAY_MORE2
109
+ ):
110
+ self.move_to_element(element_in=each_element)
111
+ self.scroll_window_with_parameter(parameter_in="500")
112
+ try:
113
+ each_element.click()
114
+ elements = self.driver.find_elements(self.locator.DISPLAY_MORE)
115
+ except Exception as e:
116
+ print(f"Click display more unsucessfully, error message:\n{e}")
117
+
118
+ def click_reject_login_button(self):
119
+ try:
120
+ reject_login_button = WebDriverWait(self.driver, 10).until(
121
+ EC.visibility_of_element_located((self.locator.CLOSELOGIN))
122
+ )
123
+ reject_login_button.click()
124
+ except Exception as e:
125
+ print(f"Click reject button failed, message:{e}")
126
+
127
+ def quit_driver(self):
128
+ self.driver.quit()
129
+
130
+ def close_driver(self):
131
+ self.driver.close()
@@ -0,0 +1,131 @@
1
+ # -*- coding: utf-8 -*-
2
+ import json
3
+ from urllib.parse import parse_qs, unquote
4
+ from fb_scraper_request.utils.utils import *
5
+
6
+
7
+ class RequestsParser(object):
8
+ def __init__(self) -> None:
9
+ # self.driver = driver
10
+ self.reaction_names = ["讚", "哈", "怒", "大心", "加油", "哇", "嗚"]
11
+ self.en_reaction_names = [
12
+ "like",
13
+ "haha",
14
+ "angry",
15
+ "love",
16
+ "care",
17
+ "sorry",
18
+ "wow",
19
+ ]
20
+
21
+ def get_graphql_body_content(self, req_response, req_url):
22
+ target_url = "https://www.facebook.com/api/graphql/"
23
+ if req_response and req_url == target_url:
24
+ response = req_response
25
+ body = decode(
26
+ response.body, response.headers.get("Content-Encoding", "identity")
27
+ )
28
+ body_content = body.decode("utf-8").split("\n")
29
+ return body_content
30
+ return None
31
+
32
+ def _clean_res(self):
33
+ self.res_new = []
34
+ self.feedback_list = []
35
+ self.context_list = []
36
+ self.creation_list = []
37
+ self.author_id_list = []
38
+ self.author_id_list2 = []
39
+ self.owning_profile = []
40
+
41
+ def parse_body(self, body_content):
42
+ for each_body in body_content:
43
+ json_data = json.loads(each_body)
44
+ self.res_new.append(json_data)
45
+ try:
46
+ each_res = json_data["data"]["node"].copy()
47
+ each_feedback = find_feedback_with_subscription_target_id(each_res)
48
+ if each_feedback:
49
+ self.feedback_list.append(each_feedback)
50
+ message_text = find_message_text(json_data)
51
+ creation_time = find_creation(json_data)
52
+ owing_profile = find_owning_profile(json_data)
53
+ if message_text:
54
+ self.context_list.append(message_text)
55
+ elif not message_text:
56
+ self.context_list.append(None)
57
+ if creation_time:
58
+ self.creation_list.append(creation_time)
59
+ self.owning_profile.append(owing_profile)
60
+
61
+ # Did not display or record error message at here
62
+ except Exception as e:
63
+ pass
64
+
65
+ def collect_posts(self):
66
+ res_out = []
67
+ for each in self.feedback_list:
68
+ res_out.append(
69
+ {
70
+ "post_id": each["subscription_target_id"],
71
+ "reaction_count": each["reaction_count"],
72
+ "top_reactions": each["top_reactions"],
73
+ "share_count": each["share_count"],
74
+ "comment_rendering_instance": each["comment_rendering_instance"],
75
+ "video_view_count": each["video_view_count"],
76
+ }
77
+ )
78
+ return res_out
79
+
80
+ def convert_res_to_df(self, res_in):
81
+ # Pure Python version without pandas
82
+ selected_fields = [
83
+ "post_id",
84
+ "reaction_count",
85
+ "comment_rendering_instance",
86
+ "share_count",
87
+ "top_reactions",
88
+ "video_view_count",
89
+ ]
90
+ result = []
91
+ for item in res_in:
92
+ row = {field: item.get(field) for field in selected_fields}
93
+ result.append(row)
94
+ return result
95
+
96
+ def process_reactions(self, reactions_in) -> dict:
97
+ """Extract sub reaction value:
98
+ Args:
99
+ reactions_in (_type_): _description_
100
+ Returns:
101
+ _dict_: {
102
+ "like": value,
103
+ "haha": value,
104
+ "angry": value,
105
+ "love": value,
106
+ "care": value,
107
+ "sorry": value,
108
+ "wow": value
109
+ }
110
+ Note:
111
+ """
112
+ reaction_hash = {}
113
+ for each_react in reactions_in:
114
+ reaction_hash[each_react["node"]["localized_name"]] = each_react[
115
+ "reaction_count"
116
+ ] # get reaction value
117
+ return reaction_hash
118
+
119
+ def extract_first_payload(self, payload: str):
120
+ parsed_data = parse_qs(payload)
121
+ print("Parsed data:", parsed_data) # Debug: Show the parsed data
122
+ decoded_data = {
123
+ unquote(k): [unquote(v) for v in vals] for k, vals in parsed_data.items()
124
+ } # 解碼 keys 和 values
125
+ first_payload = {
126
+ k: v[0] for k, v in decoded_data.items()
127
+ } # 如果只需要第一個值作為字典中的單一值
128
+ payload_variables = json.loads(first_payload["variables"])
129
+ first_payload["variables"] = payload_variables
130
+ print(first_payload)
131
+ return first_payload
@@ -0,0 +1,318 @@
1
+ # -*- coding: utf-8 -*-
2
+ import concurrent.futures as futures
3
+ import requests
4
+ import re
5
+ from bs4 import BeautifulSoup
6
+ from datetime import datetime, timedelta
7
+ import pytz
8
+ import time
9
+ import json
10
+
11
+
12
+ # if key: 'subscription_target_id' in feedback, store this feedback
13
+ def find_feedback_with_subscription_target_id(data):
14
+ if isinstance(data, dict):
15
+ if 'feedback' in data and isinstance(data['feedback'], dict):
16
+ feedback = data['feedback']
17
+ if 'subscription_target_id' in list(feedback.keys()):
18
+ return feedback
19
+
20
+ # Traverse the values of the dictionary and continue recursively searching
21
+ for value in data.values():
22
+ result = find_feedback_with_subscription_target_id(value)
23
+ if result:
24
+ return result
25
+
26
+ # If it is a list, traverse each element in the list and continue recursively searching
27
+ elif isinstance(data, list):
28
+ for item in data:
29
+ result = find_feedback_with_subscription_target_id(item)
30
+ if result:
31
+ return result
32
+
33
+ # If no matching feedback is found, return None
34
+ return None
35
+
36
+
37
+ def find_message_text(data):
38
+ if isinstance(data, dict):
39
+ # type is dict,check 'story' key
40
+ if 'story' in data:
41
+ # if key 'story's value type is dict, and include 'message' key
42
+ if isinstance(data['story'], dict) and 'message' in data['story']:
43
+ # if key 'message's value type is dict, and include 'text' key
44
+ if isinstance(data['story']['message'], dict) and 'text' in data['story']['message']:
45
+ # return 'text' key
46
+ return data['story']['message']['text']
47
+
48
+ # recursively check each value in dict if can not find anything
49
+ for value in data.values():
50
+ result = find_message_text(value)
51
+ if result:
52
+ return result
53
+ elif isinstance(data, list):
54
+ # if array, check each element recursively
55
+ for item in data:
56
+ result = find_message_text(item)
57
+ if result:
58
+ return result
59
+ # 如果沒有符合條件的值,return None
60
+ return None
61
+
62
+
63
+ def find_creation(data):
64
+ if isinstance(data, dict):
65
+ # If it's a dictionary, check if it contains the 'story' key
66
+ if 'story' in data:
67
+ # If the value of the 'story' key is a dictionary and contains the 'creation_time' key
68
+ if isinstance(data['story'], dict) and 'creation_time' in data['story']:
69
+ # Return the value of the 'creation_time' key
70
+ return data['story']['creation_time']
71
+
72
+ # If no matching condition is found, recursively check each value in the dictionary
73
+ for value in data.values():
74
+ result = find_creation(value)
75
+ if result:
76
+ return result
77
+
78
+ elif isinstance(data, list):
79
+ # If it's a list, recursively check each element in the list
80
+ for item in data:
81
+ result = find_creation(item)
82
+ if result:
83
+ return result
84
+ # If no matching condition is found, return None
85
+ return None
86
+
87
+
88
+ def find_actors(data):
89
+ if isinstance(data, dict):
90
+ # If it's a dictionary, check if it contains the 'story' key
91
+ if 'story' in data:
92
+ # If the value of the 'story' key is a dictionary and contains the 'actors' key
93
+ if isinstance(data['story'], dict) and 'actors' in data['story']:
94
+ # Return the value of the 'id' key under 'actors'
95
+ return data['story']['actors']['id']
96
+
97
+ # If no matching condition is found, recursively check each value in the dictionary
98
+ for value in data.values():
99
+ result = find_actors(value)
100
+ if result:
101
+ return result
102
+
103
+ elif isinstance(data, list):
104
+ # If it's a list, recursively check each element in the list
105
+ for item in data:
106
+ result = find_actors(item)
107
+ if result:
108
+ return result
109
+ # If no matching condition is found, return None
110
+ return None
111
+
112
+
113
+ def find_owning_profile(data):
114
+ if isinstance(data, dict):
115
+ # If it's a dictionary, check if it contains the 'story' key
116
+ if 'owning_profile' in data:
117
+ # If the value of the 'story' key is a dictionary and contains the 'actors' key
118
+ if isinstance(data['owning_profile'], dict):
119
+ # Return the value of the 'id' key under 'actors'
120
+ return data['owning_profile']
121
+
122
+ # If no matching condition is found, recursively check each value in the dictionary
123
+ for value in data.values():
124
+ result = find_owning_profile(value)
125
+ if result:
126
+ return result
127
+
128
+ elif isinstance(data, list):
129
+ # If it's a list, recursively check each element in the list
130
+ for item in data:
131
+ result = find_owning_profile(item)
132
+ if result:
133
+ return result
134
+ # If no matching condition is found, return None
135
+ return None
136
+
137
+
138
+ def timeout(timelimit):
139
+ def decorator(func):
140
+ def decorated(*args, **kwargs):
141
+ with futures.ThreadPoolExecutor(max_workers=1) as executor:
142
+ future = executor.submit(func, *args, **kwargs)
143
+ try:
144
+ result = future.result(timelimit)
145
+ except futures.TimeoutError:
146
+ print('Time out!')
147
+ raise TimeoutError from None
148
+ else:
149
+ pass
150
+ executor._threads.clear()
151
+ futures.thread._threads_queues.clear()
152
+ return result
153
+ return decorated
154
+ return decorator
155
+
156
+
157
+ def get_current_time(timezone="Asia/Taipei"):
158
+ current_time_utc = datetime.utcnow()
159
+ target_timezone = pytz.timezone(timezone)
160
+ target_current_time = current_time_utc.replace(
161
+ tzinfo=pytz.utc).astimezone(target_timezone)
162
+ return target_current_time
163
+
164
+
165
+ def days_difference_from_now(tmp_creation_array: list) -> int:
166
+ """計算第一次發文日期與當前日間隔天數
167
+
168
+ Args:
169
+ tmp_creation_array (list): _description_
170
+
171
+ Returns:
172
+ int: 間隔天數
173
+ """
174
+ timestamp = min(tmp_creation_array)
175
+ current_date_time = datetime.now()
176
+ date_time_obj = datetime.fromtimestamp(timestamp)
177
+ difference = current_date_time - date_time_obj
178
+ return difference.days
179
+
180
+
181
+ def is_date_exceed_limit(max_days_ago, days_limit: int = 61):
182
+ if max_days_ago > days_limit:
183
+ return True
184
+ return False
185
+
186
+ def pause(pause_time: int = 1):
187
+ time.sleep(pause_time)
188
+
189
+
190
+ def get_payload(doc_id_in: str, id_in: str, before_time: str = None):
191
+ variables_dict = {
192
+ "afterTime": None,
193
+ "beforeTime": before_time,
194
+ "count": 3,
195
+ "cursor": None,
196
+ "feedLocation": "TIMELINE",
197
+ "feedbackSource": 0,
198
+ "focusCommentID": None,
199
+ "memorializedSplitTimeFilter": None,
200
+ "omitPinnedPost": True,
201
+ "postedBy": {"group": "OWNER"},
202
+ "privacy": {"exclusivity": "INCLUSIVE", "filter": "ALL"},
203
+ "privacySelectorRenderLocation": "COMET_STREAM",
204
+ "renderLocation": "timeline",
205
+ "scale": 3,
206
+ "stream_count": 1,
207
+ "taggedInOnly": False,
208
+ "useDefaultActor": False,
209
+ "id": id_in,
210
+ "__relay_internal__pv__CometImmersivePhotoCanUserDisable3DMotionrelayprovider": False,
211
+ "__relay_internal__pv__IsWorkUserrelayprovider": False,
212
+ "__relay_internal__pv__IsMergQAPollsrelayprovider": False,
213
+ "__relay_internal__pv__CometUFIReactionsEnableShortNamerelayprovider": False,
214
+ "__relay_internal__pv__CometUFIShareActionMigrationrelayprovider": False,
215
+ "__relay_internal__pv__StoriesArmadilloReplyEnabledrelayprovider": False,
216
+ "__relay_internal__pv__StoriesTrayShouldShowMetadatarelayprovider": False,
217
+ "__relay_internal__pv__StoriesRingrelayprovider": False,
218
+ "__relay_internal__pv__EventCometCardImage_prefetchEventImagerelayprovider": False
219
+ }
220
+
221
+ payload_out = {
222
+ "variables": json.dumps(variables_dict),
223
+ "doc_id": doc_id_in
224
+ }
225
+ return payload_out
226
+
227
+ def get_next_payload(
228
+ doc_id_in:str,
229
+ id_in:str,
230
+ before_time:str,
231
+ cursor_in:str
232
+ ):
233
+ variables_dict = {
234
+ "afterTime": None,
235
+ "beforeTime": before_time,
236
+ "count": 3,
237
+ "cursor": cursor_in,
238
+ "feedLocation": "TIMELINE",
239
+ "feedbackSource": 0,
240
+ "focusCommentID": None,
241
+ "memorializedSplitTimeFilter": None,
242
+ "omitPinnedPost": True,
243
+ "postedBy": {"group": "OWNER"},
244
+ "privacy": {"exclusivity": "INCLUSIVE", "filter": "ALL"},
245
+ "privacySelectorRenderLocation": "COMET_STREAM",
246
+ "renderLocation": "timeline",
247
+ "scale": 3,
248
+ "stream_count": 1,
249
+ "taggedInOnly": False,
250
+ "useDefaultActor": False,
251
+ "id": id_in,
252
+ "__relay_internal__pv__CometImmersivePhotoCanUserDisable3DMotionrelayprovider": False,
253
+ "__relay_internal__pv__IsWorkUserrelayprovider": False,
254
+ "__relay_internal__pv__IsMergQAPollsrelayprovider": False,
255
+ "__relay_internal__pv__CometUFIReactionsEnableShortNamerelayprovider": False,
256
+ "__relay_internal__pv__CometUFIShareActionMigrationrelayprovider": False,
257
+ "__relay_internal__pv__StoriesArmadilloReplyEnabledrelayprovider": False,
258
+ "__relay_internal__pv__StoriesTrayShouldShowMetadatarelayprovider": False,
259
+ "__relay_internal__pv__StoriesRingrelayprovider": False,
260
+ "__relay_internal__pv__EventCometCardImage_prefetchEventImagerelayprovider": False
261
+ }
262
+ payload_out = {
263
+ "variables": json.dumps(variables_dict),
264
+ "doc_id": doc_id_in
265
+ }
266
+ return payload_out
267
+
268
+ def get_next_cursor(body_content_in):
269
+ for i in range(len(body_content_in)-1, -1, -1):
270
+ try:
271
+ json_tail = json.loads(body_content_in[i])
272
+ nex_cursor = json_tail.get("data").get(
273
+ "page_info").get("end_cursor")
274
+ return nex_cursor
275
+ except AttributeError:
276
+ pass
277
+
278
+ def get_next_page_status(body_content):
279
+ for each_body in body_content:
280
+ try:
281
+ tmp_json = json.loads(each_body)
282
+ next_page_status = tmp_json.get("data").get(
283
+ "page_info").get("has_next_page")
284
+ return next_page_status
285
+ except Exception as e:
286
+ pass
287
+ return True # sometimes, scraper can not collect API's "has_next" info, Program choose return True, I will improve this step in the near future.
288
+
289
+
290
+ def compare_timestamp(timestamp: int, days_limit: int, display_progress: bool) -> bool:
291
+ timestamp_date = datetime.utcfromtimestamp(timestamp).date()
292
+ current_date = datetime.utcnow().date()
293
+ past_date = current_date - timedelta(days=days_limit)
294
+ if display_progress:
295
+ days_remaining = (timestamp_date - past_date).days
296
+ if days_remaining > 0:
297
+ print(f"{days_remaining} more days of posts to collect.")
298
+ else:
299
+ print("Target days reached or exceeded.")
300
+ return timestamp_date < past_date
301
+
302
+
303
+ def get_before_time(time_zone='Asia/Taipei'):
304
+ location_tz = pytz.timezone(time_zone)
305
+ current_time = datetime.now(location_tz)
306
+ timestamp = str(int(current_time.timestamp()))
307
+ return timestamp
308
+
309
+ def get_posts_image(post_id:str):
310
+ url = f"https://www.facebook.com/plugins/post.php?href=https%3A%2F%2Fwww.facebook.com%2Ftoolbox003%2Fposts%2F{post_id}&show_text=true&width=800"
311
+ """You can check out the content through the link
312
+ to better understand what I'm talking about haha"""
313
+ response = requests.get(url=url)
314
+ response.status_code
315
+ soup = BeautifulSoup(response.text, "html.parser")
316
+ pattern = re.compile(r"^https://scontent")
317
+ all_src_links = [tag['src'] for tag in soup.find_all(src=pattern)]
318
+ return all_src_links
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: fb_scraper_request
3
- Version: 0.2.3
3
+ Version: 0.2.5
4
4
  Summary: Facebook GraphQL Scraper - No login required, simple API to scrape public Facebook posts
5
5
  Author-email: Nguyen Minh Quang <quangforwork1203@gmail.com>
6
6
  License: MIT
@@ -24,6 +24,7 @@ License-File: LICENSE
24
24
  Requires-Dist: requests>=2.28.0
25
25
  Requires-Dist: pytz
26
26
  Requires-Dist: pip>=26.0.1
27
+ Requires-Dist: beautifulsoup4>=4.12.0
27
28
  Provides-Extra: dev
28
29
  Requires-Dist: build>=0.8.0; extra == "dev"
29
30
  Requires-Dist: twine>=4.0.0; extra == "dev"
@@ -0,0 +1,19 @@
1
+ LICENSE
2
+ README.md
3
+ pyproject.toml
4
+ fb_scraper_request/__init__.py
5
+ fb_scraper_request/example.py
6
+ fb_scraper_request/facebook_graphql_scraper.py
7
+ fb_scraper_request.egg-info/PKG-INFO
8
+ fb_scraper_request.egg-info/SOURCES.txt
9
+ fb_scraper_request.egg-info/dependency_links.txt
10
+ fb_scraper_request.egg-info/requires.txt
11
+ fb_scraper_request.egg-info/top_level.txt
12
+ fb_scraper_request/base/__init__.py
13
+ fb_scraper_request/base/base_page.py
14
+ fb_scraper_request/pages/__init__.py
15
+ fb_scraper_request/pages/page_optional.py
16
+ fb_scraper_request/tests/__init__.py
17
+ fb_scraper_request/utils/__init__.py
18
+ fb_scraper_request/utils/parser.py
19
+ fb_scraper_request/utils/utils.py
@@ -1,6 +1,7 @@
1
1
  requests>=2.28.0
2
2
  pytz
3
3
  pip>=26.0.1
4
+ beautifulsoup4>=4.12.0
4
5
 
5
6
  [dev]
6
7
  build>=0.8.0
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "fb_scraper_request"
3
- version = "0.2.3"
3
+ version = "0.2.5"
4
4
  description = "Facebook GraphQL Scraper - No login required, simple API to scrape public Facebook posts"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.9"
@@ -25,6 +25,7 @@ dependencies = [
25
25
  "requests>=2.28.0",
26
26
  "pytz",
27
27
  "pip>=26.0.1",
28
+ "beautifulsoup4>=4.12.0",
28
29
  ]
29
30
 
30
31
  [project.optional-dependencies]
@@ -42,5 +43,6 @@ Issues = "https://github.com/DOCUTEE/fb_crawl_request/issues"
42
43
  requires = ["setuptools>=45", "wheel"]
43
44
  build-backend = "setuptools.build_meta"
44
45
 
45
- [tool.setuptools]
46
- packages = ["fb_scraper_request"]
46
+ [tool.setuptools.packages.find]
47
+ where = ["."]
48
+ include = ["fb_scraper_request*"]
@@ -1,11 +0,0 @@
1
- LICENSE
2
- README.md
3
- pyproject.toml
4
- fb_scraper_request/__init__.py
5
- fb_scraper_request/example.py
6
- fb_scraper_request/facebook_graphql_scraper.py
7
- fb_scraper_request.egg-info/PKG-INFO
8
- fb_scraper_request.egg-info/SOURCES.txt
9
- fb_scraper_request.egg-info/dependency_links.txt
10
- fb_scraper_request.egg-info/requires.txt
11
- fb_scraper_request.egg-info/top_level.txt