fb_scraper_request 0.2.3__tar.gz → 0.2.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {fb_scraper_request-0.2.3 → fb_scraper_request-0.2.5}/PKG-INFO +2 -1
- fb_scraper_request-0.2.5/fb_scraper_request/base/__init__.py +0 -0
- fb_scraper_request-0.2.5/fb_scraper_request/base/base_page.py +90 -0
- {fb_scraper_request-0.2.3 → fb_scraper_request-0.2.5}/fb_scraper_request/facebook_graphql_scraper.py +0 -1
- fb_scraper_request-0.2.5/fb_scraper_request/pages/__init__.py +0 -0
- fb_scraper_request-0.2.5/fb_scraper_request/pages/page_optional.py +131 -0
- fb_scraper_request-0.2.5/fb_scraper_request/tests/__init__.py +0 -0
- fb_scraper_request-0.2.5/fb_scraper_request/utils/__init__.py +0 -0
- fb_scraper_request-0.2.5/fb_scraper_request/utils/parser.py +131 -0
- fb_scraper_request-0.2.5/fb_scraper_request/utils/utils.py +318 -0
- {fb_scraper_request-0.2.3 → fb_scraper_request-0.2.5}/fb_scraper_request.egg-info/PKG-INFO +2 -1
- fb_scraper_request-0.2.5/fb_scraper_request.egg-info/SOURCES.txt +19 -0
- {fb_scraper_request-0.2.3 → fb_scraper_request-0.2.5}/fb_scraper_request.egg-info/requires.txt +1 -0
- {fb_scraper_request-0.2.3 → fb_scraper_request-0.2.5}/pyproject.toml +5 -3
- fb_scraper_request-0.2.3/fb_scraper_request.egg-info/SOURCES.txt +0 -11
- {fb_scraper_request-0.2.3 → fb_scraper_request-0.2.5}/LICENSE +0 -0
- {fb_scraper_request-0.2.3 → fb_scraper_request-0.2.5}/README.md +0 -0
- {fb_scraper_request-0.2.3 → fb_scraper_request-0.2.5}/fb_scraper_request/__init__.py +0 -0
- {fb_scraper_request-0.2.3 → fb_scraper_request-0.2.5}/fb_scraper_request/example.py +0 -0
- {fb_scraper_request-0.2.3 → fb_scraper_request-0.2.5}/fb_scraper_request.egg-info/dependency_links.txt +0 -0
- {fb_scraper_request-0.2.3 → fb_scraper_request-0.2.5}/fb_scraper_request.egg-info/top_level.txt +0 -0
- {fb_scraper_request-0.2.3 → fb_scraper_request-0.2.5}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: fb_scraper_request
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.5
|
|
4
4
|
Summary: Facebook GraphQL Scraper - No login required, simple API to scrape public Facebook posts
|
|
5
5
|
Author-email: Nguyen Minh Quang <quangforwork1203@gmail.com>
|
|
6
6
|
License: MIT
|
|
@@ -24,6 +24,7 @@ License-File: LICENSE
|
|
|
24
24
|
Requires-Dist: requests>=2.28.0
|
|
25
25
|
Requires-Dist: pytz
|
|
26
26
|
Requires-Dist: pip>=26.0.1
|
|
27
|
+
Requires-Dist: beautifulsoup4>=4.12.0
|
|
27
28
|
Provides-Extra: dev
|
|
28
29
|
Requires-Dist: build>=0.8.0; extra == "dev"
|
|
29
30
|
Requires-Dist: twine>=4.0.0; extra == "dev"
|
|
File without changes
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
# -*- coding:utf-8 -*-
|
|
2
|
+
from selenium.webdriver.chrome.options import Options as ChromeOptions
|
|
3
|
+
from selenium.webdriver.chrome.service import Service
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class BasePage:
|
|
7
|
+
def __init__(self, driver_path: str, open_browser: bool = False):
|
|
8
|
+
chrome_options = self._build_options(open_browser)
|
|
9
|
+
normalized_driver_path = self._normalize_path(driver_path)
|
|
10
|
+
|
|
11
|
+
if self._looks_like_chrome_binary(normalized_driver_path):
|
|
12
|
+
chrome_options.binary_location = normalized_driver_path
|
|
13
|
+
service = Service()
|
|
14
|
+
elif normalized_driver_path:
|
|
15
|
+
service = Service(normalized_driver_path)
|
|
16
|
+
else:
|
|
17
|
+
service = Service()
|
|
18
|
+
|
|
19
|
+
self.driver = self._build_driver(service=service, chrome_options=chrome_options)
|
|
20
|
+
self.driver.maximize_window()
|
|
21
|
+
|
|
22
|
+
@staticmethod
|
|
23
|
+
def _build_options(open_browser: bool) -> ChromeOptions:
|
|
24
|
+
options = ChromeOptions()
|
|
25
|
+
options.add_argument("--disable-blink-features")
|
|
26
|
+
options.add_argument("--disable-notifications")
|
|
27
|
+
options.add_argument("--disable-blink-features=AutomationControlled")
|
|
28
|
+
if not open_browser:
|
|
29
|
+
options.add_argument("--headless=new")
|
|
30
|
+
options.add_argument("--blink-settings=imagesEnabled=false")
|
|
31
|
+
return options
|
|
32
|
+
|
|
33
|
+
@staticmethod
|
|
34
|
+
def _normalize_path(path: str | None) -> str | None:
|
|
35
|
+
if not path:
|
|
36
|
+
return None
|
|
37
|
+
# In notebook strings users often escape spaces (e.g. "Google\ Chrome").
|
|
38
|
+
return path.replace("\\ ", " ").strip()
|
|
39
|
+
|
|
40
|
+
@staticmethod
|
|
41
|
+
def _looks_like_chrome_binary(path: str | None) -> bool:
|
|
42
|
+
if not path:
|
|
43
|
+
return False
|
|
44
|
+
normalized = path.lower()
|
|
45
|
+
return normalized.endswith("/google chrome") or normalized.endswith("/chrome")
|
|
46
|
+
|
|
47
|
+
@staticmethod
|
|
48
|
+
def _build_driver(service: Service, chrome_options: ChromeOptions):
|
|
49
|
+
from seleniumwire import webdriver
|
|
50
|
+
|
|
51
|
+
try:
|
|
52
|
+
return webdriver.Chrome(service=service, options=chrome_options)
|
|
53
|
+
except AttributeError as exc:
|
|
54
|
+
if "VERSION_CHOICES" not in str(exc):
|
|
55
|
+
raise
|
|
56
|
+
# Some selenium-wire/pyOpenSSL combinations expose this at runtime.
|
|
57
|
+
BasePage._patch_seleniumwire_tls_version_choices()
|
|
58
|
+
return webdriver.Chrome(service=service, options=chrome_options)
|
|
59
|
+
|
|
60
|
+
@staticmethod
|
|
61
|
+
def _patch_seleniumwire_tls_version_choices() -> None:
|
|
62
|
+
from OpenSSL import SSL
|
|
63
|
+
from seleniumwire.thirdparty.mitmproxy.net import tls
|
|
64
|
+
|
|
65
|
+
if hasattr(tls, "VERSION_CHOICES"):
|
|
66
|
+
return
|
|
67
|
+
|
|
68
|
+
basic_options = SSL.OP_CIPHER_SERVER_PREFERENCE
|
|
69
|
+
if hasattr(SSL, "OP_NO_COMPRESSION"):
|
|
70
|
+
basic_options |= SSL.OP_NO_COMPRESSION
|
|
71
|
+
|
|
72
|
+
default_method = getattr(SSL, "SSLv23_METHOD", getattr(SSL, "TLS_METHOD", None))
|
|
73
|
+
default_options = basic_options
|
|
74
|
+
if hasattr(SSL, "OP_NO_SSLv2"):
|
|
75
|
+
default_options |= SSL.OP_NO_SSLv2
|
|
76
|
+
if hasattr(SSL, "OP_NO_SSLv3"):
|
|
77
|
+
default_options |= SSL.OP_NO_SSLv3
|
|
78
|
+
|
|
79
|
+
version_choices = {
|
|
80
|
+
"all": (default_method, basic_options),
|
|
81
|
+
"secure": (default_method, default_options),
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
for name in ("TLSv1", "TLSv1_1", "TLSv1_2"):
|
|
85
|
+
method_name = f"{name}_METHOD"
|
|
86
|
+
method = getattr(SSL, method_name, None)
|
|
87
|
+
if method is not None:
|
|
88
|
+
version_choices[name] = (method, basic_options)
|
|
89
|
+
|
|
90
|
+
tls.VERSION_CHOICES = version_choices
|
|
File without changes
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
from fb_scraper_request.utils.locator import *
|
|
3
|
+
from selenium.webdriver.common.by import By
|
|
4
|
+
from selenium.webdriver.support.ui import WebDriverWait
|
|
5
|
+
from selenium.webdriver.support import expected_conditions as EC
|
|
6
|
+
from selenium.webdriver.common.action_chains import ActionChains
|
|
7
|
+
from selenium.webdriver.common.keys import Keys
|
|
8
|
+
import time
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class PageOptional(object):
|
|
12
|
+
def __init__(self, driver=None, fb_account: str = None, fb_pwd: str = None):
|
|
13
|
+
self.locator = PageLocators
|
|
14
|
+
self.xpath_elements = PageXpath
|
|
15
|
+
self.class_elements = PageClass
|
|
16
|
+
self.page_text = PageText
|
|
17
|
+
self.driver = driver
|
|
18
|
+
self.fb_account = fb_account
|
|
19
|
+
self.fb_pwd = fb_pwd
|
|
20
|
+
|
|
21
|
+
# Loggin account
|
|
22
|
+
if self.fb_account and self.fb_pwd:
|
|
23
|
+
login_page_url = "https://www.facebook.com/login"
|
|
24
|
+
self.driver.get(url=login_page_url)
|
|
25
|
+
self.login_page()
|
|
26
|
+
|
|
27
|
+
def login_page(self):
|
|
28
|
+
try:
|
|
29
|
+
self.login_account(
|
|
30
|
+
user=self.fb_account,
|
|
31
|
+
password=self.fb_pwd,
|
|
32
|
+
)
|
|
33
|
+
time.sleep(5)
|
|
34
|
+
except Exception as e:
|
|
35
|
+
print(f"Login faield, message: {e}")
|
|
36
|
+
|
|
37
|
+
def clean_requests(self):
|
|
38
|
+
print(
|
|
39
|
+
f"Before cleaning driver requests, the number of requests are: {len(self.driver.requests)}"
|
|
40
|
+
)
|
|
41
|
+
try:
|
|
42
|
+
print("Try to clear driver requests..")
|
|
43
|
+
del self.driver.requests
|
|
44
|
+
print(f"Clear, the number of requests are: {len(self.driver.requests)}")
|
|
45
|
+
except Exception as e:
|
|
46
|
+
print(f"Clear unsuccessfully, message: {e}")
|
|
47
|
+
|
|
48
|
+
def get_in_url(self):
|
|
49
|
+
self.driver.get(url=self.url)
|
|
50
|
+
|
|
51
|
+
def login_account(self, user: str, password: str):
|
|
52
|
+
user_element = self.driver.find_element(By.NAME, "email")
|
|
53
|
+
user_element.send_keys(user)
|
|
54
|
+
password_element = self.driver.find_element(By.NAME, "pass")
|
|
55
|
+
password_element.send_keys(password)
|
|
56
|
+
password_element.send_keys(Keys.ENTER)
|
|
57
|
+
|
|
58
|
+
def scroll_window(self):
|
|
59
|
+
self.driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
|
|
60
|
+
|
|
61
|
+
def scroll_window_with_parameter(self, parameter_in: str):
|
|
62
|
+
self.driver.execute_script(f"window.scrollBy(0, {parameter_in});")
|
|
63
|
+
|
|
64
|
+
def set_browser_zoom_percent(self, zoom_percent: int):
|
|
65
|
+
zoom_percent = str(zoom_percent)
|
|
66
|
+
self.driver.execute_script(f"document.body.style.zoom='{zoom_percent}%'")
|
|
67
|
+
|
|
68
|
+
def move_to_element(self, element_in):
|
|
69
|
+
ActionChains(self.driver).move_to_element(element_in).perform()
|
|
70
|
+
|
|
71
|
+
def load_next_page(self, url: str, clear_limit: int = 20):
|
|
72
|
+
""">> Move on to target facebook user page,
|
|
73
|
+
before moving, clean driver's requests first,
|
|
74
|
+
or driver would store previous account's data.
|
|
75
|
+
Args: url (str): user(kol) links"""
|
|
76
|
+
i = 0
|
|
77
|
+
while i <= clear_limit:
|
|
78
|
+
self.clean_requests()
|
|
79
|
+
if len(self.driver.requests) == 0:
|
|
80
|
+
print("Clear all driver requests already!")
|
|
81
|
+
break
|
|
82
|
+
i += 1
|
|
83
|
+
self.driver.get(url=url)
|
|
84
|
+
|
|
85
|
+
def click_display_button(self):
|
|
86
|
+
elements = self.driver.find_elements(self.locator.DISPLAY_MORE)
|
|
87
|
+
for _ in range(10):
|
|
88
|
+
for each_element in elements:
|
|
89
|
+
if (
|
|
90
|
+
each_element.text == self.page_text.DISPLAY_MORE
|
|
91
|
+
or each_element.text == self.page_text.DISPLAY_MORE2
|
|
92
|
+
):
|
|
93
|
+
self.move_to_element(element_in=each_element)
|
|
94
|
+
self.scroll_window_with_parameter(parameter_in="500")
|
|
95
|
+
try:
|
|
96
|
+
each_element.click()
|
|
97
|
+
elements = self.driver.find_elements(self.locator.DISPLAY_MORE)
|
|
98
|
+
except Exception as e:
|
|
99
|
+
print(f"Click display more unsucessfully, error message:\n{e}")
|
|
100
|
+
|
|
101
|
+
def click_display_button2(self):
|
|
102
|
+
display_more_xpath = f"//div[@class='{PageClass.DISPLAY_MORE}' and @role='{PageRoleValue.DISPLAY_MORE}' and text()='{PageText.DISPLAY_MORE}']"
|
|
103
|
+
elements = self.driver.find_elements(By.XPATH, display_more_xpath)
|
|
104
|
+
for _ in range(10):
|
|
105
|
+
for each_element in elements:
|
|
106
|
+
if (
|
|
107
|
+
each_element.text == self.page_text.DISPLAY_MORE
|
|
108
|
+
or each_element.text == self.page_text.DISPLAY_MORE2
|
|
109
|
+
):
|
|
110
|
+
self.move_to_element(element_in=each_element)
|
|
111
|
+
self.scroll_window_with_parameter(parameter_in="500")
|
|
112
|
+
try:
|
|
113
|
+
each_element.click()
|
|
114
|
+
elements = self.driver.find_elements(self.locator.DISPLAY_MORE)
|
|
115
|
+
except Exception as e:
|
|
116
|
+
print(f"Click display more unsucessfully, error message:\n{e}")
|
|
117
|
+
|
|
118
|
+
def click_reject_login_button(self):
|
|
119
|
+
try:
|
|
120
|
+
reject_login_button = WebDriverWait(self.driver, 10).until(
|
|
121
|
+
EC.visibility_of_element_located((self.locator.CLOSELOGIN))
|
|
122
|
+
)
|
|
123
|
+
reject_login_button.click()
|
|
124
|
+
except Exception as e:
|
|
125
|
+
print(f"Click reject button failed, message:{e}")
|
|
126
|
+
|
|
127
|
+
def quit_driver(self):
|
|
128
|
+
self.driver.quit()
|
|
129
|
+
|
|
130
|
+
def close_driver(self):
|
|
131
|
+
self.driver.close()
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
import json
|
|
3
|
+
from urllib.parse import parse_qs, unquote
|
|
4
|
+
from fb_scraper_request.utils.utils import *
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class RequestsParser(object):
|
|
8
|
+
def __init__(self) -> None:
|
|
9
|
+
# self.driver = driver
|
|
10
|
+
self.reaction_names = ["讚", "哈", "怒", "大心", "加油", "哇", "嗚"]
|
|
11
|
+
self.en_reaction_names = [
|
|
12
|
+
"like",
|
|
13
|
+
"haha",
|
|
14
|
+
"angry",
|
|
15
|
+
"love",
|
|
16
|
+
"care",
|
|
17
|
+
"sorry",
|
|
18
|
+
"wow",
|
|
19
|
+
]
|
|
20
|
+
|
|
21
|
+
def get_graphql_body_content(self, req_response, req_url):
|
|
22
|
+
target_url = "https://www.facebook.com/api/graphql/"
|
|
23
|
+
if req_response and req_url == target_url:
|
|
24
|
+
response = req_response
|
|
25
|
+
body = decode(
|
|
26
|
+
response.body, response.headers.get("Content-Encoding", "identity")
|
|
27
|
+
)
|
|
28
|
+
body_content = body.decode("utf-8").split("\n")
|
|
29
|
+
return body_content
|
|
30
|
+
return None
|
|
31
|
+
|
|
32
|
+
def _clean_res(self):
|
|
33
|
+
self.res_new = []
|
|
34
|
+
self.feedback_list = []
|
|
35
|
+
self.context_list = []
|
|
36
|
+
self.creation_list = []
|
|
37
|
+
self.author_id_list = []
|
|
38
|
+
self.author_id_list2 = []
|
|
39
|
+
self.owning_profile = []
|
|
40
|
+
|
|
41
|
+
def parse_body(self, body_content):
|
|
42
|
+
for each_body in body_content:
|
|
43
|
+
json_data = json.loads(each_body)
|
|
44
|
+
self.res_new.append(json_data)
|
|
45
|
+
try:
|
|
46
|
+
each_res = json_data["data"]["node"].copy()
|
|
47
|
+
each_feedback = find_feedback_with_subscription_target_id(each_res)
|
|
48
|
+
if each_feedback:
|
|
49
|
+
self.feedback_list.append(each_feedback)
|
|
50
|
+
message_text = find_message_text(json_data)
|
|
51
|
+
creation_time = find_creation(json_data)
|
|
52
|
+
owing_profile = find_owning_profile(json_data)
|
|
53
|
+
if message_text:
|
|
54
|
+
self.context_list.append(message_text)
|
|
55
|
+
elif not message_text:
|
|
56
|
+
self.context_list.append(None)
|
|
57
|
+
if creation_time:
|
|
58
|
+
self.creation_list.append(creation_time)
|
|
59
|
+
self.owning_profile.append(owing_profile)
|
|
60
|
+
|
|
61
|
+
# Did not display or record error message at here
|
|
62
|
+
except Exception as e:
|
|
63
|
+
pass
|
|
64
|
+
|
|
65
|
+
def collect_posts(self):
|
|
66
|
+
res_out = []
|
|
67
|
+
for each in self.feedback_list:
|
|
68
|
+
res_out.append(
|
|
69
|
+
{
|
|
70
|
+
"post_id": each["subscription_target_id"],
|
|
71
|
+
"reaction_count": each["reaction_count"],
|
|
72
|
+
"top_reactions": each["top_reactions"],
|
|
73
|
+
"share_count": each["share_count"],
|
|
74
|
+
"comment_rendering_instance": each["comment_rendering_instance"],
|
|
75
|
+
"video_view_count": each["video_view_count"],
|
|
76
|
+
}
|
|
77
|
+
)
|
|
78
|
+
return res_out
|
|
79
|
+
|
|
80
|
+
def convert_res_to_df(self, res_in):
|
|
81
|
+
# Pure Python version without pandas
|
|
82
|
+
selected_fields = [
|
|
83
|
+
"post_id",
|
|
84
|
+
"reaction_count",
|
|
85
|
+
"comment_rendering_instance",
|
|
86
|
+
"share_count",
|
|
87
|
+
"top_reactions",
|
|
88
|
+
"video_view_count",
|
|
89
|
+
]
|
|
90
|
+
result = []
|
|
91
|
+
for item in res_in:
|
|
92
|
+
row = {field: item.get(field) for field in selected_fields}
|
|
93
|
+
result.append(row)
|
|
94
|
+
return result
|
|
95
|
+
|
|
96
|
+
def process_reactions(self, reactions_in) -> dict:
|
|
97
|
+
"""Extract sub reaction value:
|
|
98
|
+
Args:
|
|
99
|
+
reactions_in (_type_): _description_
|
|
100
|
+
Returns:
|
|
101
|
+
_dict_: {
|
|
102
|
+
"like": value,
|
|
103
|
+
"haha": value,
|
|
104
|
+
"angry": value,
|
|
105
|
+
"love": value,
|
|
106
|
+
"care": value,
|
|
107
|
+
"sorry": value,
|
|
108
|
+
"wow": value
|
|
109
|
+
}
|
|
110
|
+
Note:
|
|
111
|
+
"""
|
|
112
|
+
reaction_hash = {}
|
|
113
|
+
for each_react in reactions_in:
|
|
114
|
+
reaction_hash[each_react["node"]["localized_name"]] = each_react[
|
|
115
|
+
"reaction_count"
|
|
116
|
+
] # get reaction value
|
|
117
|
+
return reaction_hash
|
|
118
|
+
|
|
119
|
+
def extract_first_payload(self, payload: str):
|
|
120
|
+
parsed_data = parse_qs(payload)
|
|
121
|
+
print("Parsed data:", parsed_data) # Debug: Show the parsed data
|
|
122
|
+
decoded_data = {
|
|
123
|
+
unquote(k): [unquote(v) for v in vals] for k, vals in parsed_data.items()
|
|
124
|
+
} # 解碼 keys 和 values
|
|
125
|
+
first_payload = {
|
|
126
|
+
k: v[0] for k, v in decoded_data.items()
|
|
127
|
+
} # 如果只需要第一個值作為字典中的單一值
|
|
128
|
+
payload_variables = json.loads(first_payload["variables"])
|
|
129
|
+
first_payload["variables"] = payload_variables
|
|
130
|
+
print(first_payload)
|
|
131
|
+
return first_payload
|
|
@@ -0,0 +1,318 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
import concurrent.futures as futures
|
|
3
|
+
import requests
|
|
4
|
+
import re
|
|
5
|
+
from bs4 import BeautifulSoup
|
|
6
|
+
from datetime import datetime, timedelta
|
|
7
|
+
import pytz
|
|
8
|
+
import time
|
|
9
|
+
import json
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
# if key: 'subscription_target_id' in feedback, store this feedback
|
|
13
|
+
def find_feedback_with_subscription_target_id(data):
|
|
14
|
+
if isinstance(data, dict):
|
|
15
|
+
if 'feedback' in data and isinstance(data['feedback'], dict):
|
|
16
|
+
feedback = data['feedback']
|
|
17
|
+
if 'subscription_target_id' in list(feedback.keys()):
|
|
18
|
+
return feedback
|
|
19
|
+
|
|
20
|
+
# Traverse the values of the dictionary and continue recursively searching
|
|
21
|
+
for value in data.values():
|
|
22
|
+
result = find_feedback_with_subscription_target_id(value)
|
|
23
|
+
if result:
|
|
24
|
+
return result
|
|
25
|
+
|
|
26
|
+
# If it is a list, traverse each element in the list and continue recursively searching
|
|
27
|
+
elif isinstance(data, list):
|
|
28
|
+
for item in data:
|
|
29
|
+
result = find_feedback_with_subscription_target_id(item)
|
|
30
|
+
if result:
|
|
31
|
+
return result
|
|
32
|
+
|
|
33
|
+
# If no matching feedback is found, return None
|
|
34
|
+
return None
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def find_message_text(data):
|
|
38
|
+
if isinstance(data, dict):
|
|
39
|
+
# type is dict,check 'story' key
|
|
40
|
+
if 'story' in data:
|
|
41
|
+
# if key 'story's value type is dict, and include 'message' key
|
|
42
|
+
if isinstance(data['story'], dict) and 'message' in data['story']:
|
|
43
|
+
# if key 'message's value type is dict, and include 'text' key
|
|
44
|
+
if isinstance(data['story']['message'], dict) and 'text' in data['story']['message']:
|
|
45
|
+
# return 'text' key
|
|
46
|
+
return data['story']['message']['text']
|
|
47
|
+
|
|
48
|
+
# recursively check each value in dict if can not find anything
|
|
49
|
+
for value in data.values():
|
|
50
|
+
result = find_message_text(value)
|
|
51
|
+
if result:
|
|
52
|
+
return result
|
|
53
|
+
elif isinstance(data, list):
|
|
54
|
+
# if array, check each element recursively
|
|
55
|
+
for item in data:
|
|
56
|
+
result = find_message_text(item)
|
|
57
|
+
if result:
|
|
58
|
+
return result
|
|
59
|
+
# 如果沒有符合條件的值,return None
|
|
60
|
+
return None
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def find_creation(data):
|
|
64
|
+
if isinstance(data, dict):
|
|
65
|
+
# If it's a dictionary, check if it contains the 'story' key
|
|
66
|
+
if 'story' in data:
|
|
67
|
+
# If the value of the 'story' key is a dictionary and contains the 'creation_time' key
|
|
68
|
+
if isinstance(data['story'], dict) and 'creation_time' in data['story']:
|
|
69
|
+
# Return the value of the 'creation_time' key
|
|
70
|
+
return data['story']['creation_time']
|
|
71
|
+
|
|
72
|
+
# If no matching condition is found, recursively check each value in the dictionary
|
|
73
|
+
for value in data.values():
|
|
74
|
+
result = find_creation(value)
|
|
75
|
+
if result:
|
|
76
|
+
return result
|
|
77
|
+
|
|
78
|
+
elif isinstance(data, list):
|
|
79
|
+
# If it's a list, recursively check each element in the list
|
|
80
|
+
for item in data:
|
|
81
|
+
result = find_creation(item)
|
|
82
|
+
if result:
|
|
83
|
+
return result
|
|
84
|
+
# If no matching condition is found, return None
|
|
85
|
+
return None
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def find_actors(data):
|
|
89
|
+
if isinstance(data, dict):
|
|
90
|
+
# If it's a dictionary, check if it contains the 'story' key
|
|
91
|
+
if 'story' in data:
|
|
92
|
+
# If the value of the 'story' key is a dictionary and contains the 'actors' key
|
|
93
|
+
if isinstance(data['story'], dict) and 'actors' in data['story']:
|
|
94
|
+
# Return the value of the 'id' key under 'actors'
|
|
95
|
+
return data['story']['actors']['id']
|
|
96
|
+
|
|
97
|
+
# If no matching condition is found, recursively check each value in the dictionary
|
|
98
|
+
for value in data.values():
|
|
99
|
+
result = find_actors(value)
|
|
100
|
+
if result:
|
|
101
|
+
return result
|
|
102
|
+
|
|
103
|
+
elif isinstance(data, list):
|
|
104
|
+
# If it's a list, recursively check each element in the list
|
|
105
|
+
for item in data:
|
|
106
|
+
result = find_actors(item)
|
|
107
|
+
if result:
|
|
108
|
+
return result
|
|
109
|
+
# If no matching condition is found, return None
|
|
110
|
+
return None
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def find_owning_profile(data):
|
|
114
|
+
if isinstance(data, dict):
|
|
115
|
+
# If it's a dictionary, check if it contains the 'story' key
|
|
116
|
+
if 'owning_profile' in data:
|
|
117
|
+
# If the value of the 'story' key is a dictionary and contains the 'actors' key
|
|
118
|
+
if isinstance(data['owning_profile'], dict):
|
|
119
|
+
# Return the value of the 'id' key under 'actors'
|
|
120
|
+
return data['owning_profile']
|
|
121
|
+
|
|
122
|
+
# If no matching condition is found, recursively check each value in the dictionary
|
|
123
|
+
for value in data.values():
|
|
124
|
+
result = find_owning_profile(value)
|
|
125
|
+
if result:
|
|
126
|
+
return result
|
|
127
|
+
|
|
128
|
+
elif isinstance(data, list):
|
|
129
|
+
# If it's a list, recursively check each element in the list
|
|
130
|
+
for item in data:
|
|
131
|
+
result = find_owning_profile(item)
|
|
132
|
+
if result:
|
|
133
|
+
return result
|
|
134
|
+
# If no matching condition is found, return None
|
|
135
|
+
return None
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def timeout(timelimit):
|
|
139
|
+
def decorator(func):
|
|
140
|
+
def decorated(*args, **kwargs):
|
|
141
|
+
with futures.ThreadPoolExecutor(max_workers=1) as executor:
|
|
142
|
+
future = executor.submit(func, *args, **kwargs)
|
|
143
|
+
try:
|
|
144
|
+
result = future.result(timelimit)
|
|
145
|
+
except futures.TimeoutError:
|
|
146
|
+
print('Time out!')
|
|
147
|
+
raise TimeoutError from None
|
|
148
|
+
else:
|
|
149
|
+
pass
|
|
150
|
+
executor._threads.clear()
|
|
151
|
+
futures.thread._threads_queues.clear()
|
|
152
|
+
return result
|
|
153
|
+
return decorated
|
|
154
|
+
return decorator
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def get_current_time(timezone="Asia/Taipei"):
|
|
158
|
+
current_time_utc = datetime.utcnow()
|
|
159
|
+
target_timezone = pytz.timezone(timezone)
|
|
160
|
+
target_current_time = current_time_utc.replace(
|
|
161
|
+
tzinfo=pytz.utc).astimezone(target_timezone)
|
|
162
|
+
return target_current_time
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def days_difference_from_now(tmp_creation_array: list) -> int:
|
|
166
|
+
"""計算第一次發文日期與當前日間隔天數
|
|
167
|
+
|
|
168
|
+
Args:
|
|
169
|
+
tmp_creation_array (list): _description_
|
|
170
|
+
|
|
171
|
+
Returns:
|
|
172
|
+
int: 間隔天數
|
|
173
|
+
"""
|
|
174
|
+
timestamp = min(tmp_creation_array)
|
|
175
|
+
current_date_time = datetime.now()
|
|
176
|
+
date_time_obj = datetime.fromtimestamp(timestamp)
|
|
177
|
+
difference = current_date_time - date_time_obj
|
|
178
|
+
return difference.days
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def is_date_exceed_limit(max_days_ago, days_limit: int = 61):
|
|
182
|
+
if max_days_ago > days_limit:
|
|
183
|
+
return True
|
|
184
|
+
return False
|
|
185
|
+
|
|
186
|
+
def pause(pause_time: int = 1):
|
|
187
|
+
time.sleep(pause_time)
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def get_payload(doc_id_in: str, id_in: str, before_time: str = None):
|
|
191
|
+
variables_dict = {
|
|
192
|
+
"afterTime": None,
|
|
193
|
+
"beforeTime": before_time,
|
|
194
|
+
"count": 3,
|
|
195
|
+
"cursor": None,
|
|
196
|
+
"feedLocation": "TIMELINE",
|
|
197
|
+
"feedbackSource": 0,
|
|
198
|
+
"focusCommentID": None,
|
|
199
|
+
"memorializedSplitTimeFilter": None,
|
|
200
|
+
"omitPinnedPost": True,
|
|
201
|
+
"postedBy": {"group": "OWNER"},
|
|
202
|
+
"privacy": {"exclusivity": "INCLUSIVE", "filter": "ALL"},
|
|
203
|
+
"privacySelectorRenderLocation": "COMET_STREAM",
|
|
204
|
+
"renderLocation": "timeline",
|
|
205
|
+
"scale": 3,
|
|
206
|
+
"stream_count": 1,
|
|
207
|
+
"taggedInOnly": False,
|
|
208
|
+
"useDefaultActor": False,
|
|
209
|
+
"id": id_in,
|
|
210
|
+
"__relay_internal__pv__CometImmersivePhotoCanUserDisable3DMotionrelayprovider": False,
|
|
211
|
+
"__relay_internal__pv__IsWorkUserrelayprovider": False,
|
|
212
|
+
"__relay_internal__pv__IsMergQAPollsrelayprovider": False,
|
|
213
|
+
"__relay_internal__pv__CometUFIReactionsEnableShortNamerelayprovider": False,
|
|
214
|
+
"__relay_internal__pv__CometUFIShareActionMigrationrelayprovider": False,
|
|
215
|
+
"__relay_internal__pv__StoriesArmadilloReplyEnabledrelayprovider": False,
|
|
216
|
+
"__relay_internal__pv__StoriesTrayShouldShowMetadatarelayprovider": False,
|
|
217
|
+
"__relay_internal__pv__StoriesRingrelayprovider": False,
|
|
218
|
+
"__relay_internal__pv__EventCometCardImage_prefetchEventImagerelayprovider": False
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
payload_out = {
|
|
222
|
+
"variables": json.dumps(variables_dict),
|
|
223
|
+
"doc_id": doc_id_in
|
|
224
|
+
}
|
|
225
|
+
return payload_out
|
|
226
|
+
|
|
227
|
+
def get_next_payload(
|
|
228
|
+
doc_id_in:str,
|
|
229
|
+
id_in:str,
|
|
230
|
+
before_time:str,
|
|
231
|
+
cursor_in:str
|
|
232
|
+
):
|
|
233
|
+
variables_dict = {
|
|
234
|
+
"afterTime": None,
|
|
235
|
+
"beforeTime": before_time,
|
|
236
|
+
"count": 3,
|
|
237
|
+
"cursor": cursor_in,
|
|
238
|
+
"feedLocation": "TIMELINE",
|
|
239
|
+
"feedbackSource": 0,
|
|
240
|
+
"focusCommentID": None,
|
|
241
|
+
"memorializedSplitTimeFilter": None,
|
|
242
|
+
"omitPinnedPost": True,
|
|
243
|
+
"postedBy": {"group": "OWNER"},
|
|
244
|
+
"privacy": {"exclusivity": "INCLUSIVE", "filter": "ALL"},
|
|
245
|
+
"privacySelectorRenderLocation": "COMET_STREAM",
|
|
246
|
+
"renderLocation": "timeline",
|
|
247
|
+
"scale": 3,
|
|
248
|
+
"stream_count": 1,
|
|
249
|
+
"taggedInOnly": False,
|
|
250
|
+
"useDefaultActor": False,
|
|
251
|
+
"id": id_in,
|
|
252
|
+
"__relay_internal__pv__CometImmersivePhotoCanUserDisable3DMotionrelayprovider": False,
|
|
253
|
+
"__relay_internal__pv__IsWorkUserrelayprovider": False,
|
|
254
|
+
"__relay_internal__pv__IsMergQAPollsrelayprovider": False,
|
|
255
|
+
"__relay_internal__pv__CometUFIReactionsEnableShortNamerelayprovider": False,
|
|
256
|
+
"__relay_internal__pv__CometUFIShareActionMigrationrelayprovider": False,
|
|
257
|
+
"__relay_internal__pv__StoriesArmadilloReplyEnabledrelayprovider": False,
|
|
258
|
+
"__relay_internal__pv__StoriesTrayShouldShowMetadatarelayprovider": False,
|
|
259
|
+
"__relay_internal__pv__StoriesRingrelayprovider": False,
|
|
260
|
+
"__relay_internal__pv__EventCometCardImage_prefetchEventImagerelayprovider": False
|
|
261
|
+
}
|
|
262
|
+
payload_out = {
|
|
263
|
+
"variables": json.dumps(variables_dict),
|
|
264
|
+
"doc_id": doc_id_in
|
|
265
|
+
}
|
|
266
|
+
return payload_out
|
|
267
|
+
|
|
268
|
+
def get_next_cursor(body_content_in):
|
|
269
|
+
for i in range(len(body_content_in)-1, -1, -1):
|
|
270
|
+
try:
|
|
271
|
+
json_tail = json.loads(body_content_in[i])
|
|
272
|
+
nex_cursor = json_tail.get("data").get(
|
|
273
|
+
"page_info").get("end_cursor")
|
|
274
|
+
return nex_cursor
|
|
275
|
+
except AttributeError:
|
|
276
|
+
pass
|
|
277
|
+
|
|
278
|
+
def get_next_page_status(body_content):
|
|
279
|
+
for each_body in body_content:
|
|
280
|
+
try:
|
|
281
|
+
tmp_json = json.loads(each_body)
|
|
282
|
+
next_page_status = tmp_json.get("data").get(
|
|
283
|
+
"page_info").get("has_next_page")
|
|
284
|
+
return next_page_status
|
|
285
|
+
except Exception as e:
|
|
286
|
+
pass
|
|
287
|
+
return True # sometimes, scraper can not collect API's "has_next" info, Program choose return True, I will improve this step in the near future.
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
def compare_timestamp(timestamp: int, days_limit: int, display_progress: bool) -> bool:
|
|
291
|
+
timestamp_date = datetime.utcfromtimestamp(timestamp).date()
|
|
292
|
+
current_date = datetime.utcnow().date()
|
|
293
|
+
past_date = current_date - timedelta(days=days_limit)
|
|
294
|
+
if display_progress:
|
|
295
|
+
days_remaining = (timestamp_date - past_date).days
|
|
296
|
+
if days_remaining > 0:
|
|
297
|
+
print(f"{days_remaining} more days of posts to collect.")
|
|
298
|
+
else:
|
|
299
|
+
print("Target days reached or exceeded.")
|
|
300
|
+
return timestamp_date < past_date
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
def get_before_time(time_zone='Asia/Taipei'):
|
|
304
|
+
location_tz = pytz.timezone(time_zone)
|
|
305
|
+
current_time = datetime.now(location_tz)
|
|
306
|
+
timestamp = str(int(current_time.timestamp()))
|
|
307
|
+
return timestamp
|
|
308
|
+
|
|
309
|
+
def get_posts_image(post_id:str):
|
|
310
|
+
url = f"https://www.facebook.com/plugins/post.php?href=https%3A%2F%2Fwww.facebook.com%2Ftoolbox003%2Fposts%2F{post_id}&show_text=true&width=800"
|
|
311
|
+
"""You can check out the content through the link
|
|
312
|
+
to better understand what I'm talking about haha"""
|
|
313
|
+
response = requests.get(url=url)
|
|
314
|
+
response.status_code
|
|
315
|
+
soup = BeautifulSoup(response.text, "html.parser")
|
|
316
|
+
pattern = re.compile(r"^https://scontent")
|
|
317
|
+
all_src_links = [tag['src'] for tag in soup.find_all(src=pattern)]
|
|
318
|
+
return all_src_links
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: fb_scraper_request
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.5
|
|
4
4
|
Summary: Facebook GraphQL Scraper - No login required, simple API to scrape public Facebook posts
|
|
5
5
|
Author-email: Nguyen Minh Quang <quangforwork1203@gmail.com>
|
|
6
6
|
License: MIT
|
|
@@ -24,6 +24,7 @@ License-File: LICENSE
|
|
|
24
24
|
Requires-Dist: requests>=2.28.0
|
|
25
25
|
Requires-Dist: pytz
|
|
26
26
|
Requires-Dist: pip>=26.0.1
|
|
27
|
+
Requires-Dist: beautifulsoup4>=4.12.0
|
|
27
28
|
Provides-Extra: dev
|
|
28
29
|
Requires-Dist: build>=0.8.0; extra == "dev"
|
|
29
30
|
Requires-Dist: twine>=4.0.0; extra == "dev"
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
fb_scraper_request/__init__.py
|
|
5
|
+
fb_scraper_request/example.py
|
|
6
|
+
fb_scraper_request/facebook_graphql_scraper.py
|
|
7
|
+
fb_scraper_request.egg-info/PKG-INFO
|
|
8
|
+
fb_scraper_request.egg-info/SOURCES.txt
|
|
9
|
+
fb_scraper_request.egg-info/dependency_links.txt
|
|
10
|
+
fb_scraper_request.egg-info/requires.txt
|
|
11
|
+
fb_scraper_request.egg-info/top_level.txt
|
|
12
|
+
fb_scraper_request/base/__init__.py
|
|
13
|
+
fb_scraper_request/base/base_page.py
|
|
14
|
+
fb_scraper_request/pages/__init__.py
|
|
15
|
+
fb_scraper_request/pages/page_optional.py
|
|
16
|
+
fb_scraper_request/tests/__init__.py
|
|
17
|
+
fb_scraper_request/utils/__init__.py
|
|
18
|
+
fb_scraper_request/utils/parser.py
|
|
19
|
+
fb_scraper_request/utils/utils.py
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "fb_scraper_request"
|
|
3
|
-
version = "0.2.
|
|
3
|
+
version = "0.2.5"
|
|
4
4
|
description = "Facebook GraphQL Scraper - No login required, simple API to scrape public Facebook posts"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
requires-python = ">=3.9"
|
|
@@ -25,6 +25,7 @@ dependencies = [
|
|
|
25
25
|
"requests>=2.28.0",
|
|
26
26
|
"pytz",
|
|
27
27
|
"pip>=26.0.1",
|
|
28
|
+
"beautifulsoup4>=4.12.0",
|
|
28
29
|
]
|
|
29
30
|
|
|
30
31
|
[project.optional-dependencies]
|
|
@@ -42,5 +43,6 @@ Issues = "https://github.com/DOCUTEE/fb_crawl_request/issues"
|
|
|
42
43
|
requires = ["setuptools>=45", "wheel"]
|
|
43
44
|
build-backend = "setuptools.build_meta"
|
|
44
45
|
|
|
45
|
-
[tool.setuptools]
|
|
46
|
-
|
|
46
|
+
[tool.setuptools.packages.find]
|
|
47
|
+
where = ["."]
|
|
48
|
+
include = ["fb_scraper_request*"]
|
|
@@ -1,11 +0,0 @@
|
|
|
1
|
-
LICENSE
|
|
2
|
-
README.md
|
|
3
|
-
pyproject.toml
|
|
4
|
-
fb_scraper_request/__init__.py
|
|
5
|
-
fb_scraper_request/example.py
|
|
6
|
-
fb_scraper_request/facebook_graphql_scraper.py
|
|
7
|
-
fb_scraper_request.egg-info/PKG-INFO
|
|
8
|
-
fb_scraper_request.egg-info/SOURCES.txt
|
|
9
|
-
fb_scraper_request.egg-info/dependency_links.txt
|
|
10
|
-
fb_scraper_request.egg-info/requires.txt
|
|
11
|
-
fb_scraper_request.egg-info/top_level.txt
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{fb_scraper_request-0.2.3 → fb_scraper_request-0.2.5}/fb_scraper_request.egg-info/top_level.txt
RENAMED
|
File without changes
|
|
File without changes
|