fbcrawl 0.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fbcrawl-0.0.1/LICENSE +21 -0
- fbcrawl-0.0.1/PKG-INFO +14 -0
- fbcrawl-0.0.1/README.md +0 -0
- fbcrawl-0.0.1/pyproject.toml +23 -0
- fbcrawl-0.0.1/setup.cfg +4 -0
- fbcrawl-0.0.1/src/fbcrawl/__init__.py +0 -0
- fbcrawl-0.0.1/src/fbcrawl/auth.py +65 -0
- fbcrawl-0.0.1/src/fbcrawl/drivers.py +43 -0
- fbcrawl-0.0.1/src/fbcrawl/file_ops.py +12 -0
- fbcrawl-0.0.1/src/fbcrawl/request_parsers/__init__.py +0 -0
- fbcrawl-0.0.1/src/fbcrawl/request_parsers/comments.py +203 -0
- fbcrawl-0.0.1/src/fbcrawl/request_parsers/posts.py +203 -0
- fbcrawl-0.0.1/src/fbcrawl/request_parsers/requests_decoder.py +81 -0
- fbcrawl-0.0.1/src/fbcrawl/scraper.py +55 -0
- fbcrawl-0.0.1/src/fbcrawl/scrapers/post.py +78 -0
- fbcrawl-0.0.1/src/fbcrawl.egg-info/PKG-INFO +14 -0
- fbcrawl-0.0.1/src/fbcrawl.egg-info/SOURCES.txt +17 -0
- fbcrawl-0.0.1/src/fbcrawl.egg-info/dependency_links.txt +1 -0
- fbcrawl-0.0.1/src/fbcrawl.egg-info/top_level.txt +1 -0
fbcrawl-0.0.1/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Securely Innovations
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
fbcrawl-0.0.1/PKG-INFO
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: fbcrawl
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: A small example package
|
|
5
|
+
Author-email: Farhan Ahmed <jattfarhan10@gmail.com>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/Securely-Innovations/FbScraper
|
|
8
|
+
Project-URL: Issues, https://github.com/Securely-Innovations/FbScraper/issues
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: Operating System :: OS Independent
|
|
11
|
+
Requires-Python: >=3.10
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
License-File: LICENSE
|
|
14
|
+
Dynamic: license-file
|
fbcrawl-0.0.1/README.md
ADDED
|
File without changes
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools >= 77.0.3"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "fbcrawl"
|
|
7
|
+
version = "0.0.1"
|
|
8
|
+
authors = [
|
|
9
|
+
{ name="Farhan Ahmed", email="jattfarhan10@gmail.com" },
|
|
10
|
+
]
|
|
11
|
+
description = "A small example package"
|
|
12
|
+
readme = "README.md"
|
|
13
|
+
requires-python = ">=3.10"
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Programming Language :: Python :: 3",
|
|
16
|
+
"Operating System :: OS Independent",
|
|
17
|
+
]
|
|
18
|
+
license = "MIT"
|
|
19
|
+
license-files = ["LICEN[CS]E*"]
|
|
20
|
+
|
|
21
|
+
[project.urls]
|
|
22
|
+
Homepage = "https://github.com/Securely-Innovations/FbScraper"
|
|
23
|
+
Issues = "https://github.com/Securely-Innovations/FbScraper/issues"
|
fbcrawl-0.0.1/setup.cfg
ADDED
|
File without changes
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from seleniumwire import webdriver
|
|
3
|
+
from selenium.webdriver.common.by import By
|
|
4
|
+
from selenium.common.exceptions import TimeoutException
|
|
5
|
+
from selenium.webdriver.support.ui import WebDriverWait
|
|
6
|
+
from selenium.webdriver.support import expected_conditions as EC
|
|
7
|
+
|
|
8
|
+
class Authenticator:
|
|
9
|
+
def __init__(self, driver, email, password) -> None:
|
|
10
|
+
self.email = email
|
|
11
|
+
self.driver = driver
|
|
12
|
+
self.password = password
|
|
13
|
+
|
|
14
|
+
self.xpaths = {
|
|
15
|
+
"create_post_btn": "//div[@aria-label='Create a post']"
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def authenticate(self):
|
|
20
|
+
if self.is_logged_in():
|
|
21
|
+
print("[+] Already logged in (session restored)")
|
|
22
|
+
return
|
|
23
|
+
print("[*] Session not authenticated, logging in...")
|
|
24
|
+
self.perform_login()
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def is_logged_in(self, timeout=5):
|
|
28
|
+
"""
|
|
29
|
+
Detect whether Facebook session is already authenticated
|
|
30
|
+
"""
|
|
31
|
+
try:
|
|
32
|
+
WebDriverWait(self.driver, timeout).until(
|
|
33
|
+
EC.presence_of_element_located((By.XPATH, self.xpaths['create_post_btn']))
|
|
34
|
+
)
|
|
35
|
+
return True
|
|
36
|
+
except TimeoutException:
|
|
37
|
+
return False
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def perform_login(self):
|
|
41
|
+
"""
|
|
42
|
+
Login using email/password if logged out
|
|
43
|
+
"""
|
|
44
|
+
try:
|
|
45
|
+
email_field = WebDriverWait(self.driver, 10).until(
|
|
46
|
+
EC.presence_of_element_located((By.ID, "email"))
|
|
47
|
+
)
|
|
48
|
+
password_field = self.driver.find_element(By.ID, "pass")
|
|
49
|
+
|
|
50
|
+
email_field.clear()
|
|
51
|
+
email_field.send_keys(self.email)
|
|
52
|
+
password_field.clear()
|
|
53
|
+
password_field.send_keys(self.password)
|
|
54
|
+
|
|
55
|
+
login_button = self.driver.find_element(By.NAME, "login")
|
|
56
|
+
login_button.click()
|
|
57
|
+
input("Press Enter after completing any additional verification steps...")
|
|
58
|
+
WebDriverWait(self.driver, 15).until(
|
|
59
|
+
EC.presence_of_element_located((By.XPATH, self.xpaths['create_post_btn']))
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
print("[+] Logged in successfully")
|
|
63
|
+
|
|
64
|
+
except TimeoutException:
|
|
65
|
+
raise RuntimeError("[-] Login failed or Facebook UI changed")
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from seleniumwire import webdriver
|
|
3
|
+
from selenium.webdriver.common.by import By
|
|
4
|
+
from selenium.common.exceptions import TimeoutException
|
|
5
|
+
from selenium.webdriver.support.ui import WebDriverWait
|
|
6
|
+
from selenium.webdriver.support import expected_conditions as EC
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class Init:
|
|
10
|
+
def get_driver(self, profile_path: str = "", proxy: dict = {}) -> webdriver.Chrome:
|
|
11
|
+
"""Initializes Selenium-wire driver instance
|
|
12
|
+
|
|
13
|
+
Args:
|
|
14
|
+
profile_path (str): Path to Store Chrome Profile, helps save auth session. By Default it saves a profile in current directory
|
|
15
|
+
proxy (dict): a proxy dict with following structure:
|
|
16
|
+
"proxy": {
|
|
17
|
+
"http": "http://username:password@host:port",
|
|
18
|
+
"https": "https://username:password@host:port",
|
|
19
|
+
"no_proxy": "localhost,127.0.0.1"
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
Returns:
|
|
23
|
+
webdriver.Chrome: Selenium-wire driver Object
|
|
24
|
+
"""
|
|
25
|
+
if not profile_path:
|
|
26
|
+
base_dir = os.getcwd()
|
|
27
|
+
profile_path = os.path.join(base_dir, "chrome_profile")
|
|
28
|
+
|
|
29
|
+
os.makedirs(profile_path, exist_ok=True)
|
|
30
|
+
|
|
31
|
+
options = webdriver.ChromeOptions()
|
|
32
|
+
options.add_argument(f"--user-data-dir={profile_path}")
|
|
33
|
+
options.add_argument("--profile-directory=Default")
|
|
34
|
+
options.add_argument("--disable-notifications")
|
|
35
|
+
# options.add_argument("--user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/144.0.0.0 Safari/537.36")
|
|
36
|
+
if proxy:
|
|
37
|
+
seleniumwire_options = {
|
|
38
|
+
'proxy': proxy
|
|
39
|
+
}
|
|
40
|
+
driver = webdriver.Chrome(options=options, seleniumwire_options=options)
|
|
41
|
+
else:
|
|
42
|
+
driver = webdriver.Chrome(options=options)
|
|
43
|
+
return driver
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import json
|
|
3
|
+
|
|
4
|
+
def load_json(json_path='posts.json'):
|
|
5
|
+
if not os.path.exists(json_path):
|
|
6
|
+
return {}
|
|
7
|
+
with open(json_path, 'r', encoding='utf-8') as f:
|
|
8
|
+
return json.load(f)
|
|
9
|
+
|
|
10
|
+
def save_json(file_name, data):
|
|
11
|
+
with open(file_name, 'w', encoding='utf-8') as f:
|
|
12
|
+
json.dump(data, f, indent=4, ensure_ascii=False)
|
|
File without changes
|
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
|
|
2
|
+
import json
|
|
3
|
+
from request_parsers.requests_decoder import get_response
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class CommentsParser:
|
|
7
|
+
def __init__(self, driver):
|
|
8
|
+
self.replies = {}
|
|
9
|
+
self.comments = {}
|
|
10
|
+
self.driver = driver
|
|
11
|
+
self.fetch_comments_api()
|
|
12
|
+
|
|
13
|
+
def fetch_comments(self):
|
|
14
|
+
# self.combine_replies_with_comments()
|
|
15
|
+
# post_id = None
|
|
16
|
+
# for comment_data in self.comments.values():
|
|
17
|
+
# post_id = comment_data.get('post_id')
|
|
18
|
+
# if post_id:
|
|
19
|
+
# break
|
|
20
|
+
# if post_id is None:
|
|
21
|
+
# return {}
|
|
22
|
+
return self.comments, self.replies
|
|
23
|
+
|
|
24
|
+
def fetch_comments_api(self):
|
|
25
|
+
for comments in get_response(self.driver, _type=['comments', 'replies']):
|
|
26
|
+
if comments.get('comments'):
|
|
27
|
+
total = f"{len(comments['comments'])} Comments"
|
|
28
|
+
self.comments |= self.parse_comments(comments['comments'])
|
|
29
|
+
else: # Comment Replies
|
|
30
|
+
total = f"{len(comments['replies'])} Replies"
|
|
31
|
+
self.replies |= self.parse_replies(comments['replies'])
|
|
32
|
+
print(f"Fetched {total}")
|
|
33
|
+
|
|
34
|
+
def parse_comments(self, edges):
|
|
35
|
+
comments = {}
|
|
36
|
+
for edge in edges:
|
|
37
|
+
comment = self.extract_comment_info(edge)
|
|
38
|
+
comments |= comment
|
|
39
|
+
return comments
|
|
40
|
+
|
|
41
|
+
def extract_comment_info(self, comment_edge: dict) -> dict:
|
|
42
|
+
"""
|
|
43
|
+
Extracts important fields from a Facebook GraphQL comment edge.
|
|
44
|
+
"""
|
|
45
|
+
node = comment_edge.get("node", {})
|
|
46
|
+
feedback = node.get("feedback", {})
|
|
47
|
+
feedback = {} if feedback is None else feedback
|
|
48
|
+
author = node.get("author", {})
|
|
49
|
+
body = node.get("body", {})
|
|
50
|
+
parent_feedback = node.get("parent_feedback", {})
|
|
51
|
+
if parent_feedback is None or parent_feedback == {}:
|
|
52
|
+
parent_feedback = node.get("comet_comment_author_name_and_badges_renderer", {}).get('comment', {}).get("parent_feedback", {})
|
|
53
|
+
|
|
54
|
+
attachments = []
|
|
55
|
+
tmp_attachments = node.get("attachments", [])
|
|
56
|
+
for tmp_attachment in tmp_attachments:
|
|
57
|
+
tmp = tmp_attachment.get('style_type_renderer', {}).get('attachment', {})
|
|
58
|
+
if tmp is None:
|
|
59
|
+
continue
|
|
60
|
+
media = tmp.get('media')
|
|
61
|
+
attachment = {
|
|
62
|
+
"type": media if media is None else media.get("__typename"),
|
|
63
|
+
"url": tmp.get("url"),
|
|
64
|
+
"photo_img": tmp.get('photo_image', {}).get('uri'),
|
|
65
|
+
|
|
66
|
+
}
|
|
67
|
+
attachments.append(attachment)
|
|
68
|
+
# Reaction breakdown
|
|
69
|
+
reactions = {}
|
|
70
|
+
for r in feedback.get("top_reactions", {}).get("edges", []):
|
|
71
|
+
reaction_id = r["node"].get("id")
|
|
72
|
+
reactions[reaction_id] = r.get("reaction_count", 0)
|
|
73
|
+
post_id = parent_feedback.get("share_fbid")
|
|
74
|
+
if post_id is None:
|
|
75
|
+
with open("debug_missing_post_id.json", "w", encoding="utf-8") as f:
|
|
76
|
+
json.dump(comment_edge, f, indent=4)
|
|
77
|
+
raise ValueError("Post ID not found in parent feedback.")
|
|
78
|
+
comment_id = node.get("id")
|
|
79
|
+
if comment_id is None:
|
|
80
|
+
with open("debug_missing_comment_id.json", "w", encoding="utf-8") as f:
|
|
81
|
+
json.dump(comment_edge, f, indent=4)
|
|
82
|
+
raise ValueError("Comment ID not found in comment node.")
|
|
83
|
+
extracted = {
|
|
84
|
+
# Comment identifiers
|
|
85
|
+
"id": comment_id,
|
|
86
|
+
|
|
87
|
+
# Author info
|
|
88
|
+
"author_id": author.get("id"),
|
|
89
|
+
"author_name": author.get("name"),
|
|
90
|
+
"author_profile_url": author.get("url"),
|
|
91
|
+
"author_gender": author.get("gender"),
|
|
92
|
+
|
|
93
|
+
# Content
|
|
94
|
+
"text": None if body is None else body.get("text"),
|
|
95
|
+
"language": node.get("translatability_for_viewer", {}).get("source_dialect"),
|
|
96
|
+
|
|
97
|
+
# Timing
|
|
98
|
+
"created_time": node.get("created_time"),
|
|
99
|
+
|
|
100
|
+
# Engagement
|
|
101
|
+
"reaction_total": feedback.get("reactors", {}).get("count"),
|
|
102
|
+
"reaction_breakdown": reactions,
|
|
103
|
+
"reply_count": feedback.get("replies_fields", {}).get("total_count"),
|
|
104
|
+
|
|
105
|
+
# Relationships
|
|
106
|
+
"post_id": post_id,
|
|
107
|
+
"post_author_id": parent_feedback.get("owning_profile", {}).get("id"),
|
|
108
|
+
|
|
109
|
+
# URLs
|
|
110
|
+
"comment_url": feedback.get("url"),
|
|
111
|
+
"attachements": attachments,
|
|
112
|
+
# Replies
|
|
113
|
+
"replies": {}
|
|
114
|
+
}
|
|
115
|
+
reply_edges = feedback.get("replies_connection", {}).get("edges", [])
|
|
116
|
+
if reply_edges:
|
|
117
|
+
replies = self.parse_replies(reply_edges)
|
|
118
|
+
self.replies |= replies
|
|
119
|
+
|
|
120
|
+
return {comment_id: extracted}
|
|
121
|
+
|
|
122
|
+
def extract_reply_info(self, edge: dict) -> dict:
|
|
123
|
+
node = edge.get("node", {})
|
|
124
|
+
author = node.get("author", {})
|
|
125
|
+
feedback = node.get("feedback", {})
|
|
126
|
+
body = node.get("body", {})
|
|
127
|
+
# Reaction breakdown
|
|
128
|
+
reactions = {}
|
|
129
|
+
for r in feedback.get("top_reactions", {}).get("edges", []):
|
|
130
|
+
reaction_id = r["node"].get("id")
|
|
131
|
+
reactions[reaction_id] = r.get("reaction_count", 0)
|
|
132
|
+
|
|
133
|
+
parent_comment_id = node.get("comment_direct_parent", {}).get("id")
|
|
134
|
+
if parent_comment_id is None:
|
|
135
|
+
with open("debug_missing_parent_comment_id.json", "w", encoding="utf-8") as f:
|
|
136
|
+
json.dump(edge, f, indent=4)
|
|
137
|
+
raise ValueError("Parent comment ID not found in reply.")
|
|
138
|
+
reply_id = node.get("id")
|
|
139
|
+
if reply_id is None:
|
|
140
|
+
with open("debug_missing_reply_id.json", "w", encoding="utf-8") as f:
|
|
141
|
+
json.dump(edge, f, indent=4, ensure_ascii=True)
|
|
142
|
+
raise ValueError("Reply ID not found in reply node.")
|
|
143
|
+
extracted = {
|
|
144
|
+
"id": reply_id,
|
|
145
|
+
"depth": node.get("depth"),
|
|
146
|
+
|
|
147
|
+
"author_id": author.get("id"),
|
|
148
|
+
"author_name": author.get("name"),
|
|
149
|
+
"author_profile": author.get("url"),
|
|
150
|
+
"author_gender": author.get("gender"),
|
|
151
|
+
|
|
152
|
+
"text": None if body is None else body.get("text"),
|
|
153
|
+
"language": node.get("translatability_for_viewer", {}).get("source_dialect"),
|
|
154
|
+
|
|
155
|
+
"created_time": node.get("created_time"),
|
|
156
|
+
|
|
157
|
+
"parent_comment_id": parent_comment_id,
|
|
158
|
+
|
|
159
|
+
"reply_count": feedback.get("replies_fields", {}).get("count"),
|
|
160
|
+
"reaction_count": feedback.get("reactors", {}).get("count"),
|
|
161
|
+
"reaction_breakdown": reactions,
|
|
162
|
+
|
|
163
|
+
"comment_url": feedback.get("url"),
|
|
164
|
+
"replies": {}
|
|
165
|
+
}
|
|
166
|
+
reply_edges = feedback.get("replies_connection", {}).get("edges", [])
|
|
167
|
+
if reply_edges:
|
|
168
|
+
replies = self.parse_replies(reply_edges)
|
|
169
|
+
self.replies |= replies
|
|
170
|
+
|
|
171
|
+
return {reply_id: extracted}
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def parse_replies(self, edges: dict) -> dict:
|
|
175
|
+
replies = {}
|
|
176
|
+
for edge in edges:
|
|
177
|
+
reply = self.extract_reply_info(edge)
|
|
178
|
+
replies |= reply
|
|
179
|
+
return replies
|
|
180
|
+
|
|
181
|
+
def combine_replies_with_comments(self):
|
|
182
|
+
"""
|
|
183
|
+
Combining replies to their respective parent comments and replies.
|
|
184
|
+
Supports multi-level nesting (comment -> reply -> sub-reply).
|
|
185
|
+
"""
|
|
186
|
+
# 1. Create a unified lookup of everything that can be a parent
|
|
187
|
+
# This allows us to find a parent whether it is a top-level comment or a reply
|
|
188
|
+
all_nodes = {**self.comments, **self.replies}
|
|
189
|
+
print(f"Total comments: {len(self.comments)}, Total replies: {len(self.replies)}")
|
|
190
|
+
# 2. Iterate through all replies to find their parents
|
|
191
|
+
for reply_id, reply_data in self.replies.items():
|
|
192
|
+
parent_id = reply_data.get('parent_comment_id')
|
|
193
|
+
|
|
194
|
+
if parent_id and parent_id in all_nodes:
|
|
195
|
+
# Add the reply to the 'replies' dictionary of its parent
|
|
196
|
+
# Using the parent_id from all_nodes ensures we can nest deeply
|
|
197
|
+
all_nodes[parent_id]['replies'][reply_id] = reply_data
|
|
198
|
+
else:
|
|
199
|
+
# This case handles scenarios where a reply's parent might not
|
|
200
|
+
# have been captured in the current API fetch cycle
|
|
201
|
+
print(f"Warning: Parent ID {parent_id} for reply {reply_id} not found.")
|
|
202
|
+
# self.comments is now updated in-place because objects in Python are passed by reference.
|
|
203
|
+
# Top-level comments now contain their direct replies, which contain their sub-replies.
|
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
|
|
2
|
+
import json
|
|
3
|
+
import traceback
|
|
4
|
+
from request_parsers.requests_decoder import get_response
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class PostParser:
|
|
8
|
+
def __init__(self, driver):
|
|
9
|
+
self.posts = {}
|
|
10
|
+
self.driver = driver
|
|
11
|
+
|
|
12
|
+
def fetch_post(self) -> dict:
|
|
13
|
+
return self.posts
|
|
14
|
+
|
|
15
|
+
def fetch_post_apis(self):
|
|
16
|
+
self.fetch_post_api('post')
|
|
17
|
+
self.fetch_post_api('posts')
|
|
18
|
+
|
|
19
|
+
def fetch_post_api(self, post_type: str ='post') -> None:
|
|
20
|
+
"""Fetch data from post API
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
post_type (str, optional): There are two types of post data (post, posts). Defaults to 'post'.
|
|
24
|
+
"""
|
|
25
|
+
for post in get_response(self.driver, _type=post_type):
|
|
26
|
+
data = self.extract_post_info(post['post'])
|
|
27
|
+
if data is None:
|
|
28
|
+
continue
|
|
29
|
+
self.posts[data['old_id']] = data
|
|
30
|
+
|
|
31
|
+
def _extract_attached_story(self, story: dict) -> dict:
|
|
32
|
+
"""Cleans Post data from fb and return it.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
story (dict):
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
dict: Clean Post Info
|
|
39
|
+
"""
|
|
40
|
+
# Navigate to the main story object
|
|
41
|
+
if story is None or story == {}:
|
|
42
|
+
return {}
|
|
43
|
+
context_story = story.get("comet_sections", {}).get("context_layout", {}).get("story", {})
|
|
44
|
+
# 2. Text (Original)
|
|
45
|
+
content_story = context_story.get("comet_sections", {}).get("content", {}).get("story", {})
|
|
46
|
+
if content_story is None:
|
|
47
|
+
content_story = content_story
|
|
48
|
+
if content_story.get('message') is None:
|
|
49
|
+
original_text = None
|
|
50
|
+
translation_text = None
|
|
51
|
+
else:
|
|
52
|
+
original_text = content_story.get('message', {}).get('text')
|
|
53
|
+
# 3. Translation
|
|
54
|
+
message_container = content_story.get('comet_sections', {}).get('message_container', {})
|
|
55
|
+
translation_text = None if message_container is None else message_container.get('story', {}).get('translation', {}).get('message', {}).get('text')
|
|
56
|
+
|
|
57
|
+
# Extract Actor Info
|
|
58
|
+
actor = context_story.get("comet_sections", {}).get("title", {}).get("story", {}).get("actors", [{}])[0]
|
|
59
|
+
author_name = actor.get("name")
|
|
60
|
+
author_name = author_name if author_name is not None else story.get("feedback", {}).get("owning_profile", {}).get("name")
|
|
61
|
+
# Extract Metadata (Timestamp)
|
|
62
|
+
metadata = context_story.get("comet_sections", {}).get("metadata", [])
|
|
63
|
+
timestamp = None
|
|
64
|
+
if metadata:
|
|
65
|
+
timestamp = metadata[0].get("story", {}).get("creation_time")
|
|
66
|
+
data = {
|
|
67
|
+
"old_id": story.get("id"),
|
|
68
|
+
"id": story.get("feedback", {}).get("owning_profile", {}).get("id"),
|
|
69
|
+
"author": author_name,
|
|
70
|
+
"author_url": actor.get("url"),
|
|
71
|
+
"original_text": original_text,
|
|
72
|
+
"translation": translation_text,
|
|
73
|
+
"created_time": timestamp,
|
|
74
|
+
"post_link": story.get("permalink_url"),
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
return data
|
|
78
|
+
|
|
79
|
+
def _extract_engagement_metrics(self, comet_sections: dict) -> dict:
|
|
80
|
+
"""
|
|
81
|
+
Extract reactions, comments, and shares from Facebook Comet UFI block
|
|
82
|
+
as seen in del.json
|
|
83
|
+
"""
|
|
84
|
+
|
|
85
|
+
feedback_root = (
|
|
86
|
+
comet_sections
|
|
87
|
+
.get("feedback", {})
|
|
88
|
+
.get("story", {})
|
|
89
|
+
.get("story_ufi_container", {})
|
|
90
|
+
.get("story", {})
|
|
91
|
+
.get("feedback_context", {})
|
|
92
|
+
.get("feedback_target_with_context", {})
|
|
93
|
+
.get("comet_ufi_summary_and_actions_renderer", {})
|
|
94
|
+
.get("feedback", {})
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
if not feedback_root:
|
|
98
|
+
return {"error": "No UFI feedback block found"}
|
|
99
|
+
|
|
100
|
+
# Total reactions
|
|
101
|
+
reactions_total = (
|
|
102
|
+
feedback_root
|
|
103
|
+
.get("reaction_count", {})
|
|
104
|
+
.get("count")
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
# Reaction breakdown (Like, Love, Haha, etc.)
|
|
108
|
+
reactions_breakdown = {}
|
|
109
|
+
for edge in feedback_root.get("top_reactions", {}).get("edges", []):
|
|
110
|
+
node = edge.get("node", {})
|
|
111
|
+
name = node.get("localized_name")
|
|
112
|
+
count = edge.get("reaction_count")
|
|
113
|
+
if name and count is not None:
|
|
114
|
+
reactions_breakdown[name] = count
|
|
115
|
+
|
|
116
|
+
# Total comments
|
|
117
|
+
comments_total = (
|
|
118
|
+
feedback_root
|
|
119
|
+
.get("comment_rendering_instance", {})
|
|
120
|
+
.get("comments", {})
|
|
121
|
+
.get("total_count")
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
# Total shares
|
|
125
|
+
shares_total = (
|
|
126
|
+
feedback_root
|
|
127
|
+
.get("share_count", {})
|
|
128
|
+
.get("count")
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
return {
|
|
132
|
+
"total_reactions": reactions_total,
|
|
133
|
+
"reactions_breakdown": reactions_breakdown,
|
|
134
|
+
"total_comments": comments_total,
|
|
135
|
+
"total_shares": shares_total,
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def extract_post_info(self, story): # story=node_v2
|
|
140
|
+
try:
|
|
141
|
+
# Access the main story node
|
|
142
|
+
comet_sections = story.get('comet_sections', {})
|
|
143
|
+
content_story = comet_sections.get('content', {}).get('story', {})
|
|
144
|
+
|
|
145
|
+
# 1. IDs
|
|
146
|
+
post_id = story.get('post_id')
|
|
147
|
+
internal_id = story.get('id')
|
|
148
|
+
|
|
149
|
+
# 2. Text (Original)
|
|
150
|
+
if content_story.get('message') is None:
|
|
151
|
+
original_text = None
|
|
152
|
+
translation_text = None
|
|
153
|
+
else:
|
|
154
|
+
original_text = content_story.get('message', {}).get('text')
|
|
155
|
+
# 3. Translation
|
|
156
|
+
message_container = content_story.get('comet_sections', {}).get('message_container', {})
|
|
157
|
+
translation_text = None if message_container is None else message_container.get('story', {}).get('translation', {}).get('message', {}).get('text')
|
|
158
|
+
|
|
159
|
+
# 4. Timestamp
|
|
160
|
+
# Creation time is usually found in the metadata/timestamp section
|
|
161
|
+
timestamp = comet_sections.get('timestamp', {}).get('story',{}).get('creation_time')
|
|
162
|
+
# 5. Post Link
|
|
163
|
+
post_link = content_story.get('wwwURL')
|
|
164
|
+
|
|
165
|
+
# 6. Author Info
|
|
166
|
+
author = content_story.get('actors', [{}])[0].get('name')
|
|
167
|
+
author_url = content_story.get('actors', [{}])[0].get('url')
|
|
168
|
+
|
|
169
|
+
# Attachments
|
|
170
|
+
attachments = []
|
|
171
|
+
tmp_attachments = story.get("attachments", [])
|
|
172
|
+
if tmp_attachments == [] and content_story is not None and content_story.get("attached_story", {}) is not None:
|
|
173
|
+
tmp_attachments = content_story.get("attached_story", {}).get("attachments", [])
|
|
174
|
+
for tmp_attachment in tmp_attachments:
|
|
175
|
+
tmp = tmp_attachment.get('styles', {}).get('attachment', {}).get('media', {})
|
|
176
|
+
if tmp is None:
|
|
177
|
+
continue
|
|
178
|
+
attachment = {
|
|
179
|
+
"type": tmp.get("__typename"),
|
|
180
|
+
"url": tmp.get("url"),
|
|
181
|
+
"photo_img": tmp.get('photo_image', {}).get('uri'),
|
|
182
|
+
}
|
|
183
|
+
attachments.append(attachment)
|
|
184
|
+
data = {
|
|
185
|
+
"old_id": post_id,
|
|
186
|
+
"id": internal_id,
|
|
187
|
+
"author": author,
|
|
188
|
+
"author_url": author_url,
|
|
189
|
+
"original_text": original_text,
|
|
190
|
+
"translation": translation_text,
|
|
191
|
+
"created_time": timestamp,
|
|
192
|
+
"post_link": post_link,
|
|
193
|
+
"attachments": attachments,
|
|
194
|
+
}
|
|
195
|
+
data |= self._extract_engagement_metrics(comet_sections)
|
|
196
|
+
data["attached_story"] = self._extract_attached_story(story.get("attached_story", {}))
|
|
197
|
+
return data
|
|
198
|
+
except Exception as e:
|
|
199
|
+
print(f"Error extracting post info: {e}")
|
|
200
|
+
print(traceback.format_exc())
|
|
201
|
+
with open("debug_extract_post_error.json", "w", encoding="utf-8") as f:
|
|
202
|
+
json.dump(story, f, indent=4, ensure_ascii=False)
|
|
203
|
+
raise e
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
import chompjs
|
|
2
|
+
from networkx import edges
|
|
3
|
+
from seleniumwire.utils import decode
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def decode_response_body(response) -> dict | None:
|
|
7
|
+
"""
|
|
8
|
+
Safely decode Selenium Wire response body into JSON.
|
|
9
|
+
Handles gzip / brotli / plain text.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
body = response.body
|
|
13
|
+
decoded_json = decode(response.body,
|
|
14
|
+
response.headers.get('Content-Encoding', 'identity')).decode(
|
|
15
|
+
'utf-8')
|
|
16
|
+
|
|
17
|
+
chomped_json = chompjs.parse_js_object(decoded_json)
|
|
18
|
+
return chomped_json
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def get_response(driver, url: str="", _type: str="comments"):
|
|
22
|
+
"""Returns decoded response of desired requests
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
driver (WebDriver): Selenium-wire WebDriver to get requests
|
|
26
|
+
url (str, optional): Urls to which response is required.
|
|
27
|
+
type (str, optional): Type of content required i.e ["comments", "post", "posts"]. Defaults to "comments".
|
|
28
|
+
"""
|
|
29
|
+
allowed_types = ['post', 'posts', 'comment']
|
|
30
|
+
if _type not in allowed_types:
|
|
31
|
+
raise Exception(f"Invalid _type value {_type}. Allowed types are {', '.join(allowed_types)}")
|
|
32
|
+
# requests = driver.requests.copy()
|
|
33
|
+
for request in driver.requests:
|
|
34
|
+
if not request.response and "facebook.com/api/graphql" not in request.url:
|
|
35
|
+
continue
|
|
36
|
+
try:
|
|
37
|
+
response_json = decode_response_body(request.response)
|
|
38
|
+
if response_json is None:
|
|
39
|
+
continue
|
|
40
|
+
except Exception as e:
|
|
41
|
+
print("⚠️ Failed parsing JSON:", e)
|
|
42
|
+
continue
|
|
43
|
+
|
|
44
|
+
if _type == 'post':
|
|
45
|
+
story = response_json.get('data', {}).get('node_v2', {})
|
|
46
|
+
if story:
|
|
47
|
+
print("Post data found in the response JSON.")
|
|
48
|
+
yield {'post': story}
|
|
49
|
+
else:
|
|
50
|
+
print("⚠️ post data not found in the response JSON.")
|
|
51
|
+
|
|
52
|
+
elif _type == 'posts':
|
|
53
|
+
edges = response_json.get("data", {}).get("node", {}).get("timeline_list_feed_units", {}).get("edges", [])
|
|
54
|
+
for edge in edges:
|
|
55
|
+
print("Post edges found in the response JSON.")
|
|
56
|
+
yield {'post': edge.get('node', {})}
|
|
57
|
+
elif _type == 'comments':
|
|
58
|
+
comments = (
|
|
59
|
+
response_json.get('data', {})
|
|
60
|
+
.get('node', {})
|
|
61
|
+
.get('comment_rendering_instance_for_feed_location', {})
|
|
62
|
+
.get('comments', {})
|
|
63
|
+
.get('edges', [])
|
|
64
|
+
)
|
|
65
|
+
replies = (
|
|
66
|
+
response_json.get("data", {})
|
|
67
|
+
.get("node", {})
|
|
68
|
+
.get("replies_connection", {})
|
|
69
|
+
.get("edges", [])
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
if not replies and not comments:
|
|
73
|
+
print("⚠️ Comments data not found in the response JSON.")
|
|
74
|
+
|
|
75
|
+
if comments:
|
|
76
|
+
print("Comments data found in the response JSON.")
|
|
77
|
+
yield {"comments": comments}
|
|
78
|
+
if replies:
|
|
79
|
+
print("Replies data found in the response JSON.")
|
|
80
|
+
yield {"replies": replies}
|
|
81
|
+
# del driver.requests[:len(requests)]
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
from drivers import Init
|
|
2
|
+
from auth import Authenticator
|
|
3
|
+
from scrapers.post import PostScraper
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class FacebookScraper:
|
|
7
|
+
def __init__(self, email: str="", password: str="", profile_path: str="", output_path: str="posts.json", proxy: dict={}):
|
|
8
|
+
""" Initialize FacebookScraper Object and initialize the driver object.
|
|
9
|
+
|
|
10
|
+
Args:
|
|
11
|
+
email (str): Facebook Email
|
|
12
|
+
password (str): Account credentials
|
|
13
|
+
profile_path (str): Path to Store Chrome Profile, helps save auth session. By Default it saves a profile in current directory
|
|
14
|
+
|
|
15
|
+
proxy (dict): The proxy should be in this format:
|
|
16
|
+
{
|
|
17
|
+
"http": "http://username:password@host:port",
|
|
18
|
+
"https": "https://username:password@host:port",
|
|
19
|
+
"no_proxy": "localhost,127.0.0.1"
|
|
20
|
+
}
|
|
21
|
+
output_path (str): path to the json file that will contain the post data
|
|
22
|
+
"""
|
|
23
|
+
self.output_path = output_path
|
|
24
|
+
self.driver = Init().get_driver(profile_path, proxy)
|
|
25
|
+
if email and password:
|
|
26
|
+
Authenticator(self.driver, email, password).authenticate()
|
|
27
|
+
|
|
28
|
+
def authenticate(self, email, password):
|
|
29
|
+
Authenticator(self.driver, email, password).authenticate()
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def get_profile(self, profile_url: str):
|
|
33
|
+
"""Gets Profile data using profile url
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
profile_url (str): Facebook Profile URL
|
|
37
|
+
"""
|
|
38
|
+
pass
|
|
39
|
+
|
|
40
|
+
def get_post(self, post_url: str) -> None:
|
|
41
|
+
"""Get Post by Post URL
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
post_url (str): url to facebook post
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
def get_posts(self, profile_url: str, count: int=0):
|
|
48
|
+
"""Scrape all posts of a profile
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
profile_url (str): Facebook Profile URL
|
|
52
|
+
count (int): Post count needed to be extracted, will get all posts if value is 0. Defaults to 0
|
|
53
|
+
"""
|
|
54
|
+
scraper = PostScraper(self.driver, profile_url, self.output_path, count)
|
|
55
|
+
scraper.parse_feed()
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import json
|
|
3
|
+
import argparse
|
|
4
|
+
import traceback ### Don't remove this because we need it to fetches complete tracebacks on exceptions.
|
|
5
|
+
from peewee import SqliteDatabase, Model, BigIntegerField, CharField, TextField, SmallIntegerField, BooleanField, IntegrityError, fn
|
|
6
|
+
from time import sleep
|
|
7
|
+
from initializer import Init
|
|
8
|
+
from seleniumwire import webdriver
|
|
9
|
+
from selenium.webdriver.common.by import By
|
|
10
|
+
from selenium.webdriver.support.ui import WebDriverWait
|
|
11
|
+
from selenium.webdriver.support import expected_conditions as EC
|
|
12
|
+
from time import sleep
|
|
13
|
+
from selenium.webdriver.common.by import By
|
|
14
|
+
from request_parsers.posts import PostParser
|
|
15
|
+
from selenium.webdriver.support.ui import WebDriverWait
|
|
16
|
+
from selenium.webdriver.common.action_chains import ActionChains
|
|
17
|
+
from selenium.webdriver.support import expected_conditions as EC
|
|
18
|
+
from selenium.common.exceptions import (
|
|
19
|
+
WebDriverException, TimeoutException,
|
|
20
|
+
StaleElementReferenceException, ElementClickInterceptedException
|
|
21
|
+
)
|
|
22
|
+
from file_ops import save_json, load_json
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class PostScraper:
|
|
26
|
+
#----------------------------------------------Initializer--------------------------------------------#
|
|
27
|
+
def __init__(self, driver, profile_url, output_path, count) -> None:
|
|
28
|
+
self.count = count
|
|
29
|
+
self.driver = driver
|
|
30
|
+
self.profile_url = profile_url
|
|
31
|
+
self.output_path = output_path
|
|
32
|
+
|
|
33
|
+
self.post_data = load_json()
|
|
34
|
+
|
|
35
|
+
self.open_profile()
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def open_profile(self):
|
|
39
|
+
self.driver.get(self.profile_url)
|
|
40
|
+
self.wait_for_feed_load()
|
|
41
|
+
|
|
42
|
+
def wait_for_feed_load(self): # Bogus logic
|
|
43
|
+
try:
|
|
44
|
+
feed = WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.XPATH,
|
|
45
|
+
"//div[@role='feed' and contains(., 'Other posts')]")))
|
|
46
|
+
except:
|
|
47
|
+
pass
|
|
48
|
+
|
|
49
|
+
#---------------------------------------------------Post Interactors----------------------------------------------#
|
|
50
|
+
def parse_feed(self):
|
|
51
|
+
while True:
|
|
52
|
+
posts = WebDriverWait(self.driver, 20).until(EC.presence_of_all_elements_located((By.XPATH,
|
|
53
|
+
"//div[@class='x1a2a7pz']")))
|
|
54
|
+
slider = self.driver.execute_script('return document.documentElement.scrollTop')
|
|
55
|
+
self.driver.execute_script('window.scrollTo(0, document.body.scrollHeight)')
|
|
56
|
+
print('Scrolling down')
|
|
57
|
+
sleep(3)
|
|
58
|
+
self.save_posts_data()
|
|
59
|
+
if slider == self.driver.execute_script('return document.documentElement.scrollTop'):
|
|
60
|
+
print('End of page')
|
|
61
|
+
break
|
|
62
|
+
|
|
63
|
+
#------------------------------------------------- Parse and Save data---------------------------------------------------------#
|
|
64
|
+
def get_post_data(self):
|
|
65
|
+
parser = PostParser(self.driver)
|
|
66
|
+
parser.fetch_post_api('posts')
|
|
67
|
+
return parser.fetch_post()
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def save_posts_data(self):
|
|
71
|
+
"""Extracting date,name and text from posts. Comparing with old_data if exist"""
|
|
72
|
+
data = self.get_post_data()
|
|
73
|
+
if data == {}:
|
|
74
|
+
return
|
|
75
|
+
|
|
76
|
+
self.posts_data |= data
|
|
77
|
+
save_json(self.output_path, self.posts_data)
|
|
78
|
+
print(f"Saved {len(self.post_data)} post")
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: fbcrawl
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: A small example package
|
|
5
|
+
Author-email: Farhan Ahmed <jattfarhan10@gmail.com>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/Securely-Innovations/FbScraper
|
|
8
|
+
Project-URL: Issues, https://github.com/Securely-Innovations/FbScraper/issues
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: Operating System :: OS Independent
|
|
11
|
+
Requires-Python: >=3.10
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
License-File: LICENSE
|
|
14
|
+
Dynamic: license-file
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
src/fbcrawl/__init__.py
|
|
5
|
+
src/fbcrawl/auth.py
|
|
6
|
+
src/fbcrawl/drivers.py
|
|
7
|
+
src/fbcrawl/file_ops.py
|
|
8
|
+
src/fbcrawl/scraper.py
|
|
9
|
+
src/fbcrawl.egg-info/PKG-INFO
|
|
10
|
+
src/fbcrawl.egg-info/SOURCES.txt
|
|
11
|
+
src/fbcrawl.egg-info/dependency_links.txt
|
|
12
|
+
src/fbcrawl.egg-info/top_level.txt
|
|
13
|
+
src/fbcrawl/request_parsers/__init__.py
|
|
14
|
+
src/fbcrawl/request_parsers/comments.py
|
|
15
|
+
src/fbcrawl/request_parsers/posts.py
|
|
16
|
+
src/fbcrawl/request_parsers/requests_decoder.py
|
|
17
|
+
src/fbcrawl/scrapers/post.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
fbcrawl
|