fbcrawl 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
fbcrawl-0.0.1/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Securely Innovations
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
fbcrawl-0.0.1/PKG-INFO ADDED
@@ -0,0 +1,14 @@
1
+ Metadata-Version: 2.4
2
+ Name: fbcrawl
3
+ Version: 0.0.1
4
+ Summary: A small example package
5
+ Author-email: Farhan Ahmed <jattfarhan10@gmail.com>
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/Securely-Innovations/FbScraper
8
+ Project-URL: Issues, https://github.com/Securely-Innovations/FbScraper/issues
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: Operating System :: OS Independent
11
+ Requires-Python: >=3.10
12
+ Description-Content-Type: text/markdown
13
+ License-File: LICENSE
14
+ Dynamic: license-file
File without changes
@@ -0,0 +1,23 @@
1
+ [build-system]
2
+ requires = ["setuptools >= 77.0.3"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "fbcrawl"
7
+ version = "0.0.1"
8
+ authors = [
9
+ { name="Farhan Ahmed", email="jattfarhan10@gmail.com" },
10
+ ]
11
+ description = "A small example package"
12
+ readme = "README.md"
13
+ requires-python = ">=3.10"
14
+ classifiers = [
15
+ "Programming Language :: Python :: 3",
16
+ "Operating System :: OS Independent",
17
+ ]
18
+ license = "MIT"
19
+ license-files = ["LICEN[CS]E*"]
20
+
21
+ [project.urls]
22
+ Homepage = "https://github.com/Securely-Innovations/FbScraper"
23
+ Issues = "https://github.com/Securely-Innovations/FbScraper/issues"
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
File without changes
@@ -0,0 +1,65 @@
1
+ import os
2
+ from seleniumwire import webdriver
3
+ from selenium.webdriver.common.by import By
4
+ from selenium.common.exceptions import TimeoutException
5
+ from selenium.webdriver.support.ui import WebDriverWait
6
+ from selenium.webdriver.support import expected_conditions as EC
7
+
8
+ class Authenticator:
9
+ def __init__(self, driver, email, password) -> None:
10
+ self.email = email
11
+ self.driver = driver
12
+ self.password = password
13
+
14
+ self.xpaths = {
15
+ "create_post_btn": "//div[@aria-label='Create a post']"
16
+ }
17
+
18
+
19
+ def authenticate(self):
20
+ if self.is_logged_in():
21
+ print("[+] Already logged in (session restored)")
22
+ return
23
+ print("[*] Session not authenticated, logging in...")
24
+ self.perform_login()
25
+
26
+
27
+ def is_logged_in(self, timeout=5):
28
+ """
29
+ Detect whether Facebook session is already authenticated
30
+ """
31
+ try:
32
+ WebDriverWait(self.driver, timeout).until(
33
+ EC.presence_of_element_located((By.XPATH, self.xpaths['create_post_btn']))
34
+ )
35
+ return True
36
+ except TimeoutException:
37
+ return False
38
+
39
+
40
+ def perform_login(self):
41
+ """
42
+ Login using email/password if logged out
43
+ """
44
+ try:
45
+ email_field = WebDriverWait(self.driver, 10).until(
46
+ EC.presence_of_element_located((By.ID, "email"))
47
+ )
48
+ password_field = self.driver.find_element(By.ID, "pass")
49
+
50
+ email_field.clear()
51
+ email_field.send_keys(self.email)
52
+ password_field.clear()
53
+ password_field.send_keys(self.password)
54
+
55
+ login_button = self.driver.find_element(By.NAME, "login")
56
+ login_button.click()
57
+ input("Press Enter after completing any additional verification steps...")
58
+ WebDriverWait(self.driver, 15).until(
59
+ EC.presence_of_element_located((By.XPATH, self.xpaths['create_post_btn']))
60
+ )
61
+
62
+ print("[+] Logged in successfully")
63
+
64
+ except TimeoutException:
65
+ raise RuntimeError("[-] Login failed or Facebook UI changed")
@@ -0,0 +1,43 @@
1
+ import os
2
+ from seleniumwire import webdriver
3
+ from selenium.webdriver.common.by import By
4
+ from selenium.common.exceptions import TimeoutException
5
+ from selenium.webdriver.support.ui import WebDriverWait
6
+ from selenium.webdriver.support import expected_conditions as EC
7
+
8
+
9
+ class Init:
10
+ def get_driver(self, profile_path: str = "", proxy: dict = {}) -> webdriver.Chrome:
11
+ """Initializes Selenium-wire driver instance
12
+
13
+ Args:
14
+ profile_path (str): Path to Store Chrome Profile, helps save auth session. By Default it saves a profile in current directory
15
+ proxy (dict): a proxy dict with following structure:
16
+ "proxy": {
17
+ "http": "http://username:password@host:port",
18
+ "https": "https://username:password@host:port",
19
+ "no_proxy": "localhost,127.0.0.1"
20
+ }
21
+
22
+ Returns:
23
+ webdriver.Chrome: Selenium-wire driver Object
24
+ """
25
+ if not profile_path:
26
+ base_dir = os.getcwd()
27
+ profile_path = os.path.join(base_dir, "chrome_profile")
28
+
29
+ os.makedirs(profile_path, exist_ok=True)
30
+
31
+ options = webdriver.ChromeOptions()
32
+ options.add_argument(f"--user-data-dir={profile_path}")
33
+ options.add_argument("--profile-directory=Default")
34
+ options.add_argument("--disable-notifications")
35
+ # options.add_argument("--user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/144.0.0.0 Safari/537.36")
36
+ if proxy:
37
+ seleniumwire_options = {
38
+ 'proxy': proxy
39
+ }
40
+ driver = webdriver.Chrome(options=options, seleniumwire_options=options)
41
+ else:
42
+ driver = webdriver.Chrome(options=options)
43
+ return driver
@@ -0,0 +1,12 @@
1
+ import os
2
+ import json
3
+
4
+ def load_json(json_path='posts.json'):
5
+ if not os.path.exists(json_path):
6
+ return {}
7
+ with open(json_path, 'r', encoding='utf-8') as f:
8
+ return json.load(f)
9
+
10
+ def save_json(file_name, data):
11
+ with open(file_name, 'w', encoding='utf-8') as f:
12
+ json.dump(data, f, indent=4, ensure_ascii=False)
File without changes
@@ -0,0 +1,203 @@
1
+
2
+ import json
3
+ from request_parsers.requests_decoder import get_response
4
+
5
+
6
+ class CommentsParser:
7
+ def __init__(self, driver):
8
+ self.replies = {}
9
+ self.comments = {}
10
+ self.driver = driver
11
+ self.fetch_comments_api()
12
+
13
+ def fetch_comments(self):
14
+ # self.combine_replies_with_comments()
15
+ # post_id = None
16
+ # for comment_data in self.comments.values():
17
+ # post_id = comment_data.get('post_id')
18
+ # if post_id:
19
+ # break
20
+ # if post_id is None:
21
+ # return {}
22
+ return self.comments, self.replies
23
+
24
+ def fetch_comments_api(self):
25
+ for comments in get_response(self.driver, _type=['comments', 'replies']):
26
+ if comments.get('comments'):
27
+ total = f"{len(comments['comments'])} Comments"
28
+ self.comments |= self.parse_comments(comments['comments'])
29
+ else: # Comment Replies
30
+ total = f"{len(comments['replies'])} Replies"
31
+ self.replies |= self.parse_replies(comments['replies'])
32
+ print(f"Fetched {total}")
33
+
34
+ def parse_comments(self, edges):
35
+ comments = {}
36
+ for edge in edges:
37
+ comment = self.extract_comment_info(edge)
38
+ comments |= comment
39
+ return comments
40
+
41
+ def extract_comment_info(self, comment_edge: dict) -> dict:
42
+ """
43
+ Extracts important fields from a Facebook GraphQL comment edge.
44
+ """
45
+ node = comment_edge.get("node", {})
46
+ feedback = node.get("feedback", {})
47
+ feedback = {} if feedback is None else feedback
48
+ author = node.get("author", {})
49
+ body = node.get("body", {})
50
+ parent_feedback = node.get("parent_feedback", {})
51
+ if parent_feedback is None or parent_feedback == {}:
52
+ parent_feedback = node.get("comet_comment_author_name_and_badges_renderer", {}).get('comment', {}).get("parent_feedback", {})
53
+
54
+ attachments = []
55
+ tmp_attachments = node.get("attachments", [])
56
+ for tmp_attachment in tmp_attachments:
57
+ tmp = tmp_attachment.get('style_type_renderer', {}).get('attachment', {})
58
+ if tmp is None:
59
+ continue
60
+ media = tmp.get('media')
61
+ attachment = {
62
+ "type": media if media is None else media.get("__typename"),
63
+ "url": tmp.get("url"),
64
+ "photo_img": tmp.get('photo_image', {}).get('uri'),
65
+
66
+ }
67
+ attachments.append(attachment)
68
+ # Reaction breakdown
69
+ reactions = {}
70
+ for r in feedback.get("top_reactions", {}).get("edges", []):
71
+ reaction_id = r["node"].get("id")
72
+ reactions[reaction_id] = r.get("reaction_count", 0)
73
+ post_id = parent_feedback.get("share_fbid")
74
+ if post_id is None:
75
+ with open("debug_missing_post_id.json", "w", encoding="utf-8") as f:
76
+ json.dump(comment_edge, f, indent=4)
77
+ raise ValueError("Post ID not found in parent feedback.")
78
+ comment_id = node.get("id")
79
+ if comment_id is None:
80
+ with open("debug_missing_comment_id.json", "w", encoding="utf-8") as f:
81
+ json.dump(comment_edge, f, indent=4)
82
+ raise ValueError("Comment ID not found in comment node.")
83
+ extracted = {
84
+ # Comment identifiers
85
+ "id": comment_id,
86
+
87
+ # Author info
88
+ "author_id": author.get("id"),
89
+ "author_name": author.get("name"),
90
+ "author_profile_url": author.get("url"),
91
+ "author_gender": author.get("gender"),
92
+
93
+ # Content
94
+ "text": None if body is None else body.get("text"),
95
+ "language": node.get("translatability_for_viewer", {}).get("source_dialect"),
96
+
97
+ # Timing
98
+ "created_time": node.get("created_time"),
99
+
100
+ # Engagement
101
+ "reaction_total": feedback.get("reactors", {}).get("count"),
102
+ "reaction_breakdown": reactions,
103
+ "reply_count": feedback.get("replies_fields", {}).get("total_count"),
104
+
105
+ # Relationships
106
+ "post_id": post_id,
107
+ "post_author_id": parent_feedback.get("owning_profile", {}).get("id"),
108
+
109
+ # URLs
110
+ "comment_url": feedback.get("url"),
111
+ "attachements": attachments,
112
+ # Replies
113
+ "replies": {}
114
+ }
115
+ reply_edges = feedback.get("replies_connection", {}).get("edges", [])
116
+ if reply_edges:
117
+ replies = self.parse_replies(reply_edges)
118
+ self.replies |= replies
119
+
120
+ return {comment_id: extracted}
121
+
122
+ def extract_reply_info(self, edge: dict) -> dict:
123
+ node = edge.get("node", {})
124
+ author = node.get("author", {})
125
+ feedback = node.get("feedback", {})
126
+ body = node.get("body", {})
127
+ # Reaction breakdown
128
+ reactions = {}
129
+ for r in feedback.get("top_reactions", {}).get("edges", []):
130
+ reaction_id = r["node"].get("id")
131
+ reactions[reaction_id] = r.get("reaction_count", 0)
132
+
133
+ parent_comment_id = node.get("comment_direct_parent", {}).get("id")
134
+ if parent_comment_id is None:
135
+ with open("debug_missing_parent_comment_id.json", "w", encoding="utf-8") as f:
136
+ json.dump(edge, f, indent=4)
137
+ raise ValueError("Parent comment ID not found in reply.")
138
+ reply_id = node.get("id")
139
+ if reply_id is None:
140
+ with open("debug_missing_reply_id.json", "w", encoding="utf-8") as f:
141
+ json.dump(edge, f, indent=4, ensure_ascii=True)
142
+ raise ValueError("Reply ID not found in reply node.")
143
+ extracted = {
144
+ "id": reply_id,
145
+ "depth": node.get("depth"),
146
+
147
+ "author_id": author.get("id"),
148
+ "author_name": author.get("name"),
149
+ "author_profile": author.get("url"),
150
+ "author_gender": author.get("gender"),
151
+
152
+ "text": None if body is None else body.get("text"),
153
+ "language": node.get("translatability_for_viewer", {}).get("source_dialect"),
154
+
155
+ "created_time": node.get("created_time"),
156
+
157
+ "parent_comment_id": parent_comment_id,
158
+
159
+ "reply_count": feedback.get("replies_fields", {}).get("count"),
160
+ "reaction_count": feedback.get("reactors", {}).get("count"),
161
+ "reaction_breakdown": reactions,
162
+
163
+ "comment_url": feedback.get("url"),
164
+ "replies": {}
165
+ }
166
+ reply_edges = feedback.get("replies_connection", {}).get("edges", [])
167
+ if reply_edges:
168
+ replies = self.parse_replies(reply_edges)
169
+ self.replies |= replies
170
+
171
+ return {reply_id: extracted}
172
+
173
+
174
+ def parse_replies(self, edges: dict) -> dict:
175
+ replies = {}
176
+ for edge in edges:
177
+ reply = self.extract_reply_info(edge)
178
+ replies |= reply
179
+ return replies
180
+
181
+ def combine_replies_with_comments(self):
182
+ """
183
+ Combining replies to their respective parent comments and replies.
184
+ Supports multi-level nesting (comment -> reply -> sub-reply).
185
+ """
186
+ # 1. Create a unified lookup of everything that can be a parent
187
+ # This allows us to find a parent whether it is a top-level comment or a reply
188
+ all_nodes = {**self.comments, **self.replies}
189
+ print(f"Total comments: {len(self.comments)}, Total replies: {len(self.replies)}")
190
+ # 2. Iterate through all replies to find their parents
191
+ for reply_id, reply_data in self.replies.items():
192
+ parent_id = reply_data.get('parent_comment_id')
193
+
194
+ if parent_id and parent_id in all_nodes:
195
+ # Add the reply to the 'replies' dictionary of its parent
196
+ # Using the parent_id from all_nodes ensures we can nest deeply
197
+ all_nodes[parent_id]['replies'][reply_id] = reply_data
198
+ else:
199
+ # This case handles scenarios where a reply's parent might not
200
+ # have been captured in the current API fetch cycle
201
+ print(f"Warning: Parent ID {parent_id} for reply {reply_id} not found.")
202
+ # self.comments is now updated in-place because objects in Python are passed by reference.
203
+ # Top-level comments now contain their direct replies, which contain their sub-replies.
@@ -0,0 +1,203 @@
1
+
2
+ import json
3
+ import traceback
4
+ from request_parsers.requests_decoder import get_response
5
+
6
+
7
+ class PostParser:
8
+ def __init__(self, driver):
9
+ self.posts = {}
10
+ self.driver = driver
11
+
12
+ def fetch_post(self) -> dict:
13
+ return self.posts
14
+
15
+ def fetch_post_apis(self):
16
+ self.fetch_post_api('post')
17
+ self.fetch_post_api('posts')
18
+
19
+ def fetch_post_api(self, post_type: str ='post') -> None:
20
+ """Fetch data from post API
21
+
22
+ Args:
23
+ post_type (str, optional): There are two types of post data (post, posts). Defaults to 'post'.
24
+ """
25
+ for post in get_response(self.driver, _type=post_type):
26
+ data = self.extract_post_info(post['post'])
27
+ if data is None:
28
+ continue
29
+ self.posts[data['old_id']] = data
30
+
31
+ def _extract_attached_story(self, story: dict) -> dict:
32
+ """Cleans Post data from fb and return it.
33
+
34
+ Args:
35
+ story (dict):
36
+
37
+ Returns:
38
+ dict: Clean Post Info
39
+ """
40
+ # Navigate to the main story object
41
+ if story is None or story == {}:
42
+ return {}
43
+ context_story = story.get("comet_sections", {}).get("context_layout", {}).get("story", {})
44
+ # 2. Text (Original)
45
+ content_story = context_story.get("comet_sections", {}).get("content", {}).get("story", {})
46
+ if content_story is None:
47
+ content_story = content_story
48
+ if content_story.get('message') is None:
49
+ original_text = None
50
+ translation_text = None
51
+ else:
52
+ original_text = content_story.get('message', {}).get('text')
53
+ # 3. Translation
54
+ message_container = content_story.get('comet_sections', {}).get('message_container', {})
55
+ translation_text = None if message_container is None else message_container.get('story', {}).get('translation', {}).get('message', {}).get('text')
56
+
57
+ # Extract Actor Info
58
+ actor = context_story.get("comet_sections", {}).get("title", {}).get("story", {}).get("actors", [{}])[0]
59
+ author_name = actor.get("name")
60
+ author_name = author_name if author_name is not None else story.get("feedback", {}).get("owning_profile", {}).get("name")
61
+ # Extract Metadata (Timestamp)
62
+ metadata = context_story.get("comet_sections", {}).get("metadata", [])
63
+ timestamp = None
64
+ if metadata:
65
+ timestamp = metadata[0].get("story", {}).get("creation_time")
66
+ data = {
67
+ "old_id": story.get("id"),
68
+ "id": story.get("feedback", {}).get("owning_profile", {}).get("id"),
69
+ "author": author_name,
70
+ "author_url": actor.get("url"),
71
+ "original_text": original_text,
72
+ "translation": translation_text,
73
+ "created_time": timestamp,
74
+ "post_link": story.get("permalink_url"),
75
+ }
76
+
77
+ return data
78
+
79
+ def _extract_engagement_metrics(self, comet_sections: dict) -> dict:
80
+ """
81
+ Extract reactions, comments, and shares from Facebook Comet UFI block
82
+ as seen in del.json
83
+ """
84
+
85
+ feedback_root = (
86
+ comet_sections
87
+ .get("feedback", {})
88
+ .get("story", {})
89
+ .get("story_ufi_container", {})
90
+ .get("story", {})
91
+ .get("feedback_context", {})
92
+ .get("feedback_target_with_context", {})
93
+ .get("comet_ufi_summary_and_actions_renderer", {})
94
+ .get("feedback", {})
95
+ )
96
+
97
+ if not feedback_root:
98
+ return {"error": "No UFI feedback block found"}
99
+
100
+ # Total reactions
101
+ reactions_total = (
102
+ feedback_root
103
+ .get("reaction_count", {})
104
+ .get("count")
105
+ )
106
+
107
+ # Reaction breakdown (Like, Love, Haha, etc.)
108
+ reactions_breakdown = {}
109
+ for edge in feedback_root.get("top_reactions", {}).get("edges", []):
110
+ node = edge.get("node", {})
111
+ name = node.get("localized_name")
112
+ count = edge.get("reaction_count")
113
+ if name and count is not None:
114
+ reactions_breakdown[name] = count
115
+
116
+ # Total comments
117
+ comments_total = (
118
+ feedback_root
119
+ .get("comment_rendering_instance", {})
120
+ .get("comments", {})
121
+ .get("total_count")
122
+ )
123
+
124
+ # Total shares
125
+ shares_total = (
126
+ feedback_root
127
+ .get("share_count", {})
128
+ .get("count")
129
+ )
130
+
131
+ return {
132
+ "total_reactions": reactions_total,
133
+ "reactions_breakdown": reactions_breakdown,
134
+ "total_comments": comments_total,
135
+ "total_shares": shares_total,
136
+ }
137
+
138
+
139
+ def extract_post_info(self, story): # story=node_v2
140
+ try:
141
+ # Access the main story node
142
+ comet_sections = story.get('comet_sections', {})
143
+ content_story = comet_sections.get('content', {}).get('story', {})
144
+
145
+ # 1. IDs
146
+ post_id = story.get('post_id')
147
+ internal_id = story.get('id')
148
+
149
+ # 2. Text (Original)
150
+ if content_story.get('message') is None:
151
+ original_text = None
152
+ translation_text = None
153
+ else:
154
+ original_text = content_story.get('message', {}).get('text')
155
+ # 3. Translation
156
+ message_container = content_story.get('comet_sections', {}).get('message_container', {})
157
+ translation_text = None if message_container is None else message_container.get('story', {}).get('translation', {}).get('message', {}).get('text')
158
+
159
+ # 4. Timestamp
160
+ # Creation time is usually found in the metadata/timestamp section
161
+ timestamp = comet_sections.get('timestamp', {}).get('story',{}).get('creation_time')
162
+ # 5. Post Link
163
+ post_link = content_story.get('wwwURL')
164
+
165
+ # 6. Author Info
166
+ author = content_story.get('actors', [{}])[0].get('name')
167
+ author_url = content_story.get('actors', [{}])[0].get('url')
168
+
169
+ # Attachments
170
+ attachments = []
171
+ tmp_attachments = story.get("attachments", [])
172
+ if tmp_attachments == [] and content_story is not None and content_story.get("attached_story", {}) is not None:
173
+ tmp_attachments = content_story.get("attached_story", {}).get("attachments", [])
174
+ for tmp_attachment in tmp_attachments:
175
+ tmp = tmp_attachment.get('styles', {}).get('attachment', {}).get('media', {})
176
+ if tmp is None:
177
+ continue
178
+ attachment = {
179
+ "type": tmp.get("__typename"),
180
+ "url": tmp.get("url"),
181
+ "photo_img": tmp.get('photo_image', {}).get('uri'),
182
+ }
183
+ attachments.append(attachment)
184
+ data = {
185
+ "old_id": post_id,
186
+ "id": internal_id,
187
+ "author": author,
188
+ "author_url": author_url,
189
+ "original_text": original_text,
190
+ "translation": translation_text,
191
+ "created_time": timestamp,
192
+ "post_link": post_link,
193
+ "attachments": attachments,
194
+ }
195
+ data |= self._extract_engagement_metrics(comet_sections)
196
+ data["attached_story"] = self._extract_attached_story(story.get("attached_story", {}))
197
+ return data
198
+ except Exception as e:
199
+ print(f"Error extracting post info: {e}")
200
+ print(traceback.format_exc())
201
+ with open("debug_extract_post_error.json", "w", encoding="utf-8") as f:
202
+ json.dump(story, f, indent=4, ensure_ascii=False)
203
+ raise e
@@ -0,0 +1,81 @@
1
+ import chompjs
2
+ from networkx import edges
3
+ from seleniumwire.utils import decode
4
+
5
+
6
+ def decode_response_body(response) -> dict | None:
7
+ """
8
+ Safely decode Selenium Wire response body into JSON.
9
+ Handles gzip / brotli / plain text.
10
+ """
11
+
12
+ body = response.body
13
+ decoded_json = decode(response.body,
14
+ response.headers.get('Content-Encoding', 'identity')).decode(
15
+ 'utf-8')
16
+
17
+ chomped_json = chompjs.parse_js_object(decoded_json)
18
+ return chomped_json
19
+
20
+
21
+ def get_response(driver, url: str="", _type: str="comments"):
22
+ """Returns decoded response of desired requests
23
+
24
+ Args:
25
+ driver (WebDriver): Selenium-wire WebDriver to get requests
26
+ url (str, optional): Urls to which response is required.
27
+ type (str, optional): Type of content required i.e ["comments", "post", "posts"]. Defaults to "comments".
28
+ """
29
+ allowed_types = ['post', 'posts', 'comment']
30
+ if _type not in allowed_types:
31
+ raise Exception(f"Invalid _type value {_type}. Allowed types are {', '.join(allowed_types)}")
32
+ # requests = driver.requests.copy()
33
+ for request in driver.requests:
34
+ if not request.response and "facebook.com/api/graphql" not in request.url:
35
+ continue
36
+ try:
37
+ response_json = decode_response_body(request.response)
38
+ if response_json is None:
39
+ continue
40
+ except Exception as e:
41
+ print("⚠️ Failed parsing JSON:", e)
42
+ continue
43
+
44
+ if _type == 'post':
45
+ story = response_json.get('data', {}).get('node_v2', {})
46
+ if story:
47
+ print("Post data found in the response JSON.")
48
+ yield {'post': story}
49
+ else:
50
+ print("⚠️ post data not found in the response JSON.")
51
+
52
+ elif _type == 'posts':
53
+ edges = response_json.get("data", {}).get("node", {}).get("timeline_list_feed_units", {}).get("edges", [])
54
+ for edge in edges:
55
+ print("Post edges found in the response JSON.")
56
+ yield {'post': edge.get('node', {})}
57
+ elif _type == 'comments':
58
+ comments = (
59
+ response_json.get('data', {})
60
+ .get('node', {})
61
+ .get('comment_rendering_instance_for_feed_location', {})
62
+ .get('comments', {})
63
+ .get('edges', [])
64
+ )
65
+ replies = (
66
+ response_json.get("data", {})
67
+ .get("node", {})
68
+ .get("replies_connection", {})
69
+ .get("edges", [])
70
+ )
71
+
72
+ if not replies and not comments:
73
+ print("⚠️ Comments data not found in the response JSON.")
74
+
75
+ if comments:
76
+ print("Comments data found in the response JSON.")
77
+ yield {"comments": comments}
78
+ if replies:
79
+ print("Replies data found in the response JSON.")
80
+ yield {"replies": replies}
81
+ # del driver.requests[:len(requests)]
@@ -0,0 +1,55 @@
1
+ from drivers import Init
2
+ from auth import Authenticator
3
+ from scrapers.post import PostScraper
4
+
5
+
6
+ class FacebookScraper:
7
+ def __init__(self, email: str="", password: str="", profile_path: str="", output_path: str="posts.json", proxy: dict={}):
8
+ """ Initialize FacebookScraper Object and initialize the driver object.
9
+
10
+ Args:
11
+ email (str): Facebook Email
12
+ password (str): Account credentials
13
+ profile_path (str): Path to Store Chrome Profile, helps save auth session. By Default it saves a profile in current directory
14
+
15
+ proxy (dict): The proxy should be in this format:
16
+ {
17
+ "http": "http://username:password@host:port",
18
+ "https": "https://username:password@host:port",
19
+ "no_proxy": "localhost,127.0.0.1"
20
+ }
21
+ output_path (str): path to the json file that will contain the post data
22
+ """
23
+ self.output_path = output_path
24
+ self.driver = Init().get_driver(profile_path, proxy)
25
+ if email and password:
26
+ Authenticator(self.driver, email, password).authenticate()
27
+
28
+ def authenticate(self, email, password):
29
+ Authenticator(self.driver, email, password).authenticate()
30
+
31
+
32
+ def get_profile(self, profile_url: str):
33
+ """Gets Profile data using profile url
34
+
35
+ Args:
36
+ profile_url (str): Facebook Profile URL
37
+ """
38
+ pass
39
+
40
+ def get_post(self, post_url: str) -> None:
41
+ """Get Post by Post URL
42
+
43
+ Args:
44
+ post_url (str): url to facebook post
45
+ """
46
+
47
+ def get_posts(self, profile_url: str, count: int=0):
48
+ """Scrape all posts of a profile
49
+
50
+ Args:
51
+ profile_url (str): Facebook Profile URL
52
+ count (int): Post count needed to be extracted, will get all posts if value is 0. Defaults to 0
53
+ """
54
+ scraper = PostScraper(self.driver, profile_url, self.output_path, count)
55
+ scraper.parse_feed()
@@ -0,0 +1,78 @@
1
+ import os
2
+ import json
3
+ import argparse
4
+ import traceback ### Don't remove this because we need it to fetches complete tracebacks on exceptions.
5
+ from peewee import SqliteDatabase, Model, BigIntegerField, CharField, TextField, SmallIntegerField, BooleanField, IntegrityError, fn
6
+ from time import sleep
7
+ from initializer import Init
8
+ from seleniumwire import webdriver
9
+ from selenium.webdriver.common.by import By
10
+ from selenium.webdriver.support.ui import WebDriverWait
11
+ from selenium.webdriver.support import expected_conditions as EC
12
+ from time import sleep
13
+ from selenium.webdriver.common.by import By
14
+ from request_parsers.posts import PostParser
15
+ from selenium.webdriver.support.ui import WebDriverWait
16
+ from selenium.webdriver.common.action_chains import ActionChains
17
+ from selenium.webdriver.support import expected_conditions as EC
18
+ from selenium.common.exceptions import (
19
+ WebDriverException, TimeoutException,
20
+ StaleElementReferenceException, ElementClickInterceptedException
21
+ )
22
+ from file_ops import save_json, load_json
23
+
24
+
25
+ class PostScraper:
26
+ #----------------------------------------------Initializer--------------------------------------------#
27
+ def __init__(self, driver, profile_url, output_path, count) -> None:
28
+ self.count = count
29
+ self.driver = driver
30
+ self.profile_url = profile_url
31
+ self.output_path = output_path
32
+
33
+ self.post_data = load_json()
34
+
35
+ self.open_profile()
36
+
37
+
38
+ def open_profile(self):
39
+ self.driver.get(self.profile_url)
40
+ self.wait_for_feed_load()
41
+
42
+ def wait_for_feed_load(self): # Bogus logic
43
+ try:
44
+ feed = WebDriverWait(self.driver, 10).until(EC.presence_of_element_located((By.XPATH,
45
+ "//div[@role='feed' and contains(., 'Other posts')]")))
46
+ except:
47
+ pass
48
+
49
+ #---------------------------------------------------Post Interactors----------------------------------------------#
50
+ def parse_feed(self):
51
+ while True:
52
+ posts = WebDriverWait(self.driver, 20).until(EC.presence_of_all_elements_located((By.XPATH,
53
+ "//div[@class='x1a2a7pz']")))
54
+ slider = self.driver.execute_script('return document.documentElement.scrollTop')
55
+ self.driver.execute_script('window.scrollTo(0, document.body.scrollHeight)')
56
+ print('Scrolling down')
57
+ sleep(3)
58
+ self.save_posts_data()
59
+ if slider == self.driver.execute_script('return document.documentElement.scrollTop'):
60
+ print('End of page')
61
+ break
62
+
63
+ #------------------------------------------------- Parse and Save data---------------------------------------------------------#
64
+ def get_post_data(self):
65
+ parser = PostParser(self.driver)
66
+ parser.fetch_post_api('posts')
67
+ return parser.fetch_post()
68
+
69
+
70
+ def save_posts_data(self):
71
+ """Extracting date,name and text from posts. Comparing with old_data if exist"""
72
+ data = self.get_post_data()
73
+ if data == {}:
74
+ return
75
+
76
+ self.posts_data |= data
77
+ save_json(self.output_path, self.posts_data)
78
+ print(f"Saved {len(self.post_data)} post")
@@ -0,0 +1,14 @@
1
+ Metadata-Version: 2.4
2
+ Name: fbcrawl
3
+ Version: 0.0.1
4
+ Summary: A small example package
5
+ Author-email: Farhan Ahmed <jattfarhan10@gmail.com>
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/Securely-Innovations/FbScraper
8
+ Project-URL: Issues, https://github.com/Securely-Innovations/FbScraper/issues
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: Operating System :: OS Independent
11
+ Requires-Python: >=3.10
12
+ Description-Content-Type: text/markdown
13
+ License-File: LICENSE
14
+ Dynamic: license-file
@@ -0,0 +1,17 @@
1
+ LICENSE
2
+ README.md
3
+ pyproject.toml
4
+ src/fbcrawl/__init__.py
5
+ src/fbcrawl/auth.py
6
+ src/fbcrawl/drivers.py
7
+ src/fbcrawl/file_ops.py
8
+ src/fbcrawl/scraper.py
9
+ src/fbcrawl.egg-info/PKG-INFO
10
+ src/fbcrawl.egg-info/SOURCES.txt
11
+ src/fbcrawl.egg-info/dependency_links.txt
12
+ src/fbcrawl.egg-info/top_level.txt
13
+ src/fbcrawl/request_parsers/__init__.py
14
+ src/fbcrawl/request_parsers/comments.py
15
+ src/fbcrawl/request_parsers/posts.py
16
+ src/fbcrawl/request_parsers/requests_decoder.py
17
+ src/fbcrawl/scrapers/post.py
@@ -0,0 +1 @@
1
+ fbcrawl