forki 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,360 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "typhoeus"
4
+ require "securerandom"
5
+ require "byebug"
6
+
7
+ module Forki
8
+ # rubocop:disable Metrics/ClassLength
9
+ class PostScraper < Scraper
10
+ # Searches the DOM to finds the number of times a (video) post has been viewed.
11
+ # Returns nil if it can't find a DOM element with the view count
12
+
13
+ def find_number_of_views
14
+ views_pattern = /[0-9MK, ]+Views/
15
+ spans = all("span")
16
+ views_span = spans.find { |s| s.text(:all) =~ views_pattern }
17
+ extract_int_from_num_element(views_span)
18
+ end
19
+
20
+ def extract_post_data(graphql_strings)
21
+ # Bail out of the post otherwise it gets stuck
22
+ raise ContentUnavailableError unless is_post_available?
23
+
24
+ graphql_objects = get_graphql_objects(graphql_strings)
25
+ post_has_video = check_if_post_is_video(graphql_objects)
26
+ post_has_image = check_if_post_is_image(graphql_objects)
27
+
28
+ # There's a chance it may be embedded in a comment chain like this:
29
+ # https://www.facebook.com/PlandemicMovie/posts/588866298398729/
30
+ post_has_video_in_comment_stream = check_if_post_is_in_comment_stream(graphql_objects) if post_has_video == false
31
+
32
+ if post_has_video
33
+ extract_video_post_data(graphql_strings)
34
+ elsif post_has_video_in_comment_stream
35
+ extract_video_comment_post_data(graphql_objects)
36
+ elsif post_has_image
37
+ extract_image_post_data(graphql_objects)
38
+ else
39
+ raise UnhandledContentError
40
+ end
41
+ end
42
+
43
+ def get_graphql_objects(graphql_strings)
44
+ graphql_strings.map { |graphql_object| JSON.parse(graphql_object) }
45
+ end
46
+
47
+ def check_if_post_is_video(graphql_objects)
48
+ graphql_objects.any? { |graphql_object| graphql_object.key?("is_live_streaming") || graphql_object.key?("video") || check_if_post_is_reel(graphql_object) }
49
+ end
50
+
51
+ def check_if_post_is_reel(graphql_object)
52
+ return false unless graphql_object.key?("node")
53
+
54
+ begin
55
+ style_infos = graphql_object["node"]["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"]["style_infos"].first
56
+ rescue NoMethodError # if the object doesn't match the attribute chain above, the line above will try to operate on nil
57
+ return false
58
+ end
59
+
60
+ style_infos.include?("fb_shorts_story")
61
+ end
62
+
63
+ def check_if_post_is_image(graphql_objects)
64
+ graphql_objects.any? do |graphql_object| # if any GraphQL objects contain the top-level keys above, return true
65
+ true unless graphql_object.fetch("image", nil).nil? # so long as the associated values are not nil
66
+ true unless graphql_object.fetch("currMedia", nil).nil?
67
+ end
68
+ end
69
+
70
+ def check_if_post_is_in_comment_stream(graphql_objects)
71
+ graphql_objects.find do |graphql_object|
72
+ next unless graphql_object.key?("nodes")
73
+
74
+ begin
75
+ type = graphql_object["nodes"].first["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"]["media"]["__typename"]
76
+ rescue StandardError
77
+ # if there's an error just return false, since the structure is so specific checking the whole thing is a lot
78
+ next
79
+ end
80
+
81
+ return true if type == "Video"
82
+ end
83
+
84
+ false
85
+ end
86
+
87
+ def is_post_available?
88
+ begin
89
+ # This Video Isn't Available Anymore
90
+ find("span", wait: 5, text: "content isn't available", exact_text: false)
91
+ rescue Capybara::ElementNotFound, Selenium::WebDriver::Error::StaleElementReferenceError
92
+ begin
93
+ find("span", wait: 5, text: "This Video Isn't Available Anymore", exact_text: false)
94
+ rescue Capybara::ElementNotFound, Selenium::WebDriver::Error::StaleElementReferenceError
95
+ return true
96
+ end
97
+ end
98
+
99
+ false
100
+ end
101
+
102
+ def extract_video_comment_post_data(graphql_objects)
103
+ graphql_nodes = nil
104
+ graphql_objects.find do |graphql_object|
105
+ next unless graphql_object.key?("nodes")
106
+ graphql_nodes = graphql_object["nodes"]
107
+
108
+ break
109
+ end
110
+
111
+ media = graphql_nodes.first["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"]["media"]
112
+ inital_feedback_object = graphql_nodes.first["comet_sections"]["feedback"]["story"]["feedback_context"]["feedback_target_with_context"]["ufi_renderer"]["feedback"]
113
+ feedback_object = inital_feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]
114
+
115
+ post_details = {
116
+ id: media["id"],
117
+ num_comments: feedback_object["comment_count"]["total_count"],
118
+ num_shares: feedback_object["share_count"]["count"],
119
+ num_views: feedback_object["video_view_count"],
120
+ reshare_warning: feedback_object["should_show_reshare_warning"],
121
+ video_preview_image_url: media["preferred_thumbnail"]["image"]["uri"],
122
+ video_url: media["playable_url_quality_hd"] || media["playable_url"],
123
+ text: graphql_nodes.first["comet_sections"]["content"]["story"]["comet_sections"]["message"]["story"]["message"]["text"],
124
+ created_at: media["publish_time"],
125
+ profile_link: graphql_nodes.first["comet_sections"]["context_layout"]["story"]["comet_sections"]["actor_photo"]["story"]["actors"].first["url"],
126
+ has_video: true
127
+ }
128
+
129
+ post_details[:video_preview_image_file] = Forki.retrieve_media(post_details[:video_preview_image_url])
130
+ post_details[:video_file] = Forki.retrieve_media(post_details[:video_url])
131
+ post_details[:reactions] = inital_feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["i18n_reaction_count"]
132
+ post_details
133
+ end
134
+
135
+ # Unfortunately, there's a taxonomy of video post types, all of which require different parsing methods
136
+ # Specifically, there are normal video posts, video posts from the watch page, and live video posts from the watch page
137
+ # The general strategy for extracting information from each type, though, is to find which of the 30-odd GraphQL strings are relevant
138
+ # After finding those GraphQL strings, we parse them into hashes and extract the information we need
139
+ def extract_video_post_data(graphql_strings)
140
+ unless all("h1").find { |h1| h1.text.strip == "Watch" }.nil?
141
+ return extract_video_post_data_from_watch_page(graphql_strings) # If this is a "watch page" video
142
+ end
143
+
144
+ graphql_object_array = graphql_strings.map { |graphql_string| JSON.parse(graphql_string) }
145
+ story_node_object = graphql_object_array.find { |graphql_object| graphql_object.key? "node" }&.fetch("node", nil) # user posted video
146
+ story_node_object = story_node_object || graphql_object_array.find { |graphql_object| graphql_object.key? "nodes" }&.fetch("nodes")&.first # page posted video
147
+
148
+ return extract_video_post_data_alternative(graphql_object_array) if story_node_object.nil?
149
+
150
+ if story_node_object["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"].key?("media")
151
+ video_object = story_node_object["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"]["media"]
152
+ creation_date = video_object["publish_time"]
153
+ # creation_date = video_object["video"]["publish_time"]
154
+ elsif story_node_object["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"].key?("style_infos")
155
+ # For "Reels" we need a separate way to parse this
156
+ video_object = story_node_object["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"]["style_infos"].first["fb_shorts_story"]["short_form_video_context"]["playback_video"]
157
+ creation_date = story_node_object["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"]["style_infos"].first["fb_shorts_story"]["creation_time"]
158
+ else
159
+ raise "Unable to parse video object"
160
+ end
161
+
162
+ feedback_object = story_node_object["comet_sections"]["feedback"]["story"]["feedback_context"]["feedback_target_with_context"]["ufi_renderer"]["feedback"]
163
+ reaction_counts = extract_reaction_counts(feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["cannot_see_top_custom_reactions"]["top_reactions"])
164
+ share_count_object = feedback_object.fetch("share_count", {})
165
+
166
+ if story_node_object["comet_sections"]["content"]["story"]["comet_sections"].key? "message"
167
+ text = story_node_object["comet_sections"]["content"]["story"]["comet_sections"]["message"]["story"]["message"]["text"]
168
+ else
169
+ text = ""
170
+ end
171
+
172
+ feedback_object["comment_list_renderer"]["feedback"]["comment_count"]["total_count"]
173
+ num_comments = feedback_object.has_key?("comment_list_renderer") ? feedback_object["comment_list_renderer"]["feedback"]["comment_count"]["total_count"] : feedback_object["comment_count"]["total_count"]
174
+
175
+ post_details = {
176
+ id: video_object["id"],
177
+ num_comments: num_comments,
178
+ num_shares: share_count_object.fetch("count", nil),
179
+ num_views: feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["video_view_count"],
180
+ reshare_warning: feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["should_show_reshare_warning"],
181
+ video_preview_image_url: video_object["preferred_thumbnail"]["image"]["uri"],
182
+ video_url: video_object["playable_url_quality_hd"] || video_object["playable_url"],
183
+ text: text,
184
+ created_at: creation_date,
185
+ profile_link: story_node_object["comet_sections"]["context_layout"]["story"]["comet_sections"]["actor_photo"]["story"]["actors"][0]["url"],
186
+ has_video: true
187
+ }
188
+ post_details[:video_preview_image_file] = Forki.retrieve_media(post_details[:video_preview_image_url])
189
+ post_details[:video_file] = Forki.retrieve_media(post_details[:video_url])
190
+ post_details[:reactions] = reaction_counts
191
+ post_details
192
+ end
193
+
194
+ def extract_video_post_data_alternative(graphql_object_array)
195
+ sidepane_object = graphql_object_array.find { |graphql_object| graphql_object.key?("tahoe_sidepane_renderer") }
196
+ video_object = graphql_object_array.find { |graphql_object| graphql_object.keys == ["video"] }
197
+ feedback_object = sidepane_object["tahoe_sidepane_renderer"]["video"]["feedback"]
198
+ reaction_counts = extract_reaction_counts(sidepane_object["tahoe_sidepane_renderer"]["video"]["feedback"]["cannot_see_top_custom_reactions"]["top_reactions"])
199
+ share_count_object = feedback_object.fetch("share_count", {})
200
+
201
+ post_details = {
202
+ id: video_object["id"],
203
+ num_comments: feedback_object["comments_count_summary_renderer"]["feedback"]["total_comment_count"],
204
+ num_shares: share_count_object.fetch("count", nil),
205
+ num_views: feedback_object["video_view_count"],
206
+ reshare_warning: feedback_object["should_show_reshare_warning"],
207
+ video_preview_image_url: video_object["video"]["preferred_thumbnail"]["image"]["uri"],
208
+ video_url: video_object["video"]["playable_url_quality_hd"] || video_object["video"]["playable_url"],
209
+ text: sidepane_object["tahoe_sidepane_renderer"]["video"]["creation_story"]["comet_sections"]["message"]["story"]["message"]["text"],
210
+ created_at: video_object["video"]["publish_time"],
211
+ profile_link: sidepane_object["tahoe_sidepane_renderer"]["video"]["creation_story"]["comet_sections"]["actor_photo"]["story"]["actors"][0]["url"],
212
+ has_video: true
213
+ }
214
+
215
+ post_details[:video_preview_image_file] = Forki.retrieve_media(post_details[:video_preview_image_url])
216
+ post_details[:video_file] = Forki.retrieve_media(post_details[:video_url])
217
+ post_details[:reactions] = reaction_counts
218
+ post_details
219
+ end
220
+
221
+ # Extracts data from an image post by parsing GraphQL strings as seen in the video post scraper above
222
+ def extract_image_post_data(graphql_object_array)
223
+ graphql_object_array.find { |graphql_object| graphql_object.key?("viewer_actor") && graphql_object.key?("display_comments") }
224
+ curr_media_object = graphql_object_array.find { |graphql_object| graphql_object.key?("currMedia") }
225
+ creation_story_object = graphql_object_array.find { |graphql_object| graphql_object.key?("creation_story") && graphql_object.key?("message") }
226
+
227
+ feedback_object = graphql_object_array.find { |graphql_object| graphql_object.has_key?("comet_ufi_summary_and_actions_renderer") }["comet_ufi_summary_and_actions_renderer"]["feedback"]
228
+ share_count_object = feedback_object.fetch("share_count", {})
229
+
230
+ poster = creation_story_object["creation_story"]["comet_sections"]["actor_photo"]["story"]["actors"][0]
231
+
232
+ reaction_counts = extract_reaction_counts(feedback_object["cannot_see_top_custom_reactions"]["top_reactions"])
233
+ post_details = {
234
+ id: curr_media_object["currMedia"]["id"],
235
+ num_comments: feedback_object["comments_count_summary_renderer"]["feedback"]["total_comment_count"],
236
+ num_shares: share_count_object.fetch("count", nil),
237
+ reshare_warning: feedback_object["should_show_reshare_warning"],
238
+ image_url: curr_media_object["currMedia"]["image"]["uri"],
239
+ text: (creation_story_object["message"] || {}).fetch("text", nil),
240
+ profile_link: poster["url"],
241
+ created_at: curr_media_object["currMedia"]["created_time"],
242
+ has_video: false
243
+ }
244
+ post_details[:image_file] = Forki.retrieve_media(post_details[:image_url])
245
+ post_details[:reactions] = reaction_counts
246
+ post_details
247
+ end
248
+
249
+ # Extract data from a non-live video post on the watch page
250
+ def extract_video_post_data_from_watch_page(graphql_strings)
251
+ return extract_live_video_post_data_from_watch_page(graphql_strings) if current_url.include?("live")
252
+ video_object = graphql_strings.map { |g| JSON.parse(g) }.find { |x| x.key?("video") }
253
+ creation_story_object = JSON.parse(graphql_strings.find { |graphql_string| (graphql_string.include?("creation_story")) && \
254
+ (graphql_string.include?("live_status")) })
255
+ video_permalink = creation_story_object["creation_story"]["shareable"]["url"].delete("\\")
256
+ media_object = video_object["video"]["story"]["attachments"][0]["media"]
257
+ reaction_counts = extract_reaction_counts(creation_story_object["feedback"]["cannot_see_top_custom_reactions"]["top_reactions"])
258
+
259
+ post_details = {
260
+ id: video_object["id"],
261
+ num_comments: creation_story_object["feedback"]["total_comment_count"],
262
+ num_shares: nil, # Not present for watch feed videos?
263
+ num_views: creation_story_object["feedback"]["video_view_count_renderer"]["feedback"]["video_view_count"],
264
+ reshare_warning: creation_story_object["feedback"]["should_show_reshare_warning"],
265
+ video_preview_image_url: video_object["video"]["story"]["attachments"][0]["media"]["preferred_thumbnail"]["image"]["uri"],
266
+ video_url: (media_object.fetch("playable_url_quality_hd", nil) || media_object.fetch("playable_url", nil)).delete("\\"),
267
+ text: (creation_story_object["creation_story"]["message"] || {})["text"],
268
+ created_at: video_object["video"]["story"]["attachments"][0]["media"]["publish_time"],
269
+ profile_link: video_permalink[..video_permalink.index("/videos")],
270
+ has_video: true
271
+ }
272
+
273
+ post_details[:video_preview_image_file] = Forki.retrieve_media(post_details[:video_preview_image_url])
274
+ post_details[:video_file] = Forki.retrieve_media(post_details[:video_url])
275
+ post_details[:reactions] = reaction_counts
276
+ post_details
277
+ end
278
+
279
+ # Extract data from live video post on the watch page
280
+ def extract_live_video_post_data_from_watch_page(graphql_strings)
281
+ creation_story_object = JSON.parse(graphql_strings.find { |graphql| (graphql.include? "comment_count") && \
282
+ (graphql.include? "creation_story") })["video"]["creation_story"]
283
+ media_object = JSON.parse(graphql_strings.find { |graphql| graphql.include? "playable_url" })["video"]["creation_story"]["attachments"][0]["media"]
284
+ video_permalink = creation_story_object["shareable"]["url"].delete("\\")
285
+ reaction_counts = extract_reaction_counts(creation_story_object["feedback_context"]["feedback_target_with_context"]["cannot_see_top_custom_reactions"]["top_reactions"])
286
+
287
+ post_details = {
288
+ id: creation_story_object["shareable"]["id"],
289
+ num_comments: creation_story_object["feedback_context"]["feedback_target_with_context"]["total_comment_count"],
290
+ num_shares: nil,
291
+ num_views: find_number_of_views, # as far as I can tell, this is never present for live videos
292
+ reshare_warning: creation_story_object["feedback_context"]["feedback_target_with_context"]["should_show_reshare_warning"],
293
+ video_preview_image_url: creation_story_object["attachments"][0]["media"]["preferred_thumbnail"]["image"]["uri"],
294
+ video_url: (media_object.fetch("playable_url_quality_hd", nil) || media_object.fetch("playable_url", nil)).delete("\\"),
295
+ text: creation_story_object["attachments"][0]["media"]["savable_description"]["text"],
296
+ created_at: creation_story_object["attachments"][0]["media"]["publish_time"],
297
+ profile_link: video_permalink[..video_permalink.index("/videos")],
298
+ has_video: true
299
+ }
300
+
301
+ post_details[:video_preview_image_file] = Forki.retrieve_media(post_details[:video_preview_image_url])
302
+ post_details[:video_file] = Forki.retrieve_media(post_details[:video_url])
303
+ post_details[:reactions] = reaction_counts
304
+ post_details
305
+ end
306
+
307
+ # Returns a hash containing counts of each reaction to a post
308
+ # Takes the edges list and creates a dictionary for each element that looks like: {:num_likes: 1234}
309
+ # Then merges the dictionaries with the inject call
310
+ def extract_reaction_counts(reactions_object)
311
+ reactions_object["edges"].map do |reaction|
312
+ {
313
+ "num_#{reaction["node"]["localized_name"].downcase}s".to_sym => reaction["reaction_count"]
314
+ }
315
+ end.inject { |emoji_counts, count| emoji_counts.merge(count) }
316
+ end
317
+
318
+ def take_screenshot
319
+ # First check whether post being scraped has a fact check overlay. If it does clear it.
320
+ begin
321
+ find('div[aria-label=" See Photo "]').click() || find('div[aria-label=" See Video "]').click()
322
+ rescue Capybara::ElementNotFound
323
+ # Do nothing if element not found
324
+ end
325
+
326
+ save_screenshot("#{Forki.temp_storage_location}/facebook_screenshot_#{SecureRandom.uuid}.png")
327
+ end
328
+
329
+ # Uses GraphQL data and DOM elements to collect information about the current post
330
+ def parse(url)
331
+ validate_and_load_page(url)
332
+ graphql_strings = find_graphql_data_strings(page.html)
333
+ post_data = extract_post_data(graphql_strings)
334
+ post_data[:url] = url
335
+ user_url = post_data[:profile_link]
336
+
337
+ 5.times do
338
+ begin
339
+ post_data[:screenshot_file] = take_screenshot
340
+ break
341
+ rescue Net::ReadTimeout; end
342
+
343
+ sleep(5)
344
+ end
345
+
346
+ # page.quit # Close browser between page navigations to prevent cache folder access issues
347
+
348
+ post_data[:user] = User.lookup(user_url).first
349
+ page.quit
350
+
351
+ post_data
352
+ rescue Net::ReadTimeout
353
+ # Eat it?
354
+ rescue StandardError => e
355
+ raise e
356
+ ensure
357
+ page.quit
358
+ end
359
+ end
360
+ end
@@ -0,0 +1,189 @@
1
+ # frozen_string_literal: true
2
+
3
+ # require_relative "user_scraper"
4
+ require "capybara/dsl"
5
+ require "dotenv/load"
6
+ require "oj"
7
+ require "selenium-webdriver"
8
+ require "open-uri"
9
+
10
+ options = Selenium::WebDriver::Options.chrome(exclude_switches: ["enable-automation"])
11
+ options.add_argument("--start-maximized")
12
+ options.add_argument("--no-sandbox")
13
+ options.add_argument("--disable-dev-shm-usage")
14
+ options.add_argument("–-disable-blink-features=AutomationControlled")
15
+ options.add_argument("--disable-extensions")
16
+ options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 13_3_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36")
17
+ options.add_preference "password_manager_enabled", false
18
+ options.add_argument("--disable-dev-shm-usage")
19
+ options.add_argument("--remote-debugging-port=9222")
20
+ options.add_argument("--user-data-dir=/tmp/tarun_forki_#{SecureRandom.uuid}")
21
+
22
+ Capybara.register_driver :selenium_forki do |app|
23
+ client = Selenium::WebDriver::Remote::Http::Default.new
24
+ client.read_timeout = 60 # Don't wait 60 seconds to return Net::ReadTimeoutError. We'll retry through Hypatia after 10 seconds
25
+ Capybara::Selenium::Driver.new(app, browser: :chrome, options: options, http_client: client)
26
+ end
27
+
28
+ Capybara.default_max_wait_time = 60
29
+ Capybara.threadsafe = true
30
+ Capybara.reuse_server = true
31
+
32
+ module Forki
33
+ class Scraper # rubocop:disable Metrics/ClassLength
34
+ include Capybara::DSL
35
+
36
+ def initialize
37
+ Capybara.default_driver = :selenium_forki
38
+ Forki.set_logger_level
39
+ # reset_selenium
40
+ end
41
+
42
+ # Yeah, just use the tmp/ directory that's created during setup
43
+ def download_image(img_elem)
44
+ img_data = URI.open(img_elem["src"]).read
45
+ File.binwrite("temp/emoji.png", img_data)
46
+ end
47
+
48
+ # Returns all GraphQL data objects embedded within a string
49
+ # Finds substrings that look like '"data": {...}' and converts them to hashes
50
+ def find_graphql_data_strings(objs = [], html_str)
51
+ data_marker = '"data":{'
52
+ data_start_index = html_str.index(data_marker)
53
+ return objs if data_start_index.nil? # No more data blocks in the page source
54
+
55
+ data_closure_index = find_graphql_data_closure_index(html_str, data_start_index)
56
+ return objs if data_closure_index.nil?
57
+
58
+ graphql_data_str = html_str[data_start_index...data_closure_index].delete_prefix('"data":')
59
+ objs + [graphql_data_str] + find_graphql_data_strings(html_str[data_closure_index..])
60
+ end
61
+
62
+ def find_graphql_data_closure_index(html_str, start_index)
63
+ closure_index = start_index + 8 # length of data marker. Begin search right after open brace
64
+ raise "Malformed graphql data object: no closing bracket found" if closure_index > html_str.length
65
+
66
+ brace_stack = 1
67
+ loop do # search for brace characters in substring instead of iterating through each char
68
+ if html_str[closure_index] == "{"
69
+ brace_stack += 1
70
+ elsif html_str[closure_index] == "}"
71
+ brace_stack -= 1
72
+ end
73
+
74
+ closure_index += 1
75
+ break if brace_stack.zero?
76
+ end
77
+
78
+ closure_index
79
+ end
80
+
81
+ private
82
+
83
+ ##########
84
+ # Set the session to use a new user folder in the options!
85
+ # #####################
86
+ def reset_selenium
87
+ options = Selenium::WebDriver::Options.chrome(exclude_switches: ["enable-automation"])
88
+ options.add_argument("--start-maximized")
89
+ options.add_argument("--no-sandbox")
90
+ options.add_argument("--disable-dev-shm-usage")
91
+ options.add_argument("–-disable-blink-features=AutomationControlled")
92
+ options.add_argument("--disable-extensions")
93
+ options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 13_3_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36")
94
+ options.add_preference "password_manager_enabled", false
95
+ options.add_argument("--disable-dev-shm-usage")
96
+ options.add_argument("--remote-debugging-port=9222")
97
+ options.add_argument("--user-data-dir=/tmp/tarun_forki_#{SecureRandom.uuid}")
98
+
99
+ Capybara.register_driver :selenium_forki do |app|
100
+ client = Selenium::WebDriver::Remote::Http::Default.new
101
+ client.read_timeout = 60 # Don't wait 60 seconds to return Net::ReadTimeoutError. We'll retry through Hypatia after 10 seconds
102
+ Capybara::Selenium::Driver.new(app, browser: :chrome, options: options, http_client: client)
103
+ end
104
+
105
+ Capybara.current_driver = :selenium_forki
106
+ end
107
+
108
+ # Logs in to Facebook (if not already logged in)
109
+ def login(url = nil)
110
+ raise MissingCredentialsError if ENV["FACEBOOK_EMAIL"].nil? || ENV["FACEBOOK_PASSWORD"].nil?
111
+
112
+ url ||= "https://www.facebook.com"
113
+ visit(url) # Visit the url passed in or the facebook homepage if nothing is
114
+
115
+ # Look for "login_form" box, which throws an error if not found. So we catch it and run the rest of the tests
116
+ begin
117
+ login_form = first(id: "login_form", wait: 5)
118
+ rescue Capybara::ElementNotFound
119
+ return unless page.title.downcase.include?("facebook - log in")
120
+ end
121
+
122
+ # Since we're not logged in, let's do that quick
123
+ visit("https://www.facebook.com") if login_form.nil?
124
+
125
+ login_form.fill_in("email", with: ENV["FACEBOOK_EMAIL"])
126
+ login_form.fill_in("pass", with: ENV["FACEBOOK_PASSWORD"])
127
+
128
+ # This is a pain because some pages just `click_button` would work, but some won't
129
+ login_buttons = login_form.all("div", text: "Log In", wait: 5)
130
+
131
+ if login_buttons.empty?
132
+ login_form.click_button("Log In")
133
+ else
134
+ login_buttons.each do |button|
135
+ if button.text == "Log In"
136
+ button.click
137
+ break
138
+ end
139
+ end
140
+ end
141
+
142
+ begin
143
+ raise Forki::BlockedCredentialsError if find_by_id("error_box", wait: 3)
144
+ rescue Capybara::ElementNotFound; end
145
+
146
+ # Now we wait awhile, hopefully to slow down scraping
147
+ sleep(rand * 10.3)
148
+ end
149
+
150
+ # Ensures that a valid Facebook url has been provided, and that it points to an available post
151
+ # If either of those two conditions are false, raises an exception
152
+ def validate_and_load_page(url)
153
+ Capybara.app_host = "https://www.facebook.com"
154
+ facebook_url = "https://www.facebook.com"
155
+ # visit "https://www.facebook.com" unless current_url.start_with?(facebook_url)
156
+ login(url)
157
+ raise Forki::InvalidUrlError unless url.start_with?(facebook_url)
158
+ visit url unless current_url.start_with?(url)
159
+ end
160
+
161
+ # Extracts an integer out of a string describing a number
162
+ # e.g. "4K Comments" returns 4000
163
+ # e.g. "131 Shares" returns 131
164
+ def extract_int_from_num_element(element)
165
+ return unless element
166
+
167
+ if element.class != String # if an html element was passed in
168
+ element = element.text(:all)
169
+ end
170
+
171
+ num_pattern = /[0-9KM ,.]+/
172
+ interaction_num_text = num_pattern.match(element)[0]
173
+
174
+ if interaction_num_text.include?(".") # e.g. "2.2K"
175
+ interaction_num_text.to_i + interaction_num_text[-2].to_i * 100
176
+ elsif interaction_num_text.include?("K") # e.g. "13K"
177
+ interaction_num_text.to_i * 1000
178
+ elsif interaction_num_text.include?("M") # e.g. "13M"
179
+ interaction_num_text.to_i * 1_000_000
180
+ else # e.g. "15,443"
181
+ interaction_num_text.delete!(",")
182
+ interaction_num_text.delete(" ").to_i
183
+ end
184
+ end
185
+ end
186
+ end
187
+
188
+ require_relative "post_scraper"
189
+ require_relative "user_scraper"
@@ -0,0 +1,94 @@
1
+ require "typhoeus"
2
+
3
+ module Forki
4
+ class UserScraper < Scraper
5
+ # Finds and returns the number of people who like the current page
6
+ def find_number_of_likes
7
+ likes_pattern = /[0-9,.KM ] people like this/
8
+ number_of_likes_elem = all("span").filter { | span| likes_pattern.match? span.text }.first
9
+ extract_int_from_num_element(number_of_likes_elem)
10
+ end
11
+
12
+ # Finds and returns the number of people who follow the current page
13
+ def find_number_of_followers(profile_details_string)
14
+ followers_pattern = /Followed by (?<num_followers>[0-9,.KM ]) people/
15
+ alt_follower_pattern = /(?<num_followers>[0-9,.KM ]+) (f|F)ollowers/
16
+ number_of_followers_match = followers_pattern.match(profile_details_string) || alt_follower_pattern.match(profile_details_string)
17
+ return nil if number_of_followers_match.nil?
18
+ extract_int_from_num_element(number_of_followers_match.named_captures["num_followers"])
19
+ end
20
+
21
+ def find_number_followers_for_normal_profile(profile_followers_node)
22
+ followers_string = profile_followers_node["node"]["timeline_context_item"]["renderer"]["context_item"]["title"]["text"]
23
+ followers_pattern = /[0-9,]+/
24
+ number_of_followers_match = followers_pattern.match(followers_string).to_s
25
+ extract_int_from_num_element(number_of_followers_match)
26
+ end
27
+
28
+ # Returns a hash of details about a Facebook user profile
29
+ def extract_profile_details(graphql_strings)
30
+ profile_header_str = graphql_strings.find { |gql| gql.include? "profile_header_renderer" }
31
+ profile_intro_str = graphql_strings.find { |g| g.include? "profile_intro_card" }
32
+ profile_header_obj = JSON.parse(profile_header_str)["user"]["profile_header_renderer"]
33
+ profile_intro_obj = profile_intro_str ? JSON.parse(profile_intro_str) : nil
34
+
35
+ number_of_followers = find_number_of_followers(profile_header_str)
36
+
37
+ # Check if the user shows followers count
38
+ if number_of_followers.nil?
39
+ profile_title_section = graphql_strings.find { |gql| gql.include? "profile_tile_section_type" }
40
+
41
+ json = JSON.parse(profile_title_section)
42
+ followers_node = json["user"]["profile_tile_sections"]["edges"].first["node"]["profile_tile_views"]["nodes"][1]["view_style_renderer"]["view"]["profile_tile_items"]["nodes"].select do |node|
43
+ node["node"]["timeline_context_item"]["timeline_context_list_item_type"] == "INTRO_CARD_FOLLOWERS"
44
+ end
45
+ if followers_node.empty?
46
+ number_of_followers = nil
47
+ else
48
+ number_of_followers = find_number_followers_for_normal_profile(followers_node.first)
49
+ end
50
+ end
51
+
52
+ {
53
+ id: profile_header_obj["user"]["id"],
54
+ number_of_followers: number_of_followers,
55
+ name: profile_header_obj["user"]["name"],
56
+ verified: profile_header_obj["user"]["is_verified"],
57
+ profile: profile_intro_obj ? profile_intro_obj["profile_intro_card"]["bio"]["text"] : "",
58
+ profile_image_url: profile_header_obj["user"]["profilePicLarge"]["uri"],
59
+ }
60
+ end
61
+
62
+ # Returns a hash of details about a Facebook page
63
+ def extract_page_details(graphql_strings)
64
+ page_cards_string = graphql_strings.find { |graphql_string| (graphql_string.include? "comet_page_cards") && \
65
+ (graphql_string.include? "follower_count")}
66
+ page_cards_list = JSON.parse(page_cards_string)["page"]["comet_page_cards"]
67
+ page_about_card = page_cards_list.find { |card| card["__typename"] == "CometPageAboutCardWithoutMapRenderer" }
68
+ viewer_page_object = JSON.parse(graphql_strings.find { |graphql_string| (graphql_string.include? "profile_photo") && \
69
+ graphql_string.include?("is_verified") })
70
+ {
71
+ id: page_about_card["page"]["id"],
72
+ profile: page_about_card["page"]["page_about_fields"]["blurb"],
73
+ number_of_followers: page_about_card["page"]["follower_count"],
74
+ name: page_about_card["page"]["name"],
75
+ verified: viewer_page_object["page"]["is_verified"],
76
+ profile_image_url: viewer_page_object["page"]["profile_picture"]["uri"],
77
+ number_of_likes: page_about_card["page"]["page_likers"]["global_likers_count"],
78
+ }
79
+ end
80
+
81
+ # Uses GraphQL data and DOM elements to collect information about the current user page
82
+ def parse(url)
83
+ validate_and_load_page(url)
84
+ graphql_strings = find_graphql_data_strings(page.html)
85
+ is_page = graphql_strings.map { |s| JSON.parse(s) }.any? { |o| o.key?("page") }
86
+ user_details = is_page ? extract_page_details(graphql_strings) : extract_profile_details(graphql_strings)
87
+
88
+ user_details[:profile_image_file] = Forki.retrieve_media(user_details[:profile_image_url])
89
+ user_details[:profile_link] = url
90
+
91
+ user_details
92
+ end
93
+ end
94
+ end