forki 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,360 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "typhoeus"
4
+ require "securerandom"
5
+ require "byebug"
6
+
7
+ module Forki
8
+ # rubocop:disable Metrics/ClassLength
9
+ class PostScraper < Scraper
10
+ # Searches the DOM to finds the number of times a (video) post has been viewed.
11
+ # Returns nil if it can't find a DOM element with the view count
12
+
13
+ def find_number_of_views
14
+ views_pattern = /[0-9MK, ]+Views/
15
+ spans = all("span")
16
+ views_span = spans.find { |s| s.text(:all) =~ views_pattern }
17
+ extract_int_from_num_element(views_span)
18
+ end
19
+
20
+ def extract_post_data(graphql_strings)
21
+ # Bail out of the post otherwise it gets stuck
22
+ raise ContentUnavailableError unless is_post_available?
23
+
24
+ graphql_objects = get_graphql_objects(graphql_strings)
25
+ post_has_video = check_if_post_is_video(graphql_objects)
26
+ post_has_image = check_if_post_is_image(graphql_objects)
27
+
28
+ # There's a chance it may be embedded in a comment chain like this:
29
+ # https://www.facebook.com/PlandemicMovie/posts/588866298398729/
30
+ post_has_video_in_comment_stream = check_if_post_is_in_comment_stream(graphql_objects) if post_has_video == false
31
+
32
+ if post_has_video
33
+ extract_video_post_data(graphql_strings)
34
+ elsif post_has_video_in_comment_stream
35
+ extract_video_comment_post_data(graphql_objects)
36
+ elsif post_has_image
37
+ extract_image_post_data(graphql_objects)
38
+ else
39
+ raise UnhandledContentError
40
+ end
41
+ end
42
+
43
+ def get_graphql_objects(graphql_strings)
44
+ graphql_strings.map { |graphql_object| JSON.parse(graphql_object) }
45
+ end
46
+
47
+ def check_if_post_is_video(graphql_objects)
48
+ graphql_objects.any? { |graphql_object| graphql_object.key?("is_live_streaming") || graphql_object.key?("video") || check_if_post_is_reel(graphql_object) }
49
+ end
50
+
51
+ def check_if_post_is_reel(graphql_object)
52
+ return false unless graphql_object.key?("node")
53
+
54
+ begin
55
+ style_infos = graphql_object["node"]["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"]["style_infos"].first
56
+ rescue NoMethodError # if the object doesn't match the attribute chain above, the line above will try to operate on nil
57
+ return false
58
+ end
59
+
60
+ style_infos.include?("fb_shorts_story")
61
+ end
62
+
63
+ def check_if_post_is_image(graphql_objects)
64
+ graphql_objects.any? do |graphql_object| # if any GraphQL objects contain the top-level keys above, return true
65
+ true unless graphql_object.fetch("image", nil).nil? # so long as the associated values are not nil
66
+ true unless graphql_object.fetch("currMedia", nil).nil?
67
+ end
68
+ end
69
+
70
+ def check_if_post_is_in_comment_stream(graphql_objects)
71
+ graphql_objects.find do |graphql_object|
72
+ next unless graphql_object.key?("nodes")
73
+
74
+ begin
75
+ type = graphql_object["nodes"].first["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"]["media"]["__typename"]
76
+ rescue StandardError
77
+ # if there's an error just return false, since the structure is so specific checking the whole thing is a lot
78
+ next
79
+ end
80
+
81
+ return true if type == "Video"
82
+ end
83
+
84
+ false
85
+ end
86
+
87
+ def is_post_available?
88
+ begin
89
+ # This Video Isn't Available Anymore
90
+ find("span", wait: 5, text: "content isn't available", exact_text: false)
91
+ rescue Capybara::ElementNotFound, Selenium::WebDriver::Error::StaleElementReferenceError
92
+ begin
93
+ find("span", wait: 5, text: "This Video Isn't Available Anymore", exact_text: false)
94
+ rescue Capybara::ElementNotFound, Selenium::WebDriver::Error::StaleElementReferenceError
95
+ return true
96
+ end
97
+ end
98
+
99
+ false
100
+ end
101
+
102
+ def extract_video_comment_post_data(graphql_objects)
103
+ graphql_nodes = nil
104
+ graphql_objects.find do |graphql_object|
105
+ next unless graphql_object.key?("nodes")
106
+ graphql_nodes = graphql_object["nodes"]
107
+
108
+ break
109
+ end
110
+
111
+ media = graphql_nodes.first["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"]["media"]
112
+ inital_feedback_object = graphql_nodes.first["comet_sections"]["feedback"]["story"]["feedback_context"]["feedback_target_with_context"]["ufi_renderer"]["feedback"]
113
+ feedback_object = inital_feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]
114
+
115
+ post_details = {
116
+ id: media["id"],
117
+ num_comments: feedback_object["comment_count"]["total_count"],
118
+ num_shares: feedback_object["share_count"]["count"],
119
+ num_views: feedback_object["video_view_count"],
120
+ reshare_warning: feedback_object["should_show_reshare_warning"],
121
+ video_preview_image_url: media["preferred_thumbnail"]["image"]["uri"],
122
+ video_url: media["playable_url_quality_hd"] || media["playable_url"],
123
+ text: graphql_nodes.first["comet_sections"]["content"]["story"]["comet_sections"]["message"]["story"]["message"]["text"],
124
+ created_at: media["publish_time"],
125
+ profile_link: graphql_nodes.first["comet_sections"]["context_layout"]["story"]["comet_sections"]["actor_photo"]["story"]["actors"].first["url"],
126
+ has_video: true
127
+ }
128
+
129
+ post_details[:video_preview_image_file] = Forki.retrieve_media(post_details[:video_preview_image_url])
130
+ post_details[:video_file] = Forki.retrieve_media(post_details[:video_url])
131
+ post_details[:reactions] = inital_feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["i18n_reaction_count"]
132
+ post_details
133
+ end
134
+
135
+ # Unfortunately, there's a taxonomy of video post types, all of which require different parsing methods
136
+ # Specifically, there are normal video posts, video posts from the watch page, and live video posts from the watch page
137
+ # The general strategy for extracting information from each type, though, is to find which of the 30-odd GraphQL strings are relevant
138
+ # After finding those GraphQL strings, we parse them into hashes and extract the information we need
139
+ def extract_video_post_data(graphql_strings)
140
+ unless all("h1").find { |h1| h1.text.strip == "Watch" }.nil?
141
+ return extract_video_post_data_from_watch_page(graphql_strings) # If this is a "watch page" video
142
+ end
143
+
144
+ graphql_object_array = graphql_strings.map { |graphql_string| JSON.parse(graphql_string) }
145
+ story_node_object = graphql_object_array.find { |graphql_object| graphql_object.key? "node" }&.fetch("node", nil) # user posted video
146
+ story_node_object = story_node_object || graphql_object_array.find { |graphql_object| graphql_object.key? "nodes" }&.fetch("nodes")&.first # page posted video
147
+
148
+ return extract_video_post_data_alternative(graphql_object_array) if story_node_object.nil?
149
+
150
+ if story_node_object["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"].key?("media")
151
+ video_object = story_node_object["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"]["media"]
152
+ creation_date = video_object["publish_time"]
153
+ # creation_date = video_object["video"]["publish_time"]
154
+ elsif story_node_object["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"].key?("style_infos")
155
+ # For "Reels" we need a separate way to parse this
156
+ video_object = story_node_object["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"]["style_infos"].first["fb_shorts_story"]["short_form_video_context"]["playback_video"]
157
+ creation_date = story_node_object["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"]["style_infos"].first["fb_shorts_story"]["creation_time"]
158
+ else
159
+ raise "Unable to parse video object"
160
+ end
161
+
162
+ feedback_object = story_node_object["comet_sections"]["feedback"]["story"]["feedback_context"]["feedback_target_with_context"]["ufi_renderer"]["feedback"]
163
+ reaction_counts = extract_reaction_counts(feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["cannot_see_top_custom_reactions"]["top_reactions"])
164
+ share_count_object = feedback_object.fetch("share_count", {})
165
+
166
+ if story_node_object["comet_sections"]["content"]["story"]["comet_sections"].key? "message"
167
+ text = story_node_object["comet_sections"]["content"]["story"]["comet_sections"]["message"]["story"]["message"]["text"]
168
+ else
169
+ text = ""
170
+ end
171
+
172
+ feedback_object["comment_list_renderer"]["feedback"]["comment_count"]["total_count"]
173
+ num_comments = feedback_object.has_key?("comment_list_renderer") ? feedback_object["comment_list_renderer"]["feedback"]["comment_count"]["total_count"] : feedback_object["comment_count"]["total_count"]
174
+
175
+ post_details = {
176
+ id: video_object["id"],
177
+ num_comments: num_comments,
178
+ num_shares: share_count_object.fetch("count", nil),
179
+ num_views: feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["video_view_count"],
180
+ reshare_warning: feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["should_show_reshare_warning"],
181
+ video_preview_image_url: video_object["preferred_thumbnail"]["image"]["uri"],
182
+ video_url: video_object["playable_url_quality_hd"] || video_object["playable_url"],
183
+ text: text,
184
+ created_at: creation_date,
185
+ profile_link: story_node_object["comet_sections"]["context_layout"]["story"]["comet_sections"]["actor_photo"]["story"]["actors"][0]["url"],
186
+ has_video: true
187
+ }
188
+ post_details[:video_preview_image_file] = Forki.retrieve_media(post_details[:video_preview_image_url])
189
+ post_details[:video_file] = Forki.retrieve_media(post_details[:video_url])
190
+ post_details[:reactions] = reaction_counts
191
+ post_details
192
+ end
193
+
194
+ def extract_video_post_data_alternative(graphql_object_array)
195
+ sidepane_object = graphql_object_array.find { |graphql_object| graphql_object.key?("tahoe_sidepane_renderer") }
196
+ video_object = graphql_object_array.find { |graphql_object| graphql_object.keys == ["video"] }
197
+ feedback_object = sidepane_object["tahoe_sidepane_renderer"]["video"]["feedback"]
198
+ reaction_counts = extract_reaction_counts(sidepane_object["tahoe_sidepane_renderer"]["video"]["feedback"]["cannot_see_top_custom_reactions"]["top_reactions"])
199
+ share_count_object = feedback_object.fetch("share_count", {})
200
+
201
+ post_details = {
202
+ id: video_object["id"],
203
+ num_comments: feedback_object["comments_count_summary_renderer"]["feedback"]["total_comment_count"],
204
+ num_shares: share_count_object.fetch("count", nil),
205
+ num_views: feedback_object["video_view_count"],
206
+ reshare_warning: feedback_object["should_show_reshare_warning"],
207
+ video_preview_image_url: video_object["video"]["preferred_thumbnail"]["image"]["uri"],
208
+ video_url: video_object["video"]["playable_url_quality_hd"] || video_object["video"]["playable_url"],
209
+ text: sidepane_object["tahoe_sidepane_renderer"]["video"]["creation_story"]["comet_sections"]["message"]["story"]["message"]["text"],
210
+ created_at: video_object["video"]["publish_time"],
211
+ profile_link: sidepane_object["tahoe_sidepane_renderer"]["video"]["creation_story"]["comet_sections"]["actor_photo"]["story"]["actors"][0]["url"],
212
+ has_video: true
213
+ }
214
+
215
+ post_details[:video_preview_image_file] = Forki.retrieve_media(post_details[:video_preview_image_url])
216
+ post_details[:video_file] = Forki.retrieve_media(post_details[:video_url])
217
+ post_details[:reactions] = reaction_counts
218
+ post_details
219
+ end
220
+
221
+ # Extracts data from an image post by parsing GraphQL strings as seen in the video post scraper above
222
+ def extract_image_post_data(graphql_object_array)
223
+ graphql_object_array.find { |graphql_object| graphql_object.key?("viewer_actor") && graphql_object.key?("display_comments") }
224
+ curr_media_object = graphql_object_array.find { |graphql_object| graphql_object.key?("currMedia") }
225
+ creation_story_object = graphql_object_array.find { |graphql_object| graphql_object.key?("creation_story") && graphql_object.key?("message") }
226
+
227
+ feedback_object = graphql_object_array.find { |graphql_object| graphql_object.has_key?("comet_ufi_summary_and_actions_renderer") }["comet_ufi_summary_and_actions_renderer"]["feedback"]
228
+ share_count_object = feedback_object.fetch("share_count", {})
229
+
230
+ poster = creation_story_object["creation_story"]["comet_sections"]["actor_photo"]["story"]["actors"][0]
231
+
232
+ reaction_counts = extract_reaction_counts(feedback_object["cannot_see_top_custom_reactions"]["top_reactions"])
233
+ post_details = {
234
+ id: curr_media_object["currMedia"]["id"],
235
+ num_comments: feedback_object["comments_count_summary_renderer"]["feedback"]["total_comment_count"],
236
+ num_shares: share_count_object.fetch("count", nil),
237
+ reshare_warning: feedback_object["should_show_reshare_warning"],
238
+ image_url: curr_media_object["currMedia"]["image"]["uri"],
239
+ text: (creation_story_object["message"] || {}).fetch("text", nil),
240
+ profile_link: poster["url"],
241
+ created_at: curr_media_object["currMedia"]["created_time"],
242
+ has_video: false
243
+ }
244
+ post_details[:image_file] = Forki.retrieve_media(post_details[:image_url])
245
+ post_details[:reactions] = reaction_counts
246
+ post_details
247
+ end
248
+
249
+ # Extract data from a non-live video post on the watch page
250
+ def extract_video_post_data_from_watch_page(graphql_strings)
251
+ return extract_live_video_post_data_from_watch_page(graphql_strings) if current_url.include?("live")
252
+ video_object = graphql_strings.map { |g| JSON.parse(g) }.find { |x| x.key?("video") }
253
+ creation_story_object = JSON.parse(graphql_strings.find { |graphql_string| (graphql_string.include?("creation_story")) && \
254
+ (graphql_string.include?("live_status")) })
255
+ video_permalink = creation_story_object["creation_story"]["shareable"]["url"].delete("\\")
256
+ media_object = video_object["video"]["story"]["attachments"][0]["media"]
257
+ reaction_counts = extract_reaction_counts(creation_story_object["feedback"]["cannot_see_top_custom_reactions"]["top_reactions"])
258
+
259
+ post_details = {
260
+ id: video_object["id"],
261
+ num_comments: creation_story_object["feedback"]["total_comment_count"],
262
+ num_shares: nil, # Not present for watch feed videos?
263
+ num_views: creation_story_object["feedback"]["video_view_count_renderer"]["feedback"]["video_view_count"],
264
+ reshare_warning: creation_story_object["feedback"]["should_show_reshare_warning"],
265
+ video_preview_image_url: video_object["video"]["story"]["attachments"][0]["media"]["preferred_thumbnail"]["image"]["uri"],
266
+ video_url: (media_object.fetch("playable_url_quality_hd", nil) || media_object.fetch("playable_url", nil)).delete("\\"),
267
+ text: (creation_story_object["creation_story"]["message"] || {})["text"],
268
+ created_at: video_object["video"]["story"]["attachments"][0]["media"]["publish_time"],
269
+ profile_link: video_permalink[..video_permalink.index("/videos")],
270
+ has_video: true
271
+ }
272
+
273
+ post_details[:video_preview_image_file] = Forki.retrieve_media(post_details[:video_preview_image_url])
274
+ post_details[:video_file] = Forki.retrieve_media(post_details[:video_url])
275
+ post_details[:reactions] = reaction_counts
276
+ post_details
277
+ end
278
+
279
+ # Extract data from live video post on the watch page
280
+ def extract_live_video_post_data_from_watch_page(graphql_strings)
281
+ creation_story_object = JSON.parse(graphql_strings.find { |graphql| (graphql.include? "comment_count") && \
282
+ (graphql.include? "creation_story") })["video"]["creation_story"]
283
+ media_object = JSON.parse(graphql_strings.find { |graphql| graphql.include? "playable_url" })["video"]["creation_story"]["attachments"][0]["media"]
284
+ video_permalink = creation_story_object["shareable"]["url"].delete("\\")
285
+ reaction_counts = extract_reaction_counts(creation_story_object["feedback_context"]["feedback_target_with_context"]["cannot_see_top_custom_reactions"]["top_reactions"])
286
+
287
+ post_details = {
288
+ id: creation_story_object["shareable"]["id"],
289
+ num_comments: creation_story_object["feedback_context"]["feedback_target_with_context"]["total_comment_count"],
290
+ num_shares: nil,
291
+ num_views: find_number_of_views, # as far as I can tell, this is never present for live videos
292
+ reshare_warning: creation_story_object["feedback_context"]["feedback_target_with_context"]["should_show_reshare_warning"],
293
+ video_preview_image_url: creation_story_object["attachments"][0]["media"]["preferred_thumbnail"]["image"]["uri"],
294
+ video_url: (media_object.fetch("playable_url_quality_hd", nil) || media_object.fetch("playable_url", nil)).delete("\\"),
295
+ text: creation_story_object["attachments"][0]["media"]["savable_description"]["text"],
296
+ created_at: creation_story_object["attachments"][0]["media"]["publish_time"],
297
+ profile_link: video_permalink[..video_permalink.index("/videos")],
298
+ has_video: true
299
+ }
300
+
301
+ post_details[:video_preview_image_file] = Forki.retrieve_media(post_details[:video_preview_image_url])
302
+ post_details[:video_file] = Forki.retrieve_media(post_details[:video_url])
303
+ post_details[:reactions] = reaction_counts
304
+ post_details
305
+ end
306
+
307
+ # Returns a hash containing counts of each reaction to a post
308
+ # Takes the edges list and creates a dictionary for each element that looks like: {:num_likes: 1234}
309
+ # Then merges the dictionaries with the inject call
310
+ def extract_reaction_counts(reactions_object)
311
+ reactions_object["edges"].map do |reaction|
312
+ {
313
+ "num_#{reaction["node"]["localized_name"].downcase}s".to_sym => reaction["reaction_count"]
314
+ }
315
+ end.inject { |emoji_counts, count| emoji_counts.merge(count) }
316
+ end
317
+
318
+ def take_screenshot
319
+ # First check whether post being scraped has a fact check overlay. If it does clear it.
320
+ begin
321
+ find('div[aria-label=" See Photo "]').click() || find('div[aria-label=" See Video "]').click()
322
+ rescue Capybara::ElementNotFound
323
+ # Do nothing if element not found
324
+ end
325
+
326
+ save_screenshot("#{Forki.temp_storage_location}/facebook_screenshot_#{SecureRandom.uuid}.png")
327
+ end
328
+
329
+ # Uses GraphQL data and DOM elements to collect information about the current post
330
+ def parse(url)
331
+ validate_and_load_page(url)
332
+ graphql_strings = find_graphql_data_strings(page.html)
333
+ post_data = extract_post_data(graphql_strings)
334
+ post_data[:url] = url
335
+ user_url = post_data[:profile_link]
336
+
337
+ 5.times do
338
+ begin
339
+ post_data[:screenshot_file] = take_screenshot
340
+ break
341
+ rescue Net::ReadTimeout; end
342
+
343
+ sleep(5)
344
+ end
345
+
346
+ # page.quit # Close browser between page navigations to prevent cache folder access issues
347
+
348
+ post_data[:user] = User.lookup(user_url).first
349
+ page.quit
350
+
351
+ post_data
352
+ rescue Net::ReadTimeout
353
+ # Eat it?
354
+ rescue StandardError => e
355
+ raise e
356
+ ensure
357
+ page.quit
358
+ end
359
+ end
360
+ end
@@ -0,0 +1,189 @@
1
+ # frozen_string_literal: true
2
+
3
+ # require_relative "user_scraper"
4
+ require "capybara/dsl"
5
+ require "dotenv/load"
6
+ require "oj"
7
+ require "selenium-webdriver"
8
+ require "open-uri"
9
+
10
+ options = Selenium::WebDriver::Options.chrome(exclude_switches: ["enable-automation"])
11
+ options.add_argument("--start-maximized")
12
+ options.add_argument("--no-sandbox")
13
+ options.add_argument("--disable-dev-shm-usage")
14
+ options.add_argument("–-disable-blink-features=AutomationControlled")
15
+ options.add_argument("--disable-extensions")
16
+ options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 13_3_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36")
17
+ options.add_preference "password_manager_enabled", false
18
+ options.add_argument("--disable-dev-shm-usage")
19
+ options.add_argument("--remote-debugging-port=9222")
20
+ options.add_argument("--user-data-dir=/tmp/tarun_forki_#{SecureRandom.uuid}")
21
+
22
+ Capybara.register_driver :selenium_forki do |app|
23
+ client = Selenium::WebDriver::Remote::Http::Default.new
24
+ client.read_timeout = 60 # Don't wait 60 seconds to return Net::ReadTimeoutError. We'll retry through Hypatia after 10 seconds
25
+ Capybara::Selenium::Driver.new(app, browser: :chrome, options: options, http_client: client)
26
+ end
27
+
28
+ Capybara.default_max_wait_time = 60
29
+ Capybara.threadsafe = true
30
+ Capybara.reuse_server = true
31
+
32
+ module Forki
33
+ class Scraper # rubocop:disable Metrics/ClassLength
34
+ include Capybara::DSL
35
+
36
+ def initialize
37
+ Capybara.default_driver = :selenium_forki
38
+ Forki.set_logger_level
39
+ # reset_selenium
40
+ end
41
+
42
+ # Yeah, just use the tmp/ directory that's created during setup
43
+ def download_image(img_elem)
44
+ img_data = URI.open(img_elem["src"]).read
45
+ File.binwrite("temp/emoji.png", img_data)
46
+ end
47
+
48
+ # Returns all GraphQL data objects embedded within a string
49
+ # Finds substrings that look like '"data": {...}' and converts them to hashes
50
+ def find_graphql_data_strings(objs = [], html_str)
51
+ data_marker = '"data":{'
52
+ data_start_index = html_str.index(data_marker)
53
+ return objs if data_start_index.nil? # No more data blocks in the page source
54
+
55
+ data_closure_index = find_graphql_data_closure_index(html_str, data_start_index)
56
+ return objs if data_closure_index.nil?
57
+
58
+ graphql_data_str = html_str[data_start_index...data_closure_index].delete_prefix('"data":')
59
+ objs + [graphql_data_str] + find_graphql_data_strings(html_str[data_closure_index..])
60
+ end
61
+
62
+ def find_graphql_data_closure_index(html_str, start_index)
63
+ closure_index = start_index + 8 # length of data marker. Begin search right after open brace
64
+ raise "Malformed graphql data object: no closing bracket found" if closure_index > html_str.length
65
+
66
+ brace_stack = 1
67
+ loop do # search for brace characters in substring instead of iterating through each char
68
+ if html_str[closure_index] == "{"
69
+ brace_stack += 1
70
+ elsif html_str[closure_index] == "}"
71
+ brace_stack -= 1
72
+ end
73
+
74
+ closure_index += 1
75
+ break if brace_stack.zero?
76
+ end
77
+
78
+ closure_index
79
+ end
80
+
81
+ private
82
+
83
+ ##########
84
+ # Set the session to use a new user folder in the options!
85
+ # #####################
86
+ def reset_selenium
87
+ options = Selenium::WebDriver::Options.chrome(exclude_switches: ["enable-automation"])
88
+ options.add_argument("--start-maximized")
89
+ options.add_argument("--no-sandbox")
90
+ options.add_argument("--disable-dev-shm-usage")
91
+ options.add_argument("–-disable-blink-features=AutomationControlled")
92
+ options.add_argument("--disable-extensions")
93
+ options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 13_3_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36")
94
+ options.add_preference "password_manager_enabled", false
95
+ options.add_argument("--disable-dev-shm-usage")
96
+ options.add_argument("--remote-debugging-port=9222")
97
+ options.add_argument("--user-data-dir=/tmp/tarun_forki_#{SecureRandom.uuid}")
98
+
99
+ Capybara.register_driver :selenium_forki do |app|
100
+ client = Selenium::WebDriver::Remote::Http::Default.new
101
+ client.read_timeout = 60 # Don't wait 60 seconds to return Net::ReadTimeoutError. We'll retry through Hypatia after 10 seconds
102
+ Capybara::Selenium::Driver.new(app, browser: :chrome, options: options, http_client: client)
103
+ end
104
+
105
+ Capybara.current_driver = :selenium_forki
106
+ end
107
+
108
+ # Logs in to Facebook (if not already logged in)
109
+ def login(url = nil)
110
+ raise MissingCredentialsError if ENV["FACEBOOK_EMAIL"].nil? || ENV["FACEBOOK_PASSWORD"].nil?
111
+
112
+ url ||= "https://www.facebook.com"
113
+ visit(url) # Visit the url passed in or the facebook homepage if nothing is
114
+
115
+ # Look for "login_form" box, which throws an error if not found. So we catch it and run the rest of the tests
116
+ begin
117
+ login_form = first(id: "login_form", wait: 5)
118
+ rescue Capybara::ElementNotFound
119
+ return unless page.title.downcase.include?("facebook - log in")
120
+ end
121
+
122
+ # Since we're not logged in, let's do that quick
123
+ visit("https://www.facebook.com") if login_form.nil?
124
+
125
+ login_form.fill_in("email", with: ENV["FACEBOOK_EMAIL"])
126
+ login_form.fill_in("pass", with: ENV["FACEBOOK_PASSWORD"])
127
+
128
+ # This is a pain because some pages just `click_button` would work, but some won't
129
+ login_buttons = login_form.all("div", text: "Log In", wait: 5)
130
+
131
+ if login_buttons.empty?
132
+ login_form.click_button("Log In")
133
+ else
134
+ login_buttons.each do |button|
135
+ if button.text == "Log In"
136
+ button.click
137
+ break
138
+ end
139
+ end
140
+ end
141
+
142
+ begin
143
+ raise Forki::BlockedCredentialsError if find_by_id("error_box", wait: 3)
144
+ rescue Capybara::ElementNotFound; end
145
+
146
+ # Now we wait awhile, hopefully to slow down scraping
147
+ sleep(rand * 10.3)
148
+ end
149
+
150
+ # Ensures that a valid Facebook url has been provided, and that it points to an available post
151
+ # If either of those two conditions are false, raises an exception
152
+ def validate_and_load_page(url)
153
+ Capybara.app_host = "https://www.facebook.com"
154
+ facebook_url = "https://www.facebook.com"
155
+ # visit "https://www.facebook.com" unless current_url.start_with?(facebook_url)
156
+ login(url)
157
+ raise Forki::InvalidUrlError unless url.start_with?(facebook_url)
158
+ visit url unless current_url.start_with?(url)
159
+ end
160
+
161
+ # Extracts an integer out of a string describing a number
162
+ # e.g. "4K Comments" returns 4000
163
+ # e.g. "131 Shares" returns 131
164
+ def extract_int_from_num_element(element)
165
+ return unless element
166
+
167
+ if element.class != String # if an html element was passed in
168
+ element = element.text(:all)
169
+ end
170
+
171
+ num_pattern = /[0-9KM ,.]+/
172
+ interaction_num_text = num_pattern.match(element)[0]
173
+
174
+ if interaction_num_text.include?(".") # e.g. "2.2K"
175
+ interaction_num_text.to_i + interaction_num_text[-2].to_i * 100
176
+ elsif interaction_num_text.include?("K") # e.g. "13K"
177
+ interaction_num_text.to_i * 1000
178
+ elsif interaction_num_text.include?("M") # e.g. "13M"
179
+ interaction_num_text.to_i * 1_000_000
180
+ else # e.g. "15,443"
181
+ interaction_num_text.delete!(",")
182
+ interaction_num_text.delete(" ").to_i
183
+ end
184
+ end
185
+ end
186
+ end
187
+
188
+ require_relative "post_scraper"
189
+ require_relative "user_scraper"
@@ -0,0 +1,94 @@
1
+ require "typhoeus"
2
+
3
+ module Forki
4
+ class UserScraper < Scraper
5
+ # Finds and returns the number of people who like the current page
6
+ def find_number_of_likes
7
+ likes_pattern = /[0-9,.KM ] people like this/
8
+ number_of_likes_elem = all("span").filter { | span| likes_pattern.match? span.text }.first
9
+ extract_int_from_num_element(number_of_likes_elem)
10
+ end
11
+
12
+ # Finds and returns the number of people who follow the current page
13
+ def find_number_of_followers(profile_details_string)
14
+ followers_pattern = /Followed by (?<num_followers>[0-9,.KM ]) people/
15
+ alt_follower_pattern = /(?<num_followers>[0-9,.KM ]+) (f|F)ollowers/
16
+ number_of_followers_match = followers_pattern.match(profile_details_string) || alt_follower_pattern.match(profile_details_string)
17
+ return nil if number_of_followers_match.nil?
18
+ extract_int_from_num_element(number_of_followers_match.named_captures["num_followers"])
19
+ end
20
+
21
+ def find_number_followers_for_normal_profile(profile_followers_node)
22
+ followers_string = profile_followers_node["node"]["timeline_context_item"]["renderer"]["context_item"]["title"]["text"]
23
+ followers_pattern = /[0-9,]+/
24
+ number_of_followers_match = followers_pattern.match(followers_string).to_s
25
+ extract_int_from_num_element(number_of_followers_match)
26
+ end
27
+
28
+ # Returns a hash of details about a Facebook user profile
29
+ def extract_profile_details(graphql_strings)
30
+ profile_header_str = graphql_strings.find { |gql| gql.include? "profile_header_renderer" }
31
+ profile_intro_str = graphql_strings.find { |g| g.include? "profile_intro_card" }
32
+ profile_header_obj = JSON.parse(profile_header_str)["user"]["profile_header_renderer"]
33
+ profile_intro_obj = profile_intro_str ? JSON.parse(profile_intro_str) : nil
34
+
35
+ number_of_followers = find_number_of_followers(profile_header_str)
36
+
37
+ # Check if the user shows followers count
38
+ if number_of_followers.nil?
39
+ profile_title_section = graphql_strings.find { |gql| gql.include? "profile_tile_section_type" }
40
+
41
+ json = JSON.parse(profile_title_section)
42
+ followers_node = json["user"]["profile_tile_sections"]["edges"].first["node"]["profile_tile_views"]["nodes"][1]["view_style_renderer"]["view"]["profile_tile_items"]["nodes"].select do |node|
43
+ node["node"]["timeline_context_item"]["timeline_context_list_item_type"] == "INTRO_CARD_FOLLOWERS"
44
+ end
45
+ if followers_node.empty?
46
+ number_of_followers = nil
47
+ else
48
+ number_of_followers = find_number_followers_for_normal_profile(followers_node.first)
49
+ end
50
+ end
51
+
52
+ {
53
+ id: profile_header_obj["user"]["id"],
54
+ number_of_followers: number_of_followers,
55
+ name: profile_header_obj["user"]["name"],
56
+ verified: profile_header_obj["user"]["is_verified"],
57
+ profile: profile_intro_obj ? profile_intro_obj["profile_intro_card"]["bio"]["text"] : "",
58
+ profile_image_url: profile_header_obj["user"]["profilePicLarge"]["uri"],
59
+ }
60
+ end
61
+
62
+ # Returns a hash of details about a Facebook page
63
+ def extract_page_details(graphql_strings)
64
+ page_cards_string = graphql_strings.find { |graphql_string| (graphql_string.include? "comet_page_cards") && \
65
+ (graphql_string.include? "follower_count")}
66
+ page_cards_list = JSON.parse(page_cards_string)["page"]["comet_page_cards"]
67
+ page_about_card = page_cards_list.find { |card| card["__typename"] == "CometPageAboutCardWithoutMapRenderer" }
68
+ viewer_page_object = JSON.parse(graphql_strings.find { |graphql_string| (graphql_string.include? "profile_photo") && \
69
+ graphql_string.include?("is_verified") })
70
+ {
71
+ id: page_about_card["page"]["id"],
72
+ profile: page_about_card["page"]["page_about_fields"]["blurb"],
73
+ number_of_followers: page_about_card["page"]["follower_count"],
74
+ name: page_about_card["page"]["name"],
75
+ verified: viewer_page_object["page"]["is_verified"],
76
+ profile_image_url: viewer_page_object["page"]["profile_picture"]["uri"],
77
+ number_of_likes: page_about_card["page"]["page_likers"]["global_likers_count"],
78
+ }
79
+ end
80
+
81
+ # Uses GraphQL data and DOM elements to collect information about the current user page
82
+ def parse(url)
83
+ validate_and_load_page(url)
84
+ graphql_strings = find_graphql_data_strings(page.html)
85
+ is_page = graphql_strings.map { |s| JSON.parse(s) }.any? { |o| o.key?("page") }
86
+ user_details = is_page ? extract_page_details(graphql_strings) : extract_profile_details(graphql_strings)
87
+
88
+ user_details[:profile_image_file] = Forki.retrieve_media(user_details[:profile_image_url])
89
+ user_details[:profile_link] = url
90
+
91
+ user_details
92
+ end
93
+ end
94
+ end