forki 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.github/workflows/main.yml +18 -0
- data/.gitignore +17 -0
- data/.rubocop.yml +71 -0
- data/.ruby-version +1 -0
- data/CHANGELOG.md +5 -0
- data/CODE_OF_CONDUCT.md +84 -0
- data/Gemfile +21 -0
- data/Gemfile.lock +163 -0
- data/LICENSE.txt +21 -0
- data/README.md +87 -0
- data/Rakefile +16 -0
- data/bin/console +15 -0
- data/bin/setup +8 -0
- data/forki.gemspec +42 -0
- data/lib/forki/post.rb +61 -0
- data/lib/forki/scrapers/post_scraper.rb +360 -0
- data/lib/forki/scrapers/scraper.rb +189 -0
- data/lib/forki/scrapers/user_scraper.rb +94 -0
- data/lib/forki/user.rb +45 -0
- data/lib/forki/version.rb +5 -0
- data/lib/forki.rb +98 -0
- data/lib/generators/forki.rb +3 -0
- data/lib/generators/forki_generator.rb +6 -0
- data/lib/helpers/configuration.rb +28 -0
- data/reactions/.DS_Store +0 -0
- data/reactions/angry.png +0 -0
- data/reactions/care.png +0 -0
- data/reactions/haha.png +0 -0
- data/reactions/like.png +0 -0
- data/reactions/love.png +0 -0
- data/reactions/pride.png +0 -0
- data/reactions/sad.png +0 -0
- data/reactions/wow.png +0 -0
- metadata +146 -0
@@ -0,0 +1,360 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "typhoeus"
|
4
|
+
require "securerandom"
|
5
|
+
require "byebug"
|
6
|
+
|
7
|
+
module Forki
|
8
|
+
# rubocop:disable Metrics/ClassLength
|
9
|
+
class PostScraper < Scraper
|
10
|
+
# Searches the DOM to finds the number of times a (video) post has been viewed.
|
11
|
+
# Returns nil if it can't find a DOM element with the view count
|
12
|
+
|
13
|
+
def find_number_of_views
|
14
|
+
views_pattern = /[0-9MK, ]+Views/
|
15
|
+
spans = all("span")
|
16
|
+
views_span = spans.find { |s| s.text(:all) =~ views_pattern }
|
17
|
+
extract_int_from_num_element(views_span)
|
18
|
+
end
|
19
|
+
|
20
|
+
def extract_post_data(graphql_strings)
|
21
|
+
# Bail out of the post otherwise it gets stuck
|
22
|
+
raise ContentUnavailableError unless is_post_available?
|
23
|
+
|
24
|
+
graphql_objects = get_graphql_objects(graphql_strings)
|
25
|
+
post_has_video = check_if_post_is_video(graphql_objects)
|
26
|
+
post_has_image = check_if_post_is_image(graphql_objects)
|
27
|
+
|
28
|
+
# There's a chance it may be embedded in a comment chain like this:
|
29
|
+
# https://www.facebook.com/PlandemicMovie/posts/588866298398729/
|
30
|
+
post_has_video_in_comment_stream = check_if_post_is_in_comment_stream(graphql_objects) if post_has_video == false
|
31
|
+
|
32
|
+
if post_has_video
|
33
|
+
extract_video_post_data(graphql_strings)
|
34
|
+
elsif post_has_video_in_comment_stream
|
35
|
+
extract_video_comment_post_data(graphql_objects)
|
36
|
+
elsif post_has_image
|
37
|
+
extract_image_post_data(graphql_objects)
|
38
|
+
else
|
39
|
+
raise UnhandledContentError
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def get_graphql_objects(graphql_strings)
|
44
|
+
graphql_strings.map { |graphql_object| JSON.parse(graphql_object) }
|
45
|
+
end
|
46
|
+
|
47
|
+
def check_if_post_is_video(graphql_objects)
|
48
|
+
graphql_objects.any? { |graphql_object| graphql_object.key?("is_live_streaming") || graphql_object.key?("video") || check_if_post_is_reel(graphql_object) }
|
49
|
+
end
|
50
|
+
|
51
|
+
def check_if_post_is_reel(graphql_object)
|
52
|
+
return false unless graphql_object.key?("node")
|
53
|
+
|
54
|
+
begin
|
55
|
+
style_infos = graphql_object["node"]["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"]["style_infos"].first
|
56
|
+
rescue NoMethodError # if the object doesn't match the attribute chain above, the line above will try to operate on nil
|
57
|
+
return false
|
58
|
+
end
|
59
|
+
|
60
|
+
style_infos.include?("fb_shorts_story")
|
61
|
+
end
|
62
|
+
|
63
|
+
def check_if_post_is_image(graphql_objects)
|
64
|
+
graphql_objects.any? do |graphql_object| # if any GraphQL objects contain the top-level keys above, return true
|
65
|
+
true unless graphql_object.fetch("image", nil).nil? # so long as the associated values are not nil
|
66
|
+
true unless graphql_object.fetch("currMedia", nil).nil?
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
def check_if_post_is_in_comment_stream(graphql_objects)
|
71
|
+
graphql_objects.find do |graphql_object|
|
72
|
+
next unless graphql_object.key?("nodes")
|
73
|
+
|
74
|
+
begin
|
75
|
+
type = graphql_object["nodes"].first["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"]["media"]["__typename"]
|
76
|
+
rescue StandardError
|
77
|
+
# if there's an error just return false, since the structure is so specific checking the whole thing is a lot
|
78
|
+
next
|
79
|
+
end
|
80
|
+
|
81
|
+
return true if type == "Video"
|
82
|
+
end
|
83
|
+
|
84
|
+
false
|
85
|
+
end
|
86
|
+
|
87
|
+
def is_post_available?
|
88
|
+
begin
|
89
|
+
# This Video Isn't Available Anymore
|
90
|
+
find("span", wait: 5, text: "content isn't available", exact_text: false)
|
91
|
+
rescue Capybara::ElementNotFound, Selenium::WebDriver::Error::StaleElementReferenceError
|
92
|
+
begin
|
93
|
+
find("span", wait: 5, text: "This Video Isn't Available Anymore", exact_text: false)
|
94
|
+
rescue Capybara::ElementNotFound, Selenium::WebDriver::Error::StaleElementReferenceError
|
95
|
+
return true
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
false
|
100
|
+
end
|
101
|
+
|
102
|
+
def extract_video_comment_post_data(graphql_objects)
|
103
|
+
graphql_nodes = nil
|
104
|
+
graphql_objects.find do |graphql_object|
|
105
|
+
next unless graphql_object.key?("nodes")
|
106
|
+
graphql_nodes = graphql_object["nodes"]
|
107
|
+
|
108
|
+
break
|
109
|
+
end
|
110
|
+
|
111
|
+
media = graphql_nodes.first["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"]["media"]
|
112
|
+
inital_feedback_object = graphql_nodes.first["comet_sections"]["feedback"]["story"]["feedback_context"]["feedback_target_with_context"]["ufi_renderer"]["feedback"]
|
113
|
+
feedback_object = inital_feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]
|
114
|
+
|
115
|
+
post_details = {
|
116
|
+
id: media["id"],
|
117
|
+
num_comments: feedback_object["comment_count"]["total_count"],
|
118
|
+
num_shares: feedback_object["share_count"]["count"],
|
119
|
+
num_views: feedback_object["video_view_count"],
|
120
|
+
reshare_warning: feedback_object["should_show_reshare_warning"],
|
121
|
+
video_preview_image_url: media["preferred_thumbnail"]["image"]["uri"],
|
122
|
+
video_url: media["playable_url_quality_hd"] || media["playable_url"],
|
123
|
+
text: graphql_nodes.first["comet_sections"]["content"]["story"]["comet_sections"]["message"]["story"]["message"]["text"],
|
124
|
+
created_at: media["publish_time"],
|
125
|
+
profile_link: graphql_nodes.first["comet_sections"]["context_layout"]["story"]["comet_sections"]["actor_photo"]["story"]["actors"].first["url"],
|
126
|
+
has_video: true
|
127
|
+
}
|
128
|
+
|
129
|
+
post_details[:video_preview_image_file] = Forki.retrieve_media(post_details[:video_preview_image_url])
|
130
|
+
post_details[:video_file] = Forki.retrieve_media(post_details[:video_url])
|
131
|
+
post_details[:reactions] = inital_feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["i18n_reaction_count"]
|
132
|
+
post_details
|
133
|
+
end
|
134
|
+
|
135
|
+
# Unfortunately, there's a taxonomy of video post types, all of which require different parsing methods
|
136
|
+
# Specifically, there are normal video posts, video posts from the watch page, and live video posts from the watch page
|
137
|
+
# The general strategy for extracting information from each type, though, is to find which of the 30-odd GraphQL strings are relevant
|
138
|
+
# After finding those GraphQL strings, we parse them into hashes and extract the information we need
|
139
|
+
def extract_video_post_data(graphql_strings)
|
140
|
+
unless all("h1").find { |h1| h1.text.strip == "Watch" }.nil?
|
141
|
+
return extract_video_post_data_from_watch_page(graphql_strings) # If this is a "watch page" video
|
142
|
+
end
|
143
|
+
|
144
|
+
graphql_object_array = graphql_strings.map { |graphql_string| JSON.parse(graphql_string) }
|
145
|
+
story_node_object = graphql_object_array.find { |graphql_object| graphql_object.key? "node" }&.fetch("node", nil) # user posted video
|
146
|
+
story_node_object = story_node_object || graphql_object_array.find { |graphql_object| graphql_object.key? "nodes" }&.fetch("nodes")&.first # page posted video
|
147
|
+
|
148
|
+
return extract_video_post_data_alternative(graphql_object_array) if story_node_object.nil?
|
149
|
+
|
150
|
+
if story_node_object["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"].key?("media")
|
151
|
+
video_object = story_node_object["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"]["media"]
|
152
|
+
creation_date = video_object["publish_time"]
|
153
|
+
# creation_date = video_object["video"]["publish_time"]
|
154
|
+
elsif story_node_object["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"].key?("style_infos")
|
155
|
+
# For "Reels" we need a separate way to parse this
|
156
|
+
video_object = story_node_object["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"]["style_infos"].first["fb_shorts_story"]["short_form_video_context"]["playback_video"]
|
157
|
+
creation_date = story_node_object["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"]["style_infos"].first["fb_shorts_story"]["creation_time"]
|
158
|
+
else
|
159
|
+
raise "Unable to parse video object"
|
160
|
+
end
|
161
|
+
|
162
|
+
feedback_object = story_node_object["comet_sections"]["feedback"]["story"]["feedback_context"]["feedback_target_with_context"]["ufi_renderer"]["feedback"]
|
163
|
+
reaction_counts = extract_reaction_counts(feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["cannot_see_top_custom_reactions"]["top_reactions"])
|
164
|
+
share_count_object = feedback_object.fetch("share_count", {})
|
165
|
+
|
166
|
+
if story_node_object["comet_sections"]["content"]["story"]["comet_sections"].key? "message"
|
167
|
+
text = story_node_object["comet_sections"]["content"]["story"]["comet_sections"]["message"]["story"]["message"]["text"]
|
168
|
+
else
|
169
|
+
text = ""
|
170
|
+
end
|
171
|
+
|
172
|
+
feedback_object["comment_list_renderer"]["feedback"]["comment_count"]["total_count"]
|
173
|
+
num_comments = feedback_object.has_key?("comment_list_renderer") ? feedback_object["comment_list_renderer"]["feedback"]["comment_count"]["total_count"] : feedback_object["comment_count"]["total_count"]
|
174
|
+
|
175
|
+
post_details = {
|
176
|
+
id: video_object["id"],
|
177
|
+
num_comments: num_comments,
|
178
|
+
num_shares: share_count_object.fetch("count", nil),
|
179
|
+
num_views: feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["video_view_count"],
|
180
|
+
reshare_warning: feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["should_show_reshare_warning"],
|
181
|
+
video_preview_image_url: video_object["preferred_thumbnail"]["image"]["uri"],
|
182
|
+
video_url: video_object["playable_url_quality_hd"] || video_object["playable_url"],
|
183
|
+
text: text,
|
184
|
+
created_at: creation_date,
|
185
|
+
profile_link: story_node_object["comet_sections"]["context_layout"]["story"]["comet_sections"]["actor_photo"]["story"]["actors"][0]["url"],
|
186
|
+
has_video: true
|
187
|
+
}
|
188
|
+
post_details[:video_preview_image_file] = Forki.retrieve_media(post_details[:video_preview_image_url])
|
189
|
+
post_details[:video_file] = Forki.retrieve_media(post_details[:video_url])
|
190
|
+
post_details[:reactions] = reaction_counts
|
191
|
+
post_details
|
192
|
+
end
|
193
|
+
|
194
|
+
def extract_video_post_data_alternative(graphql_object_array)
|
195
|
+
sidepane_object = graphql_object_array.find { |graphql_object| graphql_object.key?("tahoe_sidepane_renderer") }
|
196
|
+
video_object = graphql_object_array.find { |graphql_object| graphql_object.keys == ["video"] }
|
197
|
+
feedback_object = sidepane_object["tahoe_sidepane_renderer"]["video"]["feedback"]
|
198
|
+
reaction_counts = extract_reaction_counts(sidepane_object["tahoe_sidepane_renderer"]["video"]["feedback"]["cannot_see_top_custom_reactions"]["top_reactions"])
|
199
|
+
share_count_object = feedback_object.fetch("share_count", {})
|
200
|
+
|
201
|
+
post_details = {
|
202
|
+
id: video_object["id"],
|
203
|
+
num_comments: feedback_object["comments_count_summary_renderer"]["feedback"]["total_comment_count"],
|
204
|
+
num_shares: share_count_object.fetch("count", nil),
|
205
|
+
num_views: feedback_object["video_view_count"],
|
206
|
+
reshare_warning: feedback_object["should_show_reshare_warning"],
|
207
|
+
video_preview_image_url: video_object["video"]["preferred_thumbnail"]["image"]["uri"],
|
208
|
+
video_url: video_object["video"]["playable_url_quality_hd"] || video_object["video"]["playable_url"],
|
209
|
+
text: sidepane_object["tahoe_sidepane_renderer"]["video"]["creation_story"]["comet_sections"]["message"]["story"]["message"]["text"],
|
210
|
+
created_at: video_object["video"]["publish_time"],
|
211
|
+
profile_link: sidepane_object["tahoe_sidepane_renderer"]["video"]["creation_story"]["comet_sections"]["actor_photo"]["story"]["actors"][0]["url"],
|
212
|
+
has_video: true
|
213
|
+
}
|
214
|
+
|
215
|
+
post_details[:video_preview_image_file] = Forki.retrieve_media(post_details[:video_preview_image_url])
|
216
|
+
post_details[:video_file] = Forki.retrieve_media(post_details[:video_url])
|
217
|
+
post_details[:reactions] = reaction_counts
|
218
|
+
post_details
|
219
|
+
end
|
220
|
+
|
221
|
+
# Extracts data from an image post by parsing GraphQL strings as seen in the video post scraper above
|
222
|
+
def extract_image_post_data(graphql_object_array)
|
223
|
+
graphql_object_array.find { |graphql_object| graphql_object.key?("viewer_actor") && graphql_object.key?("display_comments") }
|
224
|
+
curr_media_object = graphql_object_array.find { |graphql_object| graphql_object.key?("currMedia") }
|
225
|
+
creation_story_object = graphql_object_array.find { |graphql_object| graphql_object.key?("creation_story") && graphql_object.key?("message") }
|
226
|
+
|
227
|
+
feedback_object = graphql_object_array.find { |graphql_object| graphql_object.has_key?("comet_ufi_summary_and_actions_renderer") }["comet_ufi_summary_and_actions_renderer"]["feedback"]
|
228
|
+
share_count_object = feedback_object.fetch("share_count", {})
|
229
|
+
|
230
|
+
poster = creation_story_object["creation_story"]["comet_sections"]["actor_photo"]["story"]["actors"][0]
|
231
|
+
|
232
|
+
reaction_counts = extract_reaction_counts(feedback_object["cannot_see_top_custom_reactions"]["top_reactions"])
|
233
|
+
post_details = {
|
234
|
+
id: curr_media_object["currMedia"]["id"],
|
235
|
+
num_comments: feedback_object["comments_count_summary_renderer"]["feedback"]["total_comment_count"],
|
236
|
+
num_shares: share_count_object.fetch("count", nil),
|
237
|
+
reshare_warning: feedback_object["should_show_reshare_warning"],
|
238
|
+
image_url: curr_media_object["currMedia"]["image"]["uri"],
|
239
|
+
text: (creation_story_object["message"] || {}).fetch("text", nil),
|
240
|
+
profile_link: poster["url"],
|
241
|
+
created_at: curr_media_object["currMedia"]["created_time"],
|
242
|
+
has_video: false
|
243
|
+
}
|
244
|
+
post_details[:image_file] = Forki.retrieve_media(post_details[:image_url])
|
245
|
+
post_details[:reactions] = reaction_counts
|
246
|
+
post_details
|
247
|
+
end
|
248
|
+
|
249
|
+
# Extract data from a non-live video post on the watch page
|
250
|
+
def extract_video_post_data_from_watch_page(graphql_strings)
|
251
|
+
return extract_live_video_post_data_from_watch_page(graphql_strings) if current_url.include?("live")
|
252
|
+
video_object = graphql_strings.map { |g| JSON.parse(g) }.find { |x| x.key?("video") }
|
253
|
+
creation_story_object = JSON.parse(graphql_strings.find { |graphql_string| (graphql_string.include?("creation_story")) && \
|
254
|
+
(graphql_string.include?("live_status")) })
|
255
|
+
video_permalink = creation_story_object["creation_story"]["shareable"]["url"].delete("\\")
|
256
|
+
media_object = video_object["video"]["story"]["attachments"][0]["media"]
|
257
|
+
reaction_counts = extract_reaction_counts(creation_story_object["feedback"]["cannot_see_top_custom_reactions"]["top_reactions"])
|
258
|
+
|
259
|
+
post_details = {
|
260
|
+
id: video_object["id"],
|
261
|
+
num_comments: creation_story_object["feedback"]["total_comment_count"],
|
262
|
+
num_shares: nil, # Not present for watch feed videos?
|
263
|
+
num_views: creation_story_object["feedback"]["video_view_count_renderer"]["feedback"]["video_view_count"],
|
264
|
+
reshare_warning: creation_story_object["feedback"]["should_show_reshare_warning"],
|
265
|
+
video_preview_image_url: video_object["video"]["story"]["attachments"][0]["media"]["preferred_thumbnail"]["image"]["uri"],
|
266
|
+
video_url: (media_object.fetch("playable_url_quality_hd", nil) || media_object.fetch("playable_url", nil)).delete("\\"),
|
267
|
+
text: (creation_story_object["creation_story"]["message"] || {})["text"],
|
268
|
+
created_at: video_object["video"]["story"]["attachments"][0]["media"]["publish_time"],
|
269
|
+
profile_link: video_permalink[..video_permalink.index("/videos")],
|
270
|
+
has_video: true
|
271
|
+
}
|
272
|
+
|
273
|
+
post_details[:video_preview_image_file] = Forki.retrieve_media(post_details[:video_preview_image_url])
|
274
|
+
post_details[:video_file] = Forki.retrieve_media(post_details[:video_url])
|
275
|
+
post_details[:reactions] = reaction_counts
|
276
|
+
post_details
|
277
|
+
end
|
278
|
+
|
279
|
+
# Extract data from live video post on the watch page
|
280
|
+
def extract_live_video_post_data_from_watch_page(graphql_strings)
|
281
|
+
creation_story_object = JSON.parse(graphql_strings.find { |graphql| (graphql.include? "comment_count") && \
|
282
|
+
(graphql.include? "creation_story") })["video"]["creation_story"]
|
283
|
+
media_object = JSON.parse(graphql_strings.find { |graphql| graphql.include? "playable_url" })["video"]["creation_story"]["attachments"][0]["media"]
|
284
|
+
video_permalink = creation_story_object["shareable"]["url"].delete("\\")
|
285
|
+
reaction_counts = extract_reaction_counts(creation_story_object["feedback_context"]["feedback_target_with_context"]["cannot_see_top_custom_reactions"]["top_reactions"])
|
286
|
+
|
287
|
+
post_details = {
|
288
|
+
id: creation_story_object["shareable"]["id"],
|
289
|
+
num_comments: creation_story_object["feedback_context"]["feedback_target_with_context"]["total_comment_count"],
|
290
|
+
num_shares: nil,
|
291
|
+
num_views: find_number_of_views, # as far as I can tell, this is never present for live videos
|
292
|
+
reshare_warning: creation_story_object["feedback_context"]["feedback_target_with_context"]["should_show_reshare_warning"],
|
293
|
+
video_preview_image_url: creation_story_object["attachments"][0]["media"]["preferred_thumbnail"]["image"]["uri"],
|
294
|
+
video_url: (media_object.fetch("playable_url_quality_hd", nil) || media_object.fetch("playable_url", nil)).delete("\\"),
|
295
|
+
text: creation_story_object["attachments"][0]["media"]["savable_description"]["text"],
|
296
|
+
created_at: creation_story_object["attachments"][0]["media"]["publish_time"],
|
297
|
+
profile_link: video_permalink[..video_permalink.index("/videos")],
|
298
|
+
has_video: true
|
299
|
+
}
|
300
|
+
|
301
|
+
post_details[:video_preview_image_file] = Forki.retrieve_media(post_details[:video_preview_image_url])
|
302
|
+
post_details[:video_file] = Forki.retrieve_media(post_details[:video_url])
|
303
|
+
post_details[:reactions] = reaction_counts
|
304
|
+
post_details
|
305
|
+
end
|
306
|
+
|
307
|
+
# Returns a hash containing counts of each reaction to a post
|
308
|
+
# Takes the edges list and creates a dictionary for each element that looks like: {:num_likes: 1234}
|
309
|
+
# Then merges the dictionaries with the inject call
|
310
|
+
def extract_reaction_counts(reactions_object)
|
311
|
+
reactions_object["edges"].map do |reaction|
|
312
|
+
{
|
313
|
+
"num_#{reaction["node"]["localized_name"].downcase}s".to_sym => reaction["reaction_count"]
|
314
|
+
}
|
315
|
+
end.inject { |emoji_counts, count| emoji_counts.merge(count) }
|
316
|
+
end
|
317
|
+
|
318
|
+
def take_screenshot
|
319
|
+
# First check whether post being scraped has a fact check overlay. If it does clear it.
|
320
|
+
begin
|
321
|
+
find('div[aria-label=" See Photo "]').click() || find('div[aria-label=" See Video "]').click()
|
322
|
+
rescue Capybara::ElementNotFound
|
323
|
+
# Do nothing if element not found
|
324
|
+
end
|
325
|
+
|
326
|
+
save_screenshot("#{Forki.temp_storage_location}/facebook_screenshot_#{SecureRandom.uuid}.png")
|
327
|
+
end
|
328
|
+
|
329
|
+
# Uses GraphQL data and DOM elements to collect information about the current post
|
330
|
+
def parse(url)
|
331
|
+
validate_and_load_page(url)
|
332
|
+
graphql_strings = find_graphql_data_strings(page.html)
|
333
|
+
post_data = extract_post_data(graphql_strings)
|
334
|
+
post_data[:url] = url
|
335
|
+
user_url = post_data[:profile_link]
|
336
|
+
|
337
|
+
5.times do
|
338
|
+
begin
|
339
|
+
post_data[:screenshot_file] = take_screenshot
|
340
|
+
break
|
341
|
+
rescue Net::ReadTimeout; end
|
342
|
+
|
343
|
+
sleep(5)
|
344
|
+
end
|
345
|
+
|
346
|
+
# page.quit # Close browser between page navigations to prevent cache folder access issues
|
347
|
+
|
348
|
+
post_data[:user] = User.lookup(user_url).first
|
349
|
+
page.quit
|
350
|
+
|
351
|
+
post_data
|
352
|
+
rescue Net::ReadTimeout
|
353
|
+
# Eat it?
|
354
|
+
rescue StandardError => e
|
355
|
+
raise e
|
356
|
+
ensure
|
357
|
+
page.quit
|
358
|
+
end
|
359
|
+
end
|
360
|
+
end
|
@@ -0,0 +1,189 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# require_relative "user_scraper"
|
4
|
+
require "capybara/dsl"
|
5
|
+
require "dotenv/load"
|
6
|
+
require "oj"
|
7
|
+
require "selenium-webdriver"
|
8
|
+
require "open-uri"
|
9
|
+
|
10
|
+
options = Selenium::WebDriver::Options.chrome(exclude_switches: ["enable-automation"])
|
11
|
+
options.add_argument("--start-maximized")
|
12
|
+
options.add_argument("--no-sandbox")
|
13
|
+
options.add_argument("--disable-dev-shm-usage")
|
14
|
+
options.add_argument("–-disable-blink-features=AutomationControlled")
|
15
|
+
options.add_argument("--disable-extensions")
|
16
|
+
options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 13_3_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36")
|
17
|
+
options.add_preference "password_manager_enabled", false
|
18
|
+
options.add_argument("--disable-dev-shm-usage")
|
19
|
+
options.add_argument("--remote-debugging-port=9222")
|
20
|
+
options.add_argument("--user-data-dir=/tmp/tarun_forki_#{SecureRandom.uuid}")
|
21
|
+
|
22
|
+
Capybara.register_driver :selenium_forki do |app|
|
23
|
+
client = Selenium::WebDriver::Remote::Http::Default.new
|
24
|
+
client.read_timeout = 60 # Don't wait 60 seconds to return Net::ReadTimeoutError. We'll retry through Hypatia after 10 seconds
|
25
|
+
Capybara::Selenium::Driver.new(app, browser: :chrome, options: options, http_client: client)
|
26
|
+
end
|
27
|
+
|
28
|
+
Capybara.default_max_wait_time = 60
|
29
|
+
Capybara.threadsafe = true
|
30
|
+
Capybara.reuse_server = true
|
31
|
+
|
32
|
+
module Forki
|
33
|
+
class Scraper # rubocop:disable Metrics/ClassLength
|
34
|
+
include Capybara::DSL
|
35
|
+
|
36
|
+
def initialize
|
37
|
+
Capybara.default_driver = :selenium_forki
|
38
|
+
Forki.set_logger_level
|
39
|
+
# reset_selenium
|
40
|
+
end
|
41
|
+
|
42
|
+
# Yeah, just use the tmp/ directory that's created during setup
|
43
|
+
def download_image(img_elem)
|
44
|
+
img_data = URI.open(img_elem["src"]).read
|
45
|
+
File.binwrite("temp/emoji.png", img_data)
|
46
|
+
end
|
47
|
+
|
48
|
+
# Returns all GraphQL data objects embedded within a string
|
49
|
+
# Finds substrings that look like '"data": {...}' and converts them to hashes
|
50
|
+
def find_graphql_data_strings(objs = [], html_str)
|
51
|
+
data_marker = '"data":{'
|
52
|
+
data_start_index = html_str.index(data_marker)
|
53
|
+
return objs if data_start_index.nil? # No more data blocks in the page source
|
54
|
+
|
55
|
+
data_closure_index = find_graphql_data_closure_index(html_str, data_start_index)
|
56
|
+
return objs if data_closure_index.nil?
|
57
|
+
|
58
|
+
graphql_data_str = html_str[data_start_index...data_closure_index].delete_prefix('"data":')
|
59
|
+
objs + [graphql_data_str] + find_graphql_data_strings(html_str[data_closure_index..])
|
60
|
+
end
|
61
|
+
|
62
|
+
def find_graphql_data_closure_index(html_str, start_index)
|
63
|
+
closure_index = start_index + 8 # length of data marker. Begin search right after open brace
|
64
|
+
raise "Malformed graphql data object: no closing bracket found" if closure_index > html_str.length
|
65
|
+
|
66
|
+
brace_stack = 1
|
67
|
+
loop do # search for brace characters in substring instead of iterating through each char
|
68
|
+
if html_str[closure_index] == "{"
|
69
|
+
brace_stack += 1
|
70
|
+
elsif html_str[closure_index] == "}"
|
71
|
+
brace_stack -= 1
|
72
|
+
end
|
73
|
+
|
74
|
+
closure_index += 1
|
75
|
+
break if brace_stack.zero?
|
76
|
+
end
|
77
|
+
|
78
|
+
closure_index
|
79
|
+
end
|
80
|
+
|
81
|
+
private
|
82
|
+
|
83
|
+
##########
|
84
|
+
# Set the session to use a new user folder in the options!
|
85
|
+
# #####################
|
86
|
+
def reset_selenium
|
87
|
+
options = Selenium::WebDriver::Options.chrome(exclude_switches: ["enable-automation"])
|
88
|
+
options.add_argument("--start-maximized")
|
89
|
+
options.add_argument("--no-sandbox")
|
90
|
+
options.add_argument("--disable-dev-shm-usage")
|
91
|
+
options.add_argument("–-disable-blink-features=AutomationControlled")
|
92
|
+
options.add_argument("--disable-extensions")
|
93
|
+
options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 13_3_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36")
|
94
|
+
options.add_preference "password_manager_enabled", false
|
95
|
+
options.add_argument("--disable-dev-shm-usage")
|
96
|
+
options.add_argument("--remote-debugging-port=9222")
|
97
|
+
options.add_argument("--user-data-dir=/tmp/tarun_forki_#{SecureRandom.uuid}")
|
98
|
+
|
99
|
+
Capybara.register_driver :selenium_forki do |app|
|
100
|
+
client = Selenium::WebDriver::Remote::Http::Default.new
|
101
|
+
client.read_timeout = 60 # Don't wait 60 seconds to return Net::ReadTimeoutError. We'll retry through Hypatia after 10 seconds
|
102
|
+
Capybara::Selenium::Driver.new(app, browser: :chrome, options: options, http_client: client)
|
103
|
+
end
|
104
|
+
|
105
|
+
Capybara.current_driver = :selenium_forki
|
106
|
+
end
|
107
|
+
|
108
|
+
# Logs in to Facebook (if not already logged in)
|
109
|
+
def login(url = nil)
|
110
|
+
raise MissingCredentialsError if ENV["FACEBOOK_EMAIL"].nil? || ENV["FACEBOOK_PASSWORD"].nil?
|
111
|
+
|
112
|
+
url ||= "https://www.facebook.com"
|
113
|
+
visit(url) # Visit the url passed in or the facebook homepage if nothing is
|
114
|
+
|
115
|
+
# Look for "login_form" box, which throws an error if not found. So we catch it and run the rest of the tests
|
116
|
+
begin
|
117
|
+
login_form = first(id: "login_form", wait: 5)
|
118
|
+
rescue Capybara::ElementNotFound
|
119
|
+
return unless page.title.downcase.include?("facebook - log in")
|
120
|
+
end
|
121
|
+
|
122
|
+
# Since we're not logged in, let's do that quick
|
123
|
+
visit("https://www.facebook.com") if login_form.nil?
|
124
|
+
|
125
|
+
login_form.fill_in("email", with: ENV["FACEBOOK_EMAIL"])
|
126
|
+
login_form.fill_in("pass", with: ENV["FACEBOOK_PASSWORD"])
|
127
|
+
|
128
|
+
# This is a pain because some pages just `click_button` would work, but some won't
|
129
|
+
login_buttons = login_form.all("div", text: "Log In", wait: 5)
|
130
|
+
|
131
|
+
if login_buttons.empty?
|
132
|
+
login_form.click_button("Log In")
|
133
|
+
else
|
134
|
+
login_buttons.each do |button|
|
135
|
+
if button.text == "Log In"
|
136
|
+
button.click
|
137
|
+
break
|
138
|
+
end
|
139
|
+
end
|
140
|
+
end
|
141
|
+
|
142
|
+
begin
|
143
|
+
raise Forki::BlockedCredentialsError if find_by_id("error_box", wait: 3)
|
144
|
+
rescue Capybara::ElementNotFound; end
|
145
|
+
|
146
|
+
# Now we wait awhile, hopefully to slow down scraping
|
147
|
+
sleep(rand * 10.3)
|
148
|
+
end
|
149
|
+
|
150
|
+
# Ensures that a valid Facebook url has been provided, and that it points to an available post
|
151
|
+
# If either of those two conditions are false, raises an exception
|
152
|
+
def validate_and_load_page(url)
|
153
|
+
Capybara.app_host = "https://www.facebook.com"
|
154
|
+
facebook_url = "https://www.facebook.com"
|
155
|
+
# visit "https://www.facebook.com" unless current_url.start_with?(facebook_url)
|
156
|
+
login(url)
|
157
|
+
raise Forki::InvalidUrlError unless url.start_with?(facebook_url)
|
158
|
+
visit url unless current_url.start_with?(url)
|
159
|
+
end
|
160
|
+
|
161
|
+
# Extracts an integer out of a string describing a number
|
162
|
+
# e.g. "4K Comments" returns 4000
|
163
|
+
# e.g. "131 Shares" returns 131
|
164
|
+
def extract_int_from_num_element(element)
|
165
|
+
return unless element
|
166
|
+
|
167
|
+
if element.class != String # if an html element was passed in
|
168
|
+
element = element.text(:all)
|
169
|
+
end
|
170
|
+
|
171
|
+
num_pattern = /[0-9KM ,.]+/
|
172
|
+
interaction_num_text = num_pattern.match(element)[0]
|
173
|
+
|
174
|
+
if interaction_num_text.include?(".") # e.g. "2.2K"
|
175
|
+
interaction_num_text.to_i + interaction_num_text[-2].to_i * 100
|
176
|
+
elsif interaction_num_text.include?("K") # e.g. "13K"
|
177
|
+
interaction_num_text.to_i * 1000
|
178
|
+
elsif interaction_num_text.include?("M") # e.g. "13M"
|
179
|
+
interaction_num_text.to_i * 1_000_000
|
180
|
+
else # e.g. "15,443"
|
181
|
+
interaction_num_text.delete!(",")
|
182
|
+
interaction_num_text.delete(" ").to_i
|
183
|
+
end
|
184
|
+
end
|
185
|
+
end
|
186
|
+
end
|
187
|
+
|
188
|
+
require_relative "post_scraper"
|
189
|
+
require_relative "user_scraper"
|
@@ -0,0 +1,94 @@
|
|
1
|
+
require "typhoeus"
|
2
|
+
|
3
|
+
module Forki
|
4
|
+
class UserScraper < Scraper
|
5
|
+
# Finds and returns the number of people who like the current page
|
6
|
+
def find_number_of_likes
|
7
|
+
likes_pattern = /[0-9,.KM ] people like this/
|
8
|
+
number_of_likes_elem = all("span").filter { | span| likes_pattern.match? span.text }.first
|
9
|
+
extract_int_from_num_element(number_of_likes_elem)
|
10
|
+
end
|
11
|
+
|
12
|
+
# Finds and returns the number of people who follow the current page
|
13
|
+
def find_number_of_followers(profile_details_string)
|
14
|
+
followers_pattern = /Followed by (?<num_followers>[0-9,.KM ]) people/
|
15
|
+
alt_follower_pattern = /(?<num_followers>[0-9,.KM ]+) (f|F)ollowers/
|
16
|
+
number_of_followers_match = followers_pattern.match(profile_details_string) || alt_follower_pattern.match(profile_details_string)
|
17
|
+
return nil if number_of_followers_match.nil?
|
18
|
+
extract_int_from_num_element(number_of_followers_match.named_captures["num_followers"])
|
19
|
+
end
|
20
|
+
|
21
|
+
def find_number_followers_for_normal_profile(profile_followers_node)
|
22
|
+
followers_string = profile_followers_node["node"]["timeline_context_item"]["renderer"]["context_item"]["title"]["text"]
|
23
|
+
followers_pattern = /[0-9,]+/
|
24
|
+
number_of_followers_match = followers_pattern.match(followers_string).to_s
|
25
|
+
extract_int_from_num_element(number_of_followers_match)
|
26
|
+
end
|
27
|
+
|
28
|
+
# Returns a hash of details about a Facebook user profile
|
29
|
+
def extract_profile_details(graphql_strings)
|
30
|
+
profile_header_str = graphql_strings.find { |gql| gql.include? "profile_header_renderer" }
|
31
|
+
profile_intro_str = graphql_strings.find { |g| g.include? "profile_intro_card" }
|
32
|
+
profile_header_obj = JSON.parse(profile_header_str)["user"]["profile_header_renderer"]
|
33
|
+
profile_intro_obj = profile_intro_str ? JSON.parse(profile_intro_str) : nil
|
34
|
+
|
35
|
+
number_of_followers = find_number_of_followers(profile_header_str)
|
36
|
+
|
37
|
+
# Check if the user shows followers count
|
38
|
+
if number_of_followers.nil?
|
39
|
+
profile_title_section = graphql_strings.find { |gql| gql.include? "profile_tile_section_type" }
|
40
|
+
|
41
|
+
json = JSON.parse(profile_title_section)
|
42
|
+
followers_node = json["user"]["profile_tile_sections"]["edges"].first["node"]["profile_tile_views"]["nodes"][1]["view_style_renderer"]["view"]["profile_tile_items"]["nodes"].select do |node|
|
43
|
+
node["node"]["timeline_context_item"]["timeline_context_list_item_type"] == "INTRO_CARD_FOLLOWERS"
|
44
|
+
end
|
45
|
+
if followers_node.empty?
|
46
|
+
number_of_followers = nil
|
47
|
+
else
|
48
|
+
number_of_followers = find_number_followers_for_normal_profile(followers_node.first)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
{
|
53
|
+
id: profile_header_obj["user"]["id"],
|
54
|
+
number_of_followers: number_of_followers,
|
55
|
+
name: profile_header_obj["user"]["name"],
|
56
|
+
verified: profile_header_obj["user"]["is_verified"],
|
57
|
+
profile: profile_intro_obj ? profile_intro_obj["profile_intro_card"]["bio"]["text"] : "",
|
58
|
+
profile_image_url: profile_header_obj["user"]["profilePicLarge"]["uri"],
|
59
|
+
}
|
60
|
+
end
|
61
|
+
|
62
|
+
# Returns a hash of details about a Facebook page
|
63
|
+
def extract_page_details(graphql_strings)
|
64
|
+
page_cards_string = graphql_strings.find { |graphql_string| (graphql_string.include? "comet_page_cards") && \
|
65
|
+
(graphql_string.include? "follower_count")}
|
66
|
+
page_cards_list = JSON.parse(page_cards_string)["page"]["comet_page_cards"]
|
67
|
+
page_about_card = page_cards_list.find { |card| card["__typename"] == "CometPageAboutCardWithoutMapRenderer" }
|
68
|
+
viewer_page_object = JSON.parse(graphql_strings.find { |graphql_string| (graphql_string.include? "profile_photo") && \
|
69
|
+
graphql_string.include?("is_verified") })
|
70
|
+
{
|
71
|
+
id: page_about_card["page"]["id"],
|
72
|
+
profile: page_about_card["page"]["page_about_fields"]["blurb"],
|
73
|
+
number_of_followers: page_about_card["page"]["follower_count"],
|
74
|
+
name: page_about_card["page"]["name"],
|
75
|
+
verified: viewer_page_object["page"]["is_verified"],
|
76
|
+
profile_image_url: viewer_page_object["page"]["profile_picture"]["uri"],
|
77
|
+
number_of_likes: page_about_card["page"]["page_likers"]["global_likers_count"],
|
78
|
+
}
|
79
|
+
end
|
80
|
+
|
81
|
+
# Uses GraphQL data and DOM elements to collect information about the current user page
|
82
|
+
def parse(url)
|
83
|
+
validate_and_load_page(url)
|
84
|
+
graphql_strings = find_graphql_data_strings(page.html)
|
85
|
+
is_page = graphql_strings.map { |s| JSON.parse(s) }.any? { |o| o.key?("page") }
|
86
|
+
user_details = is_page ? extract_page_details(graphql_strings) : extract_profile_details(graphql_strings)
|
87
|
+
|
88
|
+
user_details[:profile_image_file] = Forki.retrieve_media(user_details[:profile_image_url])
|
89
|
+
user_details[:profile_link] = url
|
90
|
+
|
91
|
+
user_details
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|