forki 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.github/workflows/main.yml +18 -0
- data/.gitignore +17 -0
- data/.rubocop.yml +71 -0
- data/.ruby-version +1 -0
- data/CHANGELOG.md +5 -0
- data/CODE_OF_CONDUCT.md +84 -0
- data/Gemfile +21 -0
- data/Gemfile.lock +163 -0
- data/LICENSE.txt +21 -0
- data/README.md +87 -0
- data/Rakefile +16 -0
- data/bin/console +15 -0
- data/bin/setup +8 -0
- data/forki.gemspec +42 -0
- data/lib/forki/post.rb +61 -0
- data/lib/forki/scrapers/post_scraper.rb +360 -0
- data/lib/forki/scrapers/scraper.rb +189 -0
- data/lib/forki/scrapers/user_scraper.rb +94 -0
- data/lib/forki/user.rb +45 -0
- data/lib/forki/version.rb +5 -0
- data/lib/forki.rb +98 -0
- data/lib/generators/forki.rb +3 -0
- data/lib/generators/forki_generator.rb +6 -0
- data/lib/helpers/configuration.rb +28 -0
- data/reactions/.DS_Store +0 -0
- data/reactions/angry.png +0 -0
- data/reactions/care.png +0 -0
- data/reactions/haha.png +0 -0
- data/reactions/like.png +0 -0
- data/reactions/love.png +0 -0
- data/reactions/pride.png +0 -0
- data/reactions/sad.png +0 -0
- data/reactions/wow.png +0 -0
- metadata +146 -0
@@ -0,0 +1,360 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "typhoeus"
|
4
|
+
require "securerandom"
|
5
|
+
require "byebug"
|
6
|
+
|
7
|
+
module Forki
|
8
|
+
# rubocop:disable Metrics/ClassLength
|
9
|
+
class PostScraper < Scraper
|
10
|
+
# Searches the DOM to finds the number of times a (video) post has been viewed.
|
11
|
+
# Returns nil if it can't find a DOM element with the view count
|
12
|
+
|
13
|
+
def find_number_of_views
|
14
|
+
views_pattern = /[0-9MK, ]+Views/
|
15
|
+
spans = all("span")
|
16
|
+
views_span = spans.find { |s| s.text(:all) =~ views_pattern }
|
17
|
+
extract_int_from_num_element(views_span)
|
18
|
+
end
|
19
|
+
|
20
|
+
def extract_post_data(graphql_strings)
|
21
|
+
# Bail out of the post otherwise it gets stuck
|
22
|
+
raise ContentUnavailableError unless is_post_available?
|
23
|
+
|
24
|
+
graphql_objects = get_graphql_objects(graphql_strings)
|
25
|
+
post_has_video = check_if_post_is_video(graphql_objects)
|
26
|
+
post_has_image = check_if_post_is_image(graphql_objects)
|
27
|
+
|
28
|
+
# There's a chance it may be embedded in a comment chain like this:
|
29
|
+
# https://www.facebook.com/PlandemicMovie/posts/588866298398729/
|
30
|
+
post_has_video_in_comment_stream = check_if_post_is_in_comment_stream(graphql_objects) if post_has_video == false
|
31
|
+
|
32
|
+
if post_has_video
|
33
|
+
extract_video_post_data(graphql_strings)
|
34
|
+
elsif post_has_video_in_comment_stream
|
35
|
+
extract_video_comment_post_data(graphql_objects)
|
36
|
+
elsif post_has_image
|
37
|
+
extract_image_post_data(graphql_objects)
|
38
|
+
else
|
39
|
+
raise UnhandledContentError
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def get_graphql_objects(graphql_strings)
|
44
|
+
graphql_strings.map { |graphql_object| JSON.parse(graphql_object) }
|
45
|
+
end
|
46
|
+
|
47
|
+
def check_if_post_is_video(graphql_objects)
|
48
|
+
graphql_objects.any? { |graphql_object| graphql_object.key?("is_live_streaming") || graphql_object.key?("video") || check_if_post_is_reel(graphql_object) }
|
49
|
+
end
|
50
|
+
|
51
|
+
def check_if_post_is_reel(graphql_object)
|
52
|
+
return false unless graphql_object.key?("node")
|
53
|
+
|
54
|
+
begin
|
55
|
+
style_infos = graphql_object["node"]["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"]["style_infos"].first
|
56
|
+
rescue NoMethodError # if the object doesn't match the attribute chain above, the line above will try to operate on nil
|
57
|
+
return false
|
58
|
+
end
|
59
|
+
|
60
|
+
style_infos.include?("fb_shorts_story")
|
61
|
+
end
|
62
|
+
|
63
|
+
def check_if_post_is_image(graphql_objects)
|
64
|
+
graphql_objects.any? do |graphql_object| # if any GraphQL objects contain the top-level keys above, return true
|
65
|
+
true unless graphql_object.fetch("image", nil).nil? # so long as the associated values are not nil
|
66
|
+
true unless graphql_object.fetch("currMedia", nil).nil?
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
def check_if_post_is_in_comment_stream(graphql_objects)
|
71
|
+
graphql_objects.find do |graphql_object|
|
72
|
+
next unless graphql_object.key?("nodes")
|
73
|
+
|
74
|
+
begin
|
75
|
+
type = graphql_object["nodes"].first["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"]["media"]["__typename"]
|
76
|
+
rescue StandardError
|
77
|
+
# if there's an error just return false, since the structure is so specific checking the whole thing is a lot
|
78
|
+
next
|
79
|
+
end
|
80
|
+
|
81
|
+
return true if type == "Video"
|
82
|
+
end
|
83
|
+
|
84
|
+
false
|
85
|
+
end
|
86
|
+
|
87
|
+
def is_post_available?
|
88
|
+
begin
|
89
|
+
# This Video Isn't Available Anymore
|
90
|
+
find("span", wait: 5, text: "content isn't available", exact_text: false)
|
91
|
+
rescue Capybara::ElementNotFound, Selenium::WebDriver::Error::StaleElementReferenceError
|
92
|
+
begin
|
93
|
+
find("span", wait: 5, text: "This Video Isn't Available Anymore", exact_text: false)
|
94
|
+
rescue Capybara::ElementNotFound, Selenium::WebDriver::Error::StaleElementReferenceError
|
95
|
+
return true
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
false
|
100
|
+
end
|
101
|
+
|
102
|
+
def extract_video_comment_post_data(graphql_objects)
|
103
|
+
graphql_nodes = nil
|
104
|
+
graphql_objects.find do |graphql_object|
|
105
|
+
next unless graphql_object.key?("nodes")
|
106
|
+
graphql_nodes = graphql_object["nodes"]
|
107
|
+
|
108
|
+
break
|
109
|
+
end
|
110
|
+
|
111
|
+
media = graphql_nodes.first["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"]["media"]
|
112
|
+
inital_feedback_object = graphql_nodes.first["comet_sections"]["feedback"]["story"]["feedback_context"]["feedback_target_with_context"]["ufi_renderer"]["feedback"]
|
113
|
+
feedback_object = inital_feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]
|
114
|
+
|
115
|
+
post_details = {
|
116
|
+
id: media["id"],
|
117
|
+
num_comments: feedback_object["comment_count"]["total_count"],
|
118
|
+
num_shares: feedback_object["share_count"]["count"],
|
119
|
+
num_views: feedback_object["video_view_count"],
|
120
|
+
reshare_warning: feedback_object["should_show_reshare_warning"],
|
121
|
+
video_preview_image_url: media["preferred_thumbnail"]["image"]["uri"],
|
122
|
+
video_url: media["playable_url_quality_hd"] || media["playable_url"],
|
123
|
+
text: graphql_nodes.first["comet_sections"]["content"]["story"]["comet_sections"]["message"]["story"]["message"]["text"],
|
124
|
+
created_at: media["publish_time"],
|
125
|
+
profile_link: graphql_nodes.first["comet_sections"]["context_layout"]["story"]["comet_sections"]["actor_photo"]["story"]["actors"].first["url"],
|
126
|
+
has_video: true
|
127
|
+
}
|
128
|
+
|
129
|
+
post_details[:video_preview_image_file] = Forki.retrieve_media(post_details[:video_preview_image_url])
|
130
|
+
post_details[:video_file] = Forki.retrieve_media(post_details[:video_url])
|
131
|
+
post_details[:reactions] = inital_feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["i18n_reaction_count"]
|
132
|
+
post_details
|
133
|
+
end
|
134
|
+
|
135
|
+
# Unfortunately, there's a taxonomy of video post types, all of which require different parsing methods
|
136
|
+
# Specifically, there are normal video posts, video posts from the watch page, and live video posts from the watch page
|
137
|
+
# The general strategy for extracting information from each type, though, is to find which of the 30-odd GraphQL strings are relevant
|
138
|
+
# After finding those GraphQL strings, we parse them into hashes and extract the information we need
|
139
|
+
def extract_video_post_data(graphql_strings)
|
140
|
+
unless all("h1").find { |h1| h1.text.strip == "Watch" }.nil?
|
141
|
+
return extract_video_post_data_from_watch_page(graphql_strings) # If this is a "watch page" video
|
142
|
+
end
|
143
|
+
|
144
|
+
graphql_object_array = graphql_strings.map { |graphql_string| JSON.parse(graphql_string) }
|
145
|
+
story_node_object = graphql_object_array.find { |graphql_object| graphql_object.key? "node" }&.fetch("node", nil) # user posted video
|
146
|
+
story_node_object = story_node_object || graphql_object_array.find { |graphql_object| graphql_object.key? "nodes" }&.fetch("nodes")&.first # page posted video
|
147
|
+
|
148
|
+
return extract_video_post_data_alternative(graphql_object_array) if story_node_object.nil?
|
149
|
+
|
150
|
+
if story_node_object["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"].key?("media")
|
151
|
+
video_object = story_node_object["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"]["media"]
|
152
|
+
creation_date = video_object["publish_time"]
|
153
|
+
# creation_date = video_object["video"]["publish_time"]
|
154
|
+
elsif story_node_object["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"].key?("style_infos")
|
155
|
+
# For "Reels" we need a separate way to parse this
|
156
|
+
video_object = story_node_object["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"]["style_infos"].first["fb_shorts_story"]["short_form_video_context"]["playback_video"]
|
157
|
+
creation_date = story_node_object["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"]["style_infos"].first["fb_shorts_story"]["creation_time"]
|
158
|
+
else
|
159
|
+
raise "Unable to parse video object"
|
160
|
+
end
|
161
|
+
|
162
|
+
feedback_object = story_node_object["comet_sections"]["feedback"]["story"]["feedback_context"]["feedback_target_with_context"]["ufi_renderer"]["feedback"]
|
163
|
+
reaction_counts = extract_reaction_counts(feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["cannot_see_top_custom_reactions"]["top_reactions"])
|
164
|
+
share_count_object = feedback_object.fetch("share_count", {})
|
165
|
+
|
166
|
+
if story_node_object["comet_sections"]["content"]["story"]["comet_sections"].key? "message"
|
167
|
+
text = story_node_object["comet_sections"]["content"]["story"]["comet_sections"]["message"]["story"]["message"]["text"]
|
168
|
+
else
|
169
|
+
text = ""
|
170
|
+
end
|
171
|
+
|
172
|
+
feedback_object["comment_list_renderer"]["feedback"]["comment_count"]["total_count"]
|
173
|
+
num_comments = feedback_object.has_key?("comment_list_renderer") ? feedback_object["comment_list_renderer"]["feedback"]["comment_count"]["total_count"] : feedback_object["comment_count"]["total_count"]
|
174
|
+
|
175
|
+
post_details = {
|
176
|
+
id: video_object["id"],
|
177
|
+
num_comments: num_comments,
|
178
|
+
num_shares: share_count_object.fetch("count", nil),
|
179
|
+
num_views: feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["video_view_count"],
|
180
|
+
reshare_warning: feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["should_show_reshare_warning"],
|
181
|
+
video_preview_image_url: video_object["preferred_thumbnail"]["image"]["uri"],
|
182
|
+
video_url: video_object["playable_url_quality_hd"] || video_object["playable_url"],
|
183
|
+
text: text,
|
184
|
+
created_at: creation_date,
|
185
|
+
profile_link: story_node_object["comet_sections"]["context_layout"]["story"]["comet_sections"]["actor_photo"]["story"]["actors"][0]["url"],
|
186
|
+
has_video: true
|
187
|
+
}
|
188
|
+
post_details[:video_preview_image_file] = Forki.retrieve_media(post_details[:video_preview_image_url])
|
189
|
+
post_details[:video_file] = Forki.retrieve_media(post_details[:video_url])
|
190
|
+
post_details[:reactions] = reaction_counts
|
191
|
+
post_details
|
192
|
+
end
|
193
|
+
|
194
|
+
def extract_video_post_data_alternative(graphql_object_array)
|
195
|
+
sidepane_object = graphql_object_array.find { |graphql_object| graphql_object.key?("tahoe_sidepane_renderer") }
|
196
|
+
video_object = graphql_object_array.find { |graphql_object| graphql_object.keys == ["video"] }
|
197
|
+
feedback_object = sidepane_object["tahoe_sidepane_renderer"]["video"]["feedback"]
|
198
|
+
reaction_counts = extract_reaction_counts(sidepane_object["tahoe_sidepane_renderer"]["video"]["feedback"]["cannot_see_top_custom_reactions"]["top_reactions"])
|
199
|
+
share_count_object = feedback_object.fetch("share_count", {})
|
200
|
+
|
201
|
+
post_details = {
|
202
|
+
id: video_object["id"],
|
203
|
+
num_comments: feedback_object["comments_count_summary_renderer"]["feedback"]["total_comment_count"],
|
204
|
+
num_shares: share_count_object.fetch("count", nil),
|
205
|
+
num_views: feedback_object["video_view_count"],
|
206
|
+
reshare_warning: feedback_object["should_show_reshare_warning"],
|
207
|
+
video_preview_image_url: video_object["video"]["preferred_thumbnail"]["image"]["uri"],
|
208
|
+
video_url: video_object["video"]["playable_url_quality_hd"] || video_object["video"]["playable_url"],
|
209
|
+
text: sidepane_object["tahoe_sidepane_renderer"]["video"]["creation_story"]["comet_sections"]["message"]["story"]["message"]["text"],
|
210
|
+
created_at: video_object["video"]["publish_time"],
|
211
|
+
profile_link: sidepane_object["tahoe_sidepane_renderer"]["video"]["creation_story"]["comet_sections"]["actor_photo"]["story"]["actors"][0]["url"],
|
212
|
+
has_video: true
|
213
|
+
}
|
214
|
+
|
215
|
+
post_details[:video_preview_image_file] = Forki.retrieve_media(post_details[:video_preview_image_url])
|
216
|
+
post_details[:video_file] = Forki.retrieve_media(post_details[:video_url])
|
217
|
+
post_details[:reactions] = reaction_counts
|
218
|
+
post_details
|
219
|
+
end
|
220
|
+
|
221
|
+
# Extracts data from an image post by parsing GraphQL strings as seen in the video post scraper above
|
222
|
+
def extract_image_post_data(graphql_object_array)
|
223
|
+
graphql_object_array.find { |graphql_object| graphql_object.key?("viewer_actor") && graphql_object.key?("display_comments") }
|
224
|
+
curr_media_object = graphql_object_array.find { |graphql_object| graphql_object.key?("currMedia") }
|
225
|
+
creation_story_object = graphql_object_array.find { |graphql_object| graphql_object.key?("creation_story") && graphql_object.key?("message") }
|
226
|
+
|
227
|
+
feedback_object = graphql_object_array.find { |graphql_object| graphql_object.has_key?("comet_ufi_summary_and_actions_renderer") }["comet_ufi_summary_and_actions_renderer"]["feedback"]
|
228
|
+
share_count_object = feedback_object.fetch("share_count", {})
|
229
|
+
|
230
|
+
poster = creation_story_object["creation_story"]["comet_sections"]["actor_photo"]["story"]["actors"][0]
|
231
|
+
|
232
|
+
reaction_counts = extract_reaction_counts(feedback_object["cannot_see_top_custom_reactions"]["top_reactions"])
|
233
|
+
post_details = {
|
234
|
+
id: curr_media_object["currMedia"]["id"],
|
235
|
+
num_comments: feedback_object["comments_count_summary_renderer"]["feedback"]["total_comment_count"],
|
236
|
+
num_shares: share_count_object.fetch("count", nil),
|
237
|
+
reshare_warning: feedback_object["should_show_reshare_warning"],
|
238
|
+
image_url: curr_media_object["currMedia"]["image"]["uri"],
|
239
|
+
text: (creation_story_object["message"] || {}).fetch("text", nil),
|
240
|
+
profile_link: poster["url"],
|
241
|
+
created_at: curr_media_object["currMedia"]["created_time"],
|
242
|
+
has_video: false
|
243
|
+
}
|
244
|
+
post_details[:image_file] = Forki.retrieve_media(post_details[:image_url])
|
245
|
+
post_details[:reactions] = reaction_counts
|
246
|
+
post_details
|
247
|
+
end
|
248
|
+
|
249
|
+
# Extract data from a non-live video post on the watch page
|
250
|
+
def extract_video_post_data_from_watch_page(graphql_strings)
|
251
|
+
return extract_live_video_post_data_from_watch_page(graphql_strings) if current_url.include?("live")
|
252
|
+
video_object = graphql_strings.map { |g| JSON.parse(g) }.find { |x| x.key?("video") }
|
253
|
+
creation_story_object = JSON.parse(graphql_strings.find { |graphql_string| (graphql_string.include?("creation_story")) && \
|
254
|
+
(graphql_string.include?("live_status")) })
|
255
|
+
video_permalink = creation_story_object["creation_story"]["shareable"]["url"].delete("\\")
|
256
|
+
media_object = video_object["video"]["story"]["attachments"][0]["media"]
|
257
|
+
reaction_counts = extract_reaction_counts(creation_story_object["feedback"]["cannot_see_top_custom_reactions"]["top_reactions"])
|
258
|
+
|
259
|
+
post_details = {
|
260
|
+
id: video_object["id"],
|
261
|
+
num_comments: creation_story_object["feedback"]["total_comment_count"],
|
262
|
+
num_shares: nil, # Not present for watch feed videos?
|
263
|
+
num_views: creation_story_object["feedback"]["video_view_count_renderer"]["feedback"]["video_view_count"],
|
264
|
+
reshare_warning: creation_story_object["feedback"]["should_show_reshare_warning"],
|
265
|
+
video_preview_image_url: video_object["video"]["story"]["attachments"][0]["media"]["preferred_thumbnail"]["image"]["uri"],
|
266
|
+
video_url: (media_object.fetch("playable_url_quality_hd", nil) || media_object.fetch("playable_url", nil)).delete("\\"),
|
267
|
+
text: (creation_story_object["creation_story"]["message"] || {})["text"],
|
268
|
+
created_at: video_object["video"]["story"]["attachments"][0]["media"]["publish_time"],
|
269
|
+
profile_link: video_permalink[..video_permalink.index("/videos")],
|
270
|
+
has_video: true
|
271
|
+
}
|
272
|
+
|
273
|
+
post_details[:video_preview_image_file] = Forki.retrieve_media(post_details[:video_preview_image_url])
|
274
|
+
post_details[:video_file] = Forki.retrieve_media(post_details[:video_url])
|
275
|
+
post_details[:reactions] = reaction_counts
|
276
|
+
post_details
|
277
|
+
end
|
278
|
+
|
279
|
+
# Extract data from live video post on the watch page
|
280
|
+
def extract_live_video_post_data_from_watch_page(graphql_strings)
|
281
|
+
creation_story_object = JSON.parse(graphql_strings.find { |graphql| (graphql.include? "comment_count") && \
|
282
|
+
(graphql.include? "creation_story") })["video"]["creation_story"]
|
283
|
+
media_object = JSON.parse(graphql_strings.find { |graphql| graphql.include? "playable_url" })["video"]["creation_story"]["attachments"][0]["media"]
|
284
|
+
video_permalink = creation_story_object["shareable"]["url"].delete("\\")
|
285
|
+
reaction_counts = extract_reaction_counts(creation_story_object["feedback_context"]["feedback_target_with_context"]["cannot_see_top_custom_reactions"]["top_reactions"])
|
286
|
+
|
287
|
+
post_details = {
|
288
|
+
id: creation_story_object["shareable"]["id"],
|
289
|
+
num_comments: creation_story_object["feedback_context"]["feedback_target_with_context"]["total_comment_count"],
|
290
|
+
num_shares: nil,
|
291
|
+
num_views: find_number_of_views, # as far as I can tell, this is never present for live videos
|
292
|
+
reshare_warning: creation_story_object["feedback_context"]["feedback_target_with_context"]["should_show_reshare_warning"],
|
293
|
+
video_preview_image_url: creation_story_object["attachments"][0]["media"]["preferred_thumbnail"]["image"]["uri"],
|
294
|
+
video_url: (media_object.fetch("playable_url_quality_hd", nil) || media_object.fetch("playable_url", nil)).delete("\\"),
|
295
|
+
text: creation_story_object["attachments"][0]["media"]["savable_description"]["text"],
|
296
|
+
created_at: creation_story_object["attachments"][0]["media"]["publish_time"],
|
297
|
+
profile_link: video_permalink[..video_permalink.index("/videos")],
|
298
|
+
has_video: true
|
299
|
+
}
|
300
|
+
|
301
|
+
post_details[:video_preview_image_file] = Forki.retrieve_media(post_details[:video_preview_image_url])
|
302
|
+
post_details[:video_file] = Forki.retrieve_media(post_details[:video_url])
|
303
|
+
post_details[:reactions] = reaction_counts
|
304
|
+
post_details
|
305
|
+
end
|
306
|
+
|
307
|
+
# Returns a hash containing counts of each reaction to a post
|
308
|
+
# Takes the edges list and creates a dictionary for each element that looks like: {:num_likes: 1234}
|
309
|
+
# Then merges the dictionaries with the inject call
|
310
|
+
def extract_reaction_counts(reactions_object)
|
311
|
+
reactions_object["edges"].map do |reaction|
|
312
|
+
{
|
313
|
+
"num_#{reaction["node"]["localized_name"].downcase}s".to_sym => reaction["reaction_count"]
|
314
|
+
}
|
315
|
+
end.inject { |emoji_counts, count| emoji_counts.merge(count) }
|
316
|
+
end
|
317
|
+
|
318
|
+
def take_screenshot
|
319
|
+
# First check whether post being scraped has a fact check overlay. If it does clear it.
|
320
|
+
begin
|
321
|
+
find('div[aria-label=" See Photo "]').click() || find('div[aria-label=" See Video "]').click()
|
322
|
+
rescue Capybara::ElementNotFound
|
323
|
+
# Do nothing if element not found
|
324
|
+
end
|
325
|
+
|
326
|
+
save_screenshot("#{Forki.temp_storage_location}/facebook_screenshot_#{SecureRandom.uuid}.png")
|
327
|
+
end
|
328
|
+
|
329
|
+
# Uses GraphQL data and DOM elements to collect information about the current post
|
330
|
+
def parse(url)
|
331
|
+
validate_and_load_page(url)
|
332
|
+
graphql_strings = find_graphql_data_strings(page.html)
|
333
|
+
post_data = extract_post_data(graphql_strings)
|
334
|
+
post_data[:url] = url
|
335
|
+
user_url = post_data[:profile_link]
|
336
|
+
|
337
|
+
5.times do
|
338
|
+
begin
|
339
|
+
post_data[:screenshot_file] = take_screenshot
|
340
|
+
break
|
341
|
+
rescue Net::ReadTimeout; end
|
342
|
+
|
343
|
+
sleep(5)
|
344
|
+
end
|
345
|
+
|
346
|
+
# page.quit # Close browser between page navigations to prevent cache folder access issues
|
347
|
+
|
348
|
+
post_data[:user] = User.lookup(user_url).first
|
349
|
+
page.quit
|
350
|
+
|
351
|
+
post_data
|
352
|
+
rescue Net::ReadTimeout
|
353
|
+
# Eat it?
|
354
|
+
rescue StandardError => e
|
355
|
+
raise e
|
356
|
+
ensure
|
357
|
+
page.quit
|
358
|
+
end
|
359
|
+
end
|
360
|
+
end
|
@@ -0,0 +1,189 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# require_relative "user_scraper"
|
4
|
+
require "capybara/dsl"
|
5
|
+
require "dotenv/load"
|
6
|
+
require "oj"
|
7
|
+
require "selenium-webdriver"
|
8
|
+
require "open-uri"
|
9
|
+
|
10
|
+
options = Selenium::WebDriver::Options.chrome(exclude_switches: ["enable-automation"])
|
11
|
+
options.add_argument("--start-maximized")
|
12
|
+
options.add_argument("--no-sandbox")
|
13
|
+
options.add_argument("--disable-dev-shm-usage")
|
14
|
+
options.add_argument("–-disable-blink-features=AutomationControlled")
|
15
|
+
options.add_argument("--disable-extensions")
|
16
|
+
options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 13_3_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36")
|
17
|
+
options.add_preference "password_manager_enabled", false
|
18
|
+
options.add_argument("--disable-dev-shm-usage")
|
19
|
+
options.add_argument("--remote-debugging-port=9222")
|
20
|
+
options.add_argument("--user-data-dir=/tmp/tarun_forki_#{SecureRandom.uuid}")
|
21
|
+
|
22
|
+
Capybara.register_driver :selenium_forki do |app|
|
23
|
+
client = Selenium::WebDriver::Remote::Http::Default.new
|
24
|
+
client.read_timeout = 60 # Don't wait 60 seconds to return Net::ReadTimeoutError. We'll retry through Hypatia after 10 seconds
|
25
|
+
Capybara::Selenium::Driver.new(app, browser: :chrome, options: options, http_client: client)
|
26
|
+
end
|
27
|
+
|
28
|
+
Capybara.default_max_wait_time = 60
|
29
|
+
Capybara.threadsafe = true
|
30
|
+
Capybara.reuse_server = true
|
31
|
+
|
32
|
+
module Forki
|
33
|
+
class Scraper # rubocop:disable Metrics/ClassLength
|
34
|
+
include Capybara::DSL
|
35
|
+
|
36
|
+
def initialize
|
37
|
+
Capybara.default_driver = :selenium_forki
|
38
|
+
Forki.set_logger_level
|
39
|
+
# reset_selenium
|
40
|
+
end
|
41
|
+
|
42
|
+
# Yeah, just use the tmp/ directory that's created during setup
|
43
|
+
def download_image(img_elem)
|
44
|
+
img_data = URI.open(img_elem["src"]).read
|
45
|
+
File.binwrite("temp/emoji.png", img_data)
|
46
|
+
end
|
47
|
+
|
48
|
+
# Returns all GraphQL data objects embedded within a string
|
49
|
+
# Finds substrings that look like '"data": {...}' and converts them to hashes
|
50
|
+
def find_graphql_data_strings(objs = [], html_str)
|
51
|
+
data_marker = '"data":{'
|
52
|
+
data_start_index = html_str.index(data_marker)
|
53
|
+
return objs if data_start_index.nil? # No more data blocks in the page source
|
54
|
+
|
55
|
+
data_closure_index = find_graphql_data_closure_index(html_str, data_start_index)
|
56
|
+
return objs if data_closure_index.nil?
|
57
|
+
|
58
|
+
graphql_data_str = html_str[data_start_index...data_closure_index].delete_prefix('"data":')
|
59
|
+
objs + [graphql_data_str] + find_graphql_data_strings(html_str[data_closure_index..])
|
60
|
+
end
|
61
|
+
|
62
|
+
def find_graphql_data_closure_index(html_str, start_index)
|
63
|
+
closure_index = start_index + 8 # length of data marker. Begin search right after open brace
|
64
|
+
raise "Malformed graphql data object: no closing bracket found" if closure_index > html_str.length
|
65
|
+
|
66
|
+
brace_stack = 1
|
67
|
+
loop do # search for brace characters in substring instead of iterating through each char
|
68
|
+
if html_str[closure_index] == "{"
|
69
|
+
brace_stack += 1
|
70
|
+
elsif html_str[closure_index] == "}"
|
71
|
+
brace_stack -= 1
|
72
|
+
end
|
73
|
+
|
74
|
+
closure_index += 1
|
75
|
+
break if brace_stack.zero?
|
76
|
+
end
|
77
|
+
|
78
|
+
closure_index
|
79
|
+
end
|
80
|
+
|
81
|
+
private
|
82
|
+
|
83
|
+
##########
|
84
|
+
# Set the session to use a new user folder in the options!
|
85
|
+
# #####################
|
86
|
+
def reset_selenium
|
87
|
+
options = Selenium::WebDriver::Options.chrome(exclude_switches: ["enable-automation"])
|
88
|
+
options.add_argument("--start-maximized")
|
89
|
+
options.add_argument("--no-sandbox")
|
90
|
+
options.add_argument("--disable-dev-shm-usage")
|
91
|
+
options.add_argument("–-disable-blink-features=AutomationControlled")
|
92
|
+
options.add_argument("--disable-extensions")
|
93
|
+
options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 13_3_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36")
|
94
|
+
options.add_preference "password_manager_enabled", false
|
95
|
+
options.add_argument("--disable-dev-shm-usage")
|
96
|
+
options.add_argument("--remote-debugging-port=9222")
|
97
|
+
options.add_argument("--user-data-dir=/tmp/tarun_forki_#{SecureRandom.uuid}")
|
98
|
+
|
99
|
+
Capybara.register_driver :selenium_forki do |app|
|
100
|
+
client = Selenium::WebDriver::Remote::Http::Default.new
|
101
|
+
client.read_timeout = 60 # Don't wait 60 seconds to return Net::ReadTimeoutError. We'll retry through Hypatia after 10 seconds
|
102
|
+
Capybara::Selenium::Driver.new(app, browser: :chrome, options: options, http_client: client)
|
103
|
+
end
|
104
|
+
|
105
|
+
Capybara.current_driver = :selenium_forki
|
106
|
+
end
|
107
|
+
|
108
|
+
# Logs in to Facebook (if not already logged in)
|
109
|
+
def login(url = nil)
|
110
|
+
raise MissingCredentialsError if ENV["FACEBOOK_EMAIL"].nil? || ENV["FACEBOOK_PASSWORD"].nil?
|
111
|
+
|
112
|
+
url ||= "https://www.facebook.com"
|
113
|
+
visit(url) # Visit the url passed in or the facebook homepage if nothing is
|
114
|
+
|
115
|
+
# Look for "login_form" box, which throws an error if not found. So we catch it and run the rest of the tests
|
116
|
+
begin
|
117
|
+
login_form = first(id: "login_form", wait: 5)
|
118
|
+
rescue Capybara::ElementNotFound
|
119
|
+
return unless page.title.downcase.include?("facebook - log in")
|
120
|
+
end
|
121
|
+
|
122
|
+
# Since we're not logged in, let's do that quick
|
123
|
+
visit("https://www.facebook.com") if login_form.nil?
|
124
|
+
|
125
|
+
login_form.fill_in("email", with: ENV["FACEBOOK_EMAIL"])
|
126
|
+
login_form.fill_in("pass", with: ENV["FACEBOOK_PASSWORD"])
|
127
|
+
|
128
|
+
# This is a pain because some pages just `click_button` would work, but some won't
|
129
|
+
login_buttons = login_form.all("div", text: "Log In", wait: 5)
|
130
|
+
|
131
|
+
if login_buttons.empty?
|
132
|
+
login_form.click_button("Log In")
|
133
|
+
else
|
134
|
+
login_buttons.each do |button|
|
135
|
+
if button.text == "Log In"
|
136
|
+
button.click
|
137
|
+
break
|
138
|
+
end
|
139
|
+
end
|
140
|
+
end
|
141
|
+
|
142
|
+
begin
|
143
|
+
raise Forki::BlockedCredentialsError if find_by_id("error_box", wait: 3)
|
144
|
+
rescue Capybara::ElementNotFound; end
|
145
|
+
|
146
|
+
# Now we wait awhile, hopefully to slow down scraping
|
147
|
+
sleep(rand * 10.3)
|
148
|
+
end
|
149
|
+
|
150
|
+
# Ensures that a valid Facebook url has been provided, and that it points to an available post
|
151
|
+
# If either of those two conditions are false, raises an exception
|
152
|
+
def validate_and_load_page(url)
|
153
|
+
Capybara.app_host = "https://www.facebook.com"
|
154
|
+
facebook_url = "https://www.facebook.com"
|
155
|
+
# visit "https://www.facebook.com" unless current_url.start_with?(facebook_url)
|
156
|
+
login(url)
|
157
|
+
raise Forki::InvalidUrlError unless url.start_with?(facebook_url)
|
158
|
+
visit url unless current_url.start_with?(url)
|
159
|
+
end
|
160
|
+
|
161
|
+
# Extracts an integer out of a string describing a number
|
162
|
+
# e.g. "4K Comments" returns 4000
|
163
|
+
# e.g. "131 Shares" returns 131
|
164
|
+
def extract_int_from_num_element(element)
|
165
|
+
return unless element
|
166
|
+
|
167
|
+
if element.class != String # if an html element was passed in
|
168
|
+
element = element.text(:all)
|
169
|
+
end
|
170
|
+
|
171
|
+
num_pattern = /[0-9KM ,.]+/
|
172
|
+
interaction_num_text = num_pattern.match(element)[0]
|
173
|
+
|
174
|
+
if interaction_num_text.include?(".") # e.g. "2.2K"
|
175
|
+
interaction_num_text.to_i + interaction_num_text[-2].to_i * 100
|
176
|
+
elsif interaction_num_text.include?("K") # e.g. "13K"
|
177
|
+
interaction_num_text.to_i * 1000
|
178
|
+
elsif interaction_num_text.include?("M") # e.g. "13M"
|
179
|
+
interaction_num_text.to_i * 1_000_000
|
180
|
+
else # e.g. "15,443"
|
181
|
+
interaction_num_text.delete!(",")
|
182
|
+
interaction_num_text.delete(" ").to_i
|
183
|
+
end
|
184
|
+
end
|
185
|
+
end
|
186
|
+
end
|
187
|
+
|
188
|
+
require_relative "post_scraper"
|
189
|
+
require_relative "user_scraper"
|
@@ -0,0 +1,94 @@
|
|
1
|
+
require "typhoeus"
|
2
|
+
|
3
|
+
module Forki
|
4
|
+
class UserScraper < Scraper
|
5
|
+
# Finds and returns the number of people who like the current page
|
6
|
+
def find_number_of_likes
|
7
|
+
likes_pattern = /[0-9,.KM ] people like this/
|
8
|
+
number_of_likes_elem = all("span").filter { | span| likes_pattern.match? span.text }.first
|
9
|
+
extract_int_from_num_element(number_of_likes_elem)
|
10
|
+
end
|
11
|
+
|
12
|
+
# Finds and returns the number of people who follow the current page
|
13
|
+
def find_number_of_followers(profile_details_string)
|
14
|
+
followers_pattern = /Followed by (?<num_followers>[0-9,.KM ]) people/
|
15
|
+
alt_follower_pattern = /(?<num_followers>[0-9,.KM ]+) (f|F)ollowers/
|
16
|
+
number_of_followers_match = followers_pattern.match(profile_details_string) || alt_follower_pattern.match(profile_details_string)
|
17
|
+
return nil if number_of_followers_match.nil?
|
18
|
+
extract_int_from_num_element(number_of_followers_match.named_captures["num_followers"])
|
19
|
+
end
|
20
|
+
|
21
|
+
def find_number_followers_for_normal_profile(profile_followers_node)
|
22
|
+
followers_string = profile_followers_node["node"]["timeline_context_item"]["renderer"]["context_item"]["title"]["text"]
|
23
|
+
followers_pattern = /[0-9,]+/
|
24
|
+
number_of_followers_match = followers_pattern.match(followers_string).to_s
|
25
|
+
extract_int_from_num_element(number_of_followers_match)
|
26
|
+
end
|
27
|
+
|
28
|
+
# Returns a hash of details about a Facebook user profile
|
29
|
+
def extract_profile_details(graphql_strings)
|
30
|
+
profile_header_str = graphql_strings.find { |gql| gql.include? "profile_header_renderer" }
|
31
|
+
profile_intro_str = graphql_strings.find { |g| g.include? "profile_intro_card" }
|
32
|
+
profile_header_obj = JSON.parse(profile_header_str)["user"]["profile_header_renderer"]
|
33
|
+
profile_intro_obj = profile_intro_str ? JSON.parse(profile_intro_str) : nil
|
34
|
+
|
35
|
+
number_of_followers = find_number_of_followers(profile_header_str)
|
36
|
+
|
37
|
+
# Check if the user shows followers count
|
38
|
+
if number_of_followers.nil?
|
39
|
+
profile_title_section = graphql_strings.find { |gql| gql.include? "profile_tile_section_type" }
|
40
|
+
|
41
|
+
json = JSON.parse(profile_title_section)
|
42
|
+
followers_node = json["user"]["profile_tile_sections"]["edges"].first["node"]["profile_tile_views"]["nodes"][1]["view_style_renderer"]["view"]["profile_tile_items"]["nodes"].select do |node|
|
43
|
+
node["node"]["timeline_context_item"]["timeline_context_list_item_type"] == "INTRO_CARD_FOLLOWERS"
|
44
|
+
end
|
45
|
+
if followers_node.empty?
|
46
|
+
number_of_followers = nil
|
47
|
+
else
|
48
|
+
number_of_followers = find_number_followers_for_normal_profile(followers_node.first)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
{
|
53
|
+
id: profile_header_obj["user"]["id"],
|
54
|
+
number_of_followers: number_of_followers,
|
55
|
+
name: profile_header_obj["user"]["name"],
|
56
|
+
verified: profile_header_obj["user"]["is_verified"],
|
57
|
+
profile: profile_intro_obj ? profile_intro_obj["profile_intro_card"]["bio"]["text"] : "",
|
58
|
+
profile_image_url: profile_header_obj["user"]["profilePicLarge"]["uri"],
|
59
|
+
}
|
60
|
+
end
|
61
|
+
|
62
|
+
# Returns a hash of details about a Facebook page
|
63
|
+
def extract_page_details(graphql_strings)
|
64
|
+
page_cards_string = graphql_strings.find { |graphql_string| (graphql_string.include? "comet_page_cards") && \
|
65
|
+
(graphql_string.include? "follower_count")}
|
66
|
+
page_cards_list = JSON.parse(page_cards_string)["page"]["comet_page_cards"]
|
67
|
+
page_about_card = page_cards_list.find { |card| card["__typename"] == "CometPageAboutCardWithoutMapRenderer" }
|
68
|
+
viewer_page_object = JSON.parse(graphql_strings.find { |graphql_string| (graphql_string.include? "profile_photo") && \
|
69
|
+
graphql_string.include?("is_verified") })
|
70
|
+
{
|
71
|
+
id: page_about_card["page"]["id"],
|
72
|
+
profile: page_about_card["page"]["page_about_fields"]["blurb"],
|
73
|
+
number_of_followers: page_about_card["page"]["follower_count"],
|
74
|
+
name: page_about_card["page"]["name"],
|
75
|
+
verified: viewer_page_object["page"]["is_verified"],
|
76
|
+
profile_image_url: viewer_page_object["page"]["profile_picture"]["uri"],
|
77
|
+
number_of_likes: page_about_card["page"]["page_likers"]["global_likers_count"],
|
78
|
+
}
|
79
|
+
end
|
80
|
+
|
81
|
+
# Uses GraphQL data and DOM elements to collect information about the current user page
|
82
|
+
def parse(url)
|
83
|
+
validate_and_load_page(url)
|
84
|
+
graphql_strings = find_graphql_data_strings(page.html)
|
85
|
+
is_page = graphql_strings.map { |s| JSON.parse(s) }.any? { |o| o.key?("page") }
|
86
|
+
user_details = is_page ? extract_page_details(graphql_strings) : extract_profile_details(graphql_strings)
|
87
|
+
|
88
|
+
user_details[:profile_image_file] = Forki.retrieve_media(user_details[:profile_image_url])
|
89
|
+
user_details[:profile_link] = url
|
90
|
+
|
91
|
+
user_details
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|