forki 0.2.5 → 0.2.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +12 -10
- data/lib/forki/scrapers/post_scraper.rb +92 -25
- data/lib/forki/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 78986561738c2e71c7504b8d4810c790e1cefa212b97358eae17accf4b1c2131
|
4
|
+
data.tar.gz: 9f72cf4a6496e4c40f3d47d566c2ec9fd1096edf8e54257223a6b26d96f0c9b2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a6fd3e3328b1c1c17d8d4bcab492588d88d7bcfc6feb20f1e85c5b75dfacc24b75c519d388353d733bf2ef0c6899f9c1804f82736d9bdb28971f21299159fed2
|
7
|
+
data.tar.gz: eb1f2608844ba87fc294d38091b6327de25e35f505ac1e873f5a56f797994de57fdf2a1e80d4781821330ff05cee15bc4f0d0fbe2f7a1b72ab7abab6806f7765
|
data/Gemfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
forki (0.2.
|
4
|
+
forki (0.2.5)
|
5
5
|
apparition
|
6
6
|
capybara
|
7
7
|
oj
|
@@ -29,12 +29,13 @@ GEM
|
|
29
29
|
i18n (>= 1.6, < 2)
|
30
30
|
minitest (>= 5.1)
|
31
31
|
tzinfo (~> 2.0)
|
32
|
-
addressable (2.8.
|
32
|
+
addressable (2.8.6)
|
33
33
|
public_suffix (>= 2.0.2, < 6.0)
|
34
34
|
apparition (0.6.0)
|
35
35
|
capybara (~> 3.13, < 4)
|
36
36
|
websocket-driver (>= 0.6.5)
|
37
37
|
ast (2.4.2)
|
38
|
+
bigdecimal (3.1.5)
|
38
39
|
builder (3.2.4)
|
39
40
|
byebug (11.1.3)
|
40
41
|
capybara (3.39.2)
|
@@ -53,7 +54,7 @@ GEM
|
|
53
54
|
erubi (1.12.0)
|
54
55
|
ethon (0.16.0)
|
55
56
|
ffi (>= 1.15.0)
|
56
|
-
ffi (1.
|
57
|
+
ffi (1.16.3)
|
57
58
|
i18n (1.13.0)
|
58
59
|
concurrent-ruby (~> 1.0)
|
59
60
|
json (2.6.3)
|
@@ -62,15 +63,16 @@ GEM
|
|
62
63
|
nokogiri (>= 1.12.0)
|
63
64
|
matrix (0.4.2)
|
64
65
|
method_source (1.0.0)
|
65
|
-
mini_mime (1.1.
|
66
|
+
mini_mime (1.1.5)
|
66
67
|
minitest (5.18.0)
|
67
68
|
nokogiri (1.15.1-arm64-darwin)
|
68
69
|
racc (~> 1.4)
|
69
|
-
oj (3.
|
70
|
+
oj (3.16.3)
|
71
|
+
bigdecimal (>= 3.0)
|
70
72
|
parallel (1.23.0)
|
71
73
|
parser (3.2.2.1)
|
72
74
|
ast (~> 2.4.1)
|
73
|
-
public_suffix (5.0.
|
75
|
+
public_suffix (5.0.4)
|
74
76
|
racc (1.6.2)
|
75
77
|
rack (2.2.4)
|
76
78
|
rack-test (2.1.0)
|
@@ -90,7 +92,7 @@ GEM
|
|
90
92
|
rainbow (3.1.1)
|
91
93
|
rake (13.0.6)
|
92
94
|
regexp_parser (2.8.0)
|
93
|
-
rexml (3.2.
|
95
|
+
rexml (3.2.6)
|
94
96
|
rubocop (1.51.0)
|
95
97
|
json (~> 2.3)
|
96
98
|
parallel (~> 1.10)
|
@@ -127,17 +129,17 @@ GEM
|
|
127
129
|
rubocop-rails (~> 2.0)
|
128
130
|
ruby-progressbar (1.13.0)
|
129
131
|
rubyzip (2.3.2)
|
130
|
-
selenium-webdriver (4.
|
132
|
+
selenium-webdriver (4.16.0)
|
131
133
|
rexml (~> 3.2, >= 3.2.5)
|
132
134
|
rubyzip (>= 1.2.2, < 3.0)
|
133
135
|
websocket (~> 1.0)
|
134
136
|
thor (1.2.2)
|
135
|
-
typhoeus (1.4.
|
137
|
+
typhoeus (1.4.1)
|
136
138
|
ethon (>= 0.9.0)
|
137
139
|
tzinfo (2.0.6)
|
138
140
|
concurrent-ruby (~> 1.0)
|
139
141
|
unicode-display_width (2.4.2)
|
140
|
-
websocket (1.2.
|
142
|
+
websocket (1.2.10)
|
141
143
|
websocket-driver (0.7.6)
|
142
144
|
websocket-extensions (>= 0.1.0)
|
143
145
|
websocket-extensions (0.1.5)
|
@@ -65,14 +65,23 @@ module Forki
|
|
65
65
|
graphql_objects.any? do |graphql_object| # if any GraphQL objects contain the top-level keys above, return true
|
66
66
|
return true unless graphql_object.fetch("image", nil).nil? # so long as the associated values are not nil
|
67
67
|
return true unless graphql_object.fetch("currMedia", nil).nil?
|
68
|
+
return true unless graphql_object.fetch("photo_image", nil).nil?
|
68
69
|
|
69
70
|
# This is a complicated form for `web.facebook.com` posts
|
70
|
-
|
71
71
|
if !graphql_object.dig("node", "comet_sections", "content", "story", "attachments").nil?
|
72
72
|
if graphql_object["node"]["comet_sections"]["content"]["story"]["attachments"].count.positive?
|
73
73
|
return true unless graphql_object["node"]["comet_sections"]["content"]["story"]["attachments"].first.dig("styles", "attachment", "all_subattachments", "nodes")&.first&.dig("media", "image", "uri").nil?
|
74
74
|
end
|
75
75
|
end
|
76
|
+
|
77
|
+
# Another weird format
|
78
|
+
begin
|
79
|
+
if !graphql_object["node"]["comet_sections"]["content"]["story"]["attachments"].empty?
|
80
|
+
return true unless graphql_object["node"]["comet_sections"]["content"]["story"]["attachments"].first.dig("styles", "attachment", "media", "photo_image", "uri").nil?
|
81
|
+
end
|
82
|
+
rescue StandardError
|
83
|
+
|
84
|
+
end
|
76
85
|
end
|
77
86
|
end
|
78
87
|
|
@@ -157,7 +166,7 @@ module Forki
|
|
157
166
|
graphql_object_array = graphql_strings.map { |graphql_string| JSON.parse(graphql_string) }
|
158
167
|
|
159
168
|
# Once in awhile it's really easy
|
160
|
-
video_objects = graphql_object_array.filter {|go| go.has_key?("video") }
|
169
|
+
video_objects = graphql_object_array.filter { |go| go.has_key?("video") }
|
161
170
|
|
162
171
|
if VideoSieve.can_process_with_sieve?(graphql_object_array)
|
163
172
|
# Eventually all of this complexity will be replaced with this
|
@@ -170,9 +179,15 @@ module Forki
|
|
170
179
|
return extract_video_post_data_alternative(graphql_object_array) if story_node_object.nil?
|
171
180
|
|
172
181
|
if story_node_object["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"].key?("media")
|
173
|
-
|
174
|
-
|
175
|
-
|
182
|
+
media_object = story_node_object["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"]
|
183
|
+
if media_object.has_key?("video")
|
184
|
+
video_object = story_node_object["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"]["media"]["video"]
|
185
|
+
elsif media_object.has_key?("media") && media_object["media"].has_key?("browser_native_sd_url")
|
186
|
+
video_object = media_object["media"]
|
187
|
+
end
|
188
|
+
|
189
|
+
creation_date = video_object["publish_time"] if video_object&.has_key?("publish_time")
|
190
|
+
creation_date = story_node_object["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"]["media"]["publish_time"] if creation_date.nil?
|
176
191
|
elsif story_node_object["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"].key?("style_infos")
|
177
192
|
# For "Reels" we need a separate way to parse this
|
178
193
|
video_object = story_node_object["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"]["style_infos"].first["fb_shorts_story"]["short_form_video_context"]["playback_video"]
|
@@ -181,13 +196,20 @@ module Forki
|
|
181
196
|
raise "Unable to parse video object" if video_objects.empty?
|
182
197
|
end
|
183
198
|
|
184
|
-
|
199
|
+
begin
|
200
|
+
feedback_object = story_node_object["comet_sections"]["feedback"]["story"]["feedback_context"]["feedback_target_with_context"]["ufi_renderer"]["feedback"]
|
201
|
+
rescue NoMethodError
|
202
|
+
feedback_object = story_node_object["comet_sections"]["feedback"]["story"]["comet_feed_ufi_container"]["story"]["story_ufi_container"]["story"]["feedback_context"]["feedback_target_with_context"]
|
203
|
+
end
|
204
|
+
|
185
205
|
if feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"].key?("cannot_see_top_custom_reactions")
|
186
206
|
reaction_counts = extract_reaction_counts(feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["cannot_see_top_custom_reactions"]["top_reactions"])
|
187
207
|
else
|
188
208
|
reaction_counts = extract_reaction_counts(feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["top_reactions"])
|
189
209
|
end
|
190
210
|
|
211
|
+
feedback_object = feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]
|
212
|
+
|
191
213
|
share_count_object = feedback_object.fetch("share_count", {})
|
192
214
|
|
193
215
|
if story_node_object["comet_sections"]["content"]["story"]["comet_sections"].key? "message"
|
@@ -202,20 +224,31 @@ module Forki
|
|
202
224
|
else
|
203
225
|
num_comments = feedback_object["comment_list_renderer"]["feedback"]["total_comment_count"]
|
204
226
|
end
|
227
|
+
|
228
|
+
view_count = feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["video_view_count"]
|
229
|
+
reshare_warning = feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["should_show_reshare_warning"]
|
230
|
+
elsif feedback_object.has_key?("comments_count_summary_renderer")
|
231
|
+
num_comments = feedback_object["comments_count_summary_renderer"]["feedback"]["comment_rendering_instance"]["comments"]["total_count"]
|
232
|
+
|
233
|
+
view_count = feedback_object["video_view_count"]
|
234
|
+
reshare_warning = feedback_object["should_show_reshare_warning"]
|
205
235
|
else
|
206
236
|
if feedback_object["feedback"].key?("comment_count")
|
207
237
|
num_comments = feedback_object["feedback"]["comment_count"]["total_count"]
|
208
238
|
else
|
209
239
|
num_comments = feedback_object["feedback"]["total_comment_count"]
|
210
240
|
end
|
241
|
+
|
242
|
+
view_count = feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["video_view_count"]
|
243
|
+
reshare_warning = feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["should_show_reshare_warning"]
|
211
244
|
end
|
212
245
|
|
213
246
|
post_details = {
|
214
247
|
id: video_object["id"],
|
215
248
|
num_comments: num_comments,
|
216
249
|
num_shares: share_count_object.fetch("count", nil),
|
217
|
-
num_views:
|
218
|
-
reshare_warning:
|
250
|
+
num_views: view_count,
|
251
|
+
reshare_warning: reshare_warning,
|
219
252
|
video_preview_image_url: video_object["preferred_thumbnail"]["image"]["uri"],
|
220
253
|
video_url: video_object["browser_native_hd_url"] || video_object["browser_native_sd_url"],
|
221
254
|
text: text,
|
@@ -242,9 +275,15 @@ module Forki
|
|
242
275
|
|
243
276
|
share_count_object = feedback_object.fetch("share_count", {})
|
244
277
|
|
278
|
+
if feedback_object["comments_count_summary_renderer"]["feedback"].has_key?("comment_rendering_instance")
|
279
|
+
num_comments = feedback_object["comments_count_summary_renderer"]["feedback"]["comment_rendering_instance"]["comments"]["total_count"]
|
280
|
+
else
|
281
|
+
num_comments = feedback_object["comments_count_summary_renderer"]["feedback"]["total_comment_count"]
|
282
|
+
end
|
283
|
+
|
245
284
|
post_details = {
|
246
285
|
id: video_object["id"],
|
247
|
-
num_comments:
|
286
|
+
num_comments: num_comments,
|
248
287
|
num_shares: share_count_object.fetch("count", nil),
|
249
288
|
num_views: feedback_object["video_view_count"],
|
250
289
|
reshare_warning: feedback_object["should_show_reshare_warning"],
|
@@ -265,31 +304,55 @@ module Forki
|
|
265
304
|
# Extracts data from an image post by parsing GraphQL strings as seen in the video post scraper above
|
266
305
|
def extract_image_post_data(graphql_object_array)
|
267
306
|
# This is a weird one-off style
|
307
|
+
|
268
308
|
graphql_object = graphql_object_array.find { |graphql_object| !graphql_object.dig("node", "comet_sections", "content", "story", "attachments").nil? }
|
269
|
-
unless graphql_object.nil? || graphql_object.count
|
309
|
+
unless graphql_object.nil? || graphql_object.count.zero?
|
310
|
+
# TODO: These two branches are *super* similar, probably a lot of overlap
|
270
311
|
attachments = graphql_object["node"]["comet_sections"]["content"]["story"]["attachments"]
|
271
312
|
|
272
|
-
if graphql_object["node"]["comet_sections"]["feedback"]["story"]
|
273
|
-
|
313
|
+
if graphql_object["node"]["comet_sections"]["feedback"]["story"].key?("feedback_context")
|
314
|
+
feedback_object = graphql_object["node"]["comet_sections"]["feedback"]["story"]["feedback_context"]["feedback_target_with_context"]["ufi_renderer"]["feedback"]["comet_ufi_summary_and_actions_renderer"]["feedback"]
|
315
|
+
elsif graphql_object["node"]["comet_sections"]["feedback"]["story"].has_key?("comet_feed_ufi_container")
|
316
|
+
feedback_object = graphql_object["node"]["comet_sections"]["feedback"]["story"]["comet_feed_ufi_container"]["story"]["story_ufi_container"]["story"]["feedback_context"]["feedback_target_with_context"]["comet_ufi_summary_and_actions_renderer"]["feedback"]
|
274
317
|
else
|
275
|
-
|
318
|
+
feedback_object = graphql_object["node"]["comet_sections"]["feedback"]["story"]["feedback_context"]["feedback_target_with_context"]["ufi_renderer"]["feedback"]["comet_ufi_summary_and_actions_renderer"]["feedback"]
|
319
|
+
end
|
320
|
+
|
321
|
+
|
322
|
+
if feedback_object.has_key?("cannot_see_top_custom_reactions")
|
323
|
+
reaction_counts = extract_reaction_counts(feedback_object["cannot_see_top_custom_reactions"]["top_reactions"])
|
324
|
+
else
|
325
|
+
reaction_counts = extract_reaction_counts(feedback_object["top_reactions"])
|
276
326
|
end
|
277
327
|
|
278
328
|
id = graphql_object["node"]["post_id"]
|
279
|
-
num_comments =
|
280
|
-
reshare_warning =
|
281
|
-
|
329
|
+
num_comments = feedback_object["share_count"]["count"]
|
330
|
+
reshare_warning = feedback_object["should_show_reshare_warning"]
|
331
|
+
|
332
|
+
if attachments.first["styles"]["attachment"].key?("all_subattachments")
|
333
|
+
image_url = attachments.first["styles"]["attachment"]["all_subattachments"]["nodes"].first["media"]["image"]["uri"]
|
334
|
+
else
|
335
|
+
image_url = attachments.first["styles"]["attachment"]["media"]["photo_image"]["uri"]
|
336
|
+
end
|
337
|
+
|
282
338
|
text = graphql_object["node"]["comet_sections"]["content"]["story"]["message"]["text"]
|
283
339
|
profile_link = graphql_object["node"]["comet_sections"]["content"]["story"]["actors"].first["url"]
|
284
340
|
created_at = graphql_object["node"]["comet_sections"]["content"]["story"]["comet_sections"]["context_layout"]["story"]["comet_sections"]["metadata"].first["story"]["creation_time"]
|
285
341
|
has_video = false
|
286
342
|
else
|
287
|
-
|
288
343
|
graphql_object_array.find { |graphql_object| graphql_object.key?("viewer_actor") && graphql_object.key?("display_comments") }
|
289
344
|
curr_media_object = graphql_object_array.find { |graphql_object| graphql_object.key?("currMedia") }
|
290
345
|
creation_story_object = graphql_object_array.find { |graphql_object| graphql_object.key?("creation_story") && graphql_object.key?("message") }
|
291
346
|
|
292
347
|
feedback_object = graphql_object_array.find { |graphql_object| graphql_object.has_key?("comet_ufi_summary_and_actions_renderer") }["comet_ufi_summary_and_actions_renderer"]["feedback"]
|
348
|
+
|
349
|
+
if feedback_object.key?("top_reactions")
|
350
|
+
feedback_object = feedback_object
|
351
|
+
else
|
352
|
+
# POSSIBLY OUT OF DATE
|
353
|
+
feedback_object = feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]
|
354
|
+
end
|
355
|
+
|
293
356
|
share_count_object = feedback_object.fetch("share_count", {})
|
294
357
|
|
295
358
|
poster = creation_story_object["creation_story"]["comet_sections"]["actor_photo"]["story"]["actors"][0]
|
@@ -301,16 +364,21 @@ module Forki
|
|
301
364
|
end
|
302
365
|
|
303
366
|
id = curr_media_object["currMedia"]["id"],
|
367
|
+
|
304
368
|
num_comments = feedback_object["comments_count_summary_renderer"]["feedback"]["total_comment_count"],
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
text = (creation_story_object["message"] || {}).fetch("text", nil),
|
309
|
-
profile_link = poster["url"],
|
310
|
-
created_at = curr_media_object["currMedia"]["created_time"],
|
311
|
-
has_video = false
|
369
|
+
if num_comments.nil? && feedback_object.has_key?("comments_count_summary_renderer")
|
370
|
+
num_comments = feedback_object["comments_count_summary_renderer"]["feedback"]["comment_rendering_instance"]["comments"]["total_count"]
|
371
|
+
end
|
312
372
|
|
373
|
+
num_shares = share_count_object.fetch("count", nil)
|
374
|
+
reshare_warning = feedback_object["should_show_reshare_warning"]
|
375
|
+
image_url = curr_media_object["currMedia"]["image"]["uri"]
|
376
|
+
text = (creation_story_object["message"] || {}).fetch("text", nil)
|
377
|
+
profile_link = poster["url"]
|
378
|
+
created_at = curr_media_object["currMedia"]["created_time"]
|
379
|
+
has_video = false
|
313
380
|
end
|
381
|
+
|
314
382
|
post_details = {
|
315
383
|
id: id,
|
316
384
|
num_comments: num_comments,
|
@@ -457,4 +525,3 @@ module Forki
|
|
457
525
|
end
|
458
526
|
|
459
527
|
require_relative "sieves/video_sieves/video_sieve"
|
460
|
-
|
data/lib/forki/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: forki
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- ''
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2024-04-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: capybara
|