forki 0.2.4 → 0.2.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +12 -10
- data/lib/forki/scrapers/post_scraper.rb +138 -30
- data/lib/forki/scrapers/sieves/video_sieves/video_sieve.rb +1 -1
- data/lib/forki/scrapers/sieves/video_sieves/video_sieve_reel.rb +9 -0
- data/lib/forki/scrapers/sieves/video_sieves/video_sieve_reel_2.rb +88 -0
- data/lib/forki/scrapers/sieves/video_sieves/video_sieve_video_page.rb +7 -3
- data/lib/forki/scrapers/sieves/video_sieves/video_sieve_video_page_2.rb +70 -0
- data/lib/forki/scrapers/sieves/video_sieves/video_sieve_watch_tab.rb +7 -1
- data/lib/forki/version.rb +1 -1
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 78986561738c2e71c7504b8d4810c790e1cefa212b97358eae17accf4b1c2131
|
4
|
+
data.tar.gz: 9f72cf4a6496e4c40f3d47d566c2ec9fd1096edf8e54257223a6b26d96f0c9b2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a6fd3e3328b1c1c17d8d4bcab492588d88d7bcfc6feb20f1e85c5b75dfacc24b75c519d388353d733bf2ef0c6899f9c1804f82736d9bdb28971f21299159fed2
|
7
|
+
data.tar.gz: eb1f2608844ba87fc294d38091b6327de25e35f505ac1e873f5a56f797994de57fdf2a1e80d4781821330ff05cee15bc4f0d0fbe2f7a1b72ab7abab6806f7765
|
data/Gemfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
forki (0.2.
|
4
|
+
forki (0.2.5)
|
5
5
|
apparition
|
6
6
|
capybara
|
7
7
|
oj
|
@@ -29,12 +29,13 @@ GEM
|
|
29
29
|
i18n (>= 1.6, < 2)
|
30
30
|
minitest (>= 5.1)
|
31
31
|
tzinfo (~> 2.0)
|
32
|
-
addressable (2.8.
|
32
|
+
addressable (2.8.6)
|
33
33
|
public_suffix (>= 2.0.2, < 6.0)
|
34
34
|
apparition (0.6.0)
|
35
35
|
capybara (~> 3.13, < 4)
|
36
36
|
websocket-driver (>= 0.6.5)
|
37
37
|
ast (2.4.2)
|
38
|
+
bigdecimal (3.1.5)
|
38
39
|
builder (3.2.4)
|
39
40
|
byebug (11.1.3)
|
40
41
|
capybara (3.39.2)
|
@@ -53,7 +54,7 @@ GEM
|
|
53
54
|
erubi (1.12.0)
|
54
55
|
ethon (0.16.0)
|
55
56
|
ffi (>= 1.15.0)
|
56
|
-
ffi (1.
|
57
|
+
ffi (1.16.3)
|
57
58
|
i18n (1.13.0)
|
58
59
|
concurrent-ruby (~> 1.0)
|
59
60
|
json (2.6.3)
|
@@ -62,15 +63,16 @@ GEM
|
|
62
63
|
nokogiri (>= 1.12.0)
|
63
64
|
matrix (0.4.2)
|
64
65
|
method_source (1.0.0)
|
65
|
-
mini_mime (1.1.
|
66
|
+
mini_mime (1.1.5)
|
66
67
|
minitest (5.18.0)
|
67
68
|
nokogiri (1.15.1-arm64-darwin)
|
68
69
|
racc (~> 1.4)
|
69
|
-
oj (3.
|
70
|
+
oj (3.16.3)
|
71
|
+
bigdecimal (>= 3.0)
|
70
72
|
parallel (1.23.0)
|
71
73
|
parser (3.2.2.1)
|
72
74
|
ast (~> 2.4.1)
|
73
|
-
public_suffix (5.0.
|
75
|
+
public_suffix (5.0.4)
|
74
76
|
racc (1.6.2)
|
75
77
|
rack (2.2.4)
|
76
78
|
rack-test (2.1.0)
|
@@ -90,7 +92,7 @@ GEM
|
|
90
92
|
rainbow (3.1.1)
|
91
93
|
rake (13.0.6)
|
92
94
|
regexp_parser (2.8.0)
|
93
|
-
rexml (3.2.
|
95
|
+
rexml (3.2.6)
|
94
96
|
rubocop (1.51.0)
|
95
97
|
json (~> 2.3)
|
96
98
|
parallel (~> 1.10)
|
@@ -127,17 +129,17 @@ GEM
|
|
127
129
|
rubocop-rails (~> 2.0)
|
128
130
|
ruby-progressbar (1.13.0)
|
129
131
|
rubyzip (2.3.2)
|
130
|
-
selenium-webdriver (4.
|
132
|
+
selenium-webdriver (4.16.0)
|
131
133
|
rexml (~> 3.2, >= 3.2.5)
|
132
134
|
rubyzip (>= 1.2.2, < 3.0)
|
133
135
|
websocket (~> 1.0)
|
134
136
|
thor (1.2.2)
|
135
|
-
typhoeus (1.4.
|
137
|
+
typhoeus (1.4.1)
|
136
138
|
ethon (>= 0.9.0)
|
137
139
|
tzinfo (2.0.6)
|
138
140
|
concurrent-ruby (~> 1.0)
|
139
141
|
unicode-display_width (2.4.2)
|
140
|
-
websocket (1.2.
|
142
|
+
websocket (1.2.10)
|
141
143
|
websocket-driver (0.7.6)
|
142
144
|
websocket-extensions (>= 0.1.0)
|
143
145
|
websocket-extensions (0.1.5)
|
@@ -65,14 +65,23 @@ module Forki
|
|
65
65
|
graphql_objects.any? do |graphql_object| # if any GraphQL objects contain the top-level keys above, return true
|
66
66
|
return true unless graphql_object.fetch("image", nil).nil? # so long as the associated values are not nil
|
67
67
|
return true unless graphql_object.fetch("currMedia", nil).nil?
|
68
|
+
return true unless graphql_object.fetch("photo_image", nil).nil?
|
68
69
|
|
69
70
|
# This is a complicated form for `web.facebook.com` posts
|
70
|
-
|
71
71
|
if !graphql_object.dig("node", "comet_sections", "content", "story", "attachments").nil?
|
72
72
|
if graphql_object["node"]["comet_sections"]["content"]["story"]["attachments"].count.positive?
|
73
73
|
return true unless graphql_object["node"]["comet_sections"]["content"]["story"]["attachments"].first.dig("styles", "attachment", "all_subattachments", "nodes")&.first&.dig("media", "image", "uri").nil?
|
74
74
|
end
|
75
75
|
end
|
76
|
+
|
77
|
+
# Another weird format
|
78
|
+
begin
|
79
|
+
if !graphql_object["node"]["comet_sections"]["content"]["story"]["attachments"].empty?
|
80
|
+
return true unless graphql_object["node"]["comet_sections"]["content"]["story"]["attachments"].first.dig("styles", "attachment", "media", "photo_image", "uri").nil?
|
81
|
+
end
|
82
|
+
rescue StandardError
|
83
|
+
|
84
|
+
end
|
76
85
|
end
|
77
86
|
end
|
78
87
|
|
@@ -157,7 +166,7 @@ module Forki
|
|
157
166
|
graphql_object_array = graphql_strings.map { |graphql_string| JSON.parse(graphql_string) }
|
158
167
|
|
159
168
|
# Once in awhile it's really easy
|
160
|
-
video_objects = graphql_object_array.filter {|go| go.has_key?("video") }
|
169
|
+
video_objects = graphql_object_array.filter { |go| go.has_key?("video") }
|
161
170
|
|
162
171
|
if VideoSieve.can_process_with_sieve?(graphql_object_array)
|
163
172
|
# Eventually all of this complexity will be replaced with this
|
@@ -170,9 +179,15 @@ module Forki
|
|
170
179
|
return extract_video_post_data_alternative(graphql_object_array) if story_node_object.nil?
|
171
180
|
|
172
181
|
if story_node_object["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"].key?("media")
|
173
|
-
|
174
|
-
|
175
|
-
|
182
|
+
media_object = story_node_object["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"]
|
183
|
+
if media_object.has_key?("video")
|
184
|
+
video_object = story_node_object["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"]["media"]["video"]
|
185
|
+
elsif media_object.has_key?("media") && media_object["media"].has_key?("browser_native_sd_url")
|
186
|
+
video_object = media_object["media"]
|
187
|
+
end
|
188
|
+
|
189
|
+
creation_date = video_object["publish_time"] if video_object&.has_key?("publish_time")
|
190
|
+
creation_date = story_node_object["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"]["media"]["publish_time"] if creation_date.nil?
|
176
191
|
elsif story_node_object["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"].key?("style_infos")
|
177
192
|
# For "Reels" we need a separate way to parse this
|
178
193
|
video_object = story_node_object["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"]["style_infos"].first["fb_shorts_story"]["short_form_video_context"]["playback_video"]
|
@@ -181,8 +196,20 @@ module Forki
|
|
181
196
|
raise "Unable to parse video object" if video_objects.empty?
|
182
197
|
end
|
183
198
|
|
184
|
-
|
185
|
-
|
199
|
+
begin
|
200
|
+
feedback_object = story_node_object["comet_sections"]["feedback"]["story"]["feedback_context"]["feedback_target_with_context"]["ufi_renderer"]["feedback"]
|
201
|
+
rescue NoMethodError
|
202
|
+
feedback_object = story_node_object["comet_sections"]["feedback"]["story"]["comet_feed_ufi_container"]["story"]["story_ufi_container"]["story"]["feedback_context"]["feedback_target_with_context"]
|
203
|
+
end
|
204
|
+
|
205
|
+
if feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"].key?("cannot_see_top_custom_reactions")
|
206
|
+
reaction_counts = extract_reaction_counts(feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["cannot_see_top_custom_reactions"]["top_reactions"])
|
207
|
+
else
|
208
|
+
reaction_counts = extract_reaction_counts(feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["top_reactions"])
|
209
|
+
end
|
210
|
+
|
211
|
+
feedback_object = feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]
|
212
|
+
|
186
213
|
share_count_object = feedback_object.fetch("share_count", {})
|
187
214
|
|
188
215
|
if story_node_object["comet_sections"]["content"]["story"]["comet_sections"].key? "message"
|
@@ -191,15 +218,37 @@ module Forki
|
|
191
218
|
text = ""
|
192
219
|
end
|
193
220
|
|
194
|
-
feedback_object
|
195
|
-
|
221
|
+
if feedback_object.has_key?("comment_list_renderer")
|
222
|
+
if feedback_object["comment_list_renderer"]["feedback"].key?("comment_count")
|
223
|
+
num_comments = feedback_object["comment_list_renderer"]["feedback"]["comment_count"]["total_count"]
|
224
|
+
else
|
225
|
+
num_comments = feedback_object["comment_list_renderer"]["feedback"]["total_comment_count"]
|
226
|
+
end
|
227
|
+
|
228
|
+
view_count = feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["video_view_count"]
|
229
|
+
reshare_warning = feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["should_show_reshare_warning"]
|
230
|
+
elsif feedback_object.has_key?("comments_count_summary_renderer")
|
231
|
+
num_comments = feedback_object["comments_count_summary_renderer"]["feedback"]["comment_rendering_instance"]["comments"]["total_count"]
|
232
|
+
|
233
|
+
view_count = feedback_object["video_view_count"]
|
234
|
+
reshare_warning = feedback_object["should_show_reshare_warning"]
|
235
|
+
else
|
236
|
+
if feedback_object["feedback"].key?("comment_count")
|
237
|
+
num_comments = feedback_object["feedback"]["comment_count"]["total_count"]
|
238
|
+
else
|
239
|
+
num_comments = feedback_object["feedback"]["total_comment_count"]
|
240
|
+
end
|
241
|
+
|
242
|
+
view_count = feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["video_view_count"]
|
243
|
+
reshare_warning = feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["should_show_reshare_warning"]
|
244
|
+
end
|
196
245
|
|
197
246
|
post_details = {
|
198
247
|
id: video_object["id"],
|
199
248
|
num_comments: num_comments,
|
200
249
|
num_shares: share_count_object.fetch("count", nil),
|
201
|
-
num_views:
|
202
|
-
reshare_warning:
|
250
|
+
num_views: view_count,
|
251
|
+
reshare_warning: reshare_warning,
|
203
252
|
video_preview_image_url: video_object["preferred_thumbnail"]["image"]["uri"],
|
204
253
|
video_url: video_object["browser_native_hd_url"] || video_object["browser_native_sd_url"],
|
205
254
|
text: text,
|
@@ -217,12 +266,24 @@ module Forki
|
|
217
266
|
sidepane_object = graphql_object_array.find { |graphql_object| graphql_object.key?("tahoe_sidepane_renderer") }
|
218
267
|
video_object = graphql_object_array.find { |graphql_object| graphql_object.has_key?("video") }
|
219
268
|
feedback_object = sidepane_object["tahoe_sidepane_renderer"]["video"]["feedback"]
|
220
|
-
|
269
|
+
|
270
|
+
if sidepane_object["tahoe_sidepane_renderer"]["video"]["feedback"].key?("cannot_see_top_custom_reactions")
|
271
|
+
reaction_counts = extract_reaction_counts(sidepane_object["tahoe_sidepane_renderer"]["video"]["feedback"]["cannot_see_top_custom_reactions"]["top_reactions"])
|
272
|
+
else # if the video has no reactions, it will have a different structure
|
273
|
+
reaction_counts = extract_reaction_counts(sidepane_object["tahoe_sidepane_renderer"]["video"]["feedback"]["top_reactions"])
|
274
|
+
end
|
275
|
+
|
221
276
|
share_count_object = feedback_object.fetch("share_count", {})
|
222
277
|
|
278
|
+
if feedback_object["comments_count_summary_renderer"]["feedback"].has_key?("comment_rendering_instance")
|
279
|
+
num_comments = feedback_object["comments_count_summary_renderer"]["feedback"]["comment_rendering_instance"]["comments"]["total_count"]
|
280
|
+
else
|
281
|
+
num_comments = feedback_object["comments_count_summary_renderer"]["feedback"]["total_comment_count"]
|
282
|
+
end
|
283
|
+
|
223
284
|
post_details = {
|
224
285
|
id: video_object["id"],
|
225
|
-
num_comments:
|
286
|
+
num_comments: num_comments,
|
226
287
|
num_shares: share_count_object.fetch("count", nil),
|
227
288
|
num_views: feedback_object["video_view_count"],
|
228
289
|
reshare_warning: feedback_object["should_show_reshare_warning"],
|
@@ -243,42 +304,81 @@ module Forki
|
|
243
304
|
# Extracts data from an image post by parsing GraphQL strings as seen in the video post scraper above
|
244
305
|
def extract_image_post_data(graphql_object_array)
|
245
306
|
# This is a weird one-off style
|
307
|
+
|
246
308
|
graphql_object = graphql_object_array.find { |graphql_object| !graphql_object.dig("node", "comet_sections", "content", "story", "attachments").nil? }
|
247
|
-
unless graphql_object.nil? || graphql_object.count
|
309
|
+
unless graphql_object.nil? || graphql_object.count.zero?
|
310
|
+
# TODO: These two branches are *super* similar, probably a lot of overlap
|
248
311
|
attachments = graphql_object["node"]["comet_sections"]["content"]["story"]["attachments"]
|
249
312
|
|
250
|
-
|
313
|
+
if graphql_object["node"]["comet_sections"]["feedback"]["story"].key?("feedback_context")
|
314
|
+
feedback_object = graphql_object["node"]["comet_sections"]["feedback"]["story"]["feedback_context"]["feedback_target_with_context"]["ufi_renderer"]["feedback"]["comet_ufi_summary_and_actions_renderer"]["feedback"]
|
315
|
+
elsif graphql_object["node"]["comet_sections"]["feedback"]["story"].has_key?("comet_feed_ufi_container")
|
316
|
+
feedback_object = graphql_object["node"]["comet_sections"]["feedback"]["story"]["comet_feed_ufi_container"]["story"]["story_ufi_container"]["story"]["feedback_context"]["feedback_target_with_context"]["comet_ufi_summary_and_actions_renderer"]["feedback"]
|
317
|
+
else
|
318
|
+
feedback_object = graphql_object["node"]["comet_sections"]["feedback"]["story"]["feedback_context"]["feedback_target_with_context"]["ufi_renderer"]["feedback"]["comet_ufi_summary_and_actions_renderer"]["feedback"]
|
319
|
+
end
|
320
|
+
|
321
|
+
|
322
|
+
if feedback_object.has_key?("cannot_see_top_custom_reactions")
|
323
|
+
reaction_counts = extract_reaction_counts(feedback_object["cannot_see_top_custom_reactions"]["top_reactions"])
|
324
|
+
else
|
325
|
+
reaction_counts = extract_reaction_counts(feedback_object["top_reactions"])
|
326
|
+
end
|
327
|
+
|
251
328
|
id = graphql_object["node"]["post_id"]
|
252
|
-
num_comments =
|
253
|
-
reshare_warning =
|
254
|
-
|
329
|
+
num_comments = feedback_object["share_count"]["count"]
|
330
|
+
reshare_warning = feedback_object["should_show_reshare_warning"]
|
331
|
+
|
332
|
+
if attachments.first["styles"]["attachment"].key?("all_subattachments")
|
333
|
+
image_url = attachments.first["styles"]["attachment"]["all_subattachments"]["nodes"].first["media"]["image"]["uri"]
|
334
|
+
else
|
335
|
+
image_url = attachments.first["styles"]["attachment"]["media"]["photo_image"]["uri"]
|
336
|
+
end
|
337
|
+
|
255
338
|
text = graphql_object["node"]["comet_sections"]["content"]["story"]["message"]["text"]
|
256
339
|
profile_link = graphql_object["node"]["comet_sections"]["content"]["story"]["actors"].first["url"]
|
257
340
|
created_at = graphql_object["node"]["comet_sections"]["content"]["story"]["comet_sections"]["context_layout"]["story"]["comet_sections"]["metadata"].first["story"]["creation_time"]
|
258
341
|
has_video = false
|
259
342
|
else
|
260
|
-
|
261
343
|
graphql_object_array.find { |graphql_object| graphql_object.key?("viewer_actor") && graphql_object.key?("display_comments") }
|
262
344
|
curr_media_object = graphql_object_array.find { |graphql_object| graphql_object.key?("currMedia") }
|
263
345
|
creation_story_object = graphql_object_array.find { |graphql_object| graphql_object.key?("creation_story") && graphql_object.key?("message") }
|
264
346
|
|
265
347
|
feedback_object = graphql_object_array.find { |graphql_object| graphql_object.has_key?("comet_ufi_summary_and_actions_renderer") }["comet_ufi_summary_and_actions_renderer"]["feedback"]
|
348
|
+
|
349
|
+
if feedback_object.key?("top_reactions")
|
350
|
+
feedback_object = feedback_object
|
351
|
+
else
|
352
|
+
# POSSIBLY OUT OF DATE
|
353
|
+
feedback_object = feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]
|
354
|
+
end
|
355
|
+
|
266
356
|
share_count_object = feedback_object.fetch("share_count", {})
|
267
357
|
|
268
358
|
poster = creation_story_object["creation_story"]["comet_sections"]["actor_photo"]["story"]["actors"][0]
|
269
359
|
|
270
|
-
|
360
|
+
if feedback_object.has_key?("cannot_see_top_custom_reactions")
|
361
|
+
reaction_counts = extract_reaction_counts(feedback_object["cannot_see_top_custom_reactions"]["top_reactions"])
|
362
|
+
else
|
363
|
+
reaction_counts = extract_reaction_counts(feedback_object["top_reactions"])
|
364
|
+
end
|
365
|
+
|
271
366
|
id = curr_media_object["currMedia"]["id"],
|
367
|
+
|
272
368
|
num_comments = feedback_object["comments_count_summary_renderer"]["feedback"]["total_comment_count"],
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
text = (creation_story_object["message"] || {}).fetch("text", nil),
|
277
|
-
profile_link = poster["url"],
|
278
|
-
created_at = curr_media_object["currMedia"]["created_time"],
|
279
|
-
has_video = false
|
369
|
+
if num_comments.nil? && feedback_object.has_key?("comments_count_summary_renderer")
|
370
|
+
num_comments = feedback_object["comments_count_summary_renderer"]["feedback"]["comment_rendering_instance"]["comments"]["total_count"]
|
371
|
+
end
|
280
372
|
|
373
|
+
num_shares = share_count_object.fetch("count", nil)
|
374
|
+
reshare_warning = feedback_object["should_show_reshare_warning"]
|
375
|
+
image_url = curr_media_object["currMedia"]["image"]["uri"]
|
376
|
+
text = (creation_story_object["message"] || {}).fetch("text", nil)
|
377
|
+
profile_link = poster["url"]
|
378
|
+
created_at = curr_media_object["currMedia"]["created_time"]
|
379
|
+
has_video = false
|
281
380
|
end
|
381
|
+
|
282
382
|
post_details = {
|
283
383
|
id: id,
|
284
384
|
num_comments: num_comments,
|
@@ -303,7 +403,12 @@ module Forki
|
|
303
403
|
(graphql_string.include?("live_status")) })
|
304
404
|
video_permalink = creation_story_object["creation_story"]["shareable"]["url"].delete("\\")
|
305
405
|
media_object = video_object["video"]["story"]["attachments"][0]["media"]
|
306
|
-
|
406
|
+
|
407
|
+
if creation_story_object["feedback"].key?("cannot_see_top_custom_reactions")
|
408
|
+
reaction_counts = extract_reaction_counts(creation_story_object["feedback"]["cannot_see_top_custom_reactions"]["top_reactions"])
|
409
|
+
else
|
410
|
+
reaction_counts = extract_reaction_counts(creation_story_object["feedback"]["top_reactions"])
|
411
|
+
end
|
307
412
|
|
308
413
|
post_details = {
|
309
414
|
id: video_object["id"],
|
@@ -331,7 +436,11 @@ module Forki
|
|
331
436
|
(graphql.include? "creation_story") })["video"]["creation_story"]
|
332
437
|
media_object = JSON.parse(graphql_strings.find { |graphql| graphql.include? "playable_url" })["video"]["creation_story"]["attachments"][0]["media"]
|
333
438
|
video_permalink = creation_story_object["shareable"]["url"].delete("\\")
|
334
|
-
|
439
|
+
if creation_story_object["feedback_context"]["feedback_target_with_context"].key?("cannot_see_top_custom_reactions")
|
440
|
+
reaction_counts = extract_reaction_counts(creation_story_object["feedback_context"]["feedback_target_with_context"]["cannot_see_top_custom_reactions"]["top_reactions"])
|
441
|
+
else
|
442
|
+
reaction_counts = extract_reaction_counts(creation_story_object["feedback_context"]["feedback_target_with_context"]["top_reactions"])
|
443
|
+
end
|
335
444
|
|
336
445
|
post_details = {
|
337
446
|
id: creation_story_object["shareable"]["id"],
|
@@ -416,4 +525,3 @@ module Forki
|
|
416
525
|
end
|
417
526
|
|
418
527
|
require_relative "sieves/video_sieves/video_sieve"
|
419
|
-
|
@@ -14,7 +14,7 @@ class VideoSieve
|
|
14
14
|
private
|
15
15
|
|
16
16
|
def self.sieve_class_for_graphql_objects(graphql_objects)
|
17
|
-
sieves = [VideoSieveWatchTab, VideoSieveVideoPage, VideoSieveReel]
|
17
|
+
sieves = [VideoSieveWatchTab, VideoSieveVideoPage, VideoSieveVideoPage2, VideoSieveReel, VideoSieveReel2]
|
18
18
|
sieves.detect { |sieve| sieve.check(graphql_objects) }
|
19
19
|
end
|
20
20
|
end
|
@@ -4,6 +4,15 @@ class VideoSieveReel < VideoSieve
|
|
4
4
|
video_object = self.extractor(graphql_objects)
|
5
5
|
|
6
6
|
return false unless video_object.has_key?("short_form_video_context")
|
7
|
+
|
8
|
+
# In relation to video_sieve_reel_2
|
9
|
+
comment_count = graphql_objects.filter do |go|
|
10
|
+
go = go.first if go.kind_of?(Array) && !go.empty?
|
11
|
+
!go.dig("feedback", "total_comment_count").nil?
|
12
|
+
end.first
|
13
|
+
|
14
|
+
return false unless comment_count.nil?
|
15
|
+
|
7
16
|
true
|
8
17
|
rescue StandardError
|
9
18
|
return false
|
@@ -0,0 +1,88 @@
|
|
1
|
+
class VideoSieveReel2 < VideoSieve
|
2
|
+
# To check if it's valid for the inputted graphql objects
|
3
|
+
def self.check(graphql_objects)
|
4
|
+
video_object = self.extractor(graphql_objects)
|
5
|
+
|
6
|
+
return false unless video_object.has_key?("short_form_video_context")
|
7
|
+
|
8
|
+
comment_count = graphql_objects.filter do |go|
|
9
|
+
go = go.first if go.kind_of?(Array) && !go.empty?
|
10
|
+
!go.dig("feedback", "total_comment_count").nil?
|
11
|
+
end.first
|
12
|
+
|
13
|
+
return false if comment_count.nil?
|
14
|
+
|
15
|
+
true
|
16
|
+
rescue StandardError
|
17
|
+
return false
|
18
|
+
end
|
19
|
+
|
20
|
+
# output the expected format of:
|
21
|
+
#
|
22
|
+
# post_details = {
|
23
|
+
# id: video_object["id"],
|
24
|
+
# num_comments: num_comments,
|
25
|
+
# num_shares: share_count_object.fetch("count", nil),
|
26
|
+
# num_views: feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["video_view_count"],
|
27
|
+
# reshare_warning: feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["should_show_reshare_warning"],
|
28
|
+
# video_preview_image_url: video_object["preferred_thumbnail"]["image"]["uri"],
|
29
|
+
# video_url: video_object["browser_native_hd_url"] || video_object["browser_native_sd_url"],
|
30
|
+
# text: text,
|
31
|
+
# created_at: creation_date,
|
32
|
+
# profile_link: story_node_object["comet_sections"]["context_layout"]["story"]["comet_sections"]["actor_photo"]["story"]["actors"][0]["url"],
|
33
|
+
# has_video: true
|
34
|
+
# }
|
35
|
+
# post_details[:video_preview_image_file] = Forki.retrieve_media(post_details[:video_preview_image_url])
|
36
|
+
# post_details[:video_file] = Forki.retrieve_media(post_details[:video_url])
|
37
|
+
# post_details[:reactions] = reaction_counts
|
38
|
+
|
39
|
+
def self.sieve(graphql_objects)
|
40
|
+
video_object = self.extractor(graphql_objects)
|
41
|
+
|
42
|
+
|
43
|
+
feedback_object = graphql_objects.filter do |go|
|
44
|
+
go = go.first if go.kind_of?(Array) && !go.empty?
|
45
|
+
!go.dig("feedback", "total_comment_count").nil?
|
46
|
+
end.first
|
47
|
+
|
48
|
+
reels_feedback_renderer = graphql_objects.filter do |go|
|
49
|
+
go.dig("reels_feedback_renderer")
|
50
|
+
end.first
|
51
|
+
|
52
|
+
reels_feedback_renderer["reels_feedback_renderer"]["story"]
|
53
|
+
reshare_warning = video_object["short_form_video_context"]["playback_video"].dig("warning_screen_renderer", "cix_screen", "view_model", "__typename") == "OverlayWarningScreenViewModel"
|
54
|
+
|
55
|
+
video_preview_image_url = video_object["short_form_video_context"]["playback_video"]["preferred_thumbnail"]["image"]["uri"]
|
56
|
+
video_url = video_object["short_form_video_context"]["playback_video"]["browser_native_hd_url"] || video_object["short_form_video_context"]["playback_video"]["browser_native_sd_url"]
|
57
|
+
|
58
|
+
post_details = {
|
59
|
+
id: video_object["short_form_video_context"]["video"]["id"],
|
60
|
+
num_comments: feedback_object["feedback"]["total_comment_count"],
|
61
|
+
num_shared: Forki::Scraper.extract_int_from_num_element(feedback_object["feedback"]["share_count_reduced"]),
|
62
|
+
num_views: nil,
|
63
|
+
reshare_warning: reshare_warning,
|
64
|
+
video_preview_image_url: video_preview_image_url,
|
65
|
+
video_url: video_url,
|
66
|
+
text: nil, # Reels don't have text
|
67
|
+
created_at: video_object["creation_time"],
|
68
|
+
profile_link: video_object["short_form_video_context"]["video_owner"]["url"],
|
69
|
+
has_video: true,
|
70
|
+
video_preview_image_file: Forki.retrieve_media(video_preview_image_url),
|
71
|
+
video_file: Forki.retrieve_media(video_url),
|
72
|
+
reactions: nil # Only available on comments it seems? Look into this again sometime
|
73
|
+
}
|
74
|
+
rescue StandardError => e
|
75
|
+
debugger
|
76
|
+
end
|
77
|
+
|
78
|
+
private
|
79
|
+
|
80
|
+
def self.extractor(graphql_objects)
|
81
|
+
video_objects = graphql_objects.filter do |go|
|
82
|
+
go = go.first if go.kind_of?(Array) && !go.empty?
|
83
|
+
go.has_key?("video")
|
84
|
+
end
|
85
|
+
|
86
|
+
video_objects.first.dig("video", "creation_story")
|
87
|
+
end
|
88
|
+
end
|
@@ -4,9 +4,13 @@ class VideoSieveVideoPage < VideoSieve
|
|
4
4
|
story_node_object = self.extractor(graphql_objects) # This will error out
|
5
5
|
return false unless story_node_object["content"]["story"]["attachments"].first["styles"]["attachment"].has_key?("media")
|
6
6
|
|
7
|
+
feedback_object = story_node_object["feedback"]["story"]["feedback_context"]["feedback_target_with_context"]["ufi_renderer"]["feedback"]["comet_ufi_summary_and_actions_renderer"]["feedback"]
|
8
|
+
# This is what differs from video_sieve_video_page_2.rb, where this key is unnested
|
9
|
+
return false unless feedback_object.has_key?("cannot_see_top_custom_reactions")
|
10
|
+
|
7
11
|
true
|
8
|
-
rescue StandardError
|
9
|
-
|
12
|
+
rescue StandardError => e
|
13
|
+
false
|
10
14
|
end
|
11
15
|
|
12
16
|
# output the expected format of:
|
@@ -39,7 +43,7 @@ class VideoSieveVideoPage < VideoSieve
|
|
39
43
|
video_url = video_object["browser_native_hd_url"]
|
40
44
|
video_url = video_object["browser_native_sd_url"] if video_url.nil?
|
41
45
|
|
42
|
-
|
46
|
+
{
|
43
47
|
id: video_object["id"],
|
44
48
|
num_comments: feedback_object["total_comment_count"],
|
45
49
|
num_shared: feedback_object["share_count"]["count"],
|
@@ -0,0 +1,70 @@
|
|
1
|
+
class VideoSieveVideoPage2 < VideoSieve
|
2
|
+
# To check if it's valid for the inputted graphql objects
|
3
|
+
def self.check(graphql_objects)
|
4
|
+
story_node_object = self.extractor(graphql_objects) # This will error out
|
5
|
+
return false unless story_node_object["content"]["story"]["attachments"].first["styles"]["attachment"].has_key?("media")
|
6
|
+
|
7
|
+
feedback_object = story_node_object["feedback"]["story"]["feedback_context"]["feedback_target_with_context"]["ufi_renderer"]["feedback"]["comet_ufi_summary_and_actions_renderer"]["feedback"]
|
8
|
+
# This is what differs from video_sieve_video_page.rb, where this key is nested further
|
9
|
+
return false unless feedback_object.has_key?("top_reactions")
|
10
|
+
|
11
|
+
true
|
12
|
+
rescue StandardError
|
13
|
+
false
|
14
|
+
end
|
15
|
+
|
16
|
+
# output the expected format of:
|
17
|
+
#
|
18
|
+
# post_details = {
|
19
|
+
# id: video_object["id"],
|
20
|
+
# num_comments: num_comments,
|
21
|
+
# num_shares: share_count_object.fetch("count", nil),
|
22
|
+
# num_views: feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["video_view_count"],
|
23
|
+
# reshare_warning: feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["should_show_reshare_warning"],
|
24
|
+
# video_preview_image_url: video_object["preferred_thumbnail"]["image"]["uri"],
|
25
|
+
# video_url: video_object["browser_native_hd_url"] || video_object["browser_native_sd_url"],
|
26
|
+
# text: text,
|
27
|
+
# created_at: creation_date,
|
28
|
+
# profile_link: story_node_object["comet_sections"]["context_layout"]["story"]["comet_sections"]["actor_photo"]["story"]["actors"][0]["url"],
|
29
|
+
# has_video: true
|
30
|
+
# }
|
31
|
+
# post_details[:video_preview_image_file] = Forki.retrieve_media(post_details[:video_preview_image_url])
|
32
|
+
# post_details[:video_file] = Forki.retrieve_media(post_details[:video_url])
|
33
|
+
# post_details[:reactions] = reaction_counts
|
34
|
+
|
35
|
+
def self.sieve(graphql_objects)
|
36
|
+
extracted_text = self.extractor(graphql_objects)
|
37
|
+
|
38
|
+
story_object = extracted_text["content"]["story"]
|
39
|
+
video_object = extracted_text["content"]["story"]["attachments"].first["styles"]["attachment"]["media"]
|
40
|
+
feedback_object = extracted_text["feedback"]["story"]["feedback_context"]["feedback_target_with_context"]["ufi_renderer"]["feedback"]["comet_ufi_summary_and_actions_renderer"]["feedback"]
|
41
|
+
|
42
|
+
video_preview_image_url = video_object["preferred_thumbnail"]["image"]["uri"]
|
43
|
+
video_url = video_object["browser_native_hd_url"]
|
44
|
+
video_url = video_object["browser_native_sd_url"] if video_url.nil?
|
45
|
+
|
46
|
+
{
|
47
|
+
id: video_object["id"],
|
48
|
+
num_comments: feedback_object["total_comment_count"],
|
49
|
+
num_shared: feedback_object["share_count"]["count"],
|
50
|
+
num_views: nil,
|
51
|
+
reshare_warning: feedback_object["should_show_reshare_warning"],
|
52
|
+
video_preview_image_url: video_preview_image_url,
|
53
|
+
video_url: video_url,
|
54
|
+
text: story_object["message"]["text"],
|
55
|
+
created_at: video_object["publish_time"],
|
56
|
+
profile_link: story_object["actors"].first["url"],
|
57
|
+
has_video: true,
|
58
|
+
video_preview_image_file: Forki.retrieve_media(video_preview_image_url),
|
59
|
+
video_file: Forki.retrieve_media(video_url),
|
60
|
+
reactions: feedback_object["top_reactions"]["edges"]
|
61
|
+
}
|
62
|
+
end
|
63
|
+
|
64
|
+
private
|
65
|
+
|
66
|
+
def self.extractor(graphql_objects)
|
67
|
+
story_node_object = graphql_objects.find { |graphql_object| graphql_object.key? "node" }&.fetch("node", nil) # user posted video
|
68
|
+
story_node_object["comet_sections"]
|
69
|
+
end
|
70
|
+
end
|
@@ -57,6 +57,12 @@ class VideoSieveWatchTab < VideoSieve
|
|
57
57
|
profile_link = filtered_json["attachments"].first["media"]["creation_story"]["comet_sections"]["title"]["story"]["actors"].first["url"]
|
58
58
|
end
|
59
59
|
|
60
|
+
if feedback_object.key?("cannot_see_top_custom_reactions")
|
61
|
+
reactions = feedback_object["cannot_see_top_custom_reactions"]["top_reactions"]["edges"]
|
62
|
+
else
|
63
|
+
reactions = feedback_object["top_reactions"]["edges"]
|
64
|
+
end
|
65
|
+
|
60
66
|
post_details = {
|
61
67
|
id: video_object.dig("shareable", "id") || video_object["attachments"].first["media"]["id"],
|
62
68
|
num_comments: feedback_object["total_comment_count"],
|
@@ -71,7 +77,7 @@ class VideoSieveWatchTab < VideoSieve
|
|
71
77
|
has_video: true,
|
72
78
|
video_preview_image_file: Forki.retrieve_media(video_preview_image_url),
|
73
79
|
video_file: Forki.retrieve_media(video_url),
|
74
|
-
reactions:
|
80
|
+
reactions: reactions
|
75
81
|
}
|
76
82
|
end
|
77
83
|
|
data/lib/forki/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: forki
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- ''
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2024-04-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: capybara
|
@@ -127,7 +127,9 @@ files:
|
|
127
127
|
- lib/forki/scrapers/sieves/image_sieves/image_sieve.rb.rb
|
128
128
|
- lib/forki/scrapers/sieves/video_sieves/video_sieve.rb
|
129
129
|
- lib/forki/scrapers/sieves/video_sieves/video_sieve_reel.rb
|
130
|
+
- lib/forki/scrapers/sieves/video_sieves/video_sieve_reel_2.rb
|
130
131
|
- lib/forki/scrapers/sieves/video_sieves/video_sieve_video_page.rb
|
132
|
+
- lib/forki/scrapers/sieves/video_sieves/video_sieve_video_page_2.rb
|
131
133
|
- lib/forki/scrapers/sieves/video_sieves/video_sieve_watch_tab.rb
|
132
134
|
- lib/forki/scrapers/user_scraper.rb
|
133
135
|
- lib/forki/user.rb
|