forki 0.2.5 → 0.2.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +12 -10
- data/lib/forki/scrapers/post_scraper.rb +92 -25
- data/lib/forki/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 78986561738c2e71c7504b8d4810c790e1cefa212b97358eae17accf4b1c2131
|
4
|
+
data.tar.gz: 9f72cf4a6496e4c40f3d47d566c2ec9fd1096edf8e54257223a6b26d96f0c9b2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a6fd3e3328b1c1c17d8d4bcab492588d88d7bcfc6feb20f1e85c5b75dfacc24b75c519d388353d733bf2ef0c6899f9c1804f82736d9bdb28971f21299159fed2
|
7
|
+
data.tar.gz: eb1f2608844ba87fc294d38091b6327de25e35f505ac1e873f5a56f797994de57fdf2a1e80d4781821330ff05cee15bc4f0d0fbe2f7a1b72ab7abab6806f7765
|
data/Gemfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
forki (0.2.
|
4
|
+
forki (0.2.5)
|
5
5
|
apparition
|
6
6
|
capybara
|
7
7
|
oj
|
@@ -29,12 +29,13 @@ GEM
|
|
29
29
|
i18n (>= 1.6, < 2)
|
30
30
|
minitest (>= 5.1)
|
31
31
|
tzinfo (~> 2.0)
|
32
|
-
addressable (2.8.
|
32
|
+
addressable (2.8.6)
|
33
33
|
public_suffix (>= 2.0.2, < 6.0)
|
34
34
|
apparition (0.6.0)
|
35
35
|
capybara (~> 3.13, < 4)
|
36
36
|
websocket-driver (>= 0.6.5)
|
37
37
|
ast (2.4.2)
|
38
|
+
bigdecimal (3.1.5)
|
38
39
|
builder (3.2.4)
|
39
40
|
byebug (11.1.3)
|
40
41
|
capybara (3.39.2)
|
@@ -53,7 +54,7 @@ GEM
|
|
53
54
|
erubi (1.12.0)
|
54
55
|
ethon (0.16.0)
|
55
56
|
ffi (>= 1.15.0)
|
56
|
-
ffi (1.
|
57
|
+
ffi (1.16.3)
|
57
58
|
i18n (1.13.0)
|
58
59
|
concurrent-ruby (~> 1.0)
|
59
60
|
json (2.6.3)
|
@@ -62,15 +63,16 @@ GEM
|
|
62
63
|
nokogiri (>= 1.12.0)
|
63
64
|
matrix (0.4.2)
|
64
65
|
method_source (1.0.0)
|
65
|
-
mini_mime (1.1.
|
66
|
+
mini_mime (1.1.5)
|
66
67
|
minitest (5.18.0)
|
67
68
|
nokogiri (1.15.1-arm64-darwin)
|
68
69
|
racc (~> 1.4)
|
69
|
-
oj (3.
|
70
|
+
oj (3.16.3)
|
71
|
+
bigdecimal (>= 3.0)
|
70
72
|
parallel (1.23.0)
|
71
73
|
parser (3.2.2.1)
|
72
74
|
ast (~> 2.4.1)
|
73
|
-
public_suffix (5.0.
|
75
|
+
public_suffix (5.0.4)
|
74
76
|
racc (1.6.2)
|
75
77
|
rack (2.2.4)
|
76
78
|
rack-test (2.1.0)
|
@@ -90,7 +92,7 @@ GEM
|
|
90
92
|
rainbow (3.1.1)
|
91
93
|
rake (13.0.6)
|
92
94
|
regexp_parser (2.8.0)
|
93
|
-
rexml (3.2.
|
95
|
+
rexml (3.2.6)
|
94
96
|
rubocop (1.51.0)
|
95
97
|
json (~> 2.3)
|
96
98
|
parallel (~> 1.10)
|
@@ -127,17 +129,17 @@ GEM
|
|
127
129
|
rubocop-rails (~> 2.0)
|
128
130
|
ruby-progressbar (1.13.0)
|
129
131
|
rubyzip (2.3.2)
|
130
|
-
selenium-webdriver (4.
|
132
|
+
selenium-webdriver (4.16.0)
|
131
133
|
rexml (~> 3.2, >= 3.2.5)
|
132
134
|
rubyzip (>= 1.2.2, < 3.0)
|
133
135
|
websocket (~> 1.0)
|
134
136
|
thor (1.2.2)
|
135
|
-
typhoeus (1.4.
|
137
|
+
typhoeus (1.4.1)
|
136
138
|
ethon (>= 0.9.0)
|
137
139
|
tzinfo (2.0.6)
|
138
140
|
concurrent-ruby (~> 1.0)
|
139
141
|
unicode-display_width (2.4.2)
|
140
|
-
websocket (1.2.
|
142
|
+
websocket (1.2.10)
|
141
143
|
websocket-driver (0.7.6)
|
142
144
|
websocket-extensions (>= 0.1.0)
|
143
145
|
websocket-extensions (0.1.5)
|
@@ -65,14 +65,23 @@ module Forki
|
|
65
65
|
graphql_objects.any? do |graphql_object| # if any GraphQL objects contain the top-level keys above, return true
|
66
66
|
return true unless graphql_object.fetch("image", nil).nil? # so long as the associated values are not nil
|
67
67
|
return true unless graphql_object.fetch("currMedia", nil).nil?
|
68
|
+
return true unless graphql_object.fetch("photo_image", nil).nil?
|
68
69
|
|
69
70
|
# This is a complicated form for `web.facebook.com` posts
|
70
|
-
|
71
71
|
if !graphql_object.dig("node", "comet_sections", "content", "story", "attachments").nil?
|
72
72
|
if graphql_object["node"]["comet_sections"]["content"]["story"]["attachments"].count.positive?
|
73
73
|
return true unless graphql_object["node"]["comet_sections"]["content"]["story"]["attachments"].first.dig("styles", "attachment", "all_subattachments", "nodes")&.first&.dig("media", "image", "uri").nil?
|
74
74
|
end
|
75
75
|
end
|
76
|
+
|
77
|
+
# Another weird format
|
78
|
+
begin
|
79
|
+
if !graphql_object["node"]["comet_sections"]["content"]["story"]["attachments"].empty?
|
80
|
+
return true unless graphql_object["node"]["comet_sections"]["content"]["story"]["attachments"].first.dig("styles", "attachment", "media", "photo_image", "uri").nil?
|
81
|
+
end
|
82
|
+
rescue StandardError
|
83
|
+
|
84
|
+
end
|
76
85
|
end
|
77
86
|
end
|
78
87
|
|
@@ -157,7 +166,7 @@ module Forki
|
|
157
166
|
graphql_object_array = graphql_strings.map { |graphql_string| JSON.parse(graphql_string) }
|
158
167
|
|
159
168
|
# Once in awhile it's really easy
|
160
|
-
video_objects = graphql_object_array.filter {|go| go.has_key?("video") }
|
169
|
+
video_objects = graphql_object_array.filter { |go| go.has_key?("video") }
|
161
170
|
|
162
171
|
if VideoSieve.can_process_with_sieve?(graphql_object_array)
|
163
172
|
# Eventually all of this complexity will be replaced with this
|
@@ -170,9 +179,15 @@ module Forki
|
|
170
179
|
return extract_video_post_data_alternative(graphql_object_array) if story_node_object.nil?
|
171
180
|
|
172
181
|
if story_node_object["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"].key?("media")
|
173
|
-
|
174
|
-
|
175
|
-
|
182
|
+
media_object = story_node_object["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"]
|
183
|
+
if media_object.has_key?("video")
|
184
|
+
video_object = story_node_object["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"]["media"]["video"]
|
185
|
+
elsif media_object.has_key?("media") && media_object["media"].has_key?("browser_native_sd_url")
|
186
|
+
video_object = media_object["media"]
|
187
|
+
end
|
188
|
+
|
189
|
+
creation_date = video_object["publish_time"] if video_object&.has_key?("publish_time")
|
190
|
+
creation_date = story_node_object["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"]["media"]["publish_time"] if creation_date.nil?
|
176
191
|
elsif story_node_object["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"].key?("style_infos")
|
177
192
|
# For "Reels" we need a separate way to parse this
|
178
193
|
video_object = story_node_object["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"]["style_infos"].first["fb_shorts_story"]["short_form_video_context"]["playback_video"]
|
@@ -181,13 +196,20 @@ module Forki
|
|
181
196
|
raise "Unable to parse video object" if video_objects.empty?
|
182
197
|
end
|
183
198
|
|
184
|
-
|
199
|
+
begin
|
200
|
+
feedback_object = story_node_object["comet_sections"]["feedback"]["story"]["feedback_context"]["feedback_target_with_context"]["ufi_renderer"]["feedback"]
|
201
|
+
rescue NoMethodError
|
202
|
+
feedback_object = story_node_object["comet_sections"]["feedback"]["story"]["comet_feed_ufi_container"]["story"]["story_ufi_container"]["story"]["feedback_context"]["feedback_target_with_context"]
|
203
|
+
end
|
204
|
+
|
185
205
|
if feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"].key?("cannot_see_top_custom_reactions")
|
186
206
|
reaction_counts = extract_reaction_counts(feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["cannot_see_top_custom_reactions"]["top_reactions"])
|
187
207
|
else
|
188
208
|
reaction_counts = extract_reaction_counts(feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["top_reactions"])
|
189
209
|
end
|
190
210
|
|
211
|
+
feedback_object = feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]
|
212
|
+
|
191
213
|
share_count_object = feedback_object.fetch("share_count", {})
|
192
214
|
|
193
215
|
if story_node_object["comet_sections"]["content"]["story"]["comet_sections"].key? "message"
|
@@ -202,20 +224,31 @@ module Forki
|
|
202
224
|
else
|
203
225
|
num_comments = feedback_object["comment_list_renderer"]["feedback"]["total_comment_count"]
|
204
226
|
end
|
227
|
+
|
228
|
+
view_count = feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["video_view_count"]
|
229
|
+
reshare_warning = feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["should_show_reshare_warning"]
|
230
|
+
elsif feedback_object.has_key?("comments_count_summary_renderer")
|
231
|
+
num_comments = feedback_object["comments_count_summary_renderer"]["feedback"]["comment_rendering_instance"]["comments"]["total_count"]
|
232
|
+
|
233
|
+
view_count = feedback_object["video_view_count"]
|
234
|
+
reshare_warning = feedback_object["should_show_reshare_warning"]
|
205
235
|
else
|
206
236
|
if feedback_object["feedback"].key?("comment_count")
|
207
237
|
num_comments = feedback_object["feedback"]["comment_count"]["total_count"]
|
208
238
|
else
|
209
239
|
num_comments = feedback_object["feedback"]["total_comment_count"]
|
210
240
|
end
|
241
|
+
|
242
|
+
view_count = feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["video_view_count"]
|
243
|
+
reshare_warning = feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["should_show_reshare_warning"]
|
211
244
|
end
|
212
245
|
|
213
246
|
post_details = {
|
214
247
|
id: video_object["id"],
|
215
248
|
num_comments: num_comments,
|
216
249
|
num_shares: share_count_object.fetch("count", nil),
|
217
|
-
num_views:
|
218
|
-
reshare_warning:
|
250
|
+
num_views: view_count,
|
251
|
+
reshare_warning: reshare_warning,
|
219
252
|
video_preview_image_url: video_object["preferred_thumbnail"]["image"]["uri"],
|
220
253
|
video_url: video_object["browser_native_hd_url"] || video_object["browser_native_sd_url"],
|
221
254
|
text: text,
|
@@ -242,9 +275,15 @@ module Forki
|
|
242
275
|
|
243
276
|
share_count_object = feedback_object.fetch("share_count", {})
|
244
277
|
|
278
|
+
if feedback_object["comments_count_summary_renderer"]["feedback"].has_key?("comment_rendering_instance")
|
279
|
+
num_comments = feedback_object["comments_count_summary_renderer"]["feedback"]["comment_rendering_instance"]["comments"]["total_count"]
|
280
|
+
else
|
281
|
+
num_comments = feedback_object["comments_count_summary_renderer"]["feedback"]["total_comment_count"]
|
282
|
+
end
|
283
|
+
|
245
284
|
post_details = {
|
246
285
|
id: video_object["id"],
|
247
|
-
num_comments:
|
286
|
+
num_comments: num_comments,
|
248
287
|
num_shares: share_count_object.fetch("count", nil),
|
249
288
|
num_views: feedback_object["video_view_count"],
|
250
289
|
reshare_warning: feedback_object["should_show_reshare_warning"],
|
@@ -265,31 +304,55 @@ module Forki
|
|
265
304
|
# Extracts data from an image post by parsing GraphQL strings as seen in the video post scraper above
|
266
305
|
def extract_image_post_data(graphql_object_array)
|
267
306
|
# This is a weird one-off style
|
307
|
+
|
268
308
|
graphql_object = graphql_object_array.find { |graphql_object| !graphql_object.dig("node", "comet_sections", "content", "story", "attachments").nil? }
|
269
|
-
unless graphql_object.nil? || graphql_object.count
|
309
|
+
unless graphql_object.nil? || graphql_object.count.zero?
|
310
|
+
# TODO: These two branches are *super* similar, probably a lot of overlap
|
270
311
|
attachments = graphql_object["node"]["comet_sections"]["content"]["story"]["attachments"]
|
271
312
|
|
272
|
-
if graphql_object["node"]["comet_sections"]["feedback"]["story"]
|
273
|
-
|
313
|
+
if graphql_object["node"]["comet_sections"]["feedback"]["story"].key?("feedback_context")
|
314
|
+
feedback_object = graphql_object["node"]["comet_sections"]["feedback"]["story"]["feedback_context"]["feedback_target_with_context"]["ufi_renderer"]["feedback"]["comet_ufi_summary_and_actions_renderer"]["feedback"]
|
315
|
+
elsif graphql_object["node"]["comet_sections"]["feedback"]["story"].has_key?("comet_feed_ufi_container")
|
316
|
+
feedback_object = graphql_object["node"]["comet_sections"]["feedback"]["story"]["comet_feed_ufi_container"]["story"]["story_ufi_container"]["story"]["feedback_context"]["feedback_target_with_context"]["comet_ufi_summary_and_actions_renderer"]["feedback"]
|
274
317
|
else
|
275
|
-
|
318
|
+
feedback_object = graphql_object["node"]["comet_sections"]["feedback"]["story"]["feedback_context"]["feedback_target_with_context"]["ufi_renderer"]["feedback"]["comet_ufi_summary_and_actions_renderer"]["feedback"]
|
319
|
+
end
|
320
|
+
|
321
|
+
|
322
|
+
if feedback_object.has_key?("cannot_see_top_custom_reactions")
|
323
|
+
reaction_counts = extract_reaction_counts(feedback_object["cannot_see_top_custom_reactions"]["top_reactions"])
|
324
|
+
else
|
325
|
+
reaction_counts = extract_reaction_counts(feedback_object["top_reactions"])
|
276
326
|
end
|
277
327
|
|
278
328
|
id = graphql_object["node"]["post_id"]
|
279
|
-
num_comments =
|
280
|
-
reshare_warning =
|
281
|
-
|
329
|
+
num_comments = feedback_object["share_count"]["count"]
|
330
|
+
reshare_warning = feedback_object["should_show_reshare_warning"]
|
331
|
+
|
332
|
+
if attachments.first["styles"]["attachment"].key?("all_subattachments")
|
333
|
+
image_url = attachments.first["styles"]["attachment"]["all_subattachments"]["nodes"].first["media"]["image"]["uri"]
|
334
|
+
else
|
335
|
+
image_url = attachments.first["styles"]["attachment"]["media"]["photo_image"]["uri"]
|
336
|
+
end
|
337
|
+
|
282
338
|
text = graphql_object["node"]["comet_sections"]["content"]["story"]["message"]["text"]
|
283
339
|
profile_link = graphql_object["node"]["comet_sections"]["content"]["story"]["actors"].first["url"]
|
284
340
|
created_at = graphql_object["node"]["comet_sections"]["content"]["story"]["comet_sections"]["context_layout"]["story"]["comet_sections"]["metadata"].first["story"]["creation_time"]
|
285
341
|
has_video = false
|
286
342
|
else
|
287
|
-
|
288
343
|
graphql_object_array.find { |graphql_object| graphql_object.key?("viewer_actor") && graphql_object.key?("display_comments") }
|
289
344
|
curr_media_object = graphql_object_array.find { |graphql_object| graphql_object.key?("currMedia") }
|
290
345
|
creation_story_object = graphql_object_array.find { |graphql_object| graphql_object.key?("creation_story") && graphql_object.key?("message") }
|
291
346
|
|
292
347
|
feedback_object = graphql_object_array.find { |graphql_object| graphql_object.has_key?("comet_ufi_summary_and_actions_renderer") }["comet_ufi_summary_and_actions_renderer"]["feedback"]
|
348
|
+
|
349
|
+
if feedback_object.key?("top_reactions")
|
350
|
+
feedback_object = feedback_object
|
351
|
+
else
|
352
|
+
# POSSIBLY OUT OF DATE
|
353
|
+
feedback_object = feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]
|
354
|
+
end
|
355
|
+
|
293
356
|
share_count_object = feedback_object.fetch("share_count", {})
|
294
357
|
|
295
358
|
poster = creation_story_object["creation_story"]["comet_sections"]["actor_photo"]["story"]["actors"][0]
|
@@ -301,16 +364,21 @@ module Forki
|
|
301
364
|
end
|
302
365
|
|
303
366
|
id = curr_media_object["currMedia"]["id"],
|
367
|
+
|
304
368
|
num_comments = feedback_object["comments_count_summary_renderer"]["feedback"]["total_comment_count"],
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
text = (creation_story_object["message"] || {}).fetch("text", nil),
|
309
|
-
profile_link = poster["url"],
|
310
|
-
created_at = curr_media_object["currMedia"]["created_time"],
|
311
|
-
has_video = false
|
369
|
+
if num_comments.nil? && feedback_object.has_key?("comments_count_summary_renderer")
|
370
|
+
num_comments = feedback_object["comments_count_summary_renderer"]["feedback"]["comment_rendering_instance"]["comments"]["total_count"]
|
371
|
+
end
|
312
372
|
|
373
|
+
num_shares = share_count_object.fetch("count", nil)
|
374
|
+
reshare_warning = feedback_object["should_show_reshare_warning"]
|
375
|
+
image_url = curr_media_object["currMedia"]["image"]["uri"]
|
376
|
+
text = (creation_story_object["message"] || {}).fetch("text", nil)
|
377
|
+
profile_link = poster["url"]
|
378
|
+
created_at = curr_media_object["currMedia"]["created_time"]
|
379
|
+
has_video = false
|
313
380
|
end
|
381
|
+
|
314
382
|
post_details = {
|
315
383
|
id: id,
|
316
384
|
num_comments: num_comments,
|
@@ -457,4 +525,3 @@ module Forki
|
|
457
525
|
end
|
458
526
|
|
459
527
|
require_relative "sieves/video_sieves/video_sieve"
|
460
|
-
|
data/lib/forki/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: forki
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- ''
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2024-04-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: capybara
|