forki 0.2.5 → 0.2.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +12 -10
- data/lib/forki/scrapers/post_scraper.rb +105 -26
- data/lib/forki/scrapers/sieves/image_sieves/image_sieve.rb.rb +1 -3
- data/lib/forki/version.rb +1 -1
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: cf93e965787eaf05b26f6ea1377775fb61ed131a52458a371336474d09e4a639
|
4
|
+
data.tar.gz: 729e9409bf76eb8551913f64e02d3905a878d057f045e16a64f712926d0d5cc8
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6d710aee0bb1ae64c3796de31f85a9e39f26791bd12a653785544099d9e10629a3a239f69dff5702b80b009d1e3b6c9abfef50109e7078dcb70194f9f5c65384
|
7
|
+
data.tar.gz: 0b34b2dceaeff07c844e9fc8b42f27bfe3fbaae2c631bfe80f605f1c7caa821f0387780b37cc1bdb9b6735c20a967ba51e2d6a58a0a0478ee624520ab060402d
|
data/Gemfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
forki (0.2.
|
4
|
+
forki (0.2.5)
|
5
5
|
apparition
|
6
6
|
capybara
|
7
7
|
oj
|
@@ -29,12 +29,13 @@ GEM
|
|
29
29
|
i18n (>= 1.6, < 2)
|
30
30
|
minitest (>= 5.1)
|
31
31
|
tzinfo (~> 2.0)
|
32
|
-
addressable (2.8.
|
32
|
+
addressable (2.8.6)
|
33
33
|
public_suffix (>= 2.0.2, < 6.0)
|
34
34
|
apparition (0.6.0)
|
35
35
|
capybara (~> 3.13, < 4)
|
36
36
|
websocket-driver (>= 0.6.5)
|
37
37
|
ast (2.4.2)
|
38
|
+
bigdecimal (3.1.5)
|
38
39
|
builder (3.2.4)
|
39
40
|
byebug (11.1.3)
|
40
41
|
capybara (3.39.2)
|
@@ -53,7 +54,7 @@ GEM
|
|
53
54
|
erubi (1.12.0)
|
54
55
|
ethon (0.16.0)
|
55
56
|
ffi (>= 1.15.0)
|
56
|
-
ffi (1.
|
57
|
+
ffi (1.16.3)
|
57
58
|
i18n (1.13.0)
|
58
59
|
concurrent-ruby (~> 1.0)
|
59
60
|
json (2.6.3)
|
@@ -62,15 +63,16 @@ GEM
|
|
62
63
|
nokogiri (>= 1.12.0)
|
63
64
|
matrix (0.4.2)
|
64
65
|
method_source (1.0.0)
|
65
|
-
mini_mime (1.1.
|
66
|
+
mini_mime (1.1.5)
|
66
67
|
minitest (5.18.0)
|
67
68
|
nokogiri (1.15.1-arm64-darwin)
|
68
69
|
racc (~> 1.4)
|
69
|
-
oj (3.
|
70
|
+
oj (3.16.3)
|
71
|
+
bigdecimal (>= 3.0)
|
70
72
|
parallel (1.23.0)
|
71
73
|
parser (3.2.2.1)
|
72
74
|
ast (~> 2.4.1)
|
73
|
-
public_suffix (5.0.
|
75
|
+
public_suffix (5.0.4)
|
74
76
|
racc (1.6.2)
|
75
77
|
rack (2.2.4)
|
76
78
|
rack-test (2.1.0)
|
@@ -90,7 +92,7 @@ GEM
|
|
90
92
|
rainbow (3.1.1)
|
91
93
|
rake (13.0.6)
|
92
94
|
regexp_parser (2.8.0)
|
93
|
-
rexml (3.2.
|
95
|
+
rexml (3.2.6)
|
94
96
|
rubocop (1.51.0)
|
95
97
|
json (~> 2.3)
|
96
98
|
parallel (~> 1.10)
|
@@ -127,17 +129,17 @@ GEM
|
|
127
129
|
rubocop-rails (~> 2.0)
|
128
130
|
ruby-progressbar (1.13.0)
|
129
131
|
rubyzip (2.3.2)
|
130
|
-
selenium-webdriver (4.
|
132
|
+
selenium-webdriver (4.16.0)
|
131
133
|
rexml (~> 3.2, >= 3.2.5)
|
132
134
|
rubyzip (>= 1.2.2, < 3.0)
|
133
135
|
websocket (~> 1.0)
|
134
136
|
thor (1.2.2)
|
135
|
-
typhoeus (1.4.
|
137
|
+
typhoeus (1.4.1)
|
136
138
|
ethon (>= 0.9.0)
|
137
139
|
tzinfo (2.0.6)
|
138
140
|
concurrent-ruby (~> 1.0)
|
139
141
|
unicode-display_width (2.4.2)
|
140
|
-
websocket (1.2.
|
142
|
+
websocket (1.2.10)
|
141
143
|
websocket-driver (0.7.6)
|
142
144
|
websocket-extensions (>= 0.1.0)
|
143
145
|
websocket-extensions (0.1.5)
|
@@ -65,14 +65,26 @@ module Forki
|
|
65
65
|
graphql_objects.any? do |graphql_object| # if any GraphQL objects contain the top-level keys above, return true
|
66
66
|
return true unless graphql_object.fetch("image", nil).nil? # so long as the associated values are not nil
|
67
67
|
return true unless graphql_object.fetch("currMedia", nil).nil?
|
68
|
+
return true unless graphql_object.fetch("photo_image", nil).nil?
|
68
69
|
|
69
70
|
# This is a complicated form for `web.facebook.com` posts
|
70
|
-
|
71
71
|
if !graphql_object.dig("node", "comet_sections", "content", "story", "attachments").nil?
|
72
72
|
if graphql_object["node"]["comet_sections"]["content"]["story"]["attachments"].count.positive?
|
73
73
|
return true unless graphql_object["node"]["comet_sections"]["content"]["story"]["attachments"].first.dig("styles", "attachment", "all_subattachments", "nodes")&.first&.dig("media", "image", "uri").nil?
|
74
|
+
|
75
|
+
# Another version I guess
|
76
|
+
return true unless graphql_object["node"]["comet_sections"]["content"]["story"]["attachments"].first.dig("styles", "attachment", "media", "large_share_image")&.dig("uri").nil?
|
74
77
|
end
|
75
78
|
end
|
79
|
+
|
80
|
+
# Another weird format
|
81
|
+
begin
|
82
|
+
if !graphql_object["node"]["comet_sections"]["content"]["story"]["attachments"].empty?
|
83
|
+
return true unless graphql_object["node"]["comet_sections"]["content"]["story"]["attachments"].first.dig("styles", "attachment", "media", "photo_image", "uri").nil?
|
84
|
+
end
|
85
|
+
rescue StandardError
|
86
|
+
|
87
|
+
end
|
76
88
|
end
|
77
89
|
end
|
78
90
|
|
@@ -157,7 +169,7 @@ module Forki
|
|
157
169
|
graphql_object_array = graphql_strings.map { |graphql_string| JSON.parse(graphql_string) }
|
158
170
|
|
159
171
|
# Once in awhile it's really easy
|
160
|
-
video_objects = graphql_object_array.filter {|go| go.has_key?("video") }
|
172
|
+
video_objects = graphql_object_array.filter { |go| go.has_key?("video") }
|
161
173
|
|
162
174
|
if VideoSieve.can_process_with_sieve?(graphql_object_array)
|
163
175
|
# Eventually all of this complexity will be replaced with this
|
@@ -170,9 +182,15 @@ module Forki
|
|
170
182
|
return extract_video_post_data_alternative(graphql_object_array) if story_node_object.nil?
|
171
183
|
|
172
184
|
if story_node_object["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"].key?("media")
|
173
|
-
|
174
|
-
|
175
|
-
|
185
|
+
media_object = story_node_object["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"]
|
186
|
+
if media_object.has_key?("video")
|
187
|
+
video_object = story_node_object["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"]["media"]["video"]
|
188
|
+
elsif media_object.has_key?("media") && media_object["media"].has_key?("browser_native_sd_url")
|
189
|
+
video_object = media_object["media"]
|
190
|
+
end
|
191
|
+
|
192
|
+
creation_date = video_object["publish_time"] if video_object&.has_key?("publish_time")
|
193
|
+
creation_date = story_node_object["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"]["media"]["publish_time"] if creation_date.nil?
|
176
194
|
elsif story_node_object["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"].key?("style_infos")
|
177
195
|
# For "Reels" we need a separate way to parse this
|
178
196
|
video_object = story_node_object["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"]["style_infos"].first["fb_shorts_story"]["short_form_video_context"]["playback_video"]
|
@@ -181,13 +199,20 @@ module Forki
|
|
181
199
|
raise "Unable to parse video object" if video_objects.empty?
|
182
200
|
end
|
183
201
|
|
184
|
-
|
202
|
+
begin
|
203
|
+
feedback_object = story_node_object["comet_sections"]["feedback"]["story"]["feedback_context"]["feedback_target_with_context"]["ufi_renderer"]["feedback"]
|
204
|
+
rescue NoMethodError
|
205
|
+
feedback_object = story_node_object["comet_sections"]["feedback"]["story"]["comet_feed_ufi_container"]["story"]["story_ufi_container"]["story"]["feedback_context"]["feedback_target_with_context"]
|
206
|
+
end
|
207
|
+
|
185
208
|
if feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"].key?("cannot_see_top_custom_reactions")
|
186
209
|
reaction_counts = extract_reaction_counts(feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["cannot_see_top_custom_reactions"]["top_reactions"])
|
187
210
|
else
|
188
211
|
reaction_counts = extract_reaction_counts(feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["top_reactions"])
|
189
212
|
end
|
190
213
|
|
214
|
+
feedback_object = feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]
|
215
|
+
|
191
216
|
share_count_object = feedback_object.fetch("share_count", {})
|
192
217
|
|
193
218
|
if story_node_object["comet_sections"]["content"]["story"]["comet_sections"].key? "message"
|
@@ -202,20 +227,31 @@ module Forki
|
|
202
227
|
else
|
203
228
|
num_comments = feedback_object["comment_list_renderer"]["feedback"]["total_comment_count"]
|
204
229
|
end
|
230
|
+
|
231
|
+
view_count = feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["video_view_count"]
|
232
|
+
reshare_warning = feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["should_show_reshare_warning"]
|
233
|
+
elsif feedback_object.has_key?("comments_count_summary_renderer")
|
234
|
+
num_comments = feedback_object["comments_count_summary_renderer"]["feedback"]["comment_rendering_instance"]["comments"]["total_count"]
|
235
|
+
|
236
|
+
view_count = feedback_object["video_view_count"]
|
237
|
+
reshare_warning = feedback_object["should_show_reshare_warning"]
|
205
238
|
else
|
206
239
|
if feedback_object["feedback"].key?("comment_count")
|
207
240
|
num_comments = feedback_object["feedback"]["comment_count"]["total_count"]
|
208
241
|
else
|
209
242
|
num_comments = feedback_object["feedback"]["total_comment_count"]
|
210
243
|
end
|
244
|
+
|
245
|
+
view_count = feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["video_view_count"]
|
246
|
+
reshare_warning = feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["should_show_reshare_warning"]
|
211
247
|
end
|
212
248
|
|
213
249
|
post_details = {
|
214
250
|
id: video_object["id"],
|
215
251
|
num_comments: num_comments,
|
216
252
|
num_shares: share_count_object.fetch("count", nil),
|
217
|
-
num_views:
|
218
|
-
reshare_warning:
|
253
|
+
num_views: view_count,
|
254
|
+
reshare_warning: reshare_warning,
|
219
255
|
video_preview_image_url: video_object["preferred_thumbnail"]["image"]["uri"],
|
220
256
|
video_url: video_object["browser_native_hd_url"] || video_object["browser_native_sd_url"],
|
221
257
|
text: text,
|
@@ -242,9 +278,15 @@ module Forki
|
|
242
278
|
|
243
279
|
share_count_object = feedback_object.fetch("share_count", {})
|
244
280
|
|
281
|
+
if feedback_object["comments_count_summary_renderer"]["feedback"].has_key?("comment_rendering_instance")
|
282
|
+
num_comments = feedback_object["comments_count_summary_renderer"]["feedback"]["comment_rendering_instance"]["comments"]["total_count"]
|
283
|
+
else
|
284
|
+
num_comments = feedback_object["comments_count_summary_renderer"]["feedback"]["total_comment_count"]
|
285
|
+
end
|
286
|
+
|
245
287
|
post_details = {
|
246
288
|
id: video_object["id"],
|
247
|
-
num_comments:
|
289
|
+
num_comments: num_comments,
|
248
290
|
num_shares: share_count_object.fetch("count", nil),
|
249
291
|
num_views: feedback_object["video_view_count"],
|
250
292
|
reshare_warning: feedback_object["should_show_reshare_warning"],
|
@@ -265,31 +307,64 @@ module Forki
|
|
265
307
|
# Extracts data from an image post by parsing GraphQL strings as seen in the video post scraper above
|
266
308
|
def extract_image_post_data(graphql_object_array)
|
267
309
|
# This is a weird one-off style
|
310
|
+
|
268
311
|
graphql_object = graphql_object_array.find { |graphql_object| !graphql_object.dig("node", "comet_sections", "content", "story", "attachments").nil? }
|
269
|
-
unless graphql_object.nil? || graphql_object.count
|
312
|
+
unless graphql_object.nil? || graphql_object.count.zero?
|
313
|
+
# TODO: These two branches are *super* similar, probably a lot of overlap
|
270
314
|
attachments = graphql_object["node"]["comet_sections"]["content"]["story"]["attachments"]
|
271
315
|
|
272
|
-
if graphql_object["node"]["comet_sections"]["feedback"]["story"]
|
273
|
-
|
316
|
+
if graphql_object["node"]["comet_sections"]["feedback"]["story"].key?("feedback_context")
|
317
|
+
feedback_object = graphql_object["node"]["comet_sections"]["feedback"]["story"]["feedback_context"]["feedback_target_with_context"]["ufi_renderer"]["feedback"]["comet_ufi_summary_and_actions_renderer"]["feedback"]
|
318
|
+
elsif graphql_object["node"]["comet_sections"]["feedback"]["story"].has_key?("comet_feed_ufi_container")
|
319
|
+
feedback_object = graphql_object["node"]["comet_sections"]["feedback"]["story"]["comet_feed_ufi_container"]["story"]["story_ufi_container"]["story"]["feedback_context"]["feedback_target_with_context"]["comet_ufi_summary_and_actions_renderer"]["feedback"]
|
320
|
+
else
|
321
|
+
feedback_object = graphql_object["node"]["comet_sections"]["feedback"]["story"]["feedback_context"]["feedback_target_with_context"]["ufi_renderer"]["feedback"]["comet_ufi_summary_and_actions_renderer"]["feedback"]
|
322
|
+
end
|
323
|
+
|
324
|
+
if feedback_object.has_key?("cannot_see_top_custom_reactions")
|
325
|
+
reaction_counts = extract_reaction_counts(feedback_object["cannot_see_top_custom_reactions"]["top_reactions"])
|
274
326
|
else
|
275
|
-
reaction_counts = extract_reaction_counts(
|
327
|
+
reaction_counts = extract_reaction_counts(feedback_object["top_reactions"])
|
276
328
|
end
|
277
329
|
|
278
330
|
id = graphql_object["node"]["post_id"]
|
279
|
-
num_comments =
|
280
|
-
reshare_warning =
|
281
|
-
|
331
|
+
num_comments = feedback_object["comments_count_summary_renderer"]["feedback"]["comment_rendering_instance"]["comments"]["total_count"]
|
332
|
+
reshare_warning = feedback_object["should_show_reshare_warning"]
|
333
|
+
|
334
|
+
if attachments.first["styles"]["attachment"].key?("all_subattachments")
|
335
|
+
image_url = attachments.first["styles"]["attachment"]["all_subattachments"]["nodes"].first["media"]["image"]["uri"]
|
336
|
+
else
|
337
|
+
image_url = attachments.first.dig("styles", "attachment", "media", "photo_image", "uri")
|
338
|
+
|
339
|
+
if image_url.nil?
|
340
|
+
image_url = attachments.first["styles"]["attachment"]["media"]["large_share_image"]["uri"]
|
341
|
+
end
|
342
|
+
end
|
343
|
+
|
282
344
|
text = graphql_object["node"]["comet_sections"]["content"]["story"]["message"]["text"]
|
283
345
|
profile_link = graphql_object["node"]["comet_sections"]["content"]["story"]["actors"].first["url"]
|
284
|
-
|
346
|
+
|
347
|
+
unless graphql_object["node"]["comet_sections"].dig("content", "story", "comet_sections", "context_layout", "story", "comet_sections", "metadata").nil?
|
348
|
+
created_at = graphql_object["node"]["comet_sections"].dig("content", "story", "comet_sections", "context_layout", "story", "comet_sections", "metadata")&.first["story"]["creation_time"]
|
349
|
+
else
|
350
|
+
created_at = graphql_object["node"]["comet_sections"]["context_layout"]["story"]["comet_sections"]["metadata"].first["story"]["creation_time"]
|
351
|
+
end
|
352
|
+
|
285
353
|
has_video = false
|
286
354
|
else
|
287
|
-
|
288
355
|
graphql_object_array.find { |graphql_object| graphql_object.key?("viewer_actor") && graphql_object.key?("display_comments") }
|
289
356
|
curr_media_object = graphql_object_array.find { |graphql_object| graphql_object.key?("currMedia") }
|
290
357
|
creation_story_object = graphql_object_array.find { |graphql_object| graphql_object.key?("creation_story") && graphql_object.key?("message") }
|
291
358
|
|
292
359
|
feedback_object = graphql_object_array.find { |graphql_object| graphql_object.has_key?("comet_ufi_summary_and_actions_renderer") }["comet_ufi_summary_and_actions_renderer"]["feedback"]
|
360
|
+
|
361
|
+
if feedback_object.key?("top_reactions")
|
362
|
+
feedback_object = feedback_object
|
363
|
+
else
|
364
|
+
# POSSIBLY OUT OF DATE
|
365
|
+
feedback_object = feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]
|
366
|
+
end
|
367
|
+
|
293
368
|
share_count_object = feedback_object.fetch("share_count", {})
|
294
369
|
|
295
370
|
poster = creation_story_object["creation_story"]["comet_sections"]["actor_photo"]["story"]["actors"][0]
|
@@ -301,16 +376,21 @@ module Forki
|
|
301
376
|
end
|
302
377
|
|
303
378
|
id = curr_media_object["currMedia"]["id"],
|
379
|
+
|
304
380
|
num_comments = feedback_object["comments_count_summary_renderer"]["feedback"]["total_comment_count"],
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
text = (creation_story_object["message"] || {}).fetch("text", nil),
|
309
|
-
profile_link = poster["url"],
|
310
|
-
created_at = curr_media_object["currMedia"]["created_time"],
|
311
|
-
has_video = false
|
381
|
+
if num_comments.nil? && feedback_object.has_key?("comments_count_summary_renderer")
|
382
|
+
num_comments = feedback_object["comments_count_summary_renderer"]["feedback"]["comment_rendering_instance"]["comments"]["total_count"]
|
383
|
+
end
|
312
384
|
|
385
|
+
num_shares = share_count_object.fetch("count", nil)
|
386
|
+
reshare_warning = feedback_object["should_show_reshare_warning"]
|
387
|
+
image_url = curr_media_object["currMedia"]["image"]["uri"]
|
388
|
+
text = (creation_story_object["message"] || {}).fetch("text", nil)
|
389
|
+
profile_link = poster["url"]
|
390
|
+
created_at = curr_media_object["currMedia"]["created_time"]
|
391
|
+
has_video = false
|
313
392
|
end
|
393
|
+
|
314
394
|
post_details = {
|
315
395
|
id: id,
|
316
396
|
num_comments: num_comments,
|
@@ -457,4 +537,3 @@ module Forki
|
|
457
537
|
end
|
458
538
|
|
459
539
|
require_relative "sieves/video_sieves/video_sieve"
|
460
|
-
|
@@ -4,7 +4,6 @@ class ImageSieve
|
|
4
4
|
end
|
5
5
|
|
6
6
|
def self.sieve_for_graphql_objects(graphql_objects)
|
7
|
-
|
8
7
|
sieve = sieve_class_for_graphql_objects(graphql_objects)
|
9
8
|
return nil if sieve.nil?
|
10
9
|
|
@@ -19,7 +18,6 @@ private
|
|
19
18
|
end
|
20
19
|
end
|
21
20
|
|
22
|
-
|
23
|
-
Dir['./lib/forki/scrapers/sieves/image_sieves/*.rb'].each do |file|
|
21
|
+
Dir["./lib/forki/scrapers/sieves/image_sieves/*.rb"].each do |file|
|
24
22
|
require file unless file.end_with?("image_sieve.rb")
|
25
23
|
end
|
data/lib/forki/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: forki
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- ''
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2024-05-08 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: capybara
|
@@ -165,7 +165,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
165
165
|
- !ruby/object:Gem::Version
|
166
166
|
version: '0'
|
167
167
|
requirements: []
|
168
|
-
rubygems_version: 3.
|
168
|
+
rubygems_version: 3.5.9
|
169
169
|
signing_key:
|
170
170
|
specification_version: 4
|
171
171
|
summary: A gem to scrape Facebook pages for archive purposes.
|