forki 0.1.4 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +7 -6
- data/bin/generate_sieve +79 -0
- data/bin/generator_templates/image_sieve_template.rb.erb +63 -0
- data/bin/generator_templates/image_sieve_test_template.rb.erb +42 -0
- data/bin/generator_templates/video_sieve_template.rb.erb +61 -0
- data/bin/generator_templates/video_sieve_test_template.rb.erb +41 -0
- data/forki.gemspec +2 -0
- data/lib/forki/scrapers/post_scraper.rb +20 -6
- data/lib/forki/scrapers/scraper.rb +29 -12
- data/lib/forki/scrapers/sieves/image_sieves/image_sieve.rb.rb +25 -0
- data/lib/forki/scrapers/sieves/video_sieves/video_sieve.rb +24 -0
- data/lib/forki/scrapers/sieves/video_sieves/video_sieve_video_page.rb +66 -0
- data/lib/forki/scrapers/sieves/video_sieves/video_sieve_watch_tab.rb +91 -0
- data/lib/forki/scrapers/user_scraper.rb +20 -5
- data/lib/forki/version.rb +1 -1
- metadata +25 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7a1b2f6a831ebac1bf9e79cc33818aa3d8f638459f538ff4f0cb92bab55b16df
|
4
|
+
data.tar.gz: 1df3b090db0ba37ecfbcf17933d41334620e9542a75f3328d9a335adec5d1ae9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b65c46157a6d1f320345d0216d138239c9d73cb29fd7fc0fa504655e0c789021a558d2e75a0447f026dc694e4fb95765cf12f8b2350e147b1011e670cd40621f
|
7
|
+
data.tar.gz: bc62a4decd0205e75ac93038d5768ff4d9ec55d8a25980dbb8522b3b2296f84e50db7caa3ba8aeb28d61eb482957b4d4bf68a9d9658cc26f4209198a64b1fe04
|
data/Gemfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
forki (0.1.
|
4
|
+
forki (0.1.4)
|
5
5
|
apparition
|
6
6
|
capybara
|
7
7
|
oj
|
@@ -37,7 +37,7 @@ GEM
|
|
37
37
|
ast (2.4.2)
|
38
38
|
builder (3.2.4)
|
39
39
|
byebug (11.1.3)
|
40
|
-
capybara (3.39.
|
40
|
+
capybara (3.39.2)
|
41
41
|
addressable
|
42
42
|
matrix
|
43
43
|
mini_mime (>= 0.1.3)
|
@@ -66,11 +66,11 @@ GEM
|
|
66
66
|
minitest (5.18.0)
|
67
67
|
nokogiri (1.15.1-arm64-darwin)
|
68
68
|
racc (~> 1.4)
|
69
|
-
oj (3.
|
69
|
+
oj (3.15.1)
|
70
70
|
parallel (1.23.0)
|
71
71
|
parser (3.2.2.1)
|
72
72
|
ast (~> 2.4.1)
|
73
|
-
public_suffix (5.0.
|
73
|
+
public_suffix (5.0.3)
|
74
74
|
racc (1.6.2)
|
75
75
|
rack (2.2.4)
|
76
76
|
rack-test (2.1.0)
|
@@ -127,7 +127,7 @@ GEM
|
|
127
127
|
rubocop-rails (~> 2.0)
|
128
128
|
ruby-progressbar (1.13.0)
|
129
129
|
rubyzip (2.3.2)
|
130
|
-
selenium-webdriver (4.
|
130
|
+
selenium-webdriver (4.10.0)
|
131
131
|
rexml (~> 3.2, >= 3.2.5)
|
132
132
|
rubyzip (>= 1.2.2, < 3.0)
|
133
133
|
websocket (~> 1.0)
|
@@ -138,7 +138,7 @@ GEM
|
|
138
138
|
concurrent-ruby (~> 1.0)
|
139
139
|
unicode-display_width (2.4.2)
|
140
140
|
websocket (1.2.9)
|
141
|
-
websocket-driver (0.7.
|
141
|
+
websocket-driver (0.7.6)
|
142
142
|
websocket-extensions (>= 0.1.0)
|
143
143
|
websocket-extensions (0.1.5)
|
144
144
|
xpath (3.2.0)
|
@@ -160,6 +160,7 @@ DEPENDENCIES
|
|
160
160
|
rubocop (~> 1.7)
|
161
161
|
rubocop-rails (~> 2.17.3)
|
162
162
|
rubocop-rails_config
|
163
|
+
thor
|
163
164
|
|
164
165
|
BUNDLED WITH
|
165
166
|
2.3.11
|
data/bin/generate_sieve
ADDED
@@ -0,0 +1,79 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
require "bundler/setup"
|
5
|
+
require "forki"
|
6
|
+
require "thor"
|
7
|
+
require "erb"
|
8
|
+
|
9
|
+
class GenerateSieve < Thor
|
10
|
+
|
11
|
+
desc "generate TYPE NAME", "generate a TYPE named NAME"
|
12
|
+
option :video
|
13
|
+
option :image
|
14
|
+
def generate(type, name)
|
15
|
+
case type
|
16
|
+
when "post"
|
17
|
+
if !options[:video].nil?
|
18
|
+
style = :video
|
19
|
+
elsif !options[:image].nil?
|
20
|
+
style = :image
|
21
|
+
else
|
22
|
+
puts "Must indicate either video or image flag"
|
23
|
+
exit
|
24
|
+
end
|
25
|
+
|
26
|
+
SieveGenerator.generate_post_sieve(name, style)
|
27
|
+
when "user"
|
28
|
+
SieveGenerator.generate_user_sieve(name)
|
29
|
+
else
|
30
|
+
puts "Type must be `post` or `user` only. `#{type}` passed in."
|
31
|
+
exit
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
class SieveGenerator
|
37
|
+
def self.generate_post_sieve(name, style)
|
38
|
+
puts "Generating post sieve named #{name} with style #{style}"
|
39
|
+
|
40
|
+
file_path = "./lib/forki/scrapers/sieves/"
|
41
|
+
test_path = "./test/sieves/"
|
42
|
+
case style
|
43
|
+
when :video
|
44
|
+
test_data_valid_path = "#{test_path}/video_sieves/test_data/video_sieve_#{name}_valid.json"
|
45
|
+
test_data_invalid_path = "#{test_path}/video_sieves/test_data/video_sieve_#{name}_invalid.json"
|
46
|
+
|
47
|
+
file_path += "video_sieves/video_sieve_#{name}.rb"
|
48
|
+
test_path += "video_sieves/video_sieve_#{name}_test.rb"
|
49
|
+
file_template = File.read("./bin/generator_templates/video_sieve_template.rb.erb")
|
50
|
+
test_file_template = File.read("./bin/generator_templates/video_sieve_test_template.rb.erb")
|
51
|
+
when :image
|
52
|
+
test_data_valid_path = "#{test_path}/image_sieves/test_data/image_sieve_#{name}_valid.json"
|
53
|
+
test_data_invalid_path = "#{test_path}/image_sieves/test_data/image_sieve_#{name}_invalid.json"
|
54
|
+
|
55
|
+
file_path += "image_sieves/image_sieve_#{name}.rb"
|
56
|
+
test_path += "image_sieves/image_sieve_#{name}_test.rb"
|
57
|
+
file_template = File.read("./bin/generator_templates/image_sieve_template.rb.erb")
|
58
|
+
test_file_template = File.read("./bin/generator_templates/image_sieve_test_template.rb.erb")
|
59
|
+
end
|
60
|
+
|
61
|
+
file_contents = ERB.new(file_template)
|
62
|
+
test_file_contents = ERB.new(test_file_template)
|
63
|
+
|
64
|
+
camel_name = name.split('_').collect(&:capitalize).join
|
65
|
+
|
66
|
+
File.write(file_path, file_contents.result(binding))
|
67
|
+
File.write(test_path, test_file_contents.result(binding))
|
68
|
+
|
69
|
+
File.write(test_data_valid_path, "")
|
70
|
+
File.write(test_data_invalid_path, "")
|
71
|
+
end
|
72
|
+
|
73
|
+
def self.generate_user_sieve(name)
|
74
|
+
puts "Generating user sieve named #{name}"
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
GenerateSieve.start(ARGV)
|
79
|
+
|
@@ -0,0 +1,63 @@
|
|
1
|
+
# NOTE: This is not implemented yet, just here for filler
|
2
|
+
|
3
|
+
class ImageSieve<%= camel_name %> < ImageSieve
|
4
|
+
# To check if it's valid for the inputted graphql objects
|
5
|
+
def self.check(graphql_objects)
|
6
|
+
image_object = self.extractor(graphql_objects)
|
7
|
+
|
8
|
+
true
|
9
|
+
rescue StandardError
|
10
|
+
return false
|
11
|
+
end
|
12
|
+
|
13
|
+
# output the expected format of:
|
14
|
+
#
|
15
|
+
# post_details = {
|
16
|
+
# id: video_object["id"],
|
17
|
+
# num_comments: num_comments,
|
18
|
+
# num_shares: share_count_object.fetch("count", nil),
|
19
|
+
# num_views: feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["video_view_count"],
|
20
|
+
# reshare_warning: feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["should_show_reshare_warning"],
|
21
|
+
# video_preview_image_url: video_object["preferred_thumbnail"]["image"]["uri"],
|
22
|
+
# video_url: video_object["browser_native_hd_url"] || video_object["browser_native_sd_url"],
|
23
|
+
# text: text,
|
24
|
+
# created_at: creation_date,
|
25
|
+
# profile_link: story_node_object["comet_sections"]["context_layout"]["story"]["comet_sections"]["actor_photo"]["story"]["actors"][0]["url"],
|
26
|
+
# has_video: true
|
27
|
+
# }
|
28
|
+
# post_details[:video_preview_image_file] = Forki.retrieve_media(post_details[:video_preview_image_url])
|
29
|
+
# post_details[:video_file] = Forki.retrieve_media(post_details[:video_url])
|
30
|
+
# post_details[:reactions] = reaction_counts
|
31
|
+
|
32
|
+
def self.sieve(graphql_objects)
|
33
|
+
image_object = self.extractor(graphql_objects)
|
34
|
+
|
35
|
+
post_details = {
|
36
|
+
id: nil,
|
37
|
+
num_comments: nil,
|
38
|
+
num_shared: nil,
|
39
|
+
num_views: nil,
|
40
|
+
reshare_warning: nil,
|
41
|
+
video_preview_image_url: nil,
|
42
|
+
video_url: nil,
|
43
|
+
text: nil,
|
44
|
+
created_at: nil,
|
45
|
+
profile_link: nil,
|
46
|
+
has_video: false,
|
47
|
+
video_preview_image_file: Forki.retrieve_media(video_preview_image_url),
|
48
|
+
video_file: Forki.retrieve_media(video_url),
|
49
|
+
reactions: nil
|
50
|
+
}
|
51
|
+
end
|
52
|
+
|
53
|
+
private
|
54
|
+
|
55
|
+
def self.extractor(graphql_objects)
|
56
|
+
image_objects = graphql_objects.filter do |go|
|
57
|
+
# go = go.first if go.kind_of?(Array) && !go.empty?
|
58
|
+
# go.has_key?("image")
|
59
|
+
end
|
60
|
+
|
61
|
+
# image_objects.first.dig("image", "creation_story")
|
62
|
+
end
|
63
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "test_helper"
|
4
|
+
|
5
|
+
# rubocop:disable Metrics/ClassLength
|
6
|
+
class ImageSieve<%= camel_name %>Test < Minitest::Test
|
7
|
+
def setup
|
8
|
+
@valid_json = JSON.parse(File.read("test/sieves/image_sieves/test_data/image_sieve_<%= name %>_valid.json"))
|
9
|
+
@invalid_json = JSON.parse(File.read("test/sieves/image_sieves/test_data/image_sieve_<%= name %>_invalid.json"))
|
10
|
+
end
|
11
|
+
|
12
|
+
def test_sieve_properly_fails_check
|
13
|
+
assert ImageSieve<%= camel_name %>.check(@invalid_json) == false
|
14
|
+
end
|
15
|
+
|
16
|
+
def test_sieve_properly_passes_check
|
17
|
+
assert ImageSieve<%= camel_name %>.check(@valid_json)
|
18
|
+
end
|
19
|
+
|
20
|
+
def test_sieve_can_sieve_properly
|
21
|
+
result = ImageSieve<%= camel_name %>.sieve(@valid_json)
|
22
|
+
|
23
|
+
# TODO: Update the values for the post you're testing
|
24
|
+
# MAINTAINER TODO: FIX THIS FOR IMAGES
|
25
|
+
assert_equal "394367115960503", result[:id]
|
26
|
+
assert_equal 173, result[:num_comments]
|
27
|
+
assert_equal nil, result[:num_shared]
|
28
|
+
assert_equal nil, result[:num_views]
|
29
|
+
assert_equal false, result[:reshare_warning]
|
30
|
+
assert_not_nil result[:video_preview_image_url]
|
31
|
+
assert_not_nil result[:video_url]
|
32
|
+
assert_equal nil, text
|
33
|
+
assert_equal 1654989063, result[:created_at]
|
34
|
+
assert_equal nil, result[:profile_link]
|
35
|
+
assert_equal false, result[:has_video]
|
36
|
+
assert_not_nil result[:video_preview_image_file]
|
37
|
+
assert_not_nil result[:video_file]
|
38
|
+
assert_not_nil result[:reactions]
|
39
|
+
|
40
|
+
assert result[:reactions].kind_of?(Array)
|
41
|
+
end
|
42
|
+
end
|
@@ -0,0 +1,61 @@
|
|
1
|
+
class VideoSieve<%= camel_name %> < VideoSieve
|
2
|
+
# To check if it's valid for the inputted graphql objects
|
3
|
+
def self.check(graphql_objects)
|
4
|
+
video_object = self.extractor(graphql_objects)
|
5
|
+
|
6
|
+
true
|
7
|
+
rescue StandardError
|
8
|
+
return false
|
9
|
+
end
|
10
|
+
|
11
|
+
# output the expected format of:
|
12
|
+
#
|
13
|
+
# post_details = {
|
14
|
+
# id: video_object["id"],
|
15
|
+
# num_comments: num_comments,
|
16
|
+
# num_shares: share_count_object.fetch("count", nil),
|
17
|
+
# num_views: feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["video_view_count"],
|
18
|
+
# reshare_warning: feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["should_show_reshare_warning"],
|
19
|
+
# video_preview_image_url: video_object["preferred_thumbnail"]["image"]["uri"],
|
20
|
+
# video_url: video_object["browser_native_hd_url"] || video_object["browser_native_sd_url"],
|
21
|
+
# text: text,
|
22
|
+
# created_at: creation_date,
|
23
|
+
# profile_link: story_node_object["comet_sections"]["context_layout"]["story"]["comet_sections"]["actor_photo"]["story"]["actors"][0]["url"],
|
24
|
+
# has_video: true
|
25
|
+
# }
|
26
|
+
# post_details[:video_preview_image_file] = Forki.retrieve_media(post_details[:video_preview_image_url])
|
27
|
+
# post_details[:video_file] = Forki.retrieve_media(post_details[:video_url])
|
28
|
+
# post_details[:reactions] = reaction_counts
|
29
|
+
|
30
|
+
def self.sieve(graphql_objects)
|
31
|
+
video_object = self.extractor(graphql_objects)
|
32
|
+
|
33
|
+
post_details = {
|
34
|
+
id: nil,
|
35
|
+
num_comments: nil,
|
36
|
+
num_shared: nil,
|
37
|
+
num_views: nil,
|
38
|
+
reshare_warning: nil,
|
39
|
+
video_preview_image_url: nil,
|
40
|
+
video_url: nil,
|
41
|
+
text: nil,
|
42
|
+
created_at: nil,
|
43
|
+
profile_link: nil,
|
44
|
+
has_video: true,
|
45
|
+
video_preview_image_file: Forki.retrieve_media(video_preview_image_url),
|
46
|
+
video_file: Forki.retrieve_media(video_url),
|
47
|
+
reactions: nil
|
48
|
+
}
|
49
|
+
end
|
50
|
+
|
51
|
+
private
|
52
|
+
|
53
|
+
def self.extractor(graphql_objects)
|
54
|
+
video_objects = graphql_objects.filter do |go|
|
55
|
+
# go = go.first if go.kind_of?(Array) && !go.empty?
|
56
|
+
# go.has_key?("video")
|
57
|
+
end
|
58
|
+
|
59
|
+
# video_objects.first.dig("video", "creation_story")
|
60
|
+
end
|
61
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "test_helper"
|
4
|
+
|
5
|
+
# rubocop:disable Metrics/ClassLength
|
6
|
+
class VideoSieve<%= camel_name %>Test < Minitest::Test
|
7
|
+
def setup
|
8
|
+
@valid_json = JSON.parse(File.read("test/sieves/video_sieves/test_data/video_sieve_<%= name %>_valid.json"))
|
9
|
+
@invalid_json = JSON.parse(File.read("test/sieves/video_sieves/test_data/video_sieve_<%= name %>_invalid.json"))
|
10
|
+
end
|
11
|
+
|
12
|
+
def test_sieve_properly_fails_check
|
13
|
+
assert VideoSieve<%= camel_name %>.check(@invalid_json) == false
|
14
|
+
end
|
15
|
+
|
16
|
+
def test_sieve_properly_passes_check
|
17
|
+
assert VideoSieve<%= camel_name %>.check(@valid_json)
|
18
|
+
end
|
19
|
+
|
20
|
+
def test_sieve_can_sieve_properly
|
21
|
+
result = VideoSieve<%= camel_name %>.sieve(@valid_json)
|
22
|
+
|
23
|
+
# TODO: Update the values for the post you're testing
|
24
|
+
assert_equal "394367115960503", result[:id]
|
25
|
+
assert_equal 173, result[:num_comments]
|
26
|
+
assert_equal nil, result[:num_shared]
|
27
|
+
assert_equal nil, result[:num_views]
|
28
|
+
assert_equal false, result[:reshare_warning]
|
29
|
+
assert_not_nil result[:video_preview_image_url]
|
30
|
+
assert_not_nil result[:video_url]
|
31
|
+
assert_equal nil, text
|
32
|
+
assert_equal 1654989063, result[:created_at]
|
33
|
+
assert_equal nil, result[:profile_link]
|
34
|
+
assert_equal true, result[:has_video]
|
35
|
+
assert_not_nil result[:video_preview_image_file]
|
36
|
+
assert_not_nil result[:video_file]
|
37
|
+
assert_not_nil result[:reactions]
|
38
|
+
|
39
|
+
assert result[:reactions].kind_of?(Array)
|
40
|
+
end
|
41
|
+
end
|
data/forki.gemspec
CHANGED
@@ -37,6 +37,8 @@ Gem::Specification.new do |spec|
|
|
37
37
|
spec.add_dependency "oj" # A faster JSON parser/loader than stdlib
|
38
38
|
spec.add_dependency "selenium-webdriver" # Webdriver selenium
|
39
39
|
|
40
|
+
spec.add_development_dependency "thor" # For the generator
|
41
|
+
|
40
42
|
# For more information and examples about making a new gem, checkout our
|
41
43
|
# guide at: https://bundler.io/guides/creating_gem.html
|
42
44
|
end
|
@@ -4,6 +4,7 @@ require "typhoeus"
|
|
4
4
|
require "securerandom"
|
5
5
|
require "byebug"
|
6
6
|
|
7
|
+
|
7
8
|
module Forki
|
8
9
|
# rubocop:disable Metrics/ClassLength
|
9
10
|
class PostScraper < Scraper
|
@@ -154,21 +155,30 @@ module Forki
|
|
154
155
|
end
|
155
156
|
|
156
157
|
graphql_object_array = graphql_strings.map { |graphql_string| JSON.parse(graphql_string) }
|
158
|
+
|
159
|
+
# Once in awhile it's really easy
|
160
|
+
video_objects = graphql_object_array.filter {|go| go.has_key?("video") }
|
161
|
+
|
162
|
+
if VideoSieve.can_process_with_sieve?(graphql_object_array)
|
163
|
+
# Eventually all of this complexity will be replaced with this
|
164
|
+
return VideoSieve.sieve_for_graphql_objects(graphql_object_array)
|
165
|
+
end
|
166
|
+
|
157
167
|
story_node_object = graphql_object_array.find { |graphql_object| graphql_object.key? "node" }&.fetch("node", nil) # user posted video
|
158
168
|
story_node_object = story_node_object || graphql_object_array.find { |graphql_object| graphql_object.key? "nodes" }&.fetch("nodes")&.first # page posted video
|
159
169
|
|
160
170
|
return extract_video_post_data_alternative(graphql_object_array) if story_node_object.nil?
|
161
171
|
|
162
172
|
if story_node_object["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"].key?("media")
|
163
|
-
video_object = story_node_object["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"]["media"]
|
164
|
-
creation_date = video_object["publish_time"]
|
165
|
-
|
173
|
+
video_object = story_node_object["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"]["media"]["video"]
|
174
|
+
creation_date = video_object["publish_time"] if video_object&.has_key("publish_time")
|
175
|
+
creation_date = story_node_object["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"]["media"] if creation_date.nil?
|
166
176
|
elsif story_node_object["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"].key?("style_infos")
|
167
177
|
# For "Reels" we need a separate way to parse this
|
168
178
|
video_object = story_node_object["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"]["style_infos"].first["fb_shorts_story"]["short_form_video_context"]["playback_video"]
|
169
179
|
creation_date = story_node_object["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"]["style_infos"].first["fb_shorts_story"]["creation_time"]
|
170
180
|
else
|
171
|
-
raise "Unable to parse video object"
|
181
|
+
raise "Unable to parse video object" if video_objects.empty?
|
172
182
|
end
|
173
183
|
|
174
184
|
feedback_object = story_node_object["comet_sections"]["feedback"]["story"]["feedback_context"]["feedback_target_with_context"]["ufi_renderer"]["feedback"]
|
@@ -191,7 +201,7 @@ module Forki
|
|
191
201
|
num_views: feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["video_view_count"],
|
192
202
|
reshare_warning: feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["should_show_reshare_warning"],
|
193
203
|
video_preview_image_url: video_object["preferred_thumbnail"]["image"]["uri"],
|
194
|
-
video_url: video_object["
|
204
|
+
video_url: video_object["browser_native_hd_url"] || video_object["browser_native_sd_url"],
|
195
205
|
text: text,
|
196
206
|
created_at: creation_date,
|
197
207
|
profile_link: story_node_object["comet_sections"]["context_layout"]["story"]["comet_sections"]["actor_photo"]["story"]["actors"][0]["url"],
|
@@ -217,7 +227,7 @@ module Forki
|
|
217
227
|
num_views: feedback_object["video_view_count"],
|
218
228
|
reshare_warning: feedback_object["should_show_reshare_warning"],
|
219
229
|
video_preview_image_url: video_object["video"]["preferred_thumbnail"]["image"]["uri"],
|
220
|
-
video_url: video_object["video"]["playable_url_quality_hd"] || video_object["video"]["playable_url"],
|
230
|
+
video_url: video_object["video"]["playable_url_quality_hd"] || video_object["video"]["browser_native_hd_url"] || video_object["video"]["browser_native_sd_url"] || video_object["video"]["playable_url"],
|
221
231
|
text: sidepane_object["tahoe_sidepane_renderer"]["video"]["creation_story"]["comet_sections"]["message"]["story"]["message"]["text"],
|
222
232
|
created_at: video_object["video"]["publish_time"],
|
223
233
|
profile_link: sidepane_object["tahoe_sidepane_renderer"]["video"]["creation_story"]["comet_sections"]["actor_photo"]["story"]["actors"][0]["url"],
|
@@ -369,6 +379,7 @@ module Forki
|
|
369
379
|
def parse(url)
|
370
380
|
validate_and_load_page(url)
|
371
381
|
graphql_strings = find_graphql_data_strings(page.html)
|
382
|
+
|
372
383
|
post_data = extract_post_data(graphql_strings)
|
373
384
|
post_data[:url] = url
|
374
385
|
user_url = post_data[:profile_link]
|
@@ -398,3 +409,6 @@ module Forki
|
|
398
409
|
end
|
399
410
|
end
|
400
411
|
end
|
412
|
+
|
413
|
+
require_relative "sieves/video_sieves/video_sieve"
|
414
|
+
|
@@ -7,6 +7,7 @@ require "oj"
|
|
7
7
|
require "selenium-webdriver"
|
8
8
|
require "open-uri"
|
9
9
|
require "selenium/webdriver/remote/http/curb"
|
10
|
+
require "cgi"
|
10
11
|
|
11
12
|
options = Selenium::WebDriver::Options.chrome(exclude_switches: ["enable-automation"])
|
12
13
|
options.add_argument("--start-maximized")
|
@@ -190,9 +191,21 @@ module Forki
|
|
190
191
|
|
191
192
|
visit "https://www.facebook.com"
|
192
193
|
login
|
194
|
+
|
193
195
|
visit url unless current_url.start_with?(url)
|
196
|
+
# # If the video is a watch page it doesn't have most of the data we want so we click on the video
|
197
|
+
# if url.include?("watch/live")
|
198
|
+
# clickable_element = find("video")
|
199
|
+
|
200
|
+
# while(clickable_element.obscured?)
|
201
|
+
# clickable_element = clickable_element.find(:xpath, "..")
|
202
|
+
# end
|
203
|
+
|
204
|
+
# clickable_element.click
|
205
|
+
# end
|
194
206
|
end
|
195
207
|
|
208
|
+
|
196
209
|
# Extracts an integer out of a string describing a number
|
197
210
|
# e.g. "4K Comments" returns 4000
|
198
211
|
# e.g. "131 Shares" returns 131
|
@@ -203,19 +216,23 @@ module Forki
|
|
203
216
|
element = element.text(:all)
|
204
217
|
end
|
205
218
|
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
interaction_num_text.to_i + interaction_num_text[-2].to_i * 100
|
211
|
-
elsif interaction_num_text.include?("K") # e.g. "13K"
|
212
|
-
interaction_num_text.to_i * 1000
|
213
|
-
elsif interaction_num_text.include?("M") # e.g. "13M"
|
214
|
-
interaction_num_text.to_i * 1_000_000
|
215
|
-
else # e.g. "15,443"
|
216
|
-
interaction_num_text.delete!(",")
|
217
|
-
interaction_num_text.delete(" ").to_i
|
219
|
+
# Check if there's a modifier i.e. `K` or `M` if there isn't just return the number
|
220
|
+
unless element.include?("K") || element.include?("M")
|
221
|
+
element.delete(",") # "5,456" e.g.
|
222
|
+
return element.to_i
|
218
223
|
end
|
224
|
+
|
225
|
+
modifier = element[-1]
|
226
|
+
number = element[0...-1].to_f
|
227
|
+
|
228
|
+
case modifier
|
229
|
+
when "K"
|
230
|
+
number = number * 1_000
|
231
|
+
when "M"
|
232
|
+
number = number * 1_000_000
|
233
|
+
end
|
234
|
+
|
235
|
+
number.to_i
|
219
236
|
end
|
220
237
|
end
|
221
238
|
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
class ImageSieve
|
2
|
+
def self.can_process_with_sieve?(graphql_objects)
|
3
|
+
!sieve_class_for_graphql_objects(graphql_objects).nil?
|
4
|
+
end
|
5
|
+
|
6
|
+
def self.sieve_for_graphql_objects(graphql_objects)
|
7
|
+
|
8
|
+
sieve = sieve_class_for_graphql_objects(graphql_objects)
|
9
|
+
return nil if sieve.nil?
|
10
|
+
|
11
|
+
sieve.sieve(graphql_objects)
|
12
|
+
end
|
13
|
+
|
14
|
+
private
|
15
|
+
|
16
|
+
def self.sieve_class_for_graphql_objects(graphql_objects)
|
17
|
+
sieves = []
|
18
|
+
sieves.detect { |sieve| sieve.check(graphql_objects) }
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
|
23
|
+
Dir['./lib/forki/scrapers/sieves/image_sieves/*.rb'].each do |file|
|
24
|
+
require file unless file.end_with?("image_sieve.rb")
|
25
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
class VideoSieve
|
2
|
+
def self.can_process_with_sieve?(graphql_objects)
|
3
|
+
!sieve_class_for_graphql_objects(graphql_objects).nil?
|
4
|
+
end
|
5
|
+
|
6
|
+
def self.sieve_for_graphql_objects(graphql_objects)
|
7
|
+
|
8
|
+
sieve = sieve_class_for_graphql_objects(graphql_objects)
|
9
|
+
return nil if sieve.nil?
|
10
|
+
|
11
|
+
sieve.sieve(graphql_objects)
|
12
|
+
end
|
13
|
+
|
14
|
+
private
|
15
|
+
|
16
|
+
def self.sieve_class_for_graphql_objects(graphql_objects)
|
17
|
+
sieves = [VideoSieveWatchTab, VideoSieveVideoPage]
|
18
|
+
sieves.detect { |sieve| sieve.check(graphql_objects) }
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
Dir['./lib/forki/scrapers/sieves/video_sieves/*.rb'].each do |file|
|
23
|
+
require file unless file.end_with?("video_sieve.rb")
|
24
|
+
end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
class VideoSieveVideoPage < VideoSieve
|
2
|
+
# To check if it's valid for the inputted graphql objects
|
3
|
+
def self.check(graphql_objects)
|
4
|
+
story_node_object = self.extractor(graphql_objects) # This will error out
|
5
|
+
return false unless story_node_object["content"]["story"]["attachments"].first["styles"]["attachment"].has_key?("media")
|
6
|
+
|
7
|
+
true
|
8
|
+
rescue StandardError
|
9
|
+
return false
|
10
|
+
end
|
11
|
+
|
12
|
+
# output the expected format of:
|
13
|
+
#
|
14
|
+
# post_details = {
|
15
|
+
# id: video_object["id"],
|
16
|
+
# num_comments: num_comments,
|
17
|
+
# num_shares: share_count_object.fetch("count", nil),
|
18
|
+
# num_views: feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["video_view_count"],
|
19
|
+
# reshare_warning: feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["should_show_reshare_warning"],
|
20
|
+
# video_preview_image_url: video_object["preferred_thumbnail"]["image"]["uri"],
|
21
|
+
# video_url: video_object["browser_native_hd_url"] || video_object["browser_native_sd_url"],
|
22
|
+
# text: text,
|
23
|
+
# created_at: creation_date,
|
24
|
+
# profile_link: story_node_object["comet_sections"]["context_layout"]["story"]["comet_sections"]["actor_photo"]["story"]["actors"][0]["url"],
|
25
|
+
# has_video: true
|
26
|
+
# }
|
27
|
+
# post_details[:video_preview_image_file] = Forki.retrieve_media(post_details[:video_preview_image_url])
|
28
|
+
# post_details[:video_file] = Forki.retrieve_media(post_details[:video_url])
|
29
|
+
# post_details[:reactions] = reaction_counts
|
30
|
+
|
31
|
+
def self.sieve(graphql_objects)
|
32
|
+
extracted_text = self.extractor(graphql_objects)
|
33
|
+
|
34
|
+
story_object = extracted_text["content"]["story"]
|
35
|
+
video_object = extracted_text["content"]["story"]["attachments"].first["styles"]["attachment"]["media"]
|
36
|
+
feedback_object = extracted_text["feedback"]["story"]["feedback_context"]["feedback_target_with_context"]["ufi_renderer"]["feedback"]["comet_ufi_summary_and_actions_renderer"]["feedback"]
|
37
|
+
|
38
|
+
video_preview_image_url = video_object["preferred_thumbnail"]["image"]["uri"]
|
39
|
+
video_url = video_object["browser_native_hd_url"]
|
40
|
+
video_url = video_object["browser_native_sd_url"] if video_url.nil?
|
41
|
+
|
42
|
+
post_details = {
|
43
|
+
id: video_object["id"],
|
44
|
+
num_comments: feedback_object["total_comment_count"],
|
45
|
+
num_shared: feedback_object["share_count"]["count"],
|
46
|
+
num_views: nil,
|
47
|
+
reshare_warning: feedback_object["should_show_reshare_warning"],
|
48
|
+
video_preview_image_url: video_preview_image_url,
|
49
|
+
video_url: video_url,
|
50
|
+
text: story_object["message"]["text"],
|
51
|
+
created_at: video_object["publish_time"],
|
52
|
+
profile_link: story_object["actors"].first["url"],
|
53
|
+
has_video: true,
|
54
|
+
video_preview_image_file: Forki.retrieve_media(video_preview_image_url),
|
55
|
+
video_file: Forki.retrieve_media(video_url),
|
56
|
+
reactions: feedback_object["cannot_see_top_custom_reactions"]["top_reactions"]["edges"]
|
57
|
+
}
|
58
|
+
end
|
59
|
+
|
60
|
+
private
|
61
|
+
|
62
|
+
def self.extractor(graphql_objects)
|
63
|
+
story_node_object = graphql_objects.find { |graphql_object| graphql_object.key? "node" }&.fetch("node", nil) # user posted video
|
64
|
+
story_node_object["comet_sections"]
|
65
|
+
end
|
66
|
+
end
|
@@ -0,0 +1,91 @@
|
|
1
|
+
# This is for the "watch" tab style videos https://www.facebook.com/watch/live/?v=394367115960503
|
2
|
+
|
3
|
+
class VideoSieveWatchTab < VideoSieve
|
4
|
+
# To check if it's valid for the inputted graphql objects
|
5
|
+
def self.check(graphql_objects)
|
6
|
+
video_object = self.extractor(graphql_objects)
|
7
|
+
return false if video_object.nil?
|
8
|
+
|
9
|
+
video_object = video_object["attachments"]
|
10
|
+
return false if video_object.nil?
|
11
|
+
|
12
|
+
return false unless video_object.kind_of?(Array) && !video_object.empty?
|
13
|
+
|
14
|
+
video_object = video_object.first
|
15
|
+
return false unless video_object.kind_of?(Hash) && video_object.keys.include?("media")
|
16
|
+
|
17
|
+
true
|
18
|
+
rescue StandardError
|
19
|
+
return false
|
20
|
+
end
|
21
|
+
|
22
|
+
# output the expected format of:
|
23
|
+
#
|
24
|
+
# post_details = {
|
25
|
+
# id: video_object["id"],
|
26
|
+
# num_comments: num_comments,
|
27
|
+
# num_shares: share_count_object.fetch("count", nil),
|
28
|
+
# num_views: feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["video_view_count"],
|
29
|
+
# reshare_warning: feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["should_show_reshare_warning"],
|
30
|
+
# video_preview_image_url: video_object["preferred_thumbnail"]["image"]["uri"],
|
31
|
+
# video_url: video_object["browser_native_hd_url"] || video_object["browser_native_sd_url"],
|
32
|
+
# text: text,
|
33
|
+
# created_at: creation_date,
|
34
|
+
# profile_link: story_node_object["comet_sections"]["context_layout"]["story"]["comet_sections"]["actor_photo"]["story"]["actors"][0]["url"],
|
35
|
+
# has_video: true
|
36
|
+
# }
|
37
|
+
# post_details[:video_preview_image_file] = Forki.retrieve_media(post_details[:video_preview_image_url])
|
38
|
+
# post_details[:video_file] = Forki.retrieve_media(post_details[:video_url])
|
39
|
+
# post_details[:reactions] = reaction_counts
|
40
|
+
|
41
|
+
def self.sieve(graphql_objects)
|
42
|
+
video_object = self.extractor(graphql_objects)
|
43
|
+
|
44
|
+
video_url = video_object["attachments"].first["media"]["browser_native_sd_url"]
|
45
|
+
video_preview_image_url = video_object["attachments"].first["media"]["preferred_thumbnail"]["image"]["uri"]
|
46
|
+
|
47
|
+
if !video_object["feedback_context"].nil?
|
48
|
+
feedback_object = video_object["feedback_context"]["feedback_target_with_context"]
|
49
|
+
else
|
50
|
+
feedback_object = graphql_objects.find { |go| !go.dig("feedback", "total_comment_count").nil? }
|
51
|
+
feedback_object = feedback_object["feedback"] if feedback_object.has_key?("feedback")
|
52
|
+
end
|
53
|
+
|
54
|
+
profile_link = video_object["attachments"].first["media"]["owner"]["url"]
|
55
|
+
if profile_link.nil?
|
56
|
+
filtered_json = graphql_objects.find { |go| go.has_key? "attachments" }
|
57
|
+
profile_link = filtered_json["attachments"].first["media"]["creation_story"]["comet_sections"]["title"]["story"]["actors"].first["url"]
|
58
|
+
end
|
59
|
+
|
60
|
+
post_details = {
|
61
|
+
id: video_object.dig("shareable", "id") || video_object["attachments"].first["media"]["id"],
|
62
|
+
num_comments: feedback_object["total_comment_count"],
|
63
|
+
num_shared: nil, # This is not associated with these videos in this format
|
64
|
+
num_views: nil, # This is not associated with these videos in this format
|
65
|
+
reshare_warning: feedback_object["should_show_reshare_warning"],
|
66
|
+
video_preview_image_url: video_preview_image_url,
|
67
|
+
video_url: video_url,
|
68
|
+
text: nil, # There is no text associated with these videos
|
69
|
+
created_at: video_object["attachments"].first["media"]["publish_time"],
|
70
|
+
profile_link: profile_link,
|
71
|
+
has_video: true,
|
72
|
+
video_preview_image_file: Forki.retrieve_media(video_preview_image_url),
|
73
|
+
video_file: Forki.retrieve_media(video_url),
|
74
|
+
reactions: feedback_object["cannot_see_top_custom_reactions"]["top_reactions"]["edges"]
|
75
|
+
}
|
76
|
+
end
|
77
|
+
|
78
|
+
private
|
79
|
+
|
80
|
+
def self.extractor(graphql_objects)
|
81
|
+
video_objects = graphql_objects.filter do |go|
|
82
|
+
go = go.first if go.kind_of?(Array) && !go.empty?
|
83
|
+
go.has_key?("video")
|
84
|
+
end
|
85
|
+
|
86
|
+
story = video_objects.first.dig("video", "creation_story")
|
87
|
+
story = video_objects.first.dig("video", "story") if story.nil?
|
88
|
+
|
89
|
+
story
|
90
|
+
end
|
91
|
+
end
|
@@ -3,10 +3,14 @@ require "typhoeus"
|
|
3
3
|
module Forki
|
4
4
|
class UserScraper < Scraper
|
5
5
|
# Finds and returns the number of people who like the current page
|
6
|
-
def find_number_of_likes
|
7
|
-
likes_pattern = /[0-9,.KM ]
|
8
|
-
|
9
|
-
|
6
|
+
def find_number_of_likes(profile_details_string)
|
7
|
+
likes_pattern = /[0-9,.KM ] likes/
|
8
|
+
likes_pattern = /(?<num_likes>[0-9,.KM ]+) (l|L)ikes/
|
9
|
+
number_of_likes_match = likes_pattern.match(profile_details_string)
|
10
|
+
|
11
|
+
return nil if number_of_likes_match.nil?
|
12
|
+
|
13
|
+
extract_int_from_num_element(number_of_likes_match.named_captures["num_likes"])
|
10
14
|
end
|
11
15
|
|
12
16
|
# Finds and returns the number of people who follow the current page
|
@@ -14,8 +18,18 @@ module Forki
|
|
14
18
|
followers_pattern = /Followed by (?<num_followers>[0-9,.KM ]) people/
|
15
19
|
alt_follower_pattern = /(?<num_followers>[0-9,.KM ]+) (f|F)ollowers/
|
16
20
|
number_of_followers_match = followers_pattern.match(profile_details_string) || alt_follower_pattern.match(profile_details_string)
|
21
|
+
|
17
22
|
return nil if number_of_followers_match.nil?
|
18
|
-
|
23
|
+
|
24
|
+
number_of_followers = extract_int_from_num_element(number_of_followers_match.named_captures["num_followers"])
|
25
|
+
|
26
|
+
# Note, this is sticking around if we want to use it later
|
27
|
+
# if number_of_followers.nil?
|
28
|
+
# number_of_followers_string = JSON.parse(profile_header_str)["user"]["profile_header_renderer"]["user"]["profile_social_context"]["content"].first["text"]["text"]
|
29
|
+
# number_of_followers = extract_int_from_num_element(number_of_followers_string)
|
30
|
+
# end
|
31
|
+
|
32
|
+
number_of_followers
|
19
33
|
end
|
20
34
|
|
21
35
|
def find_number_followers_for_normal_profile(profile_followers_node)
|
@@ -61,6 +75,7 @@ module Forki
|
|
61
75
|
verified: profile_header_obj["user"]["is_verified"],
|
62
76
|
profile: profile_intro_obj ? profile_intro_obj["profile_intro_card"]["bio"]["text"] : "",
|
63
77
|
profile_image_url: profile_header_obj["user"]["profilePicLarge"]["uri"],
|
78
|
+
number_of_likes: find_number_of_likes(profile_header_str),
|
64
79
|
}
|
65
80
|
end
|
66
81
|
|
data/lib/forki/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: forki
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- ''
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
11
|
+
date: 2023-08-01 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: capybara
|
@@ -80,6 +80,20 @@ dependencies:
|
|
80
80
|
- - ">="
|
81
81
|
- !ruby/object:Gem::Version
|
82
82
|
version: '0'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: thor
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0'
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - ">="
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0'
|
83
97
|
description:
|
84
98
|
email:
|
85
99
|
- ''
|
@@ -99,12 +113,21 @@ files:
|
|
99
113
|
- README.md
|
100
114
|
- Rakefile
|
101
115
|
- bin/console
|
116
|
+
- bin/generate_sieve
|
117
|
+
- bin/generator_templates/image_sieve_template.rb.erb
|
118
|
+
- bin/generator_templates/image_sieve_test_template.rb.erb
|
119
|
+
- bin/generator_templates/video_sieve_template.rb.erb
|
120
|
+
- bin/generator_templates/video_sieve_test_template.rb.erb
|
102
121
|
- bin/setup
|
103
122
|
- forki.gemspec
|
104
123
|
- lib/forki.rb
|
105
124
|
- lib/forki/post.rb
|
106
125
|
- lib/forki/scrapers/post_scraper.rb
|
107
126
|
- lib/forki/scrapers/scraper.rb
|
127
|
+
- lib/forki/scrapers/sieves/image_sieves/image_sieve.rb.rb
|
128
|
+
- lib/forki/scrapers/sieves/video_sieves/video_sieve.rb
|
129
|
+
- lib/forki/scrapers/sieves/video_sieves/video_sieve_video_page.rb
|
130
|
+
- lib/forki/scrapers/sieves/video_sieves/video_sieve_watch_tab.rb
|
108
131
|
- lib/forki/scrapers/user_scraper.rb
|
109
132
|
- lib/forki/user.rb
|
110
133
|
- lib/forki/version.rb
|