forki 0.1.4 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +7 -6
- data/bin/generate_sieve +79 -0
- data/bin/generator_templates/image_sieve_template.rb.erb +63 -0
- data/bin/generator_templates/image_sieve_test_template.rb.erb +42 -0
- data/bin/generator_templates/video_sieve_template.rb.erb +61 -0
- data/bin/generator_templates/video_sieve_test_template.rb.erb +41 -0
- data/forki.gemspec +2 -0
- data/lib/forki/scrapers/post_scraper.rb +20 -6
- data/lib/forki/scrapers/scraper.rb +29 -12
- data/lib/forki/scrapers/sieves/image_sieves/image_sieve.rb.rb +25 -0
- data/lib/forki/scrapers/sieves/video_sieves/video_sieve.rb +24 -0
- data/lib/forki/scrapers/sieves/video_sieves/video_sieve_video_page.rb +66 -0
- data/lib/forki/scrapers/sieves/video_sieves/video_sieve_watch_tab.rb +91 -0
- data/lib/forki/scrapers/user_scraper.rb +20 -5
- data/lib/forki/version.rb +1 -1
- metadata +25 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7a1b2f6a831ebac1bf9e79cc33818aa3d8f638459f538ff4f0cb92bab55b16df
|
4
|
+
data.tar.gz: 1df3b090db0ba37ecfbcf17933d41334620e9542a75f3328d9a335adec5d1ae9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b65c46157a6d1f320345d0216d138239c9d73cb29fd7fc0fa504655e0c789021a558d2e75a0447f026dc694e4fb95765cf12f8b2350e147b1011e670cd40621f
|
7
|
+
data.tar.gz: bc62a4decd0205e75ac93038d5768ff4d9ec55d8a25980dbb8522b3b2296f84e50db7caa3ba8aeb28d61eb482957b4d4bf68a9d9658cc26f4209198a64b1fe04
|
data/Gemfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
forki (0.1.
|
4
|
+
forki (0.1.4)
|
5
5
|
apparition
|
6
6
|
capybara
|
7
7
|
oj
|
@@ -37,7 +37,7 @@ GEM
|
|
37
37
|
ast (2.4.2)
|
38
38
|
builder (3.2.4)
|
39
39
|
byebug (11.1.3)
|
40
|
-
capybara (3.39.
|
40
|
+
capybara (3.39.2)
|
41
41
|
addressable
|
42
42
|
matrix
|
43
43
|
mini_mime (>= 0.1.3)
|
@@ -66,11 +66,11 @@ GEM
|
|
66
66
|
minitest (5.18.0)
|
67
67
|
nokogiri (1.15.1-arm64-darwin)
|
68
68
|
racc (~> 1.4)
|
69
|
-
oj (3.
|
69
|
+
oj (3.15.1)
|
70
70
|
parallel (1.23.0)
|
71
71
|
parser (3.2.2.1)
|
72
72
|
ast (~> 2.4.1)
|
73
|
-
public_suffix (5.0.
|
73
|
+
public_suffix (5.0.3)
|
74
74
|
racc (1.6.2)
|
75
75
|
rack (2.2.4)
|
76
76
|
rack-test (2.1.0)
|
@@ -127,7 +127,7 @@ GEM
|
|
127
127
|
rubocop-rails (~> 2.0)
|
128
128
|
ruby-progressbar (1.13.0)
|
129
129
|
rubyzip (2.3.2)
|
130
|
-
selenium-webdriver (4.
|
130
|
+
selenium-webdriver (4.10.0)
|
131
131
|
rexml (~> 3.2, >= 3.2.5)
|
132
132
|
rubyzip (>= 1.2.2, < 3.0)
|
133
133
|
websocket (~> 1.0)
|
@@ -138,7 +138,7 @@ GEM
|
|
138
138
|
concurrent-ruby (~> 1.0)
|
139
139
|
unicode-display_width (2.4.2)
|
140
140
|
websocket (1.2.9)
|
141
|
-
websocket-driver (0.7.
|
141
|
+
websocket-driver (0.7.6)
|
142
142
|
websocket-extensions (>= 0.1.0)
|
143
143
|
websocket-extensions (0.1.5)
|
144
144
|
xpath (3.2.0)
|
@@ -160,6 +160,7 @@ DEPENDENCIES
|
|
160
160
|
rubocop (~> 1.7)
|
161
161
|
rubocop-rails (~> 2.17.3)
|
162
162
|
rubocop-rails_config
|
163
|
+
thor
|
163
164
|
|
164
165
|
BUNDLED WITH
|
165
166
|
2.3.11
|
data/bin/generate_sieve
ADDED
@@ -0,0 +1,79 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
require "bundler/setup"
|
5
|
+
require "forki"
|
6
|
+
require "thor"
|
7
|
+
require "erb"
|
8
|
+
|
9
|
+
class GenerateSieve < Thor
|
10
|
+
|
11
|
+
desc "generate TYPE NAME", "generate a TYPE named NAME"
|
12
|
+
option :video
|
13
|
+
option :image
|
14
|
+
def generate(type, name)
|
15
|
+
case type
|
16
|
+
when "post"
|
17
|
+
if !options[:video].nil?
|
18
|
+
style = :video
|
19
|
+
elsif !options[:image].nil?
|
20
|
+
style = :image
|
21
|
+
else
|
22
|
+
puts "Must indicate either video or image flag"
|
23
|
+
exit
|
24
|
+
end
|
25
|
+
|
26
|
+
SieveGenerator.generate_post_sieve(name, style)
|
27
|
+
when "user"
|
28
|
+
SieveGenerator.generate_user_sieve(name)
|
29
|
+
else
|
30
|
+
puts "Type must be `post` or `user` only. `#{type}` passed in."
|
31
|
+
exit
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
class SieveGenerator
|
37
|
+
def self.generate_post_sieve(name, style)
|
38
|
+
puts "Generating post sieve named #{name} with style #{style}"
|
39
|
+
|
40
|
+
file_path = "./lib/forki/scrapers/sieves/"
|
41
|
+
test_path = "./test/sieves/"
|
42
|
+
case style
|
43
|
+
when :video
|
44
|
+
test_data_valid_path = "#{test_path}/video_sieves/test_data/video_sieve_#{name}_valid.json"
|
45
|
+
test_data_invalid_path = "#{test_path}/video_sieves/test_data/video_sieve_#{name}_invalid.json"
|
46
|
+
|
47
|
+
file_path += "video_sieves/video_sieve_#{name}.rb"
|
48
|
+
test_path += "video_sieves/video_sieve_#{name}_test.rb"
|
49
|
+
file_template = File.read("./bin/generator_templates/video_sieve_template.rb.erb")
|
50
|
+
test_file_template = File.read("./bin/generator_templates/video_sieve_test_template.rb.erb")
|
51
|
+
when :image
|
52
|
+
test_data_valid_path = "#{test_path}/image_sieves/test_data/image_sieve_#{name}_valid.json"
|
53
|
+
test_data_invalid_path = "#{test_path}/image_sieves/test_data/image_sieve_#{name}_invalid.json"
|
54
|
+
|
55
|
+
file_path += "image_sieves/image_sieve_#{name}.rb"
|
56
|
+
test_path += "image_sieves/image_sieve_#{name}_test.rb"
|
57
|
+
file_template = File.read("./bin/generator_templates/image_sieve_template.rb.erb")
|
58
|
+
test_file_template = File.read("./bin/generator_templates/image_sieve_test_template.rb.erb")
|
59
|
+
end
|
60
|
+
|
61
|
+
file_contents = ERB.new(file_template)
|
62
|
+
test_file_contents = ERB.new(test_file_template)
|
63
|
+
|
64
|
+
camel_name = name.split('_').collect(&:capitalize).join
|
65
|
+
|
66
|
+
File.write(file_path, file_contents.result(binding))
|
67
|
+
File.write(test_path, test_file_contents.result(binding))
|
68
|
+
|
69
|
+
File.write(test_data_valid_path, "")
|
70
|
+
File.write(test_data_invalid_path, "")
|
71
|
+
end
|
72
|
+
|
73
|
+
def self.generate_user_sieve(name)
|
74
|
+
puts "Generating user sieve named #{name}"
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
GenerateSieve.start(ARGV)
|
79
|
+
|
@@ -0,0 +1,63 @@
|
|
1
|
+
# NOTE: This is not implemented yet, just here for filler
|
2
|
+
|
3
|
+
class ImageSieve<%= camel_name %> < ImageSieve
|
4
|
+
# To check if it's valid for the inputted graphql objects
|
5
|
+
def self.check(graphql_objects)
|
6
|
+
image_object = self.extractor(graphql_objects)
|
7
|
+
|
8
|
+
true
|
9
|
+
rescue StandardError
|
10
|
+
return false
|
11
|
+
end
|
12
|
+
|
13
|
+
# output the expected format of:
|
14
|
+
#
|
15
|
+
# post_details = {
|
16
|
+
# id: video_object["id"],
|
17
|
+
# num_comments: num_comments,
|
18
|
+
# num_shares: share_count_object.fetch("count", nil),
|
19
|
+
# num_views: feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["video_view_count"],
|
20
|
+
# reshare_warning: feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["should_show_reshare_warning"],
|
21
|
+
# video_preview_image_url: video_object["preferred_thumbnail"]["image"]["uri"],
|
22
|
+
# video_url: video_object["browser_native_hd_url"] || video_object["browser_native_sd_url"],
|
23
|
+
# text: text,
|
24
|
+
# created_at: creation_date,
|
25
|
+
# profile_link: story_node_object["comet_sections"]["context_layout"]["story"]["comet_sections"]["actor_photo"]["story"]["actors"][0]["url"],
|
26
|
+
# has_video: true
|
27
|
+
# }
|
28
|
+
# post_details[:video_preview_image_file] = Forki.retrieve_media(post_details[:video_preview_image_url])
|
29
|
+
# post_details[:video_file] = Forki.retrieve_media(post_details[:video_url])
|
30
|
+
# post_details[:reactions] = reaction_counts
|
31
|
+
|
32
|
+
def self.sieve(graphql_objects)
|
33
|
+
image_object = self.extractor(graphql_objects)
|
34
|
+
|
35
|
+
post_details = {
|
36
|
+
id: nil,
|
37
|
+
num_comments: nil,
|
38
|
+
num_shared: nil,
|
39
|
+
num_views: nil,
|
40
|
+
reshare_warning: nil,
|
41
|
+
video_preview_image_url: nil,
|
42
|
+
video_url: nil,
|
43
|
+
text: nil,
|
44
|
+
created_at: nil,
|
45
|
+
profile_link: nil,
|
46
|
+
has_video: false,
|
47
|
+
video_preview_image_file: Forki.retrieve_media(video_preview_image_url),
|
48
|
+
video_file: Forki.retrieve_media(video_url),
|
49
|
+
reactions: nil
|
50
|
+
}
|
51
|
+
end
|
52
|
+
|
53
|
+
private
|
54
|
+
|
55
|
+
def self.extractor(graphql_objects)
|
56
|
+
image_objects = graphql_objects.filter do |go|
|
57
|
+
# go = go.first if go.kind_of?(Array) && !go.empty?
|
58
|
+
# go.has_key?("image")
|
59
|
+
end
|
60
|
+
|
61
|
+
# image_objects.first.dig("image", "creation_story")
|
62
|
+
end
|
63
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "test_helper"
|
4
|
+
|
5
|
+
# rubocop:disable Metrics/ClassLength
|
6
|
+
class ImageSieve<%= camel_name %>Test < Minitest::Test
|
7
|
+
def setup
|
8
|
+
@valid_json = JSON.parse(File.read("test/sieves/image_sieves/test_data/image_sieve_<%= name %>_valid.json"))
|
9
|
+
@invalid_json = JSON.parse(File.read("test/sieves/image_sieves/test_data/image_sieve_<%= name %>_invalid.json"))
|
10
|
+
end
|
11
|
+
|
12
|
+
def test_sieve_properly_fails_check
|
13
|
+
assert ImageSieve<%= camel_name %>.check(@invalid_json) == false
|
14
|
+
end
|
15
|
+
|
16
|
+
def test_sieve_properly_passes_check
|
17
|
+
assert ImageSieve<%= camel_name %>.check(@valid_json)
|
18
|
+
end
|
19
|
+
|
20
|
+
def test_sieve_can_sieve_properly
|
21
|
+
result = ImageSieve<%= camel_name %>.sieve(@valid_json)
|
22
|
+
|
23
|
+
# TODO: Update the values for the post you're testing
|
24
|
+
# MAINTAINER TODO: FIX THIS FOR IMAGES
|
25
|
+
assert_equal "394367115960503", result[:id]
|
26
|
+
assert_equal 173, result[:num_comments]
|
27
|
+
assert_equal nil, result[:num_shared]
|
28
|
+
assert_equal nil, result[:num_views]
|
29
|
+
assert_equal false, result[:reshare_warning]
|
30
|
+
assert_not_nil result[:video_preview_image_url]
|
31
|
+
assert_not_nil result[:video_url]
|
32
|
+
assert_equal nil, text
|
33
|
+
assert_equal 1654989063, result[:created_at]
|
34
|
+
assert_equal nil, result[:profile_link]
|
35
|
+
assert_equal false, result[:has_video]
|
36
|
+
assert_not_nil result[:video_preview_image_file]
|
37
|
+
assert_not_nil result[:video_file]
|
38
|
+
assert_not_nil result[:reactions]
|
39
|
+
|
40
|
+
assert result[:reactions].kind_of?(Array)
|
41
|
+
end
|
42
|
+
end
|
@@ -0,0 +1,61 @@
|
|
1
|
+
class VideoSieve<%= camel_name %> < VideoSieve
|
2
|
+
# To check if it's valid for the inputted graphql objects
|
3
|
+
def self.check(graphql_objects)
|
4
|
+
video_object = self.extractor(graphql_objects)
|
5
|
+
|
6
|
+
true
|
7
|
+
rescue StandardError
|
8
|
+
return false
|
9
|
+
end
|
10
|
+
|
11
|
+
# output the expected format of:
|
12
|
+
#
|
13
|
+
# post_details = {
|
14
|
+
# id: video_object["id"],
|
15
|
+
# num_comments: num_comments,
|
16
|
+
# num_shares: share_count_object.fetch("count", nil),
|
17
|
+
# num_views: feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["video_view_count"],
|
18
|
+
# reshare_warning: feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["should_show_reshare_warning"],
|
19
|
+
# video_preview_image_url: video_object["preferred_thumbnail"]["image"]["uri"],
|
20
|
+
# video_url: video_object["browser_native_hd_url"] || video_object["browser_native_sd_url"],
|
21
|
+
# text: text,
|
22
|
+
# created_at: creation_date,
|
23
|
+
# profile_link: story_node_object["comet_sections"]["context_layout"]["story"]["comet_sections"]["actor_photo"]["story"]["actors"][0]["url"],
|
24
|
+
# has_video: true
|
25
|
+
# }
|
26
|
+
# post_details[:video_preview_image_file] = Forki.retrieve_media(post_details[:video_preview_image_url])
|
27
|
+
# post_details[:video_file] = Forki.retrieve_media(post_details[:video_url])
|
28
|
+
# post_details[:reactions] = reaction_counts
|
29
|
+
|
30
|
+
def self.sieve(graphql_objects)
|
31
|
+
video_object = self.extractor(graphql_objects)
|
32
|
+
|
33
|
+
post_details = {
|
34
|
+
id: nil,
|
35
|
+
num_comments: nil,
|
36
|
+
num_shared: nil,
|
37
|
+
num_views: nil,
|
38
|
+
reshare_warning: nil,
|
39
|
+
video_preview_image_url: nil,
|
40
|
+
video_url: nil,
|
41
|
+
text: nil,
|
42
|
+
created_at: nil,
|
43
|
+
profile_link: nil,
|
44
|
+
has_video: true,
|
45
|
+
video_preview_image_file: Forki.retrieve_media(video_preview_image_url),
|
46
|
+
video_file: Forki.retrieve_media(video_url),
|
47
|
+
reactions: nil
|
48
|
+
}
|
49
|
+
end
|
50
|
+
|
51
|
+
private
|
52
|
+
|
53
|
+
def self.extractor(graphql_objects)
|
54
|
+
video_objects = graphql_objects.filter do |go|
|
55
|
+
# go = go.first if go.kind_of?(Array) && !go.empty?
|
56
|
+
# go.has_key?("video")
|
57
|
+
end
|
58
|
+
|
59
|
+
# video_objects.first.dig("video", "creation_story")
|
60
|
+
end
|
61
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "test_helper"
|
4
|
+
|
5
|
+
# rubocop:disable Metrics/ClassLength
|
6
|
+
class VideoSieve<%= camel_name %>Test < Minitest::Test
|
7
|
+
def setup
|
8
|
+
@valid_json = JSON.parse(File.read("test/sieves/video_sieves/test_data/video_sieve_<%= name %>_valid.json"))
|
9
|
+
@invalid_json = JSON.parse(File.read("test/sieves/video_sieves/test_data/video_sieve_<%= name %>_invalid.json"))
|
10
|
+
end
|
11
|
+
|
12
|
+
def test_sieve_properly_fails_check
|
13
|
+
assert VideoSieve<%= camel_name %>.check(@invalid_json) == false
|
14
|
+
end
|
15
|
+
|
16
|
+
def test_sieve_properly_passes_check
|
17
|
+
assert VideoSieve<%= camel_name %>.check(@valid_json)
|
18
|
+
end
|
19
|
+
|
20
|
+
def test_sieve_can_sieve_properly
|
21
|
+
result = VideoSieve<%= camel_name %>.sieve(@valid_json)
|
22
|
+
|
23
|
+
# TODO: Update the values for the post you're testing
|
24
|
+
assert_equal "394367115960503", result[:id]
|
25
|
+
assert_equal 173, result[:num_comments]
|
26
|
+
assert_equal nil, result[:num_shared]
|
27
|
+
assert_equal nil, result[:num_views]
|
28
|
+
assert_equal false, result[:reshare_warning]
|
29
|
+
assert_not_nil result[:video_preview_image_url]
|
30
|
+
assert_not_nil result[:video_url]
|
31
|
+
assert_equal nil, text
|
32
|
+
assert_equal 1654989063, result[:created_at]
|
33
|
+
assert_equal nil, result[:profile_link]
|
34
|
+
assert_equal true, result[:has_video]
|
35
|
+
assert_not_nil result[:video_preview_image_file]
|
36
|
+
assert_not_nil result[:video_file]
|
37
|
+
assert_not_nil result[:reactions]
|
38
|
+
|
39
|
+
assert result[:reactions].kind_of?(Array)
|
40
|
+
end
|
41
|
+
end
|
data/forki.gemspec
CHANGED
@@ -37,6 +37,8 @@ Gem::Specification.new do |spec|
|
|
37
37
|
spec.add_dependency "oj" # A faster JSON parser/loader than stdlib
|
38
38
|
spec.add_dependency "selenium-webdriver" # Webdriver selenium
|
39
39
|
|
40
|
+
spec.add_development_dependency "thor" # For the generator
|
41
|
+
|
40
42
|
# For more information and examples about making a new gem, checkout our
|
41
43
|
# guide at: https://bundler.io/guides/creating_gem.html
|
42
44
|
end
|
@@ -4,6 +4,7 @@ require "typhoeus"
|
|
4
4
|
require "securerandom"
|
5
5
|
require "byebug"
|
6
6
|
|
7
|
+
|
7
8
|
module Forki
|
8
9
|
# rubocop:disable Metrics/ClassLength
|
9
10
|
class PostScraper < Scraper
|
@@ -154,21 +155,30 @@ module Forki
|
|
154
155
|
end
|
155
156
|
|
156
157
|
graphql_object_array = graphql_strings.map { |graphql_string| JSON.parse(graphql_string) }
|
158
|
+
|
159
|
+
# Once in awhile it's really easy
|
160
|
+
video_objects = graphql_object_array.filter {|go| go.has_key?("video") }
|
161
|
+
|
162
|
+
if VideoSieve.can_process_with_sieve?(graphql_object_array)
|
163
|
+
# Eventually all of this complexity will be replaced with this
|
164
|
+
return VideoSieve.sieve_for_graphql_objects(graphql_object_array)
|
165
|
+
end
|
166
|
+
|
157
167
|
story_node_object = graphql_object_array.find { |graphql_object| graphql_object.key? "node" }&.fetch("node", nil) # user posted video
|
158
168
|
story_node_object = story_node_object || graphql_object_array.find { |graphql_object| graphql_object.key? "nodes" }&.fetch("nodes")&.first # page posted video
|
159
169
|
|
160
170
|
return extract_video_post_data_alternative(graphql_object_array) if story_node_object.nil?
|
161
171
|
|
162
172
|
if story_node_object["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"].key?("media")
|
163
|
-
video_object = story_node_object["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"]["media"]
|
164
|
-
creation_date = video_object["publish_time"]
|
165
|
-
|
173
|
+
video_object = story_node_object["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"]["media"]["video"]
|
174
|
+
creation_date = video_object["publish_time"] if video_object&.has_key("publish_time")
|
175
|
+
creation_date = story_node_object["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"]["media"] if creation_date.nil?
|
166
176
|
elsif story_node_object["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"].key?("style_infos")
|
167
177
|
# For "Reels" we need a separate way to parse this
|
168
178
|
video_object = story_node_object["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"]["style_infos"].first["fb_shorts_story"]["short_form_video_context"]["playback_video"]
|
169
179
|
creation_date = story_node_object["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"]["style_infos"].first["fb_shorts_story"]["creation_time"]
|
170
180
|
else
|
171
|
-
raise "Unable to parse video object"
|
181
|
+
raise "Unable to parse video object" if video_objects.empty?
|
172
182
|
end
|
173
183
|
|
174
184
|
feedback_object = story_node_object["comet_sections"]["feedback"]["story"]["feedback_context"]["feedback_target_with_context"]["ufi_renderer"]["feedback"]
|
@@ -191,7 +201,7 @@ module Forki
|
|
191
201
|
num_views: feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["video_view_count"],
|
192
202
|
reshare_warning: feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["should_show_reshare_warning"],
|
193
203
|
video_preview_image_url: video_object["preferred_thumbnail"]["image"]["uri"],
|
194
|
-
video_url: video_object["
|
204
|
+
video_url: video_object["browser_native_hd_url"] || video_object["browser_native_sd_url"],
|
195
205
|
text: text,
|
196
206
|
created_at: creation_date,
|
197
207
|
profile_link: story_node_object["comet_sections"]["context_layout"]["story"]["comet_sections"]["actor_photo"]["story"]["actors"][0]["url"],
|
@@ -217,7 +227,7 @@ module Forki
|
|
217
227
|
num_views: feedback_object["video_view_count"],
|
218
228
|
reshare_warning: feedback_object["should_show_reshare_warning"],
|
219
229
|
video_preview_image_url: video_object["video"]["preferred_thumbnail"]["image"]["uri"],
|
220
|
-
video_url: video_object["video"]["playable_url_quality_hd"] || video_object["video"]["playable_url"],
|
230
|
+
video_url: video_object["video"]["playable_url_quality_hd"] || video_object["video"]["browser_native_hd_url"] || video_object["video"]["browser_native_sd_url"] || video_object["video"]["playable_url"],
|
221
231
|
text: sidepane_object["tahoe_sidepane_renderer"]["video"]["creation_story"]["comet_sections"]["message"]["story"]["message"]["text"],
|
222
232
|
created_at: video_object["video"]["publish_time"],
|
223
233
|
profile_link: sidepane_object["tahoe_sidepane_renderer"]["video"]["creation_story"]["comet_sections"]["actor_photo"]["story"]["actors"][0]["url"],
|
@@ -369,6 +379,7 @@ module Forki
|
|
369
379
|
def parse(url)
|
370
380
|
validate_and_load_page(url)
|
371
381
|
graphql_strings = find_graphql_data_strings(page.html)
|
382
|
+
|
372
383
|
post_data = extract_post_data(graphql_strings)
|
373
384
|
post_data[:url] = url
|
374
385
|
user_url = post_data[:profile_link]
|
@@ -398,3 +409,6 @@ module Forki
|
|
398
409
|
end
|
399
410
|
end
|
400
411
|
end
|
412
|
+
|
413
|
+
require_relative "sieves/video_sieves/video_sieve"
|
414
|
+
|
@@ -7,6 +7,7 @@ require "oj"
|
|
7
7
|
require "selenium-webdriver"
|
8
8
|
require "open-uri"
|
9
9
|
require "selenium/webdriver/remote/http/curb"
|
10
|
+
require "cgi"
|
10
11
|
|
11
12
|
options = Selenium::WebDriver::Options.chrome(exclude_switches: ["enable-automation"])
|
12
13
|
options.add_argument("--start-maximized")
|
@@ -190,9 +191,21 @@ module Forki
|
|
190
191
|
|
191
192
|
visit "https://www.facebook.com"
|
192
193
|
login
|
194
|
+
|
193
195
|
visit url unless current_url.start_with?(url)
|
196
|
+
# # If the video is a watch page it doesn't have most of the data we want so we click on the video
|
197
|
+
# if url.include?("watch/live")
|
198
|
+
# clickable_element = find("video")
|
199
|
+
|
200
|
+
# while(clickable_element.obscured?)
|
201
|
+
# clickable_element = clickable_element.find(:xpath, "..")
|
202
|
+
# end
|
203
|
+
|
204
|
+
# clickable_element.click
|
205
|
+
# end
|
194
206
|
end
|
195
207
|
|
208
|
+
|
196
209
|
# Extracts an integer out of a string describing a number
|
197
210
|
# e.g. "4K Comments" returns 4000
|
198
211
|
# e.g. "131 Shares" returns 131
|
@@ -203,19 +216,23 @@ module Forki
|
|
203
216
|
element = element.text(:all)
|
204
217
|
end
|
205
218
|
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
interaction_num_text.to_i + interaction_num_text[-2].to_i * 100
|
211
|
-
elsif interaction_num_text.include?("K") # e.g. "13K"
|
212
|
-
interaction_num_text.to_i * 1000
|
213
|
-
elsif interaction_num_text.include?("M") # e.g. "13M"
|
214
|
-
interaction_num_text.to_i * 1_000_000
|
215
|
-
else # e.g. "15,443"
|
216
|
-
interaction_num_text.delete!(",")
|
217
|
-
interaction_num_text.delete(" ").to_i
|
219
|
+
# Check if there's a modifier i.e. `K` or `M` if there isn't just return the number
|
220
|
+
unless element.include?("K") || element.include?("M")
|
221
|
+
element.delete(",") # "5,456" e.g.
|
222
|
+
return element.to_i
|
218
223
|
end
|
224
|
+
|
225
|
+
modifier = element[-1]
|
226
|
+
number = element[0...-1].to_f
|
227
|
+
|
228
|
+
case modifier
|
229
|
+
when "K"
|
230
|
+
number = number * 1_000
|
231
|
+
when "M"
|
232
|
+
number = number * 1_000_000
|
233
|
+
end
|
234
|
+
|
235
|
+
number.to_i
|
219
236
|
end
|
220
237
|
end
|
221
238
|
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
class ImageSieve
|
2
|
+
def self.can_process_with_sieve?(graphql_objects)
|
3
|
+
!sieve_class_for_graphql_objects(graphql_objects).nil?
|
4
|
+
end
|
5
|
+
|
6
|
+
def self.sieve_for_graphql_objects(graphql_objects)
|
7
|
+
|
8
|
+
sieve = sieve_class_for_graphql_objects(graphql_objects)
|
9
|
+
return nil if sieve.nil?
|
10
|
+
|
11
|
+
sieve.sieve(graphql_objects)
|
12
|
+
end
|
13
|
+
|
14
|
+
private
|
15
|
+
|
16
|
+
def self.sieve_class_for_graphql_objects(graphql_objects)
|
17
|
+
sieves = []
|
18
|
+
sieves.detect { |sieve| sieve.check(graphql_objects) }
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
|
23
|
+
Dir['./lib/forki/scrapers/sieves/image_sieves/*.rb'].each do |file|
|
24
|
+
require file unless file.end_with?("image_sieve.rb")
|
25
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
class VideoSieve
|
2
|
+
def self.can_process_with_sieve?(graphql_objects)
|
3
|
+
!sieve_class_for_graphql_objects(graphql_objects).nil?
|
4
|
+
end
|
5
|
+
|
6
|
+
def self.sieve_for_graphql_objects(graphql_objects)
|
7
|
+
|
8
|
+
sieve = sieve_class_for_graphql_objects(graphql_objects)
|
9
|
+
return nil if sieve.nil?
|
10
|
+
|
11
|
+
sieve.sieve(graphql_objects)
|
12
|
+
end
|
13
|
+
|
14
|
+
private
|
15
|
+
|
16
|
+
def self.sieve_class_for_graphql_objects(graphql_objects)
|
17
|
+
sieves = [VideoSieveWatchTab, VideoSieveVideoPage]
|
18
|
+
sieves.detect { |sieve| sieve.check(graphql_objects) }
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
Dir['./lib/forki/scrapers/sieves/video_sieves/*.rb'].each do |file|
|
23
|
+
require file unless file.end_with?("video_sieve.rb")
|
24
|
+
end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
class VideoSieveVideoPage < VideoSieve
|
2
|
+
# To check if it's valid for the inputted graphql objects
|
3
|
+
def self.check(graphql_objects)
|
4
|
+
story_node_object = self.extractor(graphql_objects) # This will error out
|
5
|
+
return false unless story_node_object["content"]["story"]["attachments"].first["styles"]["attachment"].has_key?("media")
|
6
|
+
|
7
|
+
true
|
8
|
+
rescue StandardError
|
9
|
+
return false
|
10
|
+
end
|
11
|
+
|
12
|
+
# output the expected format of:
|
13
|
+
#
|
14
|
+
# post_details = {
|
15
|
+
# id: video_object["id"],
|
16
|
+
# num_comments: num_comments,
|
17
|
+
# num_shares: share_count_object.fetch("count", nil),
|
18
|
+
# num_views: feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["video_view_count"],
|
19
|
+
# reshare_warning: feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["should_show_reshare_warning"],
|
20
|
+
# video_preview_image_url: video_object["preferred_thumbnail"]["image"]["uri"],
|
21
|
+
# video_url: video_object["browser_native_hd_url"] || video_object["browser_native_sd_url"],
|
22
|
+
# text: text,
|
23
|
+
# created_at: creation_date,
|
24
|
+
# profile_link: story_node_object["comet_sections"]["context_layout"]["story"]["comet_sections"]["actor_photo"]["story"]["actors"][0]["url"],
|
25
|
+
# has_video: true
|
26
|
+
# }
|
27
|
+
# post_details[:video_preview_image_file] = Forki.retrieve_media(post_details[:video_preview_image_url])
|
28
|
+
# post_details[:video_file] = Forki.retrieve_media(post_details[:video_url])
|
29
|
+
# post_details[:reactions] = reaction_counts
|
30
|
+
|
31
|
+
def self.sieve(graphql_objects)
|
32
|
+
extracted_text = self.extractor(graphql_objects)
|
33
|
+
|
34
|
+
story_object = extracted_text["content"]["story"]
|
35
|
+
video_object = extracted_text["content"]["story"]["attachments"].first["styles"]["attachment"]["media"]
|
36
|
+
feedback_object = extracted_text["feedback"]["story"]["feedback_context"]["feedback_target_with_context"]["ufi_renderer"]["feedback"]["comet_ufi_summary_and_actions_renderer"]["feedback"]
|
37
|
+
|
38
|
+
video_preview_image_url = video_object["preferred_thumbnail"]["image"]["uri"]
|
39
|
+
video_url = video_object["browser_native_hd_url"]
|
40
|
+
video_url = video_object["browser_native_sd_url"] if video_url.nil?
|
41
|
+
|
42
|
+
post_details = {
|
43
|
+
id: video_object["id"],
|
44
|
+
num_comments: feedback_object["total_comment_count"],
|
45
|
+
num_shared: feedback_object["share_count"]["count"],
|
46
|
+
num_views: nil,
|
47
|
+
reshare_warning: feedback_object["should_show_reshare_warning"],
|
48
|
+
video_preview_image_url: video_preview_image_url,
|
49
|
+
video_url: video_url,
|
50
|
+
text: story_object["message"]["text"],
|
51
|
+
created_at: video_object["publish_time"],
|
52
|
+
profile_link: story_object["actors"].first["url"],
|
53
|
+
has_video: true,
|
54
|
+
video_preview_image_file: Forki.retrieve_media(video_preview_image_url),
|
55
|
+
video_file: Forki.retrieve_media(video_url),
|
56
|
+
reactions: feedback_object["cannot_see_top_custom_reactions"]["top_reactions"]["edges"]
|
57
|
+
}
|
58
|
+
end
|
59
|
+
|
60
|
+
private
|
61
|
+
|
62
|
+
def self.extractor(graphql_objects)
|
63
|
+
story_node_object = graphql_objects.find { |graphql_object| graphql_object.key? "node" }&.fetch("node", nil) # user posted video
|
64
|
+
story_node_object["comet_sections"]
|
65
|
+
end
|
66
|
+
end
|
@@ -0,0 +1,91 @@
|
|
1
|
+
# This is for the "watch" tab style videos https://www.facebook.com/watch/live/?v=394367115960503
|
2
|
+
|
3
|
+
class VideoSieveWatchTab < VideoSieve
|
4
|
+
# To check if it's valid for the inputted graphql objects
|
5
|
+
def self.check(graphql_objects)
|
6
|
+
video_object = self.extractor(graphql_objects)
|
7
|
+
return false if video_object.nil?
|
8
|
+
|
9
|
+
video_object = video_object["attachments"]
|
10
|
+
return false if video_object.nil?
|
11
|
+
|
12
|
+
return false unless video_object.kind_of?(Array) && !video_object.empty?
|
13
|
+
|
14
|
+
video_object = video_object.first
|
15
|
+
return false unless video_object.kind_of?(Hash) && video_object.keys.include?("media")
|
16
|
+
|
17
|
+
true
|
18
|
+
rescue StandardError
|
19
|
+
return false
|
20
|
+
end
|
21
|
+
|
22
|
+
# output the expected format of:
|
23
|
+
#
|
24
|
+
# post_details = {
|
25
|
+
# id: video_object["id"],
|
26
|
+
# num_comments: num_comments,
|
27
|
+
# num_shares: share_count_object.fetch("count", nil),
|
28
|
+
# num_views: feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["video_view_count"],
|
29
|
+
# reshare_warning: feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["should_show_reshare_warning"],
|
30
|
+
# video_preview_image_url: video_object["preferred_thumbnail"]["image"]["uri"],
|
31
|
+
# video_url: video_object["browser_native_hd_url"] || video_object["browser_native_sd_url"],
|
32
|
+
# text: text,
|
33
|
+
# created_at: creation_date,
|
34
|
+
# profile_link: story_node_object["comet_sections"]["context_layout"]["story"]["comet_sections"]["actor_photo"]["story"]["actors"][0]["url"],
|
35
|
+
# has_video: true
|
36
|
+
# }
|
37
|
+
# post_details[:video_preview_image_file] = Forki.retrieve_media(post_details[:video_preview_image_url])
|
38
|
+
# post_details[:video_file] = Forki.retrieve_media(post_details[:video_url])
|
39
|
+
# post_details[:reactions] = reaction_counts
|
40
|
+
|
41
|
+
def self.sieve(graphql_objects)
|
42
|
+
video_object = self.extractor(graphql_objects)
|
43
|
+
|
44
|
+
video_url = video_object["attachments"].first["media"]["browser_native_sd_url"]
|
45
|
+
video_preview_image_url = video_object["attachments"].first["media"]["preferred_thumbnail"]["image"]["uri"]
|
46
|
+
|
47
|
+
if !video_object["feedback_context"].nil?
|
48
|
+
feedback_object = video_object["feedback_context"]["feedback_target_with_context"]
|
49
|
+
else
|
50
|
+
feedback_object = graphql_objects.find { |go| !go.dig("feedback", "total_comment_count").nil? }
|
51
|
+
feedback_object = feedback_object["feedback"] if feedback_object.has_key?("feedback")
|
52
|
+
end
|
53
|
+
|
54
|
+
profile_link = video_object["attachments"].first["media"]["owner"]["url"]
|
55
|
+
if profile_link.nil?
|
56
|
+
filtered_json = graphql_objects.find { |go| go.has_key? "attachments" }
|
57
|
+
profile_link = filtered_json["attachments"].first["media"]["creation_story"]["comet_sections"]["title"]["story"]["actors"].first["url"]
|
58
|
+
end
|
59
|
+
|
60
|
+
post_details = {
|
61
|
+
id: video_object.dig("shareable", "id") || video_object["attachments"].first["media"]["id"],
|
62
|
+
num_comments: feedback_object["total_comment_count"],
|
63
|
+
num_shared: nil, # This is not associated with these videos in this format
|
64
|
+
num_views: nil, # This is not associated with these videos in this format
|
65
|
+
reshare_warning: feedback_object["should_show_reshare_warning"],
|
66
|
+
video_preview_image_url: video_preview_image_url,
|
67
|
+
video_url: video_url,
|
68
|
+
text: nil, # There is no text associated with these videos
|
69
|
+
created_at: video_object["attachments"].first["media"]["publish_time"],
|
70
|
+
profile_link: profile_link,
|
71
|
+
has_video: true,
|
72
|
+
video_preview_image_file: Forki.retrieve_media(video_preview_image_url),
|
73
|
+
video_file: Forki.retrieve_media(video_url),
|
74
|
+
reactions: feedback_object["cannot_see_top_custom_reactions"]["top_reactions"]["edges"]
|
75
|
+
}
|
76
|
+
end
|
77
|
+
|
78
|
+
private
|
79
|
+
|
80
|
+
def self.extractor(graphql_objects)
|
81
|
+
video_objects = graphql_objects.filter do |go|
|
82
|
+
go = go.first if go.kind_of?(Array) && !go.empty?
|
83
|
+
go.has_key?("video")
|
84
|
+
end
|
85
|
+
|
86
|
+
story = video_objects.first.dig("video", "creation_story")
|
87
|
+
story = video_objects.first.dig("video", "story") if story.nil?
|
88
|
+
|
89
|
+
story
|
90
|
+
end
|
91
|
+
end
|
@@ -3,10 +3,14 @@ require "typhoeus"
|
|
3
3
|
module Forki
|
4
4
|
class UserScraper < Scraper
|
5
5
|
# Finds and returns the number of people who like the current page
|
6
|
-
def find_number_of_likes
|
7
|
-
likes_pattern = /[0-9,.KM ]
|
8
|
-
|
9
|
-
|
6
|
+
def find_number_of_likes(profile_details_string)
|
7
|
+
likes_pattern = /[0-9,.KM ] likes/
|
8
|
+
likes_pattern = /(?<num_likes>[0-9,.KM ]+) (l|L)ikes/
|
9
|
+
number_of_likes_match = likes_pattern.match(profile_details_string)
|
10
|
+
|
11
|
+
return nil if number_of_likes_match.nil?
|
12
|
+
|
13
|
+
extract_int_from_num_element(number_of_likes_match.named_captures["num_likes"])
|
10
14
|
end
|
11
15
|
|
12
16
|
# Finds and returns the number of people who follow the current page
|
@@ -14,8 +18,18 @@ module Forki
|
|
14
18
|
followers_pattern = /Followed by (?<num_followers>[0-9,.KM ]) people/
|
15
19
|
alt_follower_pattern = /(?<num_followers>[0-9,.KM ]+) (f|F)ollowers/
|
16
20
|
number_of_followers_match = followers_pattern.match(profile_details_string) || alt_follower_pattern.match(profile_details_string)
|
21
|
+
|
17
22
|
return nil if number_of_followers_match.nil?
|
18
|
-
|
23
|
+
|
24
|
+
number_of_followers = extract_int_from_num_element(number_of_followers_match.named_captures["num_followers"])
|
25
|
+
|
26
|
+
# Note, this is sticking around if we want to use it later
|
27
|
+
# if number_of_followers.nil?
|
28
|
+
# number_of_followers_string = JSON.parse(profile_header_str)["user"]["profile_header_renderer"]["user"]["profile_social_context"]["content"].first["text"]["text"]
|
29
|
+
# number_of_followers = extract_int_from_num_element(number_of_followers_string)
|
30
|
+
# end
|
31
|
+
|
32
|
+
number_of_followers
|
19
33
|
end
|
20
34
|
|
21
35
|
def find_number_followers_for_normal_profile(profile_followers_node)
|
@@ -61,6 +75,7 @@ module Forki
|
|
61
75
|
verified: profile_header_obj["user"]["is_verified"],
|
62
76
|
profile: profile_intro_obj ? profile_intro_obj["profile_intro_card"]["bio"]["text"] : "",
|
63
77
|
profile_image_url: profile_header_obj["user"]["profilePicLarge"]["uri"],
|
78
|
+
number_of_likes: find_number_of_likes(profile_header_str),
|
64
79
|
}
|
65
80
|
end
|
66
81
|
|
data/lib/forki/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: forki
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- ''
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
11
|
+
date: 2023-08-01 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: capybara
|
@@ -80,6 +80,20 @@ dependencies:
|
|
80
80
|
- - ">="
|
81
81
|
- !ruby/object:Gem::Version
|
82
82
|
version: '0'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: thor
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0'
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - ">="
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0'
|
83
97
|
description:
|
84
98
|
email:
|
85
99
|
- ''
|
@@ -99,12 +113,21 @@ files:
|
|
99
113
|
- README.md
|
100
114
|
- Rakefile
|
101
115
|
- bin/console
|
116
|
+
- bin/generate_sieve
|
117
|
+
- bin/generator_templates/image_sieve_template.rb.erb
|
118
|
+
- bin/generator_templates/image_sieve_test_template.rb.erb
|
119
|
+
- bin/generator_templates/video_sieve_template.rb.erb
|
120
|
+
- bin/generator_templates/video_sieve_test_template.rb.erb
|
102
121
|
- bin/setup
|
103
122
|
- forki.gemspec
|
104
123
|
- lib/forki.rb
|
105
124
|
- lib/forki/post.rb
|
106
125
|
- lib/forki/scrapers/post_scraper.rb
|
107
126
|
- lib/forki/scrapers/scraper.rb
|
127
|
+
- lib/forki/scrapers/sieves/image_sieves/image_sieve.rb.rb
|
128
|
+
- lib/forki/scrapers/sieves/video_sieves/video_sieve.rb
|
129
|
+
- lib/forki/scrapers/sieves/video_sieves/video_sieve_video_page.rb
|
130
|
+
- lib/forki/scrapers/sieves/video_sieves/video_sieve_watch_tab.rb
|
108
131
|
- lib/forki/scrapers/user_scraper.rb
|
109
132
|
- lib/forki/user.rb
|
110
133
|
- lib/forki/version.rb
|