forki 0.1.4 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 67f9dde3202683b68acd00f10ec572ba80bb4a749fa369251c5e512bb8304393
4
- data.tar.gz: 55f099e3e63f174c4ae00aef56c27c77a44b5b49414dcfb7f37e32ae8bb9f47f
3
+ metadata.gz: 7a1b2f6a831ebac1bf9e79cc33818aa3d8f638459f538ff4f0cb92bab55b16df
4
+ data.tar.gz: 1df3b090db0ba37ecfbcf17933d41334620e9542a75f3328d9a335adec5d1ae9
5
5
  SHA512:
6
- metadata.gz: 35db430333dcc95259f49d1159e976f3d969b648f80c4af9a6b87c0ae0ff4ae67462f1942797da212861c26aeab7ee982f76b86e3af8563dca561e57d012aaa3
7
- data.tar.gz: 40f0be4238a4b0bfd7fd23309403df139f82e728739211226c753e52ee96cb7ce2300ee4c0c81539c90761f05d8054b2f7a1b1d1e16cd9cd2744263df4ebf16e
6
+ metadata.gz: b65c46157a6d1f320345d0216d138239c9d73cb29fd7fc0fa504655e0c789021a558d2e75a0447f026dc694e4fb95765cf12f8b2350e147b1011e670cd40621f
7
+ data.tar.gz: bc62a4decd0205e75ac93038d5768ff4d9ec55d8a25980dbb8522b3b2296f84e50db7caa3ba8aeb28d61eb482957b4d4bf68a9d9658cc26f4209198a64b1fe04
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- forki (0.1.1)
4
+ forki (0.1.4)
5
5
  apparition
6
6
  capybara
7
7
  oj
@@ -37,7 +37,7 @@ GEM
37
37
  ast (2.4.2)
38
38
  builder (3.2.4)
39
39
  byebug (11.1.3)
40
- capybara (3.39.1)
40
+ capybara (3.39.2)
41
41
  addressable
42
42
  matrix
43
43
  mini_mime (>= 0.1.3)
@@ -66,11 +66,11 @@ GEM
66
66
  minitest (5.18.0)
67
67
  nokogiri (1.15.1-arm64-darwin)
68
68
  racc (~> 1.4)
69
- oj (3.14.3)
69
+ oj (3.15.1)
70
70
  parallel (1.23.0)
71
71
  parser (3.2.2.1)
72
72
  ast (~> 2.4.1)
73
- public_suffix (5.0.1)
73
+ public_suffix (5.0.3)
74
74
  racc (1.6.2)
75
75
  rack (2.2.4)
76
76
  rack-test (2.1.0)
@@ -127,7 +127,7 @@ GEM
127
127
  rubocop-rails (~> 2.0)
128
128
  ruby-progressbar (1.13.0)
129
129
  rubyzip (2.3.2)
130
- selenium-webdriver (4.9.1)
130
+ selenium-webdriver (4.10.0)
131
131
  rexml (~> 3.2, >= 3.2.5)
132
132
  rubyzip (>= 1.2.2, < 3.0)
133
133
  websocket (~> 1.0)
@@ -138,7 +138,7 @@ GEM
138
138
  concurrent-ruby (~> 1.0)
139
139
  unicode-display_width (2.4.2)
140
140
  websocket (1.2.9)
141
- websocket-driver (0.7.5)
141
+ websocket-driver (0.7.6)
142
142
  websocket-extensions (>= 0.1.0)
143
143
  websocket-extensions (0.1.5)
144
144
  xpath (3.2.0)
@@ -160,6 +160,7 @@ DEPENDENCIES
160
160
  rubocop (~> 1.7)
161
161
  rubocop-rails (~> 2.17.3)
162
162
  rubocop-rails_config
163
+ thor
163
164
 
164
165
  BUNDLED WITH
165
166
  2.3.11
@@ -0,0 +1,79 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ require "bundler/setup"
5
+ require "forki"
6
+ require "thor"
7
+ require "erb"
8
+
9
+ class GenerateSieve < Thor
10
+
11
+ desc "generate TYPE NAME", "generate a TYPE named NAME"
12
+ option :video
13
+ option :image
14
+ def generate(type, name)
15
+ case type
16
+ when "post"
17
+ if !options[:video].nil?
18
+ style = :video
19
+ elsif !options[:image].nil?
20
+ style = :image
21
+ else
22
+ puts "Must indicate either video or image flag"
23
+ exit
24
+ end
25
+
26
+ SieveGenerator.generate_post_sieve(name, style)
27
+ when "user"
28
+ SieveGenerator.generate_user_sieve(name)
29
+ else
30
+ puts "Type must be `post` or `user` only. `#{type}` passed in."
31
+ exit
32
+ end
33
+ end
34
+ end
35
+
36
+ class SieveGenerator
37
+ def self.generate_post_sieve(name, style)
38
+ puts "Generating post sieve named #{name} with style #{style}"
39
+
40
+ file_path = "./lib/forki/scrapers/sieves/"
41
+ test_path = "./test/sieves/"
42
+ case style
43
+ when :video
44
+ test_data_valid_path = "#{test_path}/video_sieves/test_data/video_sieve_#{name}_valid.json"
45
+ test_data_invalid_path = "#{test_path}/video_sieves/test_data/video_sieve_#{name}_invalid.json"
46
+
47
+ file_path += "video_sieves/video_sieve_#{name}.rb"
48
+ test_path += "video_sieves/video_sieve_#{name}_test.rb"
49
+ file_template = File.read("./bin/generator_templates/video_sieve_template.rb.erb")
50
+ test_file_template = File.read("./bin/generator_templates/video_sieve_test_template.rb.erb")
51
+ when :image
52
+ test_data_valid_path = "#{test_path}/image_sieves/test_data/image_sieve_#{name}_valid.json"
53
+ test_data_invalid_path = "#{test_path}/image_sieves/test_data/image_sieve_#{name}_invalid.json"
54
+
55
+ file_path += "image_sieves/image_sieve_#{name}.rb"
56
+ test_path += "image_sieves/image_sieve_#{name}_test.rb"
57
+ file_template = File.read("./bin/generator_templates/image_sieve_template.rb.erb")
58
+ test_file_template = File.read("./bin/generator_templates/image_sieve_test_template.rb.erb")
59
+ end
60
+
61
+ file_contents = ERB.new(file_template)
62
+ test_file_contents = ERB.new(test_file_template)
63
+
64
+ camel_name = name.split('_').collect(&:capitalize).join
65
+
66
+ File.write(file_path, file_contents.result(binding))
67
+ File.write(test_path, test_file_contents.result(binding))
68
+
69
+ File.write(test_data_valid_path, "")
70
+ File.write(test_data_invalid_path, "")
71
+ end
72
+
73
+ def self.generate_user_sieve(name)
74
+ puts "Generating user sieve named #{name}"
75
+ end
76
+ end
77
+
78
+ GenerateSieve.start(ARGV)
79
+
@@ -0,0 +1,63 @@
1
+ # NOTE: This is not implemented yet, just here for filler
2
+
3
+ class ImageSieve<%= camel_name %> < ImageSieve
4
+ # To check if it's valid for the inputted graphql objects
5
+ def self.check(graphql_objects)
6
+ image_object = self.extractor(graphql_objects)
7
+
8
+ true
9
+ rescue StandardError
10
+ return false
11
+ end
12
+
13
+ # output the expected format of:
14
+ #
15
+ # post_details = {
16
+ # id: video_object["id"],
17
+ # num_comments: num_comments,
18
+ # num_shares: share_count_object.fetch("count", nil),
19
+ # num_views: feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["video_view_count"],
20
+ # reshare_warning: feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["should_show_reshare_warning"],
21
+ # video_preview_image_url: video_object["preferred_thumbnail"]["image"]["uri"],
22
+ # video_url: video_object["browser_native_hd_url"] || video_object["browser_native_sd_url"],
23
+ # text: text,
24
+ # created_at: creation_date,
25
+ # profile_link: story_node_object["comet_sections"]["context_layout"]["story"]["comet_sections"]["actor_photo"]["story"]["actors"][0]["url"],
26
+ # has_video: true
27
+ # }
28
+ # post_details[:video_preview_image_file] = Forki.retrieve_media(post_details[:video_preview_image_url])
29
+ # post_details[:video_file] = Forki.retrieve_media(post_details[:video_url])
30
+ # post_details[:reactions] = reaction_counts
31
+
32
+ def self.sieve(graphql_objects)
33
+ image_object = self.extractor(graphql_objects)
34
+
35
+ post_details = {
36
+ id: nil,
37
+ num_comments: nil,
38
+ num_shared: nil,
39
+ num_views: nil,
40
+ reshare_warning: nil,
41
+ video_preview_image_url: nil,
42
+ video_url: nil,
43
+ text: nil,
44
+ created_at: nil,
45
+ profile_link: nil,
46
+ has_video: false,
47
+ video_preview_image_file: Forki.retrieve_media(video_preview_image_url),
48
+ video_file: Forki.retrieve_media(video_url),
49
+ reactions: nil
50
+ }
51
+ end
52
+
53
+ private
54
+
55
+ def self.extractor(graphql_objects)
56
+ image_objects = graphql_objects.filter do |go|
57
+ # go = go.first if go.kind_of?(Array) && !go.empty?
58
+ # go.has_key?("image")
59
+ end
60
+
61
+ # image_objects.first.dig("image", "creation_story")
62
+ end
63
+ end
@@ -0,0 +1,42 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "test_helper"
4
+
5
+ # rubocop:disable Metrics/ClassLength
6
+ class ImageSieve<%= camel_name %>Test < Minitest::Test
7
+ def setup
8
+ @valid_json = JSON.parse(File.read("test/sieves/image_sieves/test_data/image_sieve_<%= name %>_valid.json"))
9
+ @invalid_json = JSON.parse(File.read("test/sieves/image_sieves/test_data/image_sieve_<%= name %>_invalid.json"))
10
+ end
11
+
12
+ def test_sieve_properly_fails_check
13
+ assert ImageSieve<%= camel_name %>.check(@invalid_json) == false
14
+ end
15
+
16
+ def test_sieve_properly_passes_check
17
+ assert ImageSieve<%= camel_name %>.check(@valid_json)
18
+ end
19
+
20
+ def test_sieve_can_sieve_properly
21
+ result = ImageSieve<%= camel_name %>.sieve(@valid_json)
22
+
23
+ # TODO: Update the values for the post you're testing
24
+ # MAINTAINER TODO: FIX THIS FOR IMAGES
25
+ assert_equal "394367115960503", result[:id]
26
+ assert_equal 173, result[:num_comments]
27
+ assert_equal nil, result[:num_shared]
28
+ assert_equal nil, result[:num_views]
29
+ assert_equal false, result[:reshare_warning]
30
+ assert_not_nil result[:video_preview_image_url]
31
+ assert_not_nil result[:video_url]
32
+ assert_equal nil, text
33
+ assert_equal 1654989063, result[:created_at]
34
+ assert_equal nil, result[:profile_link]
35
+ assert_equal false, result[:has_video]
36
+ assert_not_nil result[:video_preview_image_file]
37
+ assert_not_nil result[:video_file]
38
+ assert_not_nil result[:reactions]
39
+
40
+ assert result[:reactions].kind_of?(Array)
41
+ end
42
+ end
@@ -0,0 +1,61 @@
1
+ class VideoSieve<%= camel_name %> < VideoSieve
2
+ # To check if it's valid for the inputted graphql objects
3
+ def self.check(graphql_objects)
4
+ video_object = self.extractor(graphql_objects)
5
+
6
+ true
7
+ rescue StandardError
8
+ return false
9
+ end
10
+
11
+ # output the expected format of:
12
+ #
13
+ # post_details = {
14
+ # id: video_object["id"],
15
+ # num_comments: num_comments,
16
+ # num_shares: share_count_object.fetch("count", nil),
17
+ # num_views: feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["video_view_count"],
18
+ # reshare_warning: feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["should_show_reshare_warning"],
19
+ # video_preview_image_url: video_object["preferred_thumbnail"]["image"]["uri"],
20
+ # video_url: video_object["browser_native_hd_url"] || video_object["browser_native_sd_url"],
21
+ # text: text,
22
+ # created_at: creation_date,
23
+ # profile_link: story_node_object["comet_sections"]["context_layout"]["story"]["comet_sections"]["actor_photo"]["story"]["actors"][0]["url"],
24
+ # has_video: true
25
+ # }
26
+ # post_details[:video_preview_image_file] = Forki.retrieve_media(post_details[:video_preview_image_url])
27
+ # post_details[:video_file] = Forki.retrieve_media(post_details[:video_url])
28
+ # post_details[:reactions] = reaction_counts
29
+
30
+ def self.sieve(graphql_objects)
31
+ video_object = self.extractor(graphql_objects)
32
+
33
+ post_details = {
34
+ id: nil,
35
+ num_comments: nil,
36
+ num_shared: nil,
37
+ num_views: nil,
38
+ reshare_warning: nil,
39
+ video_preview_image_url: nil,
40
+ video_url: nil,
41
+ text: nil,
42
+ created_at: nil,
43
+ profile_link: nil,
44
+ has_video: true,
45
+ video_preview_image_file: Forki.retrieve_media(video_preview_image_url),
46
+ video_file: Forki.retrieve_media(video_url),
47
+ reactions: nil
48
+ }
49
+ end
50
+
51
+ private
52
+
53
+ def self.extractor(graphql_objects)
54
+ video_objects = graphql_objects.filter do |go|
55
+ # go = go.first if go.kind_of?(Array) && !go.empty?
56
+ # go.has_key?("video")
57
+ end
58
+
59
+ # video_objects.first.dig("video", "creation_story")
60
+ end
61
+ end
@@ -0,0 +1,41 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "test_helper"
4
+
5
+ # rubocop:disable Metrics/ClassLength
6
+ class VideoSieve<%= camel_name %>Test < Minitest::Test
7
+ def setup
8
+ @valid_json = JSON.parse(File.read("test/sieves/video_sieves/test_data/video_sieve_<%= name %>_valid.json"))
9
+ @invalid_json = JSON.parse(File.read("test/sieves/video_sieves/test_data/video_sieve_<%= name %>_invalid.json"))
10
+ end
11
+
12
+ def test_sieve_properly_fails_check
13
+ assert VideoSieve<%= camel_name %>.check(@invalid_json) == false
14
+ end
15
+
16
+ def test_sieve_properly_passes_check
17
+ assert VideoSieve<%= camel_name %>.check(@valid_json)
18
+ end
19
+
20
+ def test_sieve_can_sieve_properly
21
+ result = VideoSieve<%= camel_name %>.sieve(@valid_json)
22
+
23
+ # TODO: Update the values for the post you're testing
24
+ assert_equal "394367115960503", result[:id]
25
+ assert_equal 173, result[:num_comments]
26
+ assert_equal nil, result[:num_shared]
27
+ assert_equal nil, result[:num_views]
28
+ assert_equal false, result[:reshare_warning]
29
+ assert_not_nil result[:video_preview_image_url]
30
+ assert_not_nil result[:video_url]
31
+ assert_equal nil, text
32
+ assert_equal 1654989063, result[:created_at]
33
+ assert_equal nil, result[:profile_link]
34
+ assert_equal true, result[:has_video]
35
+ assert_not_nil result[:video_preview_image_file]
36
+ assert_not_nil result[:video_file]
37
+ assert_not_nil result[:reactions]
38
+
39
+ assert result[:reactions].kind_of?(Array)
40
+ end
41
+ end
data/forki.gemspec CHANGED
@@ -37,6 +37,8 @@ Gem::Specification.new do |spec|
37
37
  spec.add_dependency "oj" # A faster JSON parser/loader than stdlib
38
38
  spec.add_dependency "selenium-webdriver" # Webdriver selenium
39
39
 
40
+ spec.add_development_dependency "thor" # For the generator
41
+
40
42
  # For more information and examples about making a new gem, checkout our
41
43
  # guide at: https://bundler.io/guides/creating_gem.html
42
44
  end
@@ -4,6 +4,7 @@ require "typhoeus"
4
4
  require "securerandom"
5
5
  require "byebug"
6
6
 
7
+
7
8
  module Forki
8
9
  # rubocop:disable Metrics/ClassLength
9
10
  class PostScraper < Scraper
@@ -154,21 +155,30 @@ module Forki
154
155
  end
155
156
 
156
157
  graphql_object_array = graphql_strings.map { |graphql_string| JSON.parse(graphql_string) }
158
+
159
+ # Once in awhile it's really easy
160
+ video_objects = graphql_object_array.filter {|go| go.has_key?("video") }
161
+
162
+ if VideoSieve.can_process_with_sieve?(graphql_object_array)
163
+ # Eventually all of this complexity will be replaced with this
164
+ return VideoSieve.sieve_for_graphql_objects(graphql_object_array)
165
+ end
166
+
157
167
  story_node_object = graphql_object_array.find { |graphql_object| graphql_object.key? "node" }&.fetch("node", nil) # user posted video
158
168
  story_node_object = story_node_object || graphql_object_array.find { |graphql_object| graphql_object.key? "nodes" }&.fetch("nodes")&.first # page posted video
159
169
 
160
170
  return extract_video_post_data_alternative(graphql_object_array) if story_node_object.nil?
161
171
 
162
172
  if story_node_object["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"].key?("media")
163
- video_object = story_node_object["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"]["media"]
164
- creation_date = video_object["publish_time"]
165
- # creation_date = video_object["video"]["publish_time"]
173
+ video_object = story_node_object["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"]["media"]["video"]
174
+ creation_date = video_object["publish_time"] if video_object&.has_key("publish_time")
175
+ creation_date = story_node_object["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"]["media"] if creation_date.nil?
166
176
  elsif story_node_object["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"].key?("style_infos")
167
177
  # For "Reels" we need a separate way to parse this
168
178
  video_object = story_node_object["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"]["style_infos"].first["fb_shorts_story"]["short_form_video_context"]["playback_video"]
169
179
  creation_date = story_node_object["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"]["style_infos"].first["fb_shorts_story"]["creation_time"]
170
180
  else
171
- raise "Unable to parse video object"
181
+ raise "Unable to parse video object" if video_objects.empty?
172
182
  end
173
183
 
174
184
  feedback_object = story_node_object["comet_sections"]["feedback"]["story"]["feedback_context"]["feedback_target_with_context"]["ufi_renderer"]["feedback"]
@@ -191,7 +201,7 @@ module Forki
191
201
  num_views: feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["video_view_count"],
192
202
  reshare_warning: feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["should_show_reshare_warning"],
193
203
  video_preview_image_url: video_object["preferred_thumbnail"]["image"]["uri"],
194
- video_url: video_object["playable_url_quality_hd"] || video_object["playable_url"],
204
+ video_url: video_object["browser_native_hd_url"] || video_object["browser_native_sd_url"],
195
205
  text: text,
196
206
  created_at: creation_date,
197
207
  profile_link: story_node_object["comet_sections"]["context_layout"]["story"]["comet_sections"]["actor_photo"]["story"]["actors"][0]["url"],
@@ -217,7 +227,7 @@ module Forki
217
227
  num_views: feedback_object["video_view_count"],
218
228
  reshare_warning: feedback_object["should_show_reshare_warning"],
219
229
  video_preview_image_url: video_object["video"]["preferred_thumbnail"]["image"]["uri"],
220
- video_url: video_object["video"]["playable_url_quality_hd"] || video_object["video"]["playable_url"],
230
+ video_url: video_object["video"]["playable_url_quality_hd"] || video_object["video"]["browser_native_hd_url"] || video_object["video"]["browser_native_sd_url"] || video_object["video"]["playable_url"],
221
231
  text: sidepane_object["tahoe_sidepane_renderer"]["video"]["creation_story"]["comet_sections"]["message"]["story"]["message"]["text"],
222
232
  created_at: video_object["video"]["publish_time"],
223
233
  profile_link: sidepane_object["tahoe_sidepane_renderer"]["video"]["creation_story"]["comet_sections"]["actor_photo"]["story"]["actors"][0]["url"],
@@ -369,6 +379,7 @@ module Forki
369
379
  def parse(url)
370
380
  validate_and_load_page(url)
371
381
  graphql_strings = find_graphql_data_strings(page.html)
382
+
372
383
  post_data = extract_post_data(graphql_strings)
373
384
  post_data[:url] = url
374
385
  user_url = post_data[:profile_link]
@@ -398,3 +409,6 @@ module Forki
398
409
  end
399
410
  end
400
411
  end
412
+
413
+ require_relative "sieves/video_sieves/video_sieve"
414
+
@@ -7,6 +7,7 @@ require "oj"
7
7
  require "selenium-webdriver"
8
8
  require "open-uri"
9
9
  require "selenium/webdriver/remote/http/curb"
10
+ require "cgi"
10
11
 
11
12
  options = Selenium::WebDriver::Options.chrome(exclude_switches: ["enable-automation"])
12
13
  options.add_argument("--start-maximized")
@@ -190,9 +191,21 @@ module Forki
190
191
 
191
192
  visit "https://www.facebook.com"
192
193
  login
194
+
193
195
  visit url unless current_url.start_with?(url)
196
+ # # If the video is a watch page it doesn't have most of the data we want so we click on the video
197
+ # if url.include?("watch/live")
198
+ # clickable_element = find("video")
199
+
200
+ # while(clickable_element.obscured?)
201
+ # clickable_element = clickable_element.find(:xpath, "..")
202
+ # end
203
+
204
+ # clickable_element.click
205
+ # end
194
206
  end
195
207
 
208
+
196
209
  # Extracts an integer out of a string describing a number
197
210
  # e.g. "4K Comments" returns 4000
198
211
  # e.g. "131 Shares" returns 131
@@ -203,19 +216,23 @@ module Forki
203
216
  element = element.text(:all)
204
217
  end
205
218
 
206
- num_pattern = /[0-9KM ,.]+/
207
- interaction_num_text = num_pattern.match(element)[0]
208
-
209
- if interaction_num_text.include?(".") # e.g. "2.2K"
210
- interaction_num_text.to_i + interaction_num_text[-2].to_i * 100
211
- elsif interaction_num_text.include?("K") # e.g. "13K"
212
- interaction_num_text.to_i * 1000
213
- elsif interaction_num_text.include?("M") # e.g. "13M"
214
- interaction_num_text.to_i * 1_000_000
215
- else # e.g. "15,443"
216
- interaction_num_text.delete!(",")
217
- interaction_num_text.delete(" ").to_i
219
+ # Check if there's a modifier i.e. `K` or `M` if there isn't just return the number
220
+ unless element.include?("K") || element.include?("M")
221
+ element.delete(",") # "5,456" e.g.
222
+ return element.to_i
218
223
  end
224
+
225
+ modifier = element[-1]
226
+ number = element[0...-1].to_f
227
+
228
+ case modifier
229
+ when "K"
230
+ number = number * 1_000
231
+ when "M"
232
+ number = number * 1_000_000
233
+ end
234
+
235
+ number.to_i
219
236
  end
220
237
  end
221
238
  end
@@ -0,0 +1,25 @@
1
+ class ImageSieve
2
+ def self.can_process_with_sieve?(graphql_objects)
3
+ !sieve_class_for_graphql_objects(graphql_objects).nil?
4
+ end
5
+
6
+ def self.sieve_for_graphql_objects(graphql_objects)
7
+
8
+ sieve = sieve_class_for_graphql_objects(graphql_objects)
9
+ return nil if sieve.nil?
10
+
11
+ sieve.sieve(graphql_objects)
12
+ end
13
+
14
+ private
15
+
16
+ def self.sieve_class_for_graphql_objects(graphql_objects)
17
+ sieves = []
18
+ sieves.detect { |sieve| sieve.check(graphql_objects) }
19
+ end
20
+ end
21
+
22
+
23
+ Dir['./lib/forki/scrapers/sieves/image_sieves/*.rb'].each do |file|
24
+ require file unless file.end_with?("image_sieve.rb")
25
+ end
@@ -0,0 +1,24 @@
1
+ class VideoSieve
2
+ def self.can_process_with_sieve?(graphql_objects)
3
+ !sieve_class_for_graphql_objects(graphql_objects).nil?
4
+ end
5
+
6
+ def self.sieve_for_graphql_objects(graphql_objects)
7
+
8
+ sieve = sieve_class_for_graphql_objects(graphql_objects)
9
+ return nil if sieve.nil?
10
+
11
+ sieve.sieve(graphql_objects)
12
+ end
13
+
14
+ private
15
+
16
+ def self.sieve_class_for_graphql_objects(graphql_objects)
17
+ sieves = [VideoSieveWatchTab, VideoSieveVideoPage]
18
+ sieves.detect { |sieve| sieve.check(graphql_objects) }
19
+ end
20
+ end
21
+
22
+ Dir['./lib/forki/scrapers/sieves/video_sieves/*.rb'].each do |file|
23
+ require file unless file.end_with?("video_sieve.rb")
24
+ end
@@ -0,0 +1,66 @@
1
+ class VideoSieveVideoPage < VideoSieve
2
+ # To check if it's valid for the inputted graphql objects
3
+ def self.check(graphql_objects)
4
+ story_node_object = self.extractor(graphql_objects) # This will error out
5
+ return false unless story_node_object["content"]["story"]["attachments"].first["styles"]["attachment"].has_key?("media")
6
+
7
+ true
8
+ rescue StandardError
9
+ return false
10
+ end
11
+
12
+ # output the expected format of:
13
+ #
14
+ # post_details = {
15
+ # id: video_object["id"],
16
+ # num_comments: num_comments,
17
+ # num_shares: share_count_object.fetch("count", nil),
18
+ # num_views: feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["video_view_count"],
19
+ # reshare_warning: feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["should_show_reshare_warning"],
20
+ # video_preview_image_url: video_object["preferred_thumbnail"]["image"]["uri"],
21
+ # video_url: video_object["browser_native_hd_url"] || video_object["browser_native_sd_url"],
22
+ # text: text,
23
+ # created_at: creation_date,
24
+ # profile_link: story_node_object["comet_sections"]["context_layout"]["story"]["comet_sections"]["actor_photo"]["story"]["actors"][0]["url"],
25
+ # has_video: true
26
+ # }
27
+ # post_details[:video_preview_image_file] = Forki.retrieve_media(post_details[:video_preview_image_url])
28
+ # post_details[:video_file] = Forki.retrieve_media(post_details[:video_url])
29
+ # post_details[:reactions] = reaction_counts
30
+
31
+ def self.sieve(graphql_objects)
32
+ extracted_text = self.extractor(graphql_objects)
33
+
34
+ story_object = extracted_text["content"]["story"]
35
+ video_object = extracted_text["content"]["story"]["attachments"].first["styles"]["attachment"]["media"]
36
+ feedback_object = extracted_text["feedback"]["story"]["feedback_context"]["feedback_target_with_context"]["ufi_renderer"]["feedback"]["comet_ufi_summary_and_actions_renderer"]["feedback"]
37
+
38
+ video_preview_image_url = video_object["preferred_thumbnail"]["image"]["uri"]
39
+ video_url = video_object["browser_native_hd_url"]
40
+ video_url = video_object["browser_native_sd_url"] if video_url.nil?
41
+
42
+ post_details = {
43
+ id: video_object["id"],
44
+ num_comments: feedback_object["total_comment_count"],
45
+ num_shared: feedback_object["share_count"]["count"],
46
+ num_views: nil,
47
+ reshare_warning: feedback_object["should_show_reshare_warning"],
48
+ video_preview_image_url: video_preview_image_url,
49
+ video_url: video_url,
50
+ text: story_object["message"]["text"],
51
+ created_at: video_object["publish_time"],
52
+ profile_link: story_object["actors"].first["url"],
53
+ has_video: true,
54
+ video_preview_image_file: Forki.retrieve_media(video_preview_image_url),
55
+ video_file: Forki.retrieve_media(video_url),
56
+ reactions: feedback_object["cannot_see_top_custom_reactions"]["top_reactions"]["edges"]
57
+ }
58
+ end
59
+
60
+ private
61
+
62
+ def self.extractor(graphql_objects)
63
+ story_node_object = graphql_objects.find { |graphql_object| graphql_object.key? "node" }&.fetch("node", nil) # user posted video
64
+ story_node_object["comet_sections"]
65
+ end
66
+ end
@@ -0,0 +1,91 @@
1
+ # This is for the "watch" tab style videos https://www.facebook.com/watch/live/?v=394367115960503
2
+
3
+ class VideoSieveWatchTab < VideoSieve
4
+ # To check if it's valid for the inputted graphql objects
5
+ def self.check(graphql_objects)
6
+ video_object = self.extractor(graphql_objects)
7
+ return false if video_object.nil?
8
+
9
+ video_object = video_object["attachments"]
10
+ return false if video_object.nil?
11
+
12
+ return false unless video_object.kind_of?(Array) && !video_object.empty?
13
+
14
+ video_object = video_object.first
15
+ return false unless video_object.kind_of?(Hash) && video_object.keys.include?("media")
16
+
17
+ true
18
+ rescue StandardError
19
+ return false
20
+ end
21
+
22
+ # output the expected format of:
23
+ #
24
+ # post_details = {
25
+ # id: video_object["id"],
26
+ # num_comments: num_comments,
27
+ # num_shares: share_count_object.fetch("count", nil),
28
+ # num_views: feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["video_view_count"],
29
+ # reshare_warning: feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["should_show_reshare_warning"],
30
+ # video_preview_image_url: video_object["preferred_thumbnail"]["image"]["uri"],
31
+ # video_url: video_object["browser_native_hd_url"] || video_object["browser_native_sd_url"],
32
+ # text: text,
33
+ # created_at: creation_date,
34
+ # profile_link: story_node_object["comet_sections"]["context_layout"]["story"]["comet_sections"]["actor_photo"]["story"]["actors"][0]["url"],
35
+ # has_video: true
36
+ # }
37
+ # post_details[:video_preview_image_file] = Forki.retrieve_media(post_details[:video_preview_image_url])
38
+ # post_details[:video_file] = Forki.retrieve_media(post_details[:video_url])
39
+ # post_details[:reactions] = reaction_counts
40
+
41
+ def self.sieve(graphql_objects)
42
+ video_object = self.extractor(graphql_objects)
43
+
44
+ video_url = video_object["attachments"].first["media"]["browser_native_sd_url"]
45
+ video_preview_image_url = video_object["attachments"].first["media"]["preferred_thumbnail"]["image"]["uri"]
46
+
47
+ if !video_object["feedback_context"].nil?
48
+ feedback_object = video_object["feedback_context"]["feedback_target_with_context"]
49
+ else
50
+ feedback_object = graphql_objects.find { |go| !go.dig("feedback", "total_comment_count").nil? }
51
+ feedback_object = feedback_object["feedback"] if feedback_object.has_key?("feedback")
52
+ end
53
+
54
+ profile_link = video_object["attachments"].first["media"]["owner"]["url"]
55
+ if profile_link.nil?
56
+ filtered_json = graphql_objects.find { |go| go.has_key? "attachments" }
57
+ profile_link = filtered_json["attachments"].first["media"]["creation_story"]["comet_sections"]["title"]["story"]["actors"].first["url"]
58
+ end
59
+
60
+ post_details = {
61
+ id: video_object.dig("shareable", "id") || video_object["attachments"].first["media"]["id"],
62
+ num_comments: feedback_object["total_comment_count"],
63
+ num_shared: nil, # This is not associated with these videos in this format
64
+ num_views: nil, # This is not associated with these videos in this format
65
+ reshare_warning: feedback_object["should_show_reshare_warning"],
66
+ video_preview_image_url: video_preview_image_url,
67
+ video_url: video_url,
68
+ text: nil, # There is no text associated with these videos
69
+ created_at: video_object["attachments"].first["media"]["publish_time"],
70
+ profile_link: profile_link,
71
+ has_video: true,
72
+ video_preview_image_file: Forki.retrieve_media(video_preview_image_url),
73
+ video_file: Forki.retrieve_media(video_url),
74
+ reactions: feedback_object["cannot_see_top_custom_reactions"]["top_reactions"]["edges"]
75
+ }
76
+ end
77
+
78
+ private
79
+
80
+ def self.extractor(graphql_objects)
81
+ video_objects = graphql_objects.filter do |go|
82
+ go = go.first if go.kind_of?(Array) && !go.empty?
83
+ go.has_key?("video")
84
+ end
85
+
86
+ story = video_objects.first.dig("video", "creation_story")
87
+ story = video_objects.first.dig("video", "story") if story.nil?
88
+
89
+ story
90
+ end
91
+ end
@@ -3,10 +3,14 @@ require "typhoeus"
3
3
  module Forki
4
4
  class UserScraper < Scraper
5
5
  # Finds and returns the number of people who like the current page
6
- def find_number_of_likes
7
- likes_pattern = /[0-9,.KM ] people like this/
8
- number_of_likes_elem = all("span").filter { | span| likes_pattern.match? span.text }.first
9
- extract_int_from_num_element(number_of_likes_elem)
6
+ def find_number_of_likes(profile_details_string)
7
+ likes_pattern = /[0-9,.KM ] likes/
8
+ likes_pattern = /(?<num_likes>[0-9,.KM ]+) (l|L)ikes/
9
+ number_of_likes_match = likes_pattern.match(profile_details_string)
10
+
11
+ return nil if number_of_likes_match.nil?
12
+
13
+ extract_int_from_num_element(number_of_likes_match.named_captures["num_likes"])
10
14
  end
11
15
 
12
16
  # Finds and returns the number of people who follow the current page
@@ -14,8 +18,18 @@ module Forki
14
18
  followers_pattern = /Followed by (?<num_followers>[0-9,.KM ]) people/
15
19
  alt_follower_pattern = /(?<num_followers>[0-9,.KM ]+) (f|F)ollowers/
16
20
  number_of_followers_match = followers_pattern.match(profile_details_string) || alt_follower_pattern.match(profile_details_string)
21
+
17
22
  return nil if number_of_followers_match.nil?
18
- extract_int_from_num_element(number_of_followers_match.named_captures["num_followers"])
23
+
24
+ number_of_followers = extract_int_from_num_element(number_of_followers_match.named_captures["num_followers"])
25
+
26
+ # Note, this is sticking around if we want to use it later
27
+ # if number_of_followers.nil?
28
+ # number_of_followers_string = JSON.parse(profile_header_str)["user"]["profile_header_renderer"]["user"]["profile_social_context"]["content"].first["text"]["text"]
29
+ # number_of_followers = extract_int_from_num_element(number_of_followers_string)
30
+ # end
31
+
32
+ number_of_followers
19
33
  end
20
34
 
21
35
  def find_number_followers_for_normal_profile(profile_followers_node)
@@ -61,6 +75,7 @@ module Forki
61
75
  verified: profile_header_obj["user"]["is_verified"],
62
76
  profile: profile_intro_obj ? profile_intro_obj["profile_intro_card"]["bio"]["text"] : "",
63
77
  profile_image_url: profile_header_obj["user"]["profilePicLarge"]["uri"],
78
+ number_of_likes: find_number_of_likes(profile_header_str),
64
79
  }
65
80
  end
66
81
 
data/lib/forki/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Forki
4
- VERSION = "0.1.4"
4
+ VERSION = "0.2.0"
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: forki
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.4
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - ''
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-06-20 00:00:00.000000000 Z
11
+ date: 2023-08-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: capybara
@@ -80,6 +80,20 @@ dependencies:
80
80
  - - ">="
81
81
  - !ruby/object:Gem::Version
82
82
  version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: thor
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
83
97
  description:
84
98
  email:
85
99
  - ''
@@ -99,12 +113,21 @@ files:
99
113
  - README.md
100
114
  - Rakefile
101
115
  - bin/console
116
+ - bin/generate_sieve
117
+ - bin/generator_templates/image_sieve_template.rb.erb
118
+ - bin/generator_templates/image_sieve_test_template.rb.erb
119
+ - bin/generator_templates/video_sieve_template.rb.erb
120
+ - bin/generator_templates/video_sieve_test_template.rb.erb
102
121
  - bin/setup
103
122
  - forki.gemspec
104
123
  - lib/forki.rb
105
124
  - lib/forki/post.rb
106
125
  - lib/forki/scrapers/post_scraper.rb
107
126
  - lib/forki/scrapers/scraper.rb
127
+ - lib/forki/scrapers/sieves/image_sieves/image_sieve.rb.rb
128
+ - lib/forki/scrapers/sieves/video_sieves/video_sieve.rb
129
+ - lib/forki/scrapers/sieves/video_sieves/video_sieve_video_page.rb
130
+ - lib/forki/scrapers/sieves/video_sieves/video_sieve_watch_tab.rb
108
131
  - lib/forki/scrapers/user_scraper.rb
109
132
  - lib/forki/user.rb
110
133
  - lib/forki/version.rb