forki 0.1.4 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 67f9dde3202683b68acd00f10ec572ba80bb4a749fa369251c5e512bb8304393
4
- data.tar.gz: 55f099e3e63f174c4ae00aef56c27c77a44b5b49414dcfb7f37e32ae8bb9f47f
3
+ metadata.gz: 7a1b2f6a831ebac1bf9e79cc33818aa3d8f638459f538ff4f0cb92bab55b16df
4
+ data.tar.gz: 1df3b090db0ba37ecfbcf17933d41334620e9542a75f3328d9a335adec5d1ae9
5
5
  SHA512:
6
- metadata.gz: 35db430333dcc95259f49d1159e976f3d969b648f80c4af9a6b87c0ae0ff4ae67462f1942797da212861c26aeab7ee982f76b86e3af8563dca561e57d012aaa3
7
- data.tar.gz: 40f0be4238a4b0bfd7fd23309403df139f82e728739211226c753e52ee96cb7ce2300ee4c0c81539c90761f05d8054b2f7a1b1d1e16cd9cd2744263df4ebf16e
6
+ metadata.gz: b65c46157a6d1f320345d0216d138239c9d73cb29fd7fc0fa504655e0c789021a558d2e75a0447f026dc694e4fb95765cf12f8b2350e147b1011e670cd40621f
7
+ data.tar.gz: bc62a4decd0205e75ac93038d5768ff4d9ec55d8a25980dbb8522b3b2296f84e50db7caa3ba8aeb28d61eb482957b4d4bf68a9d9658cc26f4209198a64b1fe04
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- forki (0.1.1)
4
+ forki (0.1.4)
5
5
  apparition
6
6
  capybara
7
7
  oj
@@ -37,7 +37,7 @@ GEM
37
37
  ast (2.4.2)
38
38
  builder (3.2.4)
39
39
  byebug (11.1.3)
40
- capybara (3.39.1)
40
+ capybara (3.39.2)
41
41
  addressable
42
42
  matrix
43
43
  mini_mime (>= 0.1.3)
@@ -66,11 +66,11 @@ GEM
66
66
  minitest (5.18.0)
67
67
  nokogiri (1.15.1-arm64-darwin)
68
68
  racc (~> 1.4)
69
- oj (3.14.3)
69
+ oj (3.15.1)
70
70
  parallel (1.23.0)
71
71
  parser (3.2.2.1)
72
72
  ast (~> 2.4.1)
73
- public_suffix (5.0.1)
73
+ public_suffix (5.0.3)
74
74
  racc (1.6.2)
75
75
  rack (2.2.4)
76
76
  rack-test (2.1.0)
@@ -127,7 +127,7 @@ GEM
127
127
  rubocop-rails (~> 2.0)
128
128
  ruby-progressbar (1.13.0)
129
129
  rubyzip (2.3.2)
130
- selenium-webdriver (4.9.1)
130
+ selenium-webdriver (4.10.0)
131
131
  rexml (~> 3.2, >= 3.2.5)
132
132
  rubyzip (>= 1.2.2, < 3.0)
133
133
  websocket (~> 1.0)
@@ -138,7 +138,7 @@ GEM
138
138
  concurrent-ruby (~> 1.0)
139
139
  unicode-display_width (2.4.2)
140
140
  websocket (1.2.9)
141
- websocket-driver (0.7.5)
141
+ websocket-driver (0.7.6)
142
142
  websocket-extensions (>= 0.1.0)
143
143
  websocket-extensions (0.1.5)
144
144
  xpath (3.2.0)
@@ -160,6 +160,7 @@ DEPENDENCIES
160
160
  rubocop (~> 1.7)
161
161
  rubocop-rails (~> 2.17.3)
162
162
  rubocop-rails_config
163
+ thor
163
164
 
164
165
  BUNDLED WITH
165
166
  2.3.11
@@ -0,0 +1,79 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ require "bundler/setup"
5
+ require "forki"
6
+ require "thor"
7
+ require "erb"
8
+
9
+ class GenerateSieve < Thor
10
+
11
+ desc "generate TYPE NAME", "generate a TYPE named NAME"
12
+ option :video
13
+ option :image
14
+ def generate(type, name)
15
+ case type
16
+ when "post"
17
+ if !options[:video].nil?
18
+ style = :video
19
+ elsif !options[:image].nil?
20
+ style = :image
21
+ else
22
+ puts "Must indicate either video or image flag"
23
+ exit
24
+ end
25
+
26
+ SieveGenerator.generate_post_sieve(name, style)
27
+ when "user"
28
+ SieveGenerator.generate_user_sieve(name)
29
+ else
30
+ puts "Type must be `post` or `user` only. `#{type}` passed in."
31
+ exit
32
+ end
33
+ end
34
+ end
35
+
36
+ class SieveGenerator
37
+ def self.generate_post_sieve(name, style)
38
+ puts "Generating post sieve named #{name} with style #{style}"
39
+
40
+ file_path = "./lib/forki/scrapers/sieves/"
41
+ test_path = "./test/sieves/"
42
+ case style
43
+ when :video
44
+ test_data_valid_path = "#{test_path}/video_sieves/test_data/video_sieve_#{name}_valid.json"
45
+ test_data_invalid_path = "#{test_path}/video_sieves/test_data/video_sieve_#{name}_invalid.json"
46
+
47
+ file_path += "video_sieves/video_sieve_#{name}.rb"
48
+ test_path += "video_sieves/video_sieve_#{name}_test.rb"
49
+ file_template = File.read("./bin/generator_templates/video_sieve_template.rb.erb")
50
+ test_file_template = File.read("./bin/generator_templates/video_sieve_test_template.rb.erb")
51
+ when :image
52
+ test_data_valid_path = "#{test_path}/image_sieves/test_data/image_sieve_#{name}_valid.json"
53
+ test_data_invalid_path = "#{test_path}/image_sieves/test_data/image_sieve_#{name}_invalid.json"
54
+
55
+ file_path += "image_sieves/image_sieve_#{name}.rb"
56
+ test_path += "image_sieves/image_sieve_#{name}_test.rb"
57
+ file_template = File.read("./bin/generator_templates/image_sieve_template.rb.erb")
58
+ test_file_template = File.read("./bin/generator_templates/image_sieve_test_template.rb.erb")
59
+ end
60
+
61
+ file_contents = ERB.new(file_template)
62
+ test_file_contents = ERB.new(test_file_template)
63
+
64
+ camel_name = name.split('_').collect(&:capitalize).join
65
+
66
+ File.write(file_path, file_contents.result(binding))
67
+ File.write(test_path, test_file_contents.result(binding))
68
+
69
+ File.write(test_data_valid_path, "")
70
+ File.write(test_data_invalid_path, "")
71
+ end
72
+
73
+ def self.generate_user_sieve(name)
74
+ puts "Generating user sieve named #{name}"
75
+ end
76
+ end
77
+
78
+ GenerateSieve.start(ARGV)
79
+
@@ -0,0 +1,63 @@
1
+ # NOTE: This is not implemented yet, just here for filler
2
+
3
+ class ImageSieve<%= camel_name %> < ImageSieve
4
+ # To check if it's valid for the inputted graphql objects
5
+ def self.check(graphql_objects)
6
+ image_object = self.extractor(graphql_objects)
7
+
8
+ true
9
+ rescue StandardError
10
+ return false
11
+ end
12
+
13
+ # output the expected format of:
14
+ #
15
+ # post_details = {
16
+ # id: video_object["id"],
17
+ # num_comments: num_comments,
18
+ # num_shares: share_count_object.fetch("count", nil),
19
+ # num_views: feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["video_view_count"],
20
+ # reshare_warning: feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["should_show_reshare_warning"],
21
+ # video_preview_image_url: video_object["preferred_thumbnail"]["image"]["uri"],
22
+ # video_url: video_object["browser_native_hd_url"] || video_object["browser_native_sd_url"],
23
+ # text: text,
24
+ # created_at: creation_date,
25
+ # profile_link: story_node_object["comet_sections"]["context_layout"]["story"]["comet_sections"]["actor_photo"]["story"]["actors"][0]["url"],
26
+ # has_video: true
27
+ # }
28
+ # post_details[:video_preview_image_file] = Forki.retrieve_media(post_details[:video_preview_image_url])
29
+ # post_details[:video_file] = Forki.retrieve_media(post_details[:video_url])
30
+ # post_details[:reactions] = reaction_counts
31
+
32
+ def self.sieve(graphql_objects)
33
+ image_object = self.extractor(graphql_objects)
34
+
35
+ post_details = {
36
+ id: nil,
37
+ num_comments: nil,
38
+ num_shared: nil,
39
+ num_views: nil,
40
+ reshare_warning: nil,
41
+ video_preview_image_url: nil,
42
+ video_url: nil,
43
+ text: nil,
44
+ created_at: nil,
45
+ profile_link: nil,
46
+ has_video: false,
47
+ video_preview_image_file: Forki.retrieve_media(video_preview_image_url),
48
+ video_file: Forki.retrieve_media(video_url),
49
+ reactions: nil
50
+ }
51
+ end
52
+
53
+ private
54
+
55
+ def self.extractor(graphql_objects)
56
+ image_objects = graphql_objects.filter do |go|
57
+ # go = go.first if go.kind_of?(Array) && !go.empty?
58
+ # go.has_key?("image")
59
+ end
60
+
61
+ # image_objects.first.dig("image", "creation_story")
62
+ end
63
+ end
@@ -0,0 +1,42 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "test_helper"
4
+
5
+ # rubocop:disable Metrics/ClassLength
6
+ class ImageSieve<%= camel_name %>Test < Minitest::Test
7
+ def setup
8
+ @valid_json = JSON.parse(File.read("test/sieves/image_sieves/test_data/image_sieve_<%= name %>_valid.json"))
9
+ @invalid_json = JSON.parse(File.read("test/sieves/image_sieves/test_data/image_sieve_<%= name %>_invalid.json"))
10
+ end
11
+
12
+ def test_sieve_properly_fails_check
13
+ assert ImageSieve<%= camel_name %>.check(@invalid_json) == false
14
+ end
15
+
16
+ def test_sieve_properly_passes_check
17
+ assert ImageSieve<%= camel_name %>.check(@valid_json)
18
+ end
19
+
20
+ def test_sieve_can_sieve_properly
21
+ result = ImageSieve<%= camel_name %>.sieve(@valid_json)
22
+
23
+ # TODO: Update the values for the post you're testing
24
+ # MAINTAINER TODO: FIX THIS FOR IMAGES
25
+ assert_equal "394367115960503", result[:id]
26
+ assert_equal 173, result[:num_comments]
27
+ assert_equal nil, result[:num_shared]
28
+ assert_equal nil, result[:num_views]
29
+ assert_equal false, result[:reshare_warning]
30
+ assert_not_nil result[:video_preview_image_url]
31
+ assert_not_nil result[:video_url]
32
+ assert_equal nil, text
33
+ assert_equal 1654989063, result[:created_at]
34
+ assert_equal nil, result[:profile_link]
35
+ assert_equal false, result[:has_video]
36
+ assert_not_nil result[:video_preview_image_file]
37
+ assert_not_nil result[:video_file]
38
+ assert_not_nil result[:reactions]
39
+
40
+ assert result[:reactions].kind_of?(Array)
41
+ end
42
+ end
@@ -0,0 +1,61 @@
1
+ class VideoSieve<%= camel_name %> < VideoSieve
2
+ # To check if it's valid for the inputted graphql objects
3
+ def self.check(graphql_objects)
4
+ video_object = self.extractor(graphql_objects)
5
+
6
+ true
7
+ rescue StandardError
8
+ return false
9
+ end
10
+
11
+ # output the expected format of:
12
+ #
13
+ # post_details = {
14
+ # id: video_object["id"],
15
+ # num_comments: num_comments,
16
+ # num_shares: share_count_object.fetch("count", nil),
17
+ # num_views: feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["video_view_count"],
18
+ # reshare_warning: feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["should_show_reshare_warning"],
19
+ # video_preview_image_url: video_object["preferred_thumbnail"]["image"]["uri"],
20
+ # video_url: video_object["browser_native_hd_url"] || video_object["browser_native_sd_url"],
21
+ # text: text,
22
+ # created_at: creation_date,
23
+ # profile_link: story_node_object["comet_sections"]["context_layout"]["story"]["comet_sections"]["actor_photo"]["story"]["actors"][0]["url"],
24
+ # has_video: true
25
+ # }
26
+ # post_details[:video_preview_image_file] = Forki.retrieve_media(post_details[:video_preview_image_url])
27
+ # post_details[:video_file] = Forki.retrieve_media(post_details[:video_url])
28
+ # post_details[:reactions] = reaction_counts
29
+
30
+ def self.sieve(graphql_objects)
31
+ video_object = self.extractor(graphql_objects)
32
+
33
+ post_details = {
34
+ id: nil,
35
+ num_comments: nil,
36
+ num_shared: nil,
37
+ num_views: nil,
38
+ reshare_warning: nil,
39
+ video_preview_image_url: nil,
40
+ video_url: nil,
41
+ text: nil,
42
+ created_at: nil,
43
+ profile_link: nil,
44
+ has_video: true,
45
+ video_preview_image_file: Forki.retrieve_media(video_preview_image_url),
46
+ video_file: Forki.retrieve_media(video_url),
47
+ reactions: nil
48
+ }
49
+ end
50
+
51
+ private
52
+
53
+ def self.extractor(graphql_objects)
54
+ video_objects = graphql_objects.filter do |go|
55
+ # go = go.first if go.kind_of?(Array) && !go.empty?
56
+ # go.has_key?("video")
57
+ end
58
+
59
+ # video_objects.first.dig("video", "creation_story")
60
+ end
61
+ end
@@ -0,0 +1,41 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "test_helper"
4
+
5
+ # rubocop:disable Metrics/ClassLength
6
+ class VideoSieve<%= camel_name %>Test < Minitest::Test
7
+ def setup
8
+ @valid_json = JSON.parse(File.read("test/sieves/video_sieves/test_data/video_sieve_<%= name %>_valid.json"))
9
+ @invalid_json = JSON.parse(File.read("test/sieves/video_sieves/test_data/video_sieve_<%= name %>_invalid.json"))
10
+ end
11
+
12
+ def test_sieve_properly_fails_check
13
+ assert VideoSieve<%= camel_name %>.check(@invalid_json) == false
14
+ end
15
+
16
+ def test_sieve_properly_passes_check
17
+ assert VideoSieve<%= camel_name %>.check(@valid_json)
18
+ end
19
+
20
+ def test_sieve_can_sieve_properly
21
+ result = VideoSieve<%= camel_name %>.sieve(@valid_json)
22
+
23
+ # TODO: Update the values for the post you're testing
24
+ assert_equal "394367115960503", result[:id]
25
+ assert_equal 173, result[:num_comments]
26
+ assert_equal nil, result[:num_shared]
27
+ assert_equal nil, result[:num_views]
28
+ assert_equal false, result[:reshare_warning]
29
+ assert_not_nil result[:video_preview_image_url]
30
+ assert_not_nil result[:video_url]
31
+ assert_equal nil, text
32
+ assert_equal 1654989063, result[:created_at]
33
+ assert_equal nil, result[:profile_link]
34
+ assert_equal true, result[:has_video]
35
+ assert_not_nil result[:video_preview_image_file]
36
+ assert_not_nil result[:video_file]
37
+ assert_not_nil result[:reactions]
38
+
39
+ assert result[:reactions].kind_of?(Array)
40
+ end
41
+ end
data/forki.gemspec CHANGED
@@ -37,6 +37,8 @@ Gem::Specification.new do |spec|
37
37
  spec.add_dependency "oj" # A faster JSON parser/loader than stdlib
38
38
  spec.add_dependency "selenium-webdriver" # Webdriver selenium
39
39
 
40
+ spec.add_development_dependency "thor" # For the generator
41
+
40
42
  # For more information and examples about making a new gem, checkout our
41
43
  # guide at: https://bundler.io/guides/creating_gem.html
42
44
  end
@@ -4,6 +4,7 @@ require "typhoeus"
4
4
  require "securerandom"
5
5
  require "byebug"
6
6
 
7
+
7
8
  module Forki
8
9
  # rubocop:disable Metrics/ClassLength
9
10
  class PostScraper < Scraper
@@ -154,21 +155,30 @@ module Forki
154
155
  end
155
156
 
156
157
  graphql_object_array = graphql_strings.map { |graphql_string| JSON.parse(graphql_string) }
158
+
159
+ # Once in awhile it's really easy
160
+ video_objects = graphql_object_array.filter {|go| go.has_key?("video") }
161
+
162
+ if VideoSieve.can_process_with_sieve?(graphql_object_array)
163
+ # Eventually all of this complexity will be replaced with this
164
+ return VideoSieve.sieve_for_graphql_objects(graphql_object_array)
165
+ end
166
+
157
167
  story_node_object = graphql_object_array.find { |graphql_object| graphql_object.key? "node" }&.fetch("node", nil) # user posted video
158
168
  story_node_object = story_node_object || graphql_object_array.find { |graphql_object| graphql_object.key? "nodes" }&.fetch("nodes")&.first # page posted video
159
169
 
160
170
  return extract_video_post_data_alternative(graphql_object_array) if story_node_object.nil?
161
171
 
162
172
  if story_node_object["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"].key?("media")
163
- video_object = story_node_object["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"]["media"]
164
- creation_date = video_object["publish_time"]
165
- # creation_date = video_object["video"]["publish_time"]
173
+ video_object = story_node_object["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"]["media"]["video"]
174
+ creation_date = video_object["publish_time"] if video_object&.has_key("publish_time")
175
+ creation_date = story_node_object["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"]["media"] if creation_date.nil?
166
176
  elsif story_node_object["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"].key?("style_infos")
167
177
  # For "Reels" we need a separate way to parse this
168
178
  video_object = story_node_object["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"]["style_infos"].first["fb_shorts_story"]["short_form_video_context"]["playback_video"]
169
179
  creation_date = story_node_object["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"]["style_infos"].first["fb_shorts_story"]["creation_time"]
170
180
  else
171
- raise "Unable to parse video object"
181
+ raise "Unable to parse video object" if video_objects.empty?
172
182
  end
173
183
 
174
184
  feedback_object = story_node_object["comet_sections"]["feedback"]["story"]["feedback_context"]["feedback_target_with_context"]["ufi_renderer"]["feedback"]
@@ -191,7 +201,7 @@ module Forki
191
201
  num_views: feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["video_view_count"],
192
202
  reshare_warning: feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["should_show_reshare_warning"],
193
203
  video_preview_image_url: video_object["preferred_thumbnail"]["image"]["uri"],
194
- video_url: video_object["playable_url_quality_hd"] || video_object["playable_url"],
204
+ video_url: video_object["browser_native_hd_url"] || video_object["browser_native_sd_url"],
195
205
  text: text,
196
206
  created_at: creation_date,
197
207
  profile_link: story_node_object["comet_sections"]["context_layout"]["story"]["comet_sections"]["actor_photo"]["story"]["actors"][0]["url"],
@@ -217,7 +227,7 @@ module Forki
217
227
  num_views: feedback_object["video_view_count"],
218
228
  reshare_warning: feedback_object["should_show_reshare_warning"],
219
229
  video_preview_image_url: video_object["video"]["preferred_thumbnail"]["image"]["uri"],
220
- video_url: video_object["video"]["playable_url_quality_hd"] || video_object["video"]["playable_url"],
230
+ video_url: video_object["video"]["playable_url_quality_hd"] || video_object["video"]["browser_native_hd_url"] || video_object["video"]["browser_native_sd_url"] || video_object["video"]["playable_url"],
221
231
  text: sidepane_object["tahoe_sidepane_renderer"]["video"]["creation_story"]["comet_sections"]["message"]["story"]["message"]["text"],
222
232
  created_at: video_object["video"]["publish_time"],
223
233
  profile_link: sidepane_object["tahoe_sidepane_renderer"]["video"]["creation_story"]["comet_sections"]["actor_photo"]["story"]["actors"][0]["url"],
@@ -369,6 +379,7 @@ module Forki
369
379
  def parse(url)
370
380
  validate_and_load_page(url)
371
381
  graphql_strings = find_graphql_data_strings(page.html)
382
+
372
383
  post_data = extract_post_data(graphql_strings)
373
384
  post_data[:url] = url
374
385
  user_url = post_data[:profile_link]
@@ -398,3 +409,6 @@ module Forki
398
409
  end
399
410
  end
400
411
  end
412
+
413
+ require_relative "sieves/video_sieves/video_sieve"
414
+
@@ -7,6 +7,7 @@ require "oj"
7
7
  require "selenium-webdriver"
8
8
  require "open-uri"
9
9
  require "selenium/webdriver/remote/http/curb"
10
+ require "cgi"
10
11
 
11
12
  options = Selenium::WebDriver::Options.chrome(exclude_switches: ["enable-automation"])
12
13
  options.add_argument("--start-maximized")
@@ -190,9 +191,21 @@ module Forki
190
191
 
191
192
  visit "https://www.facebook.com"
192
193
  login
194
+
193
195
  visit url unless current_url.start_with?(url)
196
+ # # If the video is a watch page it doesn't have most of the data we want so we click on the video
197
+ # if url.include?("watch/live")
198
+ # clickable_element = find("video")
199
+
200
+ # while(clickable_element.obscured?)
201
+ # clickable_element = clickable_element.find(:xpath, "..")
202
+ # end
203
+
204
+ # clickable_element.click
205
+ # end
194
206
  end
195
207
 
208
+
196
209
  # Extracts an integer out of a string describing a number
197
210
  # e.g. "4K Comments" returns 4000
198
211
  # e.g. "131 Shares" returns 131
@@ -203,19 +216,23 @@ module Forki
203
216
  element = element.text(:all)
204
217
  end
205
218
 
206
- num_pattern = /[0-9KM ,.]+/
207
- interaction_num_text = num_pattern.match(element)[0]
208
-
209
- if interaction_num_text.include?(".") # e.g. "2.2K"
210
- interaction_num_text.to_i + interaction_num_text[-2].to_i * 100
211
- elsif interaction_num_text.include?("K") # e.g. "13K"
212
- interaction_num_text.to_i * 1000
213
- elsif interaction_num_text.include?("M") # e.g. "13M"
214
- interaction_num_text.to_i * 1_000_000
215
- else # e.g. "15,443"
216
- interaction_num_text.delete!(",")
217
- interaction_num_text.delete(" ").to_i
219
+ # Check if there's a modifier i.e. `K` or `M` if there isn't just return the number
220
+ unless element.include?("K") || element.include?("M")
221
+ element.delete(",") # "5,456" e.g.
222
+ return element.to_i
218
223
  end
224
+
225
+ modifier = element[-1]
226
+ number = element[0...-1].to_f
227
+
228
+ case modifier
229
+ when "K"
230
+ number = number * 1_000
231
+ when "M"
232
+ number = number * 1_000_000
233
+ end
234
+
235
+ number.to_i
219
236
  end
220
237
  end
221
238
  end
@@ -0,0 +1,25 @@
1
+ class ImageSieve
2
+ def self.can_process_with_sieve?(graphql_objects)
3
+ !sieve_class_for_graphql_objects(graphql_objects).nil?
4
+ end
5
+
6
+ def self.sieve_for_graphql_objects(graphql_objects)
7
+
8
+ sieve = sieve_class_for_graphql_objects(graphql_objects)
9
+ return nil if sieve.nil?
10
+
11
+ sieve.sieve(graphql_objects)
12
+ end
13
+
14
+ private
15
+
16
+ def self.sieve_class_for_graphql_objects(graphql_objects)
17
+ sieves = []
18
+ sieves.detect { |sieve| sieve.check(graphql_objects) }
19
+ end
20
+ end
21
+
22
+
23
+ Dir['./lib/forki/scrapers/sieves/image_sieves/*.rb'].each do |file|
24
+ require file unless file.end_with?("image_sieve.rb")
25
+ end
@@ -0,0 +1,24 @@
1
+ class VideoSieve
2
+ def self.can_process_with_sieve?(graphql_objects)
3
+ !sieve_class_for_graphql_objects(graphql_objects).nil?
4
+ end
5
+
6
+ def self.sieve_for_graphql_objects(graphql_objects)
7
+
8
+ sieve = sieve_class_for_graphql_objects(graphql_objects)
9
+ return nil if sieve.nil?
10
+
11
+ sieve.sieve(graphql_objects)
12
+ end
13
+
14
+ private
15
+
16
+ def self.sieve_class_for_graphql_objects(graphql_objects)
17
+ sieves = [VideoSieveWatchTab, VideoSieveVideoPage]
18
+ sieves.detect { |sieve| sieve.check(graphql_objects) }
19
+ end
20
+ end
21
+
22
+ Dir['./lib/forki/scrapers/sieves/video_sieves/*.rb'].each do |file|
23
+ require file unless file.end_with?("video_sieve.rb")
24
+ end
@@ -0,0 +1,66 @@
1
+ class VideoSieveVideoPage < VideoSieve
2
+ # To check if it's valid for the inputted graphql objects
3
+ def self.check(graphql_objects)
4
+ story_node_object = self.extractor(graphql_objects) # This will error out
5
+ return false unless story_node_object["content"]["story"]["attachments"].first["styles"]["attachment"].has_key?("media")
6
+
7
+ true
8
+ rescue StandardError
9
+ return false
10
+ end
11
+
12
+ # output the expected format of:
13
+ #
14
+ # post_details = {
15
+ # id: video_object["id"],
16
+ # num_comments: num_comments,
17
+ # num_shares: share_count_object.fetch("count", nil),
18
+ # num_views: feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["video_view_count"],
19
+ # reshare_warning: feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["should_show_reshare_warning"],
20
+ # video_preview_image_url: video_object["preferred_thumbnail"]["image"]["uri"],
21
+ # video_url: video_object["browser_native_hd_url"] || video_object["browser_native_sd_url"],
22
+ # text: text,
23
+ # created_at: creation_date,
24
+ # profile_link: story_node_object["comet_sections"]["context_layout"]["story"]["comet_sections"]["actor_photo"]["story"]["actors"][0]["url"],
25
+ # has_video: true
26
+ # }
27
+ # post_details[:video_preview_image_file] = Forki.retrieve_media(post_details[:video_preview_image_url])
28
+ # post_details[:video_file] = Forki.retrieve_media(post_details[:video_url])
29
+ # post_details[:reactions] = reaction_counts
30
+
31
+ def self.sieve(graphql_objects)
32
+ extracted_text = self.extractor(graphql_objects)
33
+
34
+ story_object = extracted_text["content"]["story"]
35
+ video_object = extracted_text["content"]["story"]["attachments"].first["styles"]["attachment"]["media"]
36
+ feedback_object = extracted_text["feedback"]["story"]["feedback_context"]["feedback_target_with_context"]["ufi_renderer"]["feedback"]["comet_ufi_summary_and_actions_renderer"]["feedback"]
37
+
38
+ video_preview_image_url = video_object["preferred_thumbnail"]["image"]["uri"]
39
+ video_url = video_object["browser_native_hd_url"]
40
+ video_url = video_object["browser_native_sd_url"] if video_url.nil?
41
+
42
+ post_details = {
43
+ id: video_object["id"],
44
+ num_comments: feedback_object["total_comment_count"],
45
+ num_shared: feedback_object["share_count"]["count"],
46
+ num_views: nil,
47
+ reshare_warning: feedback_object["should_show_reshare_warning"],
48
+ video_preview_image_url: video_preview_image_url,
49
+ video_url: video_url,
50
+ text: story_object["message"]["text"],
51
+ created_at: video_object["publish_time"],
52
+ profile_link: story_object["actors"].first["url"],
53
+ has_video: true,
54
+ video_preview_image_file: Forki.retrieve_media(video_preview_image_url),
55
+ video_file: Forki.retrieve_media(video_url),
56
+ reactions: feedback_object["cannot_see_top_custom_reactions"]["top_reactions"]["edges"]
57
+ }
58
+ end
59
+
60
+ private
61
+
62
+ def self.extractor(graphql_objects)
63
+ story_node_object = graphql_objects.find { |graphql_object| graphql_object.key? "node" }&.fetch("node", nil) # user posted video
64
+ story_node_object["comet_sections"]
65
+ end
66
+ end
@@ -0,0 +1,91 @@
1
+ # This is for the "watch" tab style videos https://www.facebook.com/watch/live/?v=394367115960503
2
+
3
+ class VideoSieveWatchTab < VideoSieve
4
+ # To check if it's valid for the inputted graphql objects
5
+ def self.check(graphql_objects)
6
+ video_object = self.extractor(graphql_objects)
7
+ return false if video_object.nil?
8
+
9
+ video_object = video_object["attachments"]
10
+ return false if video_object.nil?
11
+
12
+ return false unless video_object.kind_of?(Array) && !video_object.empty?
13
+
14
+ video_object = video_object.first
15
+ return false unless video_object.kind_of?(Hash) && video_object.keys.include?("media")
16
+
17
+ true
18
+ rescue StandardError
19
+ return false
20
+ end
21
+
22
+ # output the expected format of:
23
+ #
24
+ # post_details = {
25
+ # id: video_object["id"],
26
+ # num_comments: num_comments,
27
+ # num_shares: share_count_object.fetch("count", nil),
28
+ # num_views: feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["video_view_count"],
29
+ # reshare_warning: feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["should_show_reshare_warning"],
30
+ # video_preview_image_url: video_object["preferred_thumbnail"]["image"]["uri"],
31
+ # video_url: video_object["browser_native_hd_url"] || video_object["browser_native_sd_url"],
32
+ # text: text,
33
+ # created_at: creation_date,
34
+ # profile_link: story_node_object["comet_sections"]["context_layout"]["story"]["comet_sections"]["actor_photo"]["story"]["actors"][0]["url"],
35
+ # has_video: true
36
+ # }
37
+ # post_details[:video_preview_image_file] = Forki.retrieve_media(post_details[:video_preview_image_url])
38
+ # post_details[:video_file] = Forki.retrieve_media(post_details[:video_url])
39
+ # post_details[:reactions] = reaction_counts
40
+
41
+ def self.sieve(graphql_objects)
42
+ video_object = self.extractor(graphql_objects)
43
+
44
+ video_url = video_object["attachments"].first["media"]["browser_native_sd_url"]
45
+ video_preview_image_url = video_object["attachments"].first["media"]["preferred_thumbnail"]["image"]["uri"]
46
+
47
+ if !video_object["feedback_context"].nil?
48
+ feedback_object = video_object["feedback_context"]["feedback_target_with_context"]
49
+ else
50
+ feedback_object = graphql_objects.find { |go| !go.dig("feedback", "total_comment_count").nil? }
51
+ feedback_object = feedback_object["feedback"] if feedback_object.has_key?("feedback")
52
+ end
53
+
54
+ profile_link = video_object["attachments"].first["media"]["owner"]["url"]
55
+ if profile_link.nil?
56
+ filtered_json = graphql_objects.find { |go| go.has_key? "attachments" }
57
+ profile_link = filtered_json["attachments"].first["media"]["creation_story"]["comet_sections"]["title"]["story"]["actors"].first["url"]
58
+ end
59
+
60
+ post_details = {
61
+ id: video_object.dig("shareable", "id") || video_object["attachments"].first["media"]["id"],
62
+ num_comments: feedback_object["total_comment_count"],
63
+ num_shared: nil, # This is not associated with these videos in this format
64
+ num_views: nil, # This is not associated with these videos in this format
65
+ reshare_warning: feedback_object["should_show_reshare_warning"],
66
+ video_preview_image_url: video_preview_image_url,
67
+ video_url: video_url,
68
+ text: nil, # There is no text associated with these videos
69
+ created_at: video_object["attachments"].first["media"]["publish_time"],
70
+ profile_link: profile_link,
71
+ has_video: true,
72
+ video_preview_image_file: Forki.retrieve_media(video_preview_image_url),
73
+ video_file: Forki.retrieve_media(video_url),
74
+ reactions: feedback_object["cannot_see_top_custom_reactions"]["top_reactions"]["edges"]
75
+ }
76
+ end
77
+
78
+ private
79
+
80
+ def self.extractor(graphql_objects)
81
+ video_objects = graphql_objects.filter do |go|
82
+ go = go.first if go.kind_of?(Array) && !go.empty?
83
+ go.has_key?("video")
84
+ end
85
+
86
+ story = video_objects.first.dig("video", "creation_story")
87
+ story = video_objects.first.dig("video", "story") if story.nil?
88
+
89
+ story
90
+ end
91
+ end
@@ -3,10 +3,14 @@ require "typhoeus"
3
3
  module Forki
4
4
  class UserScraper < Scraper
5
5
  # Finds and returns the number of people who like the current page
6
- def find_number_of_likes
7
- likes_pattern = /[0-9,.KM ] people like this/
8
- number_of_likes_elem = all("span").filter { | span| likes_pattern.match? span.text }.first
9
- extract_int_from_num_element(number_of_likes_elem)
6
+ def find_number_of_likes(profile_details_string)
7
+ likes_pattern = /[0-9,.KM ] likes/
8
+ likes_pattern = /(?<num_likes>[0-9,.KM ]+) (l|L)ikes/
9
+ number_of_likes_match = likes_pattern.match(profile_details_string)
10
+
11
+ return nil if number_of_likes_match.nil?
12
+
13
+ extract_int_from_num_element(number_of_likes_match.named_captures["num_likes"])
10
14
  end
11
15
 
12
16
  # Finds and returns the number of people who follow the current page
@@ -14,8 +18,18 @@ module Forki
14
18
  followers_pattern = /Followed by (?<num_followers>[0-9,.KM ]) people/
15
19
  alt_follower_pattern = /(?<num_followers>[0-9,.KM ]+) (f|F)ollowers/
16
20
  number_of_followers_match = followers_pattern.match(profile_details_string) || alt_follower_pattern.match(profile_details_string)
21
+
17
22
  return nil if number_of_followers_match.nil?
18
- extract_int_from_num_element(number_of_followers_match.named_captures["num_followers"])
23
+
24
+ number_of_followers = extract_int_from_num_element(number_of_followers_match.named_captures["num_followers"])
25
+
26
+ # Note, this is sticking around if we want to use it later
27
+ # if number_of_followers.nil?
28
+ # number_of_followers_string = JSON.parse(profile_header_str)["user"]["profile_header_renderer"]["user"]["profile_social_context"]["content"].first["text"]["text"]
29
+ # number_of_followers = extract_int_from_num_element(number_of_followers_string)
30
+ # end
31
+
32
+ number_of_followers
19
33
  end
20
34
 
21
35
  def find_number_followers_for_normal_profile(profile_followers_node)
@@ -61,6 +75,7 @@ module Forki
61
75
  verified: profile_header_obj["user"]["is_verified"],
62
76
  profile: profile_intro_obj ? profile_intro_obj["profile_intro_card"]["bio"]["text"] : "",
63
77
  profile_image_url: profile_header_obj["user"]["profilePicLarge"]["uri"],
78
+ number_of_likes: find_number_of_likes(profile_header_str),
64
79
  }
65
80
  end
66
81
 
data/lib/forki/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Forki
4
- VERSION = "0.1.4"
4
+ VERSION = "0.2.0"
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: forki
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.4
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - ''
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-06-20 00:00:00.000000000 Z
11
+ date: 2023-08-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: capybara
@@ -80,6 +80,20 @@ dependencies:
80
80
  - - ">="
81
81
  - !ruby/object:Gem::Version
82
82
  version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: thor
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
83
97
  description:
84
98
  email:
85
99
  - ''
@@ -99,12 +113,21 @@ files:
99
113
  - README.md
100
114
  - Rakefile
101
115
  - bin/console
116
+ - bin/generate_sieve
117
+ - bin/generator_templates/image_sieve_template.rb.erb
118
+ - bin/generator_templates/image_sieve_test_template.rb.erb
119
+ - bin/generator_templates/video_sieve_template.rb.erb
120
+ - bin/generator_templates/video_sieve_test_template.rb.erb
102
121
  - bin/setup
103
122
  - forki.gemspec
104
123
  - lib/forki.rb
105
124
  - lib/forki/post.rb
106
125
  - lib/forki/scrapers/post_scraper.rb
107
126
  - lib/forki/scrapers/scraper.rb
127
+ - lib/forki/scrapers/sieves/image_sieves/image_sieve.rb.rb
128
+ - lib/forki/scrapers/sieves/video_sieves/video_sieve.rb
129
+ - lib/forki/scrapers/sieves/video_sieves/video_sieve_video_page.rb
130
+ - lib/forki/scrapers/sieves/video_sieves/video_sieve_watch_tab.rb
108
131
  - lib/forki/scrapers/user_scraper.rb
109
132
  - lib/forki/user.rb
110
133
  - lib/forki/version.rb