forki 0.1.3 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: a0667614af238aeb8089c1af60794918c0986b7f5e12fa6fe96d33a2a5c1a06f
4
- data.tar.gz: 5668a5a6056bdf9bdd9c9fd8f119fa5ed754c5ee0b152d2764b26e9f9d7a8804
3
+ metadata.gz: 7a1b2f6a831ebac1bf9e79cc33818aa3d8f638459f538ff4f0cb92bab55b16df
4
+ data.tar.gz: 1df3b090db0ba37ecfbcf17933d41334620e9542a75f3328d9a335adec5d1ae9
5
5
  SHA512:
6
- metadata.gz: 509ffb30dada2666236ed0900e1def8a61413ee3ba2e8705e31bd01422f8d2baf2cd24479faf906b6374602fcb0c1dc7bb334b7e02a560c798eba6efab30a2c9
7
- data.tar.gz: 3f79b804b5505222e06a49352807d92c95f13a8735bedcd1b32cbf079be72b0922052014ef82da8771db108f7b0b674cd132caf7d377084c57cf0d245eb049e1
6
+ metadata.gz: b65c46157a6d1f320345d0216d138239c9d73cb29fd7fc0fa504655e0c789021a558d2e75a0447f026dc694e4fb95765cf12f8b2350e147b1011e670cd40621f
7
+ data.tar.gz: bc62a4decd0205e75ac93038d5768ff4d9ec55d8a25980dbb8522b3b2296f84e50db7caa3ba8aeb28d61eb482957b4d4bf68a9d9658cc26f4209198a64b1fe04
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- forki (0.1.1)
4
+ forki (0.1.4)
5
5
  apparition
6
6
  capybara
7
7
  oj
@@ -37,7 +37,7 @@ GEM
37
37
  ast (2.4.2)
38
38
  builder (3.2.4)
39
39
  byebug (11.1.3)
40
- capybara (3.39.1)
40
+ capybara (3.39.2)
41
41
  addressable
42
42
  matrix
43
43
  mini_mime (>= 0.1.3)
@@ -66,11 +66,11 @@ GEM
66
66
  minitest (5.18.0)
67
67
  nokogiri (1.15.1-arm64-darwin)
68
68
  racc (~> 1.4)
69
- oj (3.14.3)
69
+ oj (3.15.1)
70
70
  parallel (1.23.0)
71
71
  parser (3.2.2.1)
72
72
  ast (~> 2.4.1)
73
- public_suffix (5.0.1)
73
+ public_suffix (5.0.3)
74
74
  racc (1.6.2)
75
75
  rack (2.2.4)
76
76
  rack-test (2.1.0)
@@ -127,7 +127,7 @@ GEM
127
127
  rubocop-rails (~> 2.0)
128
128
  ruby-progressbar (1.13.0)
129
129
  rubyzip (2.3.2)
130
- selenium-webdriver (4.9.1)
130
+ selenium-webdriver (4.10.0)
131
131
  rexml (~> 3.2, >= 3.2.5)
132
132
  rubyzip (>= 1.2.2, < 3.0)
133
133
  websocket (~> 1.0)
@@ -138,7 +138,7 @@ GEM
138
138
  concurrent-ruby (~> 1.0)
139
139
  unicode-display_width (2.4.2)
140
140
  websocket (1.2.9)
141
- websocket-driver (0.7.5)
141
+ websocket-driver (0.7.6)
142
142
  websocket-extensions (>= 0.1.0)
143
143
  websocket-extensions (0.1.5)
144
144
  xpath (3.2.0)
@@ -160,6 +160,7 @@ DEPENDENCIES
160
160
  rubocop (~> 1.7)
161
161
  rubocop-rails (~> 2.17.3)
162
162
  rubocop-rails_config
163
+ thor
163
164
 
164
165
  BUNDLED WITH
165
166
  2.3.11
@@ -0,0 +1,79 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ require "bundler/setup"
5
+ require "forki"
6
+ require "thor"
7
+ require "erb"
8
+
9
+ class GenerateSieve < Thor
10
+
11
+ desc "generate TYPE NAME", "generate a TYPE named NAME"
12
+ option :video
13
+ option :image
14
+ def generate(type, name)
15
+ case type
16
+ when "post"
17
+ if !options[:video].nil?
18
+ style = :video
19
+ elsif !options[:image].nil?
20
+ style = :image
21
+ else
22
+ puts "Must indicate either video or image flag"
23
+ exit
24
+ end
25
+
26
+ SieveGenerator.generate_post_sieve(name, style)
27
+ when "user"
28
+ SieveGenerator.generate_user_sieve(name)
29
+ else
30
+ puts "Type must be `post` or `user` only. `#{type}` passed in."
31
+ exit
32
+ end
33
+ end
34
+ end
35
+
36
+ class SieveGenerator
37
+ def self.generate_post_sieve(name, style)
38
+ puts "Generating post sieve named #{name} with style #{style}"
39
+
40
+ file_path = "./lib/forki/scrapers/sieves/"
41
+ test_path = "./test/sieves/"
42
+ case style
43
+ when :video
44
+ test_data_valid_path = "#{test_path}/video_sieves/test_data/video_sieve_#{name}_valid.json"
45
+ test_data_invalid_path = "#{test_path}/video_sieves/test_data/video_sieve_#{name}_invalid.json"
46
+
47
+ file_path += "video_sieves/video_sieve_#{name}.rb"
48
+ test_path += "video_sieves/video_sieve_#{name}_test.rb"
49
+ file_template = File.read("./bin/generator_templates/video_sieve_template.rb.erb")
50
+ test_file_template = File.read("./bin/generator_templates/video_sieve_test_template.rb.erb")
51
+ when :image
52
+ test_data_valid_path = "#{test_path}/image_sieves/test_data/image_sieve_#{name}_valid.json"
53
+ test_data_invalid_path = "#{test_path}/image_sieves/test_data/image_sieve_#{name}_invalid.json"
54
+
55
+ file_path += "image_sieves/image_sieve_#{name}.rb"
56
+ test_path += "image_sieves/image_sieve_#{name}_test.rb"
57
+ file_template = File.read("./bin/generator_templates/image_sieve_template.rb.erb")
58
+ test_file_template = File.read("./bin/generator_templates/image_sieve_test_template.rb.erb")
59
+ end
60
+
61
+ file_contents = ERB.new(file_template)
62
+ test_file_contents = ERB.new(test_file_template)
63
+
64
+ camel_name = name.split('_').collect(&:capitalize).join
65
+
66
+ File.write(file_path, file_contents.result(binding))
67
+ File.write(test_path, test_file_contents.result(binding))
68
+
69
+ File.write(test_data_valid_path, "")
70
+ File.write(test_data_invalid_path, "")
71
+ end
72
+
73
+ def self.generate_user_sieve(name)
74
+ puts "Generating user sieve named #{name}"
75
+ end
76
+ end
77
+
78
+ GenerateSieve.start(ARGV)
79
+
@@ -0,0 +1,63 @@
1
+ # NOTE: This is not implemented yet, just here for filler
2
+
3
+ class ImageSieve<%= camel_name %> < ImageSieve
4
+ # To check if it's valid for the inputted graphql objects
5
+ def self.check(graphql_objects)
6
+ image_object = self.extractor(graphql_objects)
7
+
8
+ true
9
+ rescue StandardError
10
+ return false
11
+ end
12
+
13
+ # output the expected format of:
14
+ #
15
+ # post_details = {
16
+ # id: video_object["id"],
17
+ # num_comments: num_comments,
18
+ # num_shares: share_count_object.fetch("count", nil),
19
+ # num_views: feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["video_view_count"],
20
+ # reshare_warning: feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["should_show_reshare_warning"],
21
+ # video_preview_image_url: video_object["preferred_thumbnail"]["image"]["uri"],
22
+ # video_url: video_object["browser_native_hd_url"] || video_object["browser_native_sd_url"],
23
+ # text: text,
24
+ # created_at: creation_date,
25
+ # profile_link: story_node_object["comet_sections"]["context_layout"]["story"]["comet_sections"]["actor_photo"]["story"]["actors"][0]["url"],
26
+ # has_video: true
27
+ # }
28
+ # post_details[:video_preview_image_file] = Forki.retrieve_media(post_details[:video_preview_image_url])
29
+ # post_details[:video_file] = Forki.retrieve_media(post_details[:video_url])
30
+ # post_details[:reactions] = reaction_counts
31
+
32
+ def self.sieve(graphql_objects)
33
+ image_object = self.extractor(graphql_objects)
34
+
35
+ post_details = {
36
+ id: nil,
37
+ num_comments: nil,
38
+ num_shared: nil,
39
+ num_views: nil,
40
+ reshare_warning: nil,
41
+ video_preview_image_url: nil,
42
+ video_url: nil,
43
+ text: nil,
44
+ created_at: nil,
45
+ profile_link: nil,
46
+ has_video: false,
47
+ video_preview_image_file: Forki.retrieve_media(video_preview_image_url),
48
+ video_file: Forki.retrieve_media(video_url),
49
+ reactions: nil
50
+ }
51
+ end
52
+
53
+ private
54
+
55
+ def self.extractor(graphql_objects)
56
+ image_objects = graphql_objects.filter do |go|
57
+ # go = go.first if go.kind_of?(Array) && !go.empty?
58
+ # go.has_key?("image")
59
+ end
60
+
61
+ # image_objects.first.dig("image", "creation_story")
62
+ end
63
+ end
@@ -0,0 +1,42 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "test_helper"
4
+
5
+ # rubocop:disable Metrics/ClassLength
6
+ class ImageSieve<%= camel_name %>Test < Minitest::Test
7
+ def setup
8
+ @valid_json = JSON.parse(File.read("test/sieves/image_sieves/test_data/image_sieve_<%= name %>_valid.json"))
9
+ @invalid_json = JSON.parse(File.read("test/sieves/image_sieves/test_data/image_sieve_<%= name %>_invalid.json"))
10
+ end
11
+
12
+ def test_sieve_properly_fails_check
13
+ assert ImageSieve<%= camel_name %>.check(@invalid_json) == false
14
+ end
15
+
16
+ def test_sieve_properly_passes_check
17
+ assert ImageSieve<%= camel_name %>.check(@valid_json)
18
+ end
19
+
20
+ def test_sieve_can_sieve_properly
21
+ result = ImageSieve<%= camel_name %>.sieve(@valid_json)
22
+
23
+ # TODO: Update the values for the post you're testing
24
+ # MAINTAINER TODO: FIX THIS FOR IMAGES
25
+ assert_equal "394367115960503", result[:id]
26
+ assert_equal 173, result[:num_comments]
27
+ assert_equal nil, result[:num_shared]
28
+ assert_equal nil, result[:num_views]
29
+ assert_equal false, result[:reshare_warning]
30
+ assert_not_nil result[:video_preview_image_url]
31
+ assert_not_nil result[:video_url]
32
+ assert_equal nil, text
33
+ assert_equal 1654989063, result[:created_at]
34
+ assert_equal nil, result[:profile_link]
35
+ assert_equal false, result[:has_video]
36
+ assert_not_nil result[:video_preview_image_file]
37
+ assert_not_nil result[:video_file]
38
+ assert_not_nil result[:reactions]
39
+
40
+ assert result[:reactions].kind_of?(Array)
41
+ end
42
+ end
@@ -0,0 +1,61 @@
1
+ class VideoSieve<%= camel_name %> < VideoSieve
2
+ # To check if it's valid for the inputted graphql objects
3
+ def self.check(graphql_objects)
4
+ video_object = self.extractor(graphql_objects)
5
+
6
+ true
7
+ rescue StandardError
8
+ return false
9
+ end
10
+
11
+ # output the expected format of:
12
+ #
13
+ # post_details = {
14
+ # id: video_object["id"],
15
+ # num_comments: num_comments,
16
+ # num_shares: share_count_object.fetch("count", nil),
17
+ # num_views: feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["video_view_count"],
18
+ # reshare_warning: feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["should_show_reshare_warning"],
19
+ # video_preview_image_url: video_object["preferred_thumbnail"]["image"]["uri"],
20
+ # video_url: video_object["browser_native_hd_url"] || video_object["browser_native_sd_url"],
21
+ # text: text,
22
+ # created_at: creation_date,
23
+ # profile_link: story_node_object["comet_sections"]["context_layout"]["story"]["comet_sections"]["actor_photo"]["story"]["actors"][0]["url"],
24
+ # has_video: true
25
+ # }
26
+ # post_details[:video_preview_image_file] = Forki.retrieve_media(post_details[:video_preview_image_url])
27
+ # post_details[:video_file] = Forki.retrieve_media(post_details[:video_url])
28
+ # post_details[:reactions] = reaction_counts
29
+
30
+ def self.sieve(graphql_objects)
31
+ video_object = self.extractor(graphql_objects)
32
+
33
+ post_details = {
34
+ id: nil,
35
+ num_comments: nil,
36
+ num_shared: nil,
37
+ num_views: nil,
38
+ reshare_warning: nil,
39
+ video_preview_image_url: nil,
40
+ video_url: nil,
41
+ text: nil,
42
+ created_at: nil,
43
+ profile_link: nil,
44
+ has_video: true,
45
+ video_preview_image_file: Forki.retrieve_media(video_preview_image_url),
46
+ video_file: Forki.retrieve_media(video_url),
47
+ reactions: nil
48
+ }
49
+ end
50
+
51
+ private
52
+
53
+ def self.extractor(graphql_objects)
54
+ video_objects = graphql_objects.filter do |go|
55
+ # go = go.first if go.kind_of?(Array) && !go.empty?
56
+ # go.has_key?("video")
57
+ end
58
+
59
+ # video_objects.first.dig("video", "creation_story")
60
+ end
61
+ end
@@ -0,0 +1,41 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "test_helper"
4
+
5
+ # rubocop:disable Metrics/ClassLength
6
+ class VideoSieve<%= camel_name %>Test < Minitest::Test
7
+ def setup
8
+ @valid_json = JSON.parse(File.read("test/sieves/video_sieves/test_data/video_sieve_<%= name %>_valid.json"))
9
+ @invalid_json = JSON.parse(File.read("test/sieves/video_sieves/test_data/video_sieve_<%= name %>_invalid.json"))
10
+ end
11
+
12
+ def test_sieve_properly_fails_check
13
+ assert VideoSieve<%= camel_name %>.check(@invalid_json) == false
14
+ end
15
+
16
+ def test_sieve_properly_passes_check
17
+ assert VideoSieve<%= camel_name %>.check(@valid_json)
18
+ end
19
+
20
+ def test_sieve_can_sieve_properly
21
+ result = VideoSieve<%= camel_name %>.sieve(@valid_json)
22
+
23
+ # TODO: Update the values for the post you're testing
24
+ assert_equal "394367115960503", result[:id]
25
+ assert_equal 173, result[:num_comments]
26
+ assert_equal nil, result[:num_shared]
27
+ assert_equal nil, result[:num_views]
28
+ assert_equal false, result[:reshare_warning]
29
+ assert_not_nil result[:video_preview_image_url]
30
+ assert_not_nil result[:video_url]
31
+ assert_equal nil, text
32
+ assert_equal 1654989063, result[:created_at]
33
+ assert_equal nil, result[:profile_link]
34
+ assert_equal true, result[:has_video]
35
+ assert_not_nil result[:video_preview_image_file]
36
+ assert_not_nil result[:video_file]
37
+ assert_not_nil result[:reactions]
38
+
39
+ assert result[:reactions].kind_of?(Array)
40
+ end
41
+ end
data/forki.gemspec CHANGED
@@ -37,6 +37,8 @@ Gem::Specification.new do |spec|
37
37
  spec.add_dependency "oj" # A faster JSON parser/loader than stdlib
38
38
  spec.add_dependency "selenium-webdriver" # Webdriver selenium
39
39
 
40
+ spec.add_development_dependency "thor" # For the generator
41
+
40
42
  # For more information and examples about making a new gem, checkout our
41
43
  # guide at: https://bundler.io/guides/creating_gem.html
42
44
  end
@@ -4,6 +4,7 @@ require "typhoeus"
4
4
  require "securerandom"
5
5
  require "byebug"
6
6
 
7
+
7
8
  module Forki
8
9
  # rubocop:disable Metrics/ClassLength
9
10
  class PostScraper < Scraper
@@ -100,7 +101,11 @@ module Forki
100
101
  begin
101
102
  find("span", wait: 5, text: "This Video Isn't Available Anymore", exact_text: false)
102
103
  rescue Capybara::ElementNotFound, Selenium::WebDriver::Error::StaleElementReferenceError
103
- return true
104
+ begin
105
+ find("span", wait: 5, text: "This Page Isn't Available", exact_text: false)
106
+ rescue Capybara::ElementNotFound, Selenium::WebDriver::Error::StaleElementReferenceError
107
+ return true
108
+ end
104
109
  end
105
110
  end
106
111
 
@@ -150,21 +155,30 @@ module Forki
150
155
  end
151
156
 
152
157
  graphql_object_array = graphql_strings.map { |graphql_string| JSON.parse(graphql_string) }
158
+
159
+ # Once in awhile it's really easy
160
+ video_objects = graphql_object_array.filter {|go| go.has_key?("video") }
161
+
162
+ if VideoSieve.can_process_with_sieve?(graphql_object_array)
163
+ # Eventually all of this complexity will be replaced with this
164
+ return VideoSieve.sieve_for_graphql_objects(graphql_object_array)
165
+ end
166
+
153
167
  story_node_object = graphql_object_array.find { |graphql_object| graphql_object.key? "node" }&.fetch("node", nil) # user posted video
154
168
  story_node_object = story_node_object || graphql_object_array.find { |graphql_object| graphql_object.key? "nodes" }&.fetch("nodes")&.first # page posted video
155
169
 
156
170
  return extract_video_post_data_alternative(graphql_object_array) if story_node_object.nil?
157
171
 
158
172
  if story_node_object["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"].key?("media")
159
- video_object = story_node_object["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"]["media"]
160
- creation_date = video_object["publish_time"]
161
- # creation_date = video_object["video"]["publish_time"]
173
+ video_object = story_node_object["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"]["media"]["video"]
174
+ creation_date = video_object["publish_time"] if video_object&.has_key("publish_time")
175
+ creation_date = story_node_object["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"]["media"] if creation_date.nil?
162
176
  elsif story_node_object["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"].key?("style_infos")
163
177
  # For "Reels" we need a separate way to parse this
164
178
  video_object = story_node_object["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"]["style_infos"].first["fb_shorts_story"]["short_form_video_context"]["playback_video"]
165
179
  creation_date = story_node_object["comet_sections"]["content"]["story"]["attachments"].first["styles"]["attachment"]["style_infos"].first["fb_shorts_story"]["creation_time"]
166
180
  else
167
- raise "Unable to parse video object"
181
+ raise "Unable to parse video object" if video_objects.empty?
168
182
  end
169
183
 
170
184
  feedback_object = story_node_object["comet_sections"]["feedback"]["story"]["feedback_context"]["feedback_target_with_context"]["ufi_renderer"]["feedback"]
@@ -187,7 +201,7 @@ module Forki
187
201
  num_views: feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["video_view_count"],
188
202
  reshare_warning: feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["should_show_reshare_warning"],
189
203
  video_preview_image_url: video_object["preferred_thumbnail"]["image"]["uri"],
190
- video_url: video_object["playable_url_quality_hd"] || video_object["playable_url"],
204
+ video_url: video_object["browser_native_hd_url"] || video_object["browser_native_sd_url"],
191
205
  text: text,
192
206
  created_at: creation_date,
193
207
  profile_link: story_node_object["comet_sections"]["context_layout"]["story"]["comet_sections"]["actor_photo"]["story"]["actors"][0]["url"],
@@ -213,7 +227,7 @@ module Forki
213
227
  num_views: feedback_object["video_view_count"],
214
228
  reshare_warning: feedback_object["should_show_reshare_warning"],
215
229
  video_preview_image_url: video_object["video"]["preferred_thumbnail"]["image"]["uri"],
216
- video_url: video_object["video"]["playable_url_quality_hd"] || video_object["video"]["playable_url"],
230
+ video_url: video_object["video"]["playable_url_quality_hd"] || video_object["video"]["browser_native_hd_url"] || video_object["video"]["browser_native_sd_url"] || video_object["video"]["playable_url"],
217
231
  text: sidepane_object["tahoe_sidepane_renderer"]["video"]["creation_story"]["comet_sections"]["message"]["story"]["message"]["text"],
218
232
  created_at: video_object["video"]["publish_time"],
219
233
  profile_link: sidepane_object["tahoe_sidepane_renderer"]["video"]["creation_story"]["comet_sections"]["actor_photo"]["story"]["actors"][0]["url"],
@@ -365,6 +379,7 @@ module Forki
365
379
  def parse(url)
366
380
  validate_and_load_page(url)
367
381
  graphql_strings = find_graphql_data_strings(page.html)
382
+
368
383
  post_data = extract_post_data(graphql_strings)
369
384
  post_data[:url] = url
370
385
  user_url = post_data[:profile_link]
@@ -394,3 +409,6 @@ module Forki
394
409
  end
395
410
  end
396
411
  end
412
+
413
+ require_relative "sieves/video_sieves/video_sieve"
414
+
@@ -7,6 +7,7 @@ require "oj"
7
7
  require "selenium-webdriver"
8
8
  require "open-uri"
9
9
  require "selenium/webdriver/remote/http/curb"
10
+ require "cgi"
10
11
 
11
12
  options = Selenium::WebDriver::Options.chrome(exclude_switches: ["enable-automation"])
12
13
  options.add_argument("--start-maximized")
@@ -112,18 +113,41 @@ module Forki
112
113
 
113
114
  url ||= "https://www.facebook.com"
114
115
 
115
-
116
116
  page.driver.browser.navigate.to(url) # Visit the url passed in or the facebook homepage if nothing is
117
117
 
118
118
  # Look for "login_form" box, which throws an error if not found. So we catch it and run the rest of the tests
119
119
  begin
120
120
  login_form = first(id: "login_form", wait: 5)
121
121
  rescue Capybara::ElementNotFound
122
- return unless page.title.downcase.include?("facebook - log in")
122
+ begin
123
+ login_form = find(:xpath, '//form[@data-testid="royal_login_form"]')
124
+ rescue Capybara::ElementNotFound
125
+ return unless page.title.downcase.include?("facebook - log in")
126
+ end
123
127
  end
124
128
 
125
- # Since we're not logged in, let's do that quick
126
- page.driver.browser.navigate.to("https://www.facebook.com") if login_form.nil?
129
+ # Since we're not logged in, let's do that quickly
130
+ if login_form.nil?
131
+ page.driver.browser.navigate.to("https://www.facebook.com")
132
+
133
+ # Find the login form... again (Yes, we could extract this out, but it's only ever used
134
+ # here, so it's not worth the effort)
135
+ begin
136
+ login_form = first(id: "login_form", wait: 5)
137
+ rescue Capybara::ElementNotFound
138
+ begin
139
+ login_form = find(:xpath, '//form[@data-testid="royal_login_form"]')
140
+ rescue Capybara::ElementNotFound
141
+ return unless page.title.downcase.include?("facebook - log in")
142
+ end
143
+ end
144
+ end
145
+
146
+ if login_form.nil?
147
+ # maybe we're already logged in?
148
+ sleep(rand * 10.3)
149
+ return
150
+ end
127
151
 
128
152
  login_form.fill_in("email", with: ENV["FACEBOOK_EMAIL"])
129
153
  login_form.fill_in("pass", with: ENV["FACEBOOK_PASSWORD"])
@@ -154,13 +178,34 @@ module Forki
154
178
  # If either of those two conditions are false, raises an exception
155
179
  def validate_and_load_page(url)
156
180
  Capybara.app_host = "https://www.facebook.com"
157
- facebook_url = "https://www.facebook.com"
158
- # visit "https://www.facebook.com" unless current_url.start_with?(facebook_url)
159
- login(url)
160
- raise Forki::InvalidUrlError unless url.start_with?(facebook_url)
181
+ facebook_hosts = ["facebook.com", "www.facebook.com", "web.facebook.com", "m.facebook.com"]
182
+ parsed_url = URI.parse(url)
183
+ host = parsed_url.host
184
+ raise Forki::InvalidUrlError unless facebook_hosts.include?(host)
185
+
186
+ # Replace the host with a default one to prevent redirect loops that can happen
187
+ unless parsed_url.host == "www.facebook.com"
188
+ parsed_url.host = "www.facebook.com"
189
+ url = parsed_url.to_s
190
+ end
191
+
192
+ visit "https://www.facebook.com"
193
+ login
194
+
161
195
  visit url unless current_url.start_with?(url)
196
+ # # If the video is a watch page it doesn't have most of the data we want so we click on the video
197
+ # if url.include?("watch/live")
198
+ # clickable_element = find("video")
199
+
200
+ # while(clickable_element.obscured?)
201
+ # clickable_element = clickable_element.find(:xpath, "..")
202
+ # end
203
+
204
+ # clickable_element.click
205
+ # end
162
206
  end
163
207
 
208
+
164
209
  # Extracts an integer out of a string describing a number
165
210
  # e.g. "4K Comments" returns 4000
166
211
  # e.g. "131 Shares" returns 131
@@ -171,19 +216,23 @@ module Forki
171
216
  element = element.text(:all)
172
217
  end
173
218
 
174
- num_pattern = /[0-9KM ,.]+/
175
- interaction_num_text = num_pattern.match(element)[0]
176
-
177
- if interaction_num_text.include?(".") # e.g. "2.2K"
178
- interaction_num_text.to_i + interaction_num_text[-2].to_i * 100
179
- elsif interaction_num_text.include?("K") # e.g. "13K"
180
- interaction_num_text.to_i * 1000
181
- elsif interaction_num_text.include?("M") # e.g. "13M"
182
- interaction_num_text.to_i * 1_000_000
183
- else # e.g. "15,443"
184
- interaction_num_text.delete!(",")
185
- interaction_num_text.delete(" ").to_i
219
+ # Check if there's a modifier i.e. `K` or `M` if there isn't just return the number
220
+ unless element.include?("K") || element.include?("M")
221
+ element.delete(",") # "5,456" e.g.
222
+ return element.to_i
186
223
  end
224
+
225
+ modifier = element[-1]
226
+ number = element[0...-1].to_f
227
+
228
+ case modifier
229
+ when "K"
230
+ number = number * 1_000
231
+ when "M"
232
+ number = number * 1_000_000
233
+ end
234
+
235
+ number.to_i
187
236
  end
188
237
  end
189
238
  end
@@ -0,0 +1,25 @@
1
+ class ImageSieve
2
+ def self.can_process_with_sieve?(graphql_objects)
3
+ !sieve_class_for_graphql_objects(graphql_objects).nil?
4
+ end
5
+
6
+ def self.sieve_for_graphql_objects(graphql_objects)
7
+
8
+ sieve = sieve_class_for_graphql_objects(graphql_objects)
9
+ return nil if sieve.nil?
10
+
11
+ sieve.sieve(graphql_objects)
12
+ end
13
+
14
+ private
15
+
16
+ def self.sieve_class_for_graphql_objects(graphql_objects)
17
+ sieves = []
18
+ sieves.detect { |sieve| sieve.check(graphql_objects) }
19
+ end
20
+ end
21
+
22
+
23
+ Dir['./lib/forki/scrapers/sieves/image_sieves/*.rb'].each do |file|
24
+ require file unless file.end_with?("image_sieve.rb")
25
+ end
@@ -0,0 +1,24 @@
1
+ class VideoSieve
2
+ def self.can_process_with_sieve?(graphql_objects)
3
+ !sieve_class_for_graphql_objects(graphql_objects).nil?
4
+ end
5
+
6
+ def self.sieve_for_graphql_objects(graphql_objects)
7
+
8
+ sieve = sieve_class_for_graphql_objects(graphql_objects)
9
+ return nil if sieve.nil?
10
+
11
+ sieve.sieve(graphql_objects)
12
+ end
13
+
14
+ private
15
+
16
+ def self.sieve_class_for_graphql_objects(graphql_objects)
17
+ sieves = [VideoSieveWatchTab, VideoSieveVideoPage]
18
+ sieves.detect { |sieve| sieve.check(graphql_objects) }
19
+ end
20
+ end
21
+
22
+ Dir['./lib/forki/scrapers/sieves/video_sieves/*.rb'].each do |file|
23
+ require file unless file.end_with?("video_sieve.rb")
24
+ end
@@ -0,0 +1,66 @@
1
+ class VideoSieveVideoPage < VideoSieve
2
+ # To check if it's valid for the inputted graphql objects
3
+ def self.check(graphql_objects)
4
+ story_node_object = self.extractor(graphql_objects) # This will error out
5
+ return false unless story_node_object["content"]["story"]["attachments"].first["styles"]["attachment"].has_key?("media")
6
+
7
+ true
8
+ rescue StandardError
9
+ return false
10
+ end
11
+
12
+ # output the expected format of:
13
+ #
14
+ # post_details = {
15
+ # id: video_object["id"],
16
+ # num_comments: num_comments,
17
+ # num_shares: share_count_object.fetch("count", nil),
18
+ # num_views: feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["video_view_count"],
19
+ # reshare_warning: feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["should_show_reshare_warning"],
20
+ # video_preview_image_url: video_object["preferred_thumbnail"]["image"]["uri"],
21
+ # video_url: video_object["browser_native_hd_url"] || video_object["browser_native_sd_url"],
22
+ # text: text,
23
+ # created_at: creation_date,
24
+ # profile_link: story_node_object["comet_sections"]["context_layout"]["story"]["comet_sections"]["actor_photo"]["story"]["actors"][0]["url"],
25
+ # has_video: true
26
+ # }
27
+ # post_details[:video_preview_image_file] = Forki.retrieve_media(post_details[:video_preview_image_url])
28
+ # post_details[:video_file] = Forki.retrieve_media(post_details[:video_url])
29
+ # post_details[:reactions] = reaction_counts
30
+
31
+ def self.sieve(graphql_objects)
32
+ extracted_text = self.extractor(graphql_objects)
33
+
34
+ story_object = extracted_text["content"]["story"]
35
+ video_object = extracted_text["content"]["story"]["attachments"].first["styles"]["attachment"]["media"]
36
+ feedback_object = extracted_text["feedback"]["story"]["feedback_context"]["feedback_target_with_context"]["ufi_renderer"]["feedback"]["comet_ufi_summary_and_actions_renderer"]["feedback"]
37
+
38
+ video_preview_image_url = video_object["preferred_thumbnail"]["image"]["uri"]
39
+ video_url = video_object["browser_native_hd_url"]
40
+ video_url = video_object["browser_native_sd_url"] if video_url.nil?
41
+
42
+ post_details = {
43
+ id: video_object["id"],
44
+ num_comments: feedback_object["total_comment_count"],
45
+ num_shared: feedback_object["share_count"]["count"],
46
+ num_views: nil,
47
+ reshare_warning: feedback_object["should_show_reshare_warning"],
48
+ video_preview_image_url: video_preview_image_url,
49
+ video_url: video_url,
50
+ text: story_object["message"]["text"],
51
+ created_at: video_object["publish_time"],
52
+ profile_link: story_object["actors"].first["url"],
53
+ has_video: true,
54
+ video_preview_image_file: Forki.retrieve_media(video_preview_image_url),
55
+ video_file: Forki.retrieve_media(video_url),
56
+ reactions: feedback_object["cannot_see_top_custom_reactions"]["top_reactions"]["edges"]
57
+ }
58
+ end
59
+
60
+ private
61
+
62
+ def self.extractor(graphql_objects)
63
+ story_node_object = graphql_objects.find { |graphql_object| graphql_object.key? "node" }&.fetch("node", nil) # user posted video
64
+ story_node_object["comet_sections"]
65
+ end
66
+ end
@@ -0,0 +1,91 @@
1
+ # This is for the "watch" tab style videos https://www.facebook.com/watch/live/?v=394367115960503
2
+
3
+ class VideoSieveWatchTab < VideoSieve
4
+ # To check if it's valid for the inputted graphql objects
5
+ def self.check(graphql_objects)
6
+ video_object = self.extractor(graphql_objects)
7
+ return false if video_object.nil?
8
+
9
+ video_object = video_object["attachments"]
10
+ return false if video_object.nil?
11
+
12
+ return false unless video_object.kind_of?(Array) && !video_object.empty?
13
+
14
+ video_object = video_object.first
15
+ return false unless video_object.kind_of?(Hash) && video_object.keys.include?("media")
16
+
17
+ true
18
+ rescue StandardError
19
+ return false
20
+ end
21
+
22
+ # output the expected format of:
23
+ #
24
+ # post_details = {
25
+ # id: video_object["id"],
26
+ # num_comments: num_comments,
27
+ # num_shares: share_count_object.fetch("count", nil),
28
+ # num_views: feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["video_view_count"],
29
+ # reshare_warning: feedback_object["comet_ufi_summary_and_actions_renderer"]["feedback"]["should_show_reshare_warning"],
30
+ # video_preview_image_url: video_object["preferred_thumbnail"]["image"]["uri"],
31
+ # video_url: video_object["browser_native_hd_url"] || video_object["browser_native_sd_url"],
32
+ # text: text,
33
+ # created_at: creation_date,
34
+ # profile_link: story_node_object["comet_sections"]["context_layout"]["story"]["comet_sections"]["actor_photo"]["story"]["actors"][0]["url"],
35
+ # has_video: true
36
+ # }
37
+ # post_details[:video_preview_image_file] = Forki.retrieve_media(post_details[:video_preview_image_url])
38
+ # post_details[:video_file] = Forki.retrieve_media(post_details[:video_url])
39
+ # post_details[:reactions] = reaction_counts
40
+
41
+ def self.sieve(graphql_objects)
42
+ video_object = self.extractor(graphql_objects)
43
+
44
+ video_url = video_object["attachments"].first["media"]["browser_native_sd_url"]
45
+ video_preview_image_url = video_object["attachments"].first["media"]["preferred_thumbnail"]["image"]["uri"]
46
+
47
+ if !video_object["feedback_context"].nil?
48
+ feedback_object = video_object["feedback_context"]["feedback_target_with_context"]
49
+ else
50
+ feedback_object = graphql_objects.find { |go| !go.dig("feedback", "total_comment_count").nil? }
51
+ feedback_object = feedback_object["feedback"] if feedback_object.has_key?("feedback")
52
+ end
53
+
54
+ profile_link = video_object["attachments"].first["media"]["owner"]["url"]
55
+ if profile_link.nil?
56
+ filtered_json = graphql_objects.find { |go| go.has_key? "attachments" }
57
+ profile_link = filtered_json["attachments"].first["media"]["creation_story"]["comet_sections"]["title"]["story"]["actors"].first["url"]
58
+ end
59
+
60
+ post_details = {
61
+ id: video_object.dig("shareable", "id") || video_object["attachments"].first["media"]["id"],
62
+ num_comments: feedback_object["total_comment_count"],
63
+ num_shared: nil, # This is not associated with these videos in this format
64
+ num_views: nil, # This is not associated with these videos in this format
65
+ reshare_warning: feedback_object["should_show_reshare_warning"],
66
+ video_preview_image_url: video_preview_image_url,
67
+ video_url: video_url,
68
+ text: nil, # There is no text associated with these videos
69
+ created_at: video_object["attachments"].first["media"]["publish_time"],
70
+ profile_link: profile_link,
71
+ has_video: true,
72
+ video_preview_image_file: Forki.retrieve_media(video_preview_image_url),
73
+ video_file: Forki.retrieve_media(video_url),
74
+ reactions: feedback_object["cannot_see_top_custom_reactions"]["top_reactions"]["edges"]
75
+ }
76
+ end
77
+
78
+ private
79
+
80
+ def self.extractor(graphql_objects)
81
+ video_objects = graphql_objects.filter do |go|
82
+ go = go.first if go.kind_of?(Array) && !go.empty?
83
+ go.has_key?("video")
84
+ end
85
+
86
+ story = video_objects.first.dig("video", "creation_story")
87
+ story = video_objects.first.dig("video", "story") if story.nil?
88
+
89
+ story
90
+ end
91
+ end
@@ -3,10 +3,14 @@ require "typhoeus"
3
3
  module Forki
4
4
  class UserScraper < Scraper
5
5
  # Finds and returns the number of people who like the current page
6
- def find_number_of_likes
7
- likes_pattern = /[0-9,.KM ] people like this/
8
- number_of_likes_elem = all("span").filter { | span| likes_pattern.match? span.text }.first
9
- extract_int_from_num_element(number_of_likes_elem)
6
+ def find_number_of_likes(profile_details_string)
7
+ likes_pattern = /[0-9,.KM ] likes/
8
+ likes_pattern = /(?<num_likes>[0-9,.KM ]+) (l|L)ikes/
9
+ number_of_likes_match = likes_pattern.match(profile_details_string)
10
+
11
+ return nil if number_of_likes_match.nil?
12
+
13
+ extract_int_from_num_element(number_of_likes_match.named_captures["num_likes"])
10
14
  end
11
15
 
12
16
  # Finds and returns the number of people who follow the current page
@@ -14,8 +18,18 @@ module Forki
14
18
  followers_pattern = /Followed by (?<num_followers>[0-9,.KM ]) people/
15
19
  alt_follower_pattern = /(?<num_followers>[0-9,.KM ]+) (f|F)ollowers/
16
20
  number_of_followers_match = followers_pattern.match(profile_details_string) || alt_follower_pattern.match(profile_details_string)
21
+
17
22
  return nil if number_of_followers_match.nil?
18
- extract_int_from_num_element(number_of_followers_match.named_captures["num_followers"])
23
+
24
+ number_of_followers = extract_int_from_num_element(number_of_followers_match.named_captures["num_followers"])
25
+
26
+ # Note, this is sticking around if we want to use it later
27
+ # if number_of_followers.nil?
28
+ # number_of_followers_string = JSON.parse(profile_header_str)["user"]["profile_header_renderer"]["user"]["profile_social_context"]["content"].first["text"]["text"]
29
+ # number_of_followers = extract_int_from_num_element(number_of_followers_string)
30
+ # end
31
+
32
+ number_of_followers
19
33
  end
20
34
 
21
35
  def find_number_followers_for_normal_profile(profile_followers_node)
@@ -61,6 +75,7 @@ module Forki
61
75
  verified: profile_header_obj["user"]["is_verified"],
62
76
  profile: profile_intro_obj ? profile_intro_obj["profile_intro_card"]["bio"]["text"] : "",
63
77
  profile_image_url: profile_header_obj["user"]["profilePicLarge"]["uri"],
78
+ number_of_likes: find_number_of_likes(profile_header_str),
64
79
  }
65
80
  end
66
81
 
data/lib/forki/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Forki
4
- VERSION = "0.1.3"
4
+ VERSION = "0.2.0"
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: forki
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.3
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - ''
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-06-05 00:00:00.000000000 Z
11
+ date: 2023-08-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: capybara
@@ -80,6 +80,20 @@ dependencies:
80
80
  - - ">="
81
81
  - !ruby/object:Gem::Version
82
82
  version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: thor
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
83
97
  description:
84
98
  email:
85
99
  - ''
@@ -99,12 +113,21 @@ files:
99
113
  - README.md
100
114
  - Rakefile
101
115
  - bin/console
116
+ - bin/generate_sieve
117
+ - bin/generator_templates/image_sieve_template.rb.erb
118
+ - bin/generator_templates/image_sieve_test_template.rb.erb
119
+ - bin/generator_templates/video_sieve_template.rb.erb
120
+ - bin/generator_templates/video_sieve_test_template.rb.erb
102
121
  - bin/setup
103
122
  - forki.gemspec
104
123
  - lib/forki.rb
105
124
  - lib/forki/post.rb
106
125
  - lib/forki/scrapers/post_scraper.rb
107
126
  - lib/forki/scrapers/scraper.rb
127
+ - lib/forki/scrapers/sieves/image_sieves/image_sieve.rb.rb
128
+ - lib/forki/scrapers/sieves/video_sieves/video_sieve.rb
129
+ - lib/forki/scrapers/sieves/video_sieves/video_sieve_video_page.rb
130
+ - lib/forki/scrapers/sieves/video_sieves/video_sieve_watch_tab.rb
108
131
  - lib/forki/scrapers/user_scraper.rb
109
132
  - lib/forki/user.rb
110
133
  - lib/forki/version.rb
@@ -139,7 +162,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
139
162
  - !ruby/object:Gem::Version
140
163
  version: '0'
141
164
  requirements: []
142
- rubygems_version: 3.3.26
165
+ rubygems_version: 3.4.14
143
166
  signing_key:
144
167
  specification_version: 4
145
168
  summary: A gem to scrape Facebook pages for archive purposes.