forki 0.2.11 → 0.2.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: a7f3c7b6c1fc45dcd2c162f6ff308f3187e07c54b921739d16c37395e3574324
4
- data.tar.gz: ed7537f07b914f4500a31e948b2ee1cc8c375a15a36ea971c7186a7dbd5a7935
3
+ metadata.gz: b6dad4ed40e17a940d54200b022b9ca9b2569f48abcc602e6b40eb6bdc994f19
4
+ data.tar.gz: 96495423ff46aa90a874e2969568e0eb81fe1b0aaf1df5e3e62ec9bb46738db4
5
5
  SHA512:
6
- metadata.gz: ed836bcedbe80ef77780183d0bdcc9618d8449bed16c996c1413b109360b13e343d72ffaca8b4b4da3ea42788b29a895ba2eac22be4a57523dd232144500f412
7
- data.tar.gz: a676e6f6cae22491bf7d7d1ad916e8e14f96832f0000c1c60b222759cfed89fb3875c67f9a02253ee19adb9f9e247190e97e4a056b921a0b6a210c24e0e37d18
6
+ metadata.gz: '08a5288e3fd9feb8cfbe683986a6a99a1791699e8b2335693928b16de69a5a8ebdea955e84dd17967ab95fecde04164b9c8cbdfbcb7860ff34d12e417a597ec6'
7
+ data.tar.gz: 000e63e56a249cb8abca7e2ae9364e9f0f4b146d1d8063f279b135d4e9eb4593b1fe78d6d3b73e2bf89810568254aebf07aa52ea93e5a6e4d9fe0cfede1d4977
data/Gemfile.lock CHANGED
@@ -1,12 +1,13 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- forki (0.2.8)
4
+ forki (0.2.11)
5
5
  apparition
6
6
  capybara
7
7
  oj
8
8
  selenium-webdriver
9
9
  typhoeus
10
+ zorki
10
11
 
11
12
  GEM
12
13
  remote: https://rubygems.org/
@@ -29,8 +30,8 @@ GEM
29
30
  i18n (>= 1.6, < 2)
30
31
  minitest (>= 5.1)
31
32
  tzinfo (~> 2.0)
32
- addressable (2.8.6)
33
- public_suffix (>= 2.0.2, < 6.0)
33
+ addressable (2.8.7)
34
+ public_suffix (>= 2.0.2, < 7.0)
34
35
  apparition (0.6.0)
35
36
  capybara (~> 3.13, < 4)
36
37
  websocket-driver (>= 0.6.5)
@@ -59,6 +60,7 @@ GEM
59
60
  i18n (1.13.0)
60
61
  concurrent-ruby (~> 1.0)
61
62
  json (2.6.3)
63
+ logger (1.6.1)
62
64
  loofah (2.21.3)
63
65
  crass (~> 1.0.2)
64
66
  nokogiri (>= 1.12.0)
@@ -68,12 +70,14 @@ GEM
68
70
  minitest (5.18.0)
69
71
  nokogiri (1.15.1-arm64-darwin)
70
72
  racc (~> 1.4)
71
- oj (3.16.4)
73
+ oj (3.16.6)
72
74
  bigdecimal (>= 3.0)
75
+ ostruct (>= 0.2)
76
+ ostruct (0.6.0)
73
77
  parallel (1.23.0)
74
78
  parser (3.2.2.1)
75
79
  ast (~> 2.4.1)
76
- public_suffix (5.1.1)
80
+ public_suffix (6.0.1)
77
81
  racc (1.6.2)
78
82
  rack (2.2.4)
79
83
  rack-test (2.1.0)
@@ -130,8 +134,11 @@ GEM
130
134
  rubocop-rails (~> 2.0)
131
135
  ruby-progressbar (1.13.0)
132
136
  rubyzip (2.3.2)
133
- selenium-webdriver (4.21.1)
137
+ selenium-devtools (0.129.0)
138
+ selenium-webdriver (~> 4.2)
139
+ selenium-webdriver (4.24.0)
134
140
  base64 (~> 0.2)
141
+ logger (~> 1.4)
135
142
  rexml (~> 3.2, >= 3.2.5)
136
143
  rubyzip (>= 1.2.2, < 3.0)
137
144
  websocket (~> 1.0)
@@ -141,13 +148,20 @@ GEM
141
148
  tzinfo (2.0.6)
142
149
  concurrent-ruby (~> 1.0)
143
150
  unicode-display_width (2.4.2)
144
- websocket (1.2.10)
151
+ websocket (1.2.11)
145
152
  websocket-driver (0.7.6)
146
153
  websocket-extensions (>= 0.1.0)
147
154
  websocket-extensions (0.1.5)
148
155
  xpath (3.2.0)
149
156
  nokogiri (~> 1.8)
150
157
  zeitwerk (2.6.8)
158
+ zorki (0.2.6)
159
+ apparition
160
+ capybara
161
+ oj
162
+ selenium-devtools
163
+ selenium-webdriver (~> 4.24.0)
164
+ typhoeus
151
165
 
152
166
  PLATFORMS
153
167
  arm64-darwin-21
data/forki.gemspec CHANGED
@@ -36,6 +36,7 @@ Gem::Specification.new do |spec|
36
36
  spec.add_dependency "typhoeus" # For making API requests
37
37
  spec.add_dependency "oj" # A faster JSON parser/loader than stdlib
38
38
  spec.add_dependency "selenium-webdriver" # Webdriver selenium
39
+ spec.add_dependency "zorki" # For scraping instagram pages
39
40
 
40
41
  spec.add_development_dependency "thor" # For the generator
41
42
 
@@ -38,8 +38,6 @@ module Forki
38
38
  extract_image_post_data(graphql_objects)
39
39
  else
40
40
  extract_image_post_data(graphql_objects)
41
-
42
- #raise UnhandledContentError
43
41
  end
44
42
  end
45
43
 
@@ -324,9 +322,7 @@ module Forki
324
322
  elsif graphql_object["node"]["comet_sections"]["feedback"]["story"].dig("feedback_context")
325
323
  begin
326
324
  feedback_object = graphql_object["node"]["comet_sections"]["feedback"]["story"]["feedback_context"]["feedback_target_with_context"]["ufi_renderer"]["feedback"]["comet_ufi_summary_and_actions_renderer"]["feedback"]
327
- rescue NoMethodError
328
- debugger
329
- end
325
+ rescue NoMethodError; end
330
326
  elsif graphql_object["node"]["comet_sections"]["feedback"]["story"].has_key?("comet_feed_ufi_container")
331
327
  feedback_object = graphql_object["node"]["comet_sections"]["feedback"]["story"]["comet_feed_ufi_container"]["story"]["story_ufi_container"]["story"]["feedback_context"]["feedback_target_with_context"]["comet_ufi_summary_and_actions_renderer"]["feedback"]
332
328
  else
@@ -353,7 +349,9 @@ module Forki
353
349
  end
354
350
  end
355
351
 
356
- text = graphql_object["node"]["comet_sections"]["content"]["story"]["message"]["text"]
352
+ text = graphql_object["node"]["comet_sections"]["content"]["story"].dig(["message", "text"])
353
+ text = "" if text.nil?
354
+
357
355
  profile_link = graphql_object["node"]["comet_sections"]["content"]["story"]["actors"].first["url"]
358
356
 
359
357
  unless graphql_object["node"]["comet_sections"].dig("content", "story", "comet_sections", "context_layout", "story", "comet_sections", "metadata").nil?
@@ -506,7 +504,9 @@ module Forki
506
504
  end
507
505
 
508
506
  begin
507
+ # rubocop:disable Lint/Debugger
509
508
  save_screenshot("#{Forki.temp_storage_location}/facebook_screenshot_#{SecureRandom.uuid}.png")
509
+ # rubocop:enable Lint/Debugger
510
510
  rescue Selenium::WebDriver::Error::TimeoutError
511
511
  raise Net::ReadTimeout
512
512
  end
@@ -178,10 +178,10 @@ module Forki
178
178
  # If either of those two conditions are false, raises an exception
179
179
  def validate_and_load_page(url)
180
180
  Capybara.app_host = "https://www.facebook.com"
181
- facebook_hosts = ["facebook.com", "www.facebook.com", "web.facebook.com", "m.facebook.com"]
181
+ facebook_hosts = ["facebook.com", "www.facebook.com", "web.facebook.com", "m.facebook.com", "l.facebook.com"]
182
182
  parsed_url = URI.parse(url)
183
183
  host = parsed_url.host
184
- raise Forki::InvalidUrlError unless facebook_hosts.include?(host)
184
+ raise Forki::InvalidUrlError.new("Invalid Facebook host: #{host}") unless facebook_hosts.include?(host)
185
185
 
186
186
  # Replace the host with a default one to prevent redirect loops that can happen
187
187
  unless parsed_url.host == "www.facebook.com"
@@ -1,10 +1,12 @@
1
1
  require "typhoeus"
2
+ require "zorki"
2
3
 
4
+ # rubocop:disable Metrics/ClassLength
3
5
  module Forki
4
6
  class UserScraper < Scraper
5
7
  # Finds and returns the number of people who like the current page
6
8
  def find_number_of_likes(profile_details_string)
7
- likes_pattern = /[0-9,.KM ] likes/
9
+ # likes_pattern = /[0-9,.KM ] likes/
8
10
  likes_pattern = /(?<num_likes>[0-9,.KM ]+) (l|L)ikes/
9
11
  number_of_likes_match = likes_pattern.match(profile_details_string)
10
12
 
@@ -100,7 +102,14 @@ module Forki
100
102
 
101
103
  # Uses GraphQL data and DOM elements to collect information about the current user page
102
104
  def parse(url)
105
+ # So some reels may actually link to an instagram user?
106
+ if url.include?("instagram.com")
107
+ user = get_instagram_user(url)
108
+ return user
109
+ end
110
+
103
111
  validate_and_load_page(url)
112
+
104
113
  graphql_strings = find_graphql_data_strings(page.html)
105
114
  is_page = graphql_strings.map { |s| JSON.parse(s) }.any? { |o| o.key?("page") }
106
115
  user_details = is_page ? extract_page_details(graphql_strings) : extract_profile_details(graphql_strings)
@@ -110,5 +119,38 @@ module Forki
110
119
 
111
120
  user_details
112
121
  end
122
+
123
+ def get_instagram_user(url)
124
+ uri = URI(url)
125
+ query = uri.query
126
+ components = URI.decode_uri_component(query)
127
+ extracted_url = URI.extract(components).first
128
+ extracted_uri = URI(extracted_url)
129
+ username = extracted_uri.to_s.match(/(https:\/\/www.instagram.com\/_u\/[\w]+)/).to_s.split("/").last
130
+
131
+ page.quit # I think we need to do this so Zorki can run?
132
+ zorki_users = Zorki::User.lookup(username)
133
+ zorki_user = nil
134
+
135
+ if zorki_users.count.positive?
136
+ zorki_user = zorki_users.first
137
+ else
138
+ raise ContentUnavailableError
139
+ end
140
+
141
+
142
+ # Convert a zorki_user into a hash for Forki
143
+ {
144
+ name: zorki_user.name,
145
+ id: username,
146
+ number_of_followers: zorki_user.number_of_followers,
147
+ verified: zorki_user.verified,
148
+ profile: zorki_user.profile,
149
+ profile_link: zorki_user.profile_link,
150
+ profile_image_file: zorki_user.profile_image,
151
+ profile_image_url: zorki_user.profile_image_url,
152
+ number_of_likes: 0
153
+ }
154
+ end
113
155
  end
114
156
  end
data/lib/forki/user.rb CHANGED
@@ -37,7 +37,7 @@ module Forki
37
37
  def scrape(urls)
38
38
  urls.map do |url|
39
39
  user_hash = Forki::UserScraper.new.parse(url)
40
- User.new(user_hash)
40
+ User.new(user_hash) if user_hash.is_a?(Hash)
41
41
  end
42
42
  end
43
43
  end
data/lib/forki/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Forki
4
- VERSION = "0.2.11"
4
+ VERSION = "0.2.13"
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: forki
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.11
4
+ version: 0.2.13
5
5
  platform: ruby
6
6
  authors:
7
7
  - ''
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2024-10-08 00:00:00.000000000 Z
11
+ date: 2024-10-23 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: capybara
@@ -80,6 +80,20 @@ dependencies:
80
80
  - - ">="
81
81
  - !ruby/object:Gem::Version
82
82
  version: '0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: zorki
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
83
97
  - !ruby/object:Gem::Dependency
84
98
  name: thor
85
99
  requirement: !ruby/object:Gem::Requirement