forki 0.2.11 → 0.2.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +21 -7
- data/forki.gemspec +1 -0
- data/lib/forki/scrapers/scraper.rb +2 -2
- data/lib/forki/scrapers/user_scraper.rb +43 -1
- data/lib/forki/user.rb +1 -1
- data/lib/forki/version.rb +1 -1
- metadata +16 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5c0373d5017b66032feca78ff34014da595078bd2ea276aed8590199ba386397
|
4
|
+
data.tar.gz: 41f53e6c34b490d9cf7f24ad6ff43c82c8e596c789938c1a8026411a5111245b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c9cb10df9e69bb07c1b8f2d1da2e3369ddfaddf1900ad94e5d9568c50f4045df994939f81a9d7543371ee53b8376f93d2e2e62370a69ccffb598b83362e85ac2
|
7
|
+
data.tar.gz: f553f6ad7d90aece2501f1ef6d0cd8c787f1ce8bc051480bdc87dab8fa43721923670b76aa3a3b1b8a6fbc46dd834dde2025ffd79dc00987ba9bcd6899bd99f4
|
data/Gemfile.lock
CHANGED
@@ -1,12 +1,13 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
forki (0.2.
|
4
|
+
forki (0.2.11)
|
5
5
|
apparition
|
6
6
|
capybara
|
7
7
|
oj
|
8
8
|
selenium-webdriver
|
9
9
|
typhoeus
|
10
|
+
zorki
|
10
11
|
|
11
12
|
GEM
|
12
13
|
remote: https://rubygems.org/
|
@@ -29,8 +30,8 @@ GEM
|
|
29
30
|
i18n (>= 1.6, < 2)
|
30
31
|
minitest (>= 5.1)
|
31
32
|
tzinfo (~> 2.0)
|
32
|
-
addressable (2.8.
|
33
|
-
public_suffix (>= 2.0.2, <
|
33
|
+
addressable (2.8.7)
|
34
|
+
public_suffix (>= 2.0.2, < 7.0)
|
34
35
|
apparition (0.6.0)
|
35
36
|
capybara (~> 3.13, < 4)
|
36
37
|
websocket-driver (>= 0.6.5)
|
@@ -59,6 +60,7 @@ GEM
|
|
59
60
|
i18n (1.13.0)
|
60
61
|
concurrent-ruby (~> 1.0)
|
61
62
|
json (2.6.3)
|
63
|
+
logger (1.6.1)
|
62
64
|
loofah (2.21.3)
|
63
65
|
crass (~> 1.0.2)
|
64
66
|
nokogiri (>= 1.12.0)
|
@@ -68,12 +70,14 @@ GEM
|
|
68
70
|
minitest (5.18.0)
|
69
71
|
nokogiri (1.15.1-arm64-darwin)
|
70
72
|
racc (~> 1.4)
|
71
|
-
oj (3.16.
|
73
|
+
oj (3.16.6)
|
72
74
|
bigdecimal (>= 3.0)
|
75
|
+
ostruct (>= 0.2)
|
76
|
+
ostruct (0.6.0)
|
73
77
|
parallel (1.23.0)
|
74
78
|
parser (3.2.2.1)
|
75
79
|
ast (~> 2.4.1)
|
76
|
-
public_suffix (
|
80
|
+
public_suffix (6.0.1)
|
77
81
|
racc (1.6.2)
|
78
82
|
rack (2.2.4)
|
79
83
|
rack-test (2.1.0)
|
@@ -130,8 +134,11 @@ GEM
|
|
130
134
|
rubocop-rails (~> 2.0)
|
131
135
|
ruby-progressbar (1.13.0)
|
132
136
|
rubyzip (2.3.2)
|
133
|
-
selenium-
|
137
|
+
selenium-devtools (0.129.0)
|
138
|
+
selenium-webdriver (~> 4.2)
|
139
|
+
selenium-webdriver (4.24.0)
|
134
140
|
base64 (~> 0.2)
|
141
|
+
logger (~> 1.4)
|
135
142
|
rexml (~> 3.2, >= 3.2.5)
|
136
143
|
rubyzip (>= 1.2.2, < 3.0)
|
137
144
|
websocket (~> 1.0)
|
@@ -141,13 +148,20 @@ GEM
|
|
141
148
|
tzinfo (2.0.6)
|
142
149
|
concurrent-ruby (~> 1.0)
|
143
150
|
unicode-display_width (2.4.2)
|
144
|
-
websocket (1.2.
|
151
|
+
websocket (1.2.11)
|
145
152
|
websocket-driver (0.7.6)
|
146
153
|
websocket-extensions (>= 0.1.0)
|
147
154
|
websocket-extensions (0.1.5)
|
148
155
|
xpath (3.2.0)
|
149
156
|
nokogiri (~> 1.8)
|
150
157
|
zeitwerk (2.6.8)
|
158
|
+
zorki (0.2.6)
|
159
|
+
apparition
|
160
|
+
capybara
|
161
|
+
oj
|
162
|
+
selenium-devtools
|
163
|
+
selenium-webdriver (~> 4.24.0)
|
164
|
+
typhoeus
|
151
165
|
|
152
166
|
PLATFORMS
|
153
167
|
arm64-darwin-21
|
data/forki.gemspec
CHANGED
@@ -36,6 +36,7 @@ Gem::Specification.new do |spec|
|
|
36
36
|
spec.add_dependency "typhoeus" # For making API requests
|
37
37
|
spec.add_dependency "oj" # A faster JSON parser/loader than stdlib
|
38
38
|
spec.add_dependency "selenium-webdriver" # Webdriver selenium
|
39
|
+
spec.add_dependency "zorki" # For scraping instagram pages
|
39
40
|
|
40
41
|
spec.add_development_dependency "thor" # For the generator
|
41
42
|
|
@@ -178,10 +178,10 @@ module Forki
|
|
178
178
|
# If either of those two conditions are false, raises an exception
|
179
179
|
def validate_and_load_page(url)
|
180
180
|
Capybara.app_host = "https://www.facebook.com"
|
181
|
-
facebook_hosts = ["facebook.com", "www.facebook.com", "web.facebook.com", "m.facebook.com"]
|
181
|
+
facebook_hosts = ["facebook.com", "www.facebook.com", "web.facebook.com", "m.facebook.com", "l.facebook.com"]
|
182
182
|
parsed_url = URI.parse(url)
|
183
183
|
host = parsed_url.host
|
184
|
-
raise Forki::InvalidUrlError unless facebook_hosts.include?(host)
|
184
|
+
raise Forki::InvalidUrlError.new("Invalid Facebook host: #{host}") unless facebook_hosts.include?(host)
|
185
185
|
|
186
186
|
# Replace the host with a default one to prevent redirect loops that can happen
|
187
187
|
unless parsed_url.host == "www.facebook.com"
|
@@ -1,10 +1,12 @@
|
|
1
1
|
require "typhoeus"
|
2
|
+
require "zorki"
|
2
3
|
|
4
|
+
# rubocop:disable Metrics/ClassLength
|
3
5
|
module Forki
|
4
6
|
class UserScraper < Scraper
|
5
7
|
# Finds and returns the number of people who like the current page
|
6
8
|
def find_number_of_likes(profile_details_string)
|
7
|
-
likes_pattern = /[0-9,.KM ] likes/
|
9
|
+
# likes_pattern = /[0-9,.KM ] likes/
|
8
10
|
likes_pattern = /(?<num_likes>[0-9,.KM ]+) (l|L)ikes/
|
9
11
|
number_of_likes_match = likes_pattern.match(profile_details_string)
|
10
12
|
|
@@ -100,7 +102,14 @@ module Forki
|
|
100
102
|
|
101
103
|
# Uses GraphQL data and DOM elements to collect information about the current user page
|
102
104
|
def parse(url)
|
105
|
+
# So some reels may actually link to an instagram user?
|
106
|
+
if url.include?("instagram.com")
|
107
|
+
user = get_instagram_user(url)
|
108
|
+
return user
|
109
|
+
end
|
110
|
+
|
103
111
|
validate_and_load_page(url)
|
112
|
+
|
104
113
|
graphql_strings = find_graphql_data_strings(page.html)
|
105
114
|
is_page = graphql_strings.map { |s| JSON.parse(s) }.any? { |o| o.key?("page") }
|
106
115
|
user_details = is_page ? extract_page_details(graphql_strings) : extract_profile_details(graphql_strings)
|
@@ -110,5 +119,38 @@ module Forki
|
|
110
119
|
|
111
120
|
user_details
|
112
121
|
end
|
122
|
+
|
123
|
+
def get_instagram_user(url)
|
124
|
+
uri = URI(url)
|
125
|
+
query = uri.query
|
126
|
+
components = URI.decode_uri_component(query)
|
127
|
+
extracted_url = URI.extract(components).first
|
128
|
+
extracted_uri = URI(extracted_url)
|
129
|
+
username = extracted_uri.to_s.match(/(https:\/\/www.instagram.com\/_u\/[\w]+)/).to_s.split("/").last
|
130
|
+
|
131
|
+
page.quit # I think we need to do this so Zorki can run?
|
132
|
+
zorki_users = Zorki::User.lookup(username)
|
133
|
+
zorki_user = nil
|
134
|
+
|
135
|
+
if zorki_users.count.positive?
|
136
|
+
zorki_user = zorki_users.first
|
137
|
+
else
|
138
|
+
raise ContentUnavailableError
|
139
|
+
end
|
140
|
+
|
141
|
+
|
142
|
+
# Convert a zorki_user into a hash for Forki
|
143
|
+
{
|
144
|
+
name: zorki_user.name,
|
145
|
+
id: username,
|
146
|
+
number_of_followers: zorki_user.number_of_followers,
|
147
|
+
verified: zorki_user.verified,
|
148
|
+
profile: zorki_user.profile,
|
149
|
+
profile_link: zorki_user.profile_link,
|
150
|
+
profile_image_file: zorki_user.profile_image,
|
151
|
+
profile_image_url: zorki_user.profile_image_url,
|
152
|
+
number_of_likes: 0
|
153
|
+
}
|
154
|
+
end
|
113
155
|
end
|
114
156
|
end
|
data/lib/forki/user.rb
CHANGED
data/lib/forki/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: forki
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.12
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- ''
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-10-
|
11
|
+
date: 2024-10-20 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: capybara
|
@@ -80,6 +80,20 @@ dependencies:
|
|
80
80
|
- - ">="
|
81
81
|
- !ruby/object:Gem::Version
|
82
82
|
version: '0'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: zorki
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0'
|
90
|
+
type: :runtime
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - ">="
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0'
|
83
97
|
- !ruby/object:Gem::Dependency
|
84
98
|
name: thor
|
85
99
|
requirement: !ruby/object:Gem::Requirement
|