zorki 0.1.3 → 0.1.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/zorki/scrapers/post_scraper.rb +4 -1
- data/lib/zorki/scrapers/scraper.rb +21 -6
- data/lib/zorki/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6dd3c28132011c0d9d42875803face311af01a50bbbfa9cf4f07ca89a63029d4
|
4
|
+
data.tar.gz: 91fa19abbd41551e4cb55ff34ce3977795903312c06506bf9bb450796cca7189
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: '087a14d77466f9b5f70014f5f09ae97542d2b8083499917d0397815dd4c298da07da9c4b28fb31e6f7c1949a356e9c4d32b8bb8e7551054ca1fe5c37c972515b'
|
7
|
+
data.tar.gz: 5b46e4e7edb229d89ff5f9bb740dac2959c314e09cf5b9a770c8b16779509266a8bd06d38b1a1208f547cfef22e10fd139b0f32dbea9c89dba5413874399743a
|
@@ -23,6 +23,8 @@ module Zorki
|
|
23
23
|
"data,xdt_api__v1__media__shortcode__web_info,items"
|
24
24
|
)
|
25
25
|
|
26
|
+
graphql_object = graphql_object.first if graphql_object.kind_of?(Array)
|
27
|
+
|
26
28
|
# For pages that have been marked misinfo the structure is very different than not
|
27
29
|
# If it is a clean post then it's just a schema.org thing, but if it's misinfo it's the old
|
28
30
|
# way of deeply nested stuff.
|
@@ -54,7 +56,8 @@ module Zorki
|
|
54
56
|
else
|
55
57
|
# We need to see if this is a single image post or a slideshow. We do that
|
56
58
|
# by looking for a single image, if it's not there, we assume the alternative.
|
57
|
-
|
59
|
+
# debugger
|
60
|
+
# graphql_object = graphql_object["data"]["xdt_api__v1__media__shortcode__web_info"]
|
58
61
|
|
59
62
|
unless graphql_object["items"][0].has_key?("video_versions") && !graphql_object["items"][0]["video_versions"].nil?
|
60
63
|
# Check if there is a slideshow or not
|
@@ -5,9 +5,9 @@ require "dotenv/load"
|
|
5
5
|
require "oj"
|
6
6
|
require "selenium-webdriver"
|
7
7
|
require "logger"
|
8
|
-
require "debug"
|
9
8
|
require "securerandom"
|
10
9
|
require "selenium/webdriver/remote/http/curb"
|
10
|
+
require "debug"
|
11
11
|
|
12
12
|
# 2022-06-07 14:15:23 WARN Selenium [DEPRECATION] [:browser_options] :options as a parameter for driver initialization is deprecated. Use :capabilities with an Array of value capabilities/options if necessary instead.
|
13
13
|
|
@@ -112,12 +112,27 @@ module Zorki
|
|
112
112
|
# TODO: put this before the whole load loop
|
113
113
|
if response_body.nil?
|
114
114
|
doc = Nokogiri::HTML(page.driver.browser.page_source)
|
115
|
-
elements = doc.search("script").find_all do |e|
|
116
|
-
|
117
|
-
end
|
115
|
+
# elements = doc.search("script").find_all do |e|
|
116
|
+
# e.attributes.has_key?("type") && e.attributes["type"].value == "application/ld+json"
|
117
|
+
# end
|
118
|
+
|
119
|
+
elements = doc.search("script").map do |element|
|
120
|
+
element_json = nil
|
121
|
+
begin
|
122
|
+
element_json = JSON.parse(element)
|
123
|
+
|
124
|
+
element_json = element_json["require"].first.last.first["__bbox"]["require"].first.last.last["__bbox"]["result"]["data"]["xdt_api__v1__media__shortcode__web_info"]
|
125
|
+
rescue StandardError => e
|
126
|
+
next
|
127
|
+
end
|
128
|
+
|
129
|
+
element_json
|
130
|
+
end.compact
|
118
131
|
|
119
|
-
|
120
|
-
|
132
|
+
if elements&.empty?
|
133
|
+
raise ContentUnavailableError
|
134
|
+
end
|
135
|
+
return elements
|
121
136
|
end
|
122
137
|
|
123
138
|
raise ContentUnavailableError if response_body.nil?
|
data/lib/zorki/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: zorki
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Christopher Guess
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
11
|
+
date: 2023-07-26 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: capybara
|