openai-scraper 1.2 → 1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/openai-scraper.rb +44 -8
- metadata +18 -18
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8f28769f3837288865877b5448381b9145c9f8383b21e411dfd741b32481d8d3
|
4
|
+
data.tar.gz: 3c9fb79c92cef93ae0fdf80f1c9b98394835e9172813309e363b812894cfe58f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 114f7c2122289c09d6b8e4426620b741208cce161dc2794d7720944d82e96339df0d4336c642d097fa03fa0871a7807f2778c79aa2921569df6b04baa8e44791
|
7
|
+
data.tar.gz: fa545d7dfd11513dab76eeaabd5567aa88d59c3911ced5152fcaff21566dab8239a538e4b558c99459eaad1797c297e70cfe2110379bf90945991a854b4fb7f4
|
data/lib/openai-scraper.rb
CHANGED
@@ -22,6 +22,7 @@ end
|
|
22
22
|
module BlackStack
|
23
23
|
module OpenAIScraper
|
24
24
|
@@openai_apikey = nil
|
25
|
+
@@model = nil
|
25
26
|
@@client = nil
|
26
27
|
@@browser = nil
|
27
28
|
@@history = []
|
@@ -37,17 +38,19 @@ module BlackStack
|
|
37
38
|
|
38
39
|
def self.set(h)
|
39
40
|
@@openai_apikey = h[:openai_apikey] if h[:openai_apikey]
|
41
|
+
@@model = h[:model] if h[:model]
|
40
42
|
end
|
41
43
|
|
42
44
|
def self.init
|
43
45
|
@@client = OpenAI::Client.new(access_token: @@openai_apikey)
|
44
|
-
|
46
|
+
## setup a mechanize client
|
47
|
+
@@browser = Mechanize.new
|
45
48
|
# load history array from the file ./history.json, only if the file exists
|
46
49
|
@@history = JSON.parse(File.read('./history.json')) if File.exist?('./history.json')
|
47
50
|
end
|
48
51
|
|
49
52
|
def self.finalize
|
50
|
-
|
53
|
+
#@@browser.quit
|
51
54
|
# overrite the file ./history.json with the current history array
|
52
55
|
File.write('./history.json', @@history.to_json)
|
53
56
|
end
|
@@ -92,7 +95,7 @@ List of Commands:\n
|
|
92
95
|
|
93
96
|
response = @@client.chat(
|
94
97
|
parameters: {
|
95
|
-
model:
|
98
|
+
model: @@model, # Required.
|
96
99
|
#max_tokens: 6000,
|
97
100
|
temperature: 0.5,
|
98
101
|
messages: [
|
@@ -138,15 +141,48 @@ List of Commands:\n
|
|
138
141
|
],
|
139
142
|
=end
|
140
143
|
})
|
144
|
+
#binding.pry
|
141
145
|
raise response.dig("error", "message") if response.dig("error", "message")
|
142
146
|
return response.dig("choices", 0, "message", "content")
|
143
147
|
end
|
144
148
|
|
149
|
+
# get all links of the webiste, at a certainly deep level
|
150
|
+
def self.get_links(url, deep = 1, l=nil)
|
151
|
+
l = BlackStack::DummyLogger.new(nil) if l.nil?
|
152
|
+
ret = []
|
153
|
+
return ret if deep<=0
|
154
|
+
# go to the URL
|
155
|
+
l.logs "level #{deep} - get_links: #{url}... "
|
156
|
+
page = @@browser.get(url)
|
157
|
+
# get all links
|
158
|
+
links = page.search('a')
|
159
|
+
# add each link to the array
|
160
|
+
links.each do |link|
|
161
|
+
txt = link.text.to_s.strip
|
162
|
+
ret << { 'href' => link['href'], 'text' => txt }
|
163
|
+
end
|
164
|
+
# make ret an array of unique hashes
|
165
|
+
ret.uniq!
|
166
|
+
# remove links that are not belonging the same domain
|
167
|
+
ret.reject! { |h| h['href'] !~ /#{URI.parse(url).host}/ }
|
168
|
+
# remove links that are not belonging the protocols http or https
|
169
|
+
ret.reject! { |h| h['href'] !~ /^https?:\/\// }
|
170
|
+
l.logf 'done'.green
|
171
|
+
|
172
|
+
# get the links inside each link
|
173
|
+
ret.each do |link|
|
174
|
+
ret += get_links(link['href'], deep-1, l)
|
175
|
+
end
|
176
|
+
|
177
|
+
# return
|
178
|
+
ret
|
179
|
+
end
|
180
|
+
|
145
181
|
# download the web page, and extract all links.
|
146
182
|
#
|
147
183
|
def self.wl(url)
|
148
184
|
# visit the url
|
149
|
-
@@browser.
|
185
|
+
page = @@browser.get(url)
|
150
186
|
|
151
187
|
# wait up to 30 seconds for the page to load
|
152
188
|
#wait = Selenium::WebDriver::Wait.new(:timeout => 30)
|
@@ -157,7 +193,7 @@ List of Commands:\n
|
|
157
193
|
#wait.until { @@browser.execute_script("return jQuery.active") == 0 }
|
158
194
|
|
159
195
|
# get all the links
|
160
|
-
links =
|
196
|
+
links = page.search('a')
|
161
197
|
|
162
198
|
# add the links to a json structure
|
163
199
|
h = []
|
@@ -176,7 +212,7 @@ List of Commands:\n
|
|
176
212
|
#
|
177
213
|
def self.wt(url)
|
178
214
|
# visit the url
|
179
|
-
@@browser.
|
215
|
+
page = @@browser.get(url)
|
180
216
|
|
181
217
|
# wait up to 30 seconds for the page to load
|
182
218
|
#wait = Selenium::WebDriver::Wait.new(:timeout => 30)
|
@@ -186,8 +222,8 @@ List of Commands:\n
|
|
186
222
|
#wait = Selenium::WebDriver::Wait.new(:timeout => 30)
|
187
223
|
#wait.until { @@browser.execute_script("return jQuery.active") == 0 }
|
188
224
|
|
189
|
-
# return the text of the
|
190
|
-
|
225
|
+
# return the text of the body
|
226
|
+
page.search('body').text
|
191
227
|
end # def wt
|
192
228
|
|
193
229
|
# show the promt and wait for the user input
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: openai-scraper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: '1.
|
4
|
+
version: '1.4'
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Leandro Daniel Sardi
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-07-
|
11
|
+
date: 2023-07-24 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -31,65 +31,65 @@ dependencies:
|
|
31
31
|
- !ruby/object:Gem::Version
|
32
32
|
version: 1.13.10
|
33
33
|
- !ruby/object:Gem::Dependency
|
34
|
-
name:
|
34
|
+
name: mechanize
|
35
35
|
requirement: !ruby/object:Gem::Requirement
|
36
36
|
requirements:
|
37
37
|
- - "~>"
|
38
38
|
- !ruby/object:Gem::Version
|
39
|
-
version:
|
39
|
+
version: 2.8.5
|
40
40
|
- - ">="
|
41
41
|
- !ruby/object:Gem::Version
|
42
|
-
version:
|
42
|
+
version: 2.8.5
|
43
43
|
type: :runtime
|
44
44
|
prerelease: false
|
45
45
|
version_requirements: !ruby/object:Gem::Requirement
|
46
46
|
requirements:
|
47
47
|
- - "~>"
|
48
48
|
- !ruby/object:Gem::Version
|
49
|
-
version:
|
49
|
+
version: 2.8.5
|
50
50
|
- - ">="
|
51
51
|
- !ruby/object:Gem::Version
|
52
|
-
version:
|
52
|
+
version: 2.8.5
|
53
53
|
- !ruby/object:Gem::Dependency
|
54
|
-
name:
|
54
|
+
name: simple_cloud_logging
|
55
55
|
requirement: !ruby/object:Gem::Requirement
|
56
56
|
requirements:
|
57
57
|
- - "~>"
|
58
58
|
- !ruby/object:Gem::Version
|
59
|
-
version:
|
59
|
+
version: 1.2.2
|
60
60
|
- - ">="
|
61
61
|
- !ruby/object:Gem::Version
|
62
|
-
version:
|
62
|
+
version: 1.2.2
|
63
63
|
type: :runtime
|
64
64
|
prerelease: false
|
65
65
|
version_requirements: !ruby/object:Gem::Requirement
|
66
66
|
requirements:
|
67
67
|
- - "~>"
|
68
68
|
- !ruby/object:Gem::Version
|
69
|
-
version:
|
69
|
+
version: 1.2.2
|
70
70
|
- - ">="
|
71
71
|
- !ruby/object:Gem::Version
|
72
|
-
version:
|
72
|
+
version: 1.2.2
|
73
73
|
- !ruby/object:Gem::Dependency
|
74
|
-
name:
|
74
|
+
name: colorize
|
75
75
|
requirement: !ruby/object:Gem::Requirement
|
76
76
|
requirements:
|
77
77
|
- - "~>"
|
78
78
|
- !ruby/object:Gem::Version
|
79
|
-
version:
|
79
|
+
version: 0.8.1
|
80
80
|
- - ">="
|
81
81
|
- !ruby/object:Gem::Version
|
82
|
-
version:
|
82
|
+
version: 0.8.1
|
83
83
|
type: :runtime
|
84
84
|
prerelease: false
|
85
85
|
version_requirements: !ruby/object:Gem::Requirement
|
86
86
|
requirements:
|
87
87
|
- - "~>"
|
88
88
|
- !ruby/object:Gem::Version
|
89
|
-
version:
|
89
|
+
version: 0.8.1
|
90
90
|
- - ">="
|
91
91
|
- !ruby/object:Gem::Version
|
92
|
-
version:
|
92
|
+
version: 0.8.1
|
93
93
|
- !ruby/object:Gem::Dependency
|
94
94
|
name: ruby-openai
|
95
95
|
requirement: !ruby/object:Gem::Requirement
|
@@ -137,7 +137,7 @@ extensions: []
|
|
137
137
|
extra_rdoc_files: []
|
138
138
|
files:
|
139
139
|
- lib/openai-scraper.rb
|
140
|
-
homepage: https://
|
140
|
+
homepage: https://github.com/leandrosardi/openai-scraper
|
141
141
|
licenses:
|
142
142
|
- MIT
|
143
143
|
metadata: {}
|