openai-scraper 1.2 → 1.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/lib/openai-scraper.rb +44 -8
  3. metadata +17 -17
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 3640e18019f73e33ea331aac0b8834e0de4cc49311b43b0d578b5d1428d55ce9
4
- data.tar.gz: 8a6f66459cd9597285c638ecc11751e4bd20722f28dddd1da864e65e81e9b954
3
+ metadata.gz: bd10d79aa9bad5ef74d0f5e11869c367a5bf46e5c2031728efa74f0fc74f418d
4
+ data.tar.gz: 6a428a3c5135ed7effe14ef69fdf25631b39ae923593cf10caa5d3770b8d8db5
5
5
  SHA512:
6
- metadata.gz: dc8c94459de87a753402bfd8ce76877116d1565b328519c5a8a64701bb0b89bf6f5b6fa90b33f9aae79e4ef165b85451b0c12d9b1b8550f875078e729d7f735e
7
- data.tar.gz: 4a20f94caff31aae40299caaa4e06859467719823ea233a8dda77d8226e64526cd25b9a513886519d98c13fcbc5cf4eae1dc6b6d878ec87e3bc71c7bf138fe96
6
+ metadata.gz: d30640cfd1bb040c6881274ce47ab0727855a285823ba0116349a8f7f367c90fc197412b712a6013fe36abff87ee3dccbf21071e728d2e26128e62df3249f622
7
+ data.tar.gz: e5cd6fc52e3bf5d0a59cee01e9d99974e97edc95e08be82f966718839ebdf25901263e58c9c10e1b136a2355a785e061c1f16d3c6b748edf5114d412ff2e06da
@@ -22,6 +22,7 @@ end
22
22
  module BlackStack
23
23
  module OpenAIScraper
24
24
  @@openai_apikey = nil
25
+ @@model = nil
25
26
  @@client = nil
26
27
  @@browser = nil
27
28
  @@history = []
@@ -37,17 +38,19 @@ module BlackStack
37
38
 
38
39
  def self.set(h)
39
40
  @@openai_apikey = h[:openai_apikey] if h[:openai_apikey]
41
+ @@model = h[:model] if h[:model]
40
42
  end
41
43
 
42
44
  def self.init
43
45
  @@client = OpenAI::Client.new(access_token: @@openai_apikey)
44
- @@browser = Selenium::WebDriver.for :chrome
46
+ ## setup a mechanize client
47
+ @@browser = Mechanize.new
45
48
  # load history array from the file ./history.json, only if the file exists
46
49
  @@history = JSON.parse(File.read('./history.json')) if File.exist?('./history.json')
47
50
  end
48
51
 
49
52
  def self.finalize
50
- @@browser.quit
53
+ #@@browser.quit
51
54
  # overrite the file ./history.json with the current history array
52
55
  File.write('./history.json', @@history.to_json)
53
56
  end
@@ -92,7 +95,7 @@ List of Commands:\n
92
95
 
93
96
  response = @@client.chat(
94
97
  parameters: {
95
- model: "gpt-3.5-turbo", # Required.
98
+ model: @@model, # Required.
96
99
  #max_tokens: 6000,
97
100
  temperature: 0.5,
98
101
  messages: [
@@ -138,15 +141,48 @@ List of Commands:\n
138
141
  ],
139
142
  =end
140
143
  })
144
+ #binding.pry
141
145
  raise response.dig("error", "message") if response.dig("error", "message")
142
146
  return response.dig("choices", 0, "message", "content")
143
147
  end
144
148
 
149
+ # get all links of the webiste, at a certainly deep level
150
+ def self.get_links(url, deep = 1, l=nil)
151
+ l = BlackStack::DummyLogger.new(nil) if l.nil?
152
+ ret = []
153
+ return ret if deep<=0
154
+ # go to the URL
155
+ l.logs "level #{deep} - get_links: #{url}... "
156
+ page = @@browser.get(url)
157
+ # get all links
158
+ links = page.search('a')
159
+ # add each link to the array
160
+ links.each do |link|
161
+ txt = link.text.to_s.strip
162
+ ret << { 'href' => link['href'], 'text' => txt }
163
+ end
164
+ # make ret an array of unique hashes
165
+ ret.uniq!
166
+ # remove links that are not belonging the same domain
167
+ ret.reject! { |h| h['href'] !~ /#{URI.parse(url).host}/ }
168
+ # remove links that are not belonging the protocols http or https
169
+ ret.reject! { |h| h['href'] !~ /^https?:\/\// }
170
+ l.logf 'done'.green
171
+
172
+ # get the links inside each link
173
+ ret.each do |link|
174
+ ret += get_links(link['href'], deep-1, l)
175
+ end
176
+
177
+ # return
178
+ ret
179
+ end
180
+
145
181
  # download the web page, and extract all links.
146
182
  #
147
183
  def self.wl(url)
148
184
  # visit the url
149
- @@browser.navigate.to url
185
+ page = @@browser.get(url)
150
186
 
151
187
  # wait up to 30 seconds for the page to load
152
188
  #wait = Selenium::WebDriver::Wait.new(:timeout => 30)
@@ -157,7 +193,7 @@ List of Commands:\n
157
193
  #wait.until { @@browser.execute_script("return jQuery.active") == 0 }
158
194
 
159
195
  # get all the links
160
- links = @@browser.find_elements(:tag_name, 'a')
196
+ links = page.search('a')
161
197
 
162
198
  # add the links to a json structure
163
199
  h = []
@@ -176,7 +212,7 @@ List of Commands:\n
176
212
  #
177
213
  def self.wt(url)
178
214
  # visit the url
179
- @@browser.navigate.to url
215
+ page = @@browser.get(url)
180
216
 
181
217
  # wait up to 30 seconds for the page to load
182
218
  #wait = Selenium::WebDriver::Wait.new(:timeout => 30)
@@ -186,8 +222,8 @@ List of Commands:\n
186
222
  #wait = Selenium::WebDriver::Wait.new(:timeout => 30)
187
223
  #wait.until { @@browser.execute_script("return jQuery.active") == 0 }
188
224
 
189
- # return the text of the webpage
190
- @@browser.find_element(:tag_name, 'body').text
225
+ # return the text of the body
226
+ page.search('body').text
191
227
  end # def wt
192
228
 
193
229
  # show the promt and wait for the user input
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: openai-scraper
3
3
  version: !ruby/object:Gem::Version
4
- version: '1.2'
4
+ version: '1.3'
5
5
  platform: ruby
6
6
  authors:
7
7
  - Leandro Daniel Sardi
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-07-16 00:00:00.000000000 Z
11
+ date: 2023-07-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -31,65 +31,65 @@ dependencies:
31
31
  - !ruby/object:Gem::Version
32
32
  version: 1.13.10
33
33
  - !ruby/object:Gem::Dependency
34
- name: simple_cloud_logging
34
+ name: mechanize
35
35
  requirement: !ruby/object:Gem::Requirement
36
36
  requirements:
37
37
  - - "~>"
38
38
  - !ruby/object:Gem::Version
39
- version: 1.2.2
39
+ version: 2.8.5
40
40
  - - ">="
41
41
  - !ruby/object:Gem::Version
42
- version: 1.2.2
42
+ version: 2.8.5
43
43
  type: :runtime
44
44
  prerelease: false
45
45
  version_requirements: !ruby/object:Gem::Requirement
46
46
  requirements:
47
47
  - - "~>"
48
48
  - !ruby/object:Gem::Version
49
- version: 1.2.2
49
+ version: 2.8.5
50
50
  - - ">="
51
51
  - !ruby/object:Gem::Version
52
- version: 1.2.2
52
+ version: 2.8.5
53
53
  - !ruby/object:Gem::Dependency
54
- name: colorize
54
+ name: simple_cloud_logging
55
55
  requirement: !ruby/object:Gem::Requirement
56
56
  requirements:
57
57
  - - "~>"
58
58
  - !ruby/object:Gem::Version
59
- version: 0.8.1
59
+ version: 1.2.2
60
60
  - - ">="
61
61
  - !ruby/object:Gem::Version
62
- version: 0.8.1
62
+ version: 1.2.2
63
63
  type: :runtime
64
64
  prerelease: false
65
65
  version_requirements: !ruby/object:Gem::Requirement
66
66
  requirements:
67
67
  - - "~>"
68
68
  - !ruby/object:Gem::Version
69
- version: 0.8.1
69
+ version: 1.2.2
70
70
  - - ">="
71
71
  - !ruby/object:Gem::Version
72
- version: 0.8.1
72
+ version: 1.2.2
73
73
  - !ruby/object:Gem::Dependency
74
- name: selenium-webdriver
74
+ name: colorize
75
75
  requirement: !ruby/object:Gem::Requirement
76
76
  requirements:
77
77
  - - "~>"
78
78
  - !ruby/object:Gem::Version
79
- version: 4.10.0
79
+ version: 0.8.1
80
80
  - - ">="
81
81
  - !ruby/object:Gem::Version
82
- version: 4.10.0
82
+ version: 0.8.1
83
83
  type: :runtime
84
84
  prerelease: false
85
85
  version_requirements: !ruby/object:Gem::Requirement
86
86
  requirements:
87
87
  - - "~>"
88
88
  - !ruby/object:Gem::Version
89
- version: 4.10.0
89
+ version: 0.8.1
90
90
  - - ">="
91
91
  - !ruby/object:Gem::Version
92
- version: 4.10.0
92
+ version: 0.8.1
93
93
  - !ruby/object:Gem::Dependency
94
94
  name: ruby-openai
95
95
  requirement: !ruby/object:Gem::Requirement