openai-scraper 1.2 → 1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/lib/openai-scraper.rb +44 -8
  3. metadata +18 -18
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 3640e18019f73e33ea331aac0b8834e0de4cc49311b43b0d578b5d1428d55ce9
4
- data.tar.gz: 8a6f66459cd9597285c638ecc11751e4bd20722f28dddd1da864e65e81e9b954
3
+ metadata.gz: 8f28769f3837288865877b5448381b9145c9f8383b21e411dfd741b32481d8d3
4
+ data.tar.gz: 3c9fb79c92cef93ae0fdf80f1c9b98394835e9172813309e363b812894cfe58f
5
5
  SHA512:
6
- metadata.gz: dc8c94459de87a753402bfd8ce76877116d1565b328519c5a8a64701bb0b89bf6f5b6fa90b33f9aae79e4ef165b85451b0c12d9b1b8550f875078e729d7f735e
7
- data.tar.gz: 4a20f94caff31aae40299caaa4e06859467719823ea233a8dda77d8226e64526cd25b9a513886519d98c13fcbc5cf4eae1dc6b6d878ec87e3bc71c7bf138fe96
6
+ metadata.gz: 114f7c2122289c09d6b8e4426620b741208cce161dc2794d7720944d82e96339df0d4336c642d097fa03fa0871a7807f2778c79aa2921569df6b04baa8e44791
7
+ data.tar.gz: fa545d7dfd11513dab76eeaabd5567aa88d59c3911ced5152fcaff21566dab8239a538e4b558c99459eaad1797c297e70cfe2110379bf90945991a854b4fb7f4
@@ -22,6 +22,7 @@ end
22
22
  module BlackStack
23
23
  module OpenAIScraper
24
24
  @@openai_apikey = nil
25
+ @@model = nil
25
26
  @@client = nil
26
27
  @@browser = nil
27
28
  @@history = []
@@ -37,17 +38,19 @@ module BlackStack
37
38
 
38
39
  def self.set(h)
39
40
  @@openai_apikey = h[:openai_apikey] if h[:openai_apikey]
41
+ @@model = h[:model] if h[:model]
40
42
  end
41
43
 
42
44
  def self.init
43
45
  @@client = OpenAI::Client.new(access_token: @@openai_apikey)
44
- @@browser = Selenium::WebDriver.for :chrome
46
+ ## setup a mechanize client
47
+ @@browser = Mechanize.new
45
48
  # load history array from the file ./history.json, only if the file exists
46
49
  @@history = JSON.parse(File.read('./history.json')) if File.exist?('./history.json')
47
50
  end
48
51
 
49
52
  def self.finalize
50
- @@browser.quit
53
+ #@@browser.quit
51
54
  # overrite the file ./history.json with the current history array
52
55
  File.write('./history.json', @@history.to_json)
53
56
  end
@@ -92,7 +95,7 @@ List of Commands:\n
92
95
 
93
96
  response = @@client.chat(
94
97
  parameters: {
95
- model: "gpt-3.5-turbo", # Required.
98
+ model: @@model, # Required.
96
99
  #max_tokens: 6000,
97
100
  temperature: 0.5,
98
101
  messages: [
@@ -138,15 +141,48 @@ List of Commands:\n
138
141
  ],
139
142
  =end
140
143
  })
144
+ #binding.pry
141
145
  raise response.dig("error", "message") if response.dig("error", "message")
142
146
  return response.dig("choices", 0, "message", "content")
143
147
  end
144
148
 
149
+ # get all links of the webiste, at a certainly deep level
150
+ def self.get_links(url, deep = 1, l=nil)
151
+ l = BlackStack::DummyLogger.new(nil) if l.nil?
152
+ ret = []
153
+ return ret if deep<=0
154
+ # go to the URL
155
+ l.logs "level #{deep} - get_links: #{url}... "
156
+ page = @@browser.get(url)
157
+ # get all links
158
+ links = page.search('a')
159
+ # add each link to the array
160
+ links.each do |link|
161
+ txt = link.text.to_s.strip
162
+ ret << { 'href' => link['href'], 'text' => txt }
163
+ end
164
+ # make ret an array of unique hashes
165
+ ret.uniq!
166
+ # remove links that are not belonging the same domain
167
+ ret.reject! { |h| h['href'] !~ /#{URI.parse(url).host}/ }
168
+ # remove links that are not belonging the protocols http or https
169
+ ret.reject! { |h| h['href'] !~ /^https?:\/\// }
170
+ l.logf 'done'.green
171
+
172
+ # get the links inside each link
173
+ ret.each do |link|
174
+ ret += get_links(link['href'], deep-1, l)
175
+ end
176
+
177
+ # return
178
+ ret
179
+ end
180
+
145
181
  # download the web page, and extract all links.
146
182
  #
147
183
  def self.wl(url)
148
184
  # visit the url
149
- @@browser.navigate.to url
185
+ page = @@browser.get(url)
150
186
 
151
187
  # wait up to 30 seconds for the page to load
152
188
  #wait = Selenium::WebDriver::Wait.new(:timeout => 30)
@@ -157,7 +193,7 @@ List of Commands:\n
157
193
  #wait.until { @@browser.execute_script("return jQuery.active") == 0 }
158
194
 
159
195
  # get all the links
160
- links = @@browser.find_elements(:tag_name, 'a')
196
+ links = page.search('a')
161
197
 
162
198
  # add the links to a json structure
163
199
  h = []
@@ -176,7 +212,7 @@ List of Commands:\n
176
212
  #
177
213
  def self.wt(url)
178
214
  # visit the url
179
- @@browser.navigate.to url
215
+ page = @@browser.get(url)
180
216
 
181
217
  # wait up to 30 seconds for the page to load
182
218
  #wait = Selenium::WebDriver::Wait.new(:timeout => 30)
@@ -186,8 +222,8 @@ List of Commands:\n
186
222
  #wait = Selenium::WebDriver::Wait.new(:timeout => 30)
187
223
  #wait.until { @@browser.execute_script("return jQuery.active") == 0 }
188
224
 
189
- # return the text of the webpage
190
- @@browser.find_element(:tag_name, 'body').text
225
+ # return the text of the body
226
+ page.search('body').text
191
227
  end # def wt
192
228
 
193
229
  # show the promt and wait for the user input
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: openai-scraper
3
3
  version: !ruby/object:Gem::Version
4
- version: '1.2'
4
+ version: '1.4'
5
5
  platform: ruby
6
6
  authors:
7
7
  - Leandro Daniel Sardi
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-07-16 00:00:00.000000000 Z
11
+ date: 2023-07-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -31,65 +31,65 @@ dependencies:
31
31
  - !ruby/object:Gem::Version
32
32
  version: 1.13.10
33
33
  - !ruby/object:Gem::Dependency
34
- name: simple_cloud_logging
34
+ name: mechanize
35
35
  requirement: !ruby/object:Gem::Requirement
36
36
  requirements:
37
37
  - - "~>"
38
38
  - !ruby/object:Gem::Version
39
- version: 1.2.2
39
+ version: 2.8.5
40
40
  - - ">="
41
41
  - !ruby/object:Gem::Version
42
- version: 1.2.2
42
+ version: 2.8.5
43
43
  type: :runtime
44
44
  prerelease: false
45
45
  version_requirements: !ruby/object:Gem::Requirement
46
46
  requirements:
47
47
  - - "~>"
48
48
  - !ruby/object:Gem::Version
49
- version: 1.2.2
49
+ version: 2.8.5
50
50
  - - ">="
51
51
  - !ruby/object:Gem::Version
52
- version: 1.2.2
52
+ version: 2.8.5
53
53
  - !ruby/object:Gem::Dependency
54
- name: colorize
54
+ name: simple_cloud_logging
55
55
  requirement: !ruby/object:Gem::Requirement
56
56
  requirements:
57
57
  - - "~>"
58
58
  - !ruby/object:Gem::Version
59
- version: 0.8.1
59
+ version: 1.2.2
60
60
  - - ">="
61
61
  - !ruby/object:Gem::Version
62
- version: 0.8.1
62
+ version: 1.2.2
63
63
  type: :runtime
64
64
  prerelease: false
65
65
  version_requirements: !ruby/object:Gem::Requirement
66
66
  requirements:
67
67
  - - "~>"
68
68
  - !ruby/object:Gem::Version
69
- version: 0.8.1
69
+ version: 1.2.2
70
70
  - - ">="
71
71
  - !ruby/object:Gem::Version
72
- version: 0.8.1
72
+ version: 1.2.2
73
73
  - !ruby/object:Gem::Dependency
74
- name: selenium-webdriver
74
+ name: colorize
75
75
  requirement: !ruby/object:Gem::Requirement
76
76
  requirements:
77
77
  - - "~>"
78
78
  - !ruby/object:Gem::Version
79
- version: 4.10.0
79
+ version: 0.8.1
80
80
  - - ">="
81
81
  - !ruby/object:Gem::Version
82
- version: 4.10.0
82
+ version: 0.8.1
83
83
  type: :runtime
84
84
  prerelease: false
85
85
  version_requirements: !ruby/object:Gem::Requirement
86
86
  requirements:
87
87
  - - "~>"
88
88
  - !ruby/object:Gem::Version
89
- version: 4.10.0
89
+ version: 0.8.1
90
90
  - - ">="
91
91
  - !ruby/object:Gem::Version
92
- version: 4.10.0
92
+ version: 0.8.1
93
93
  - !ruby/object:Gem::Dependency
94
94
  name: ruby-openai
95
95
  requirement: !ruby/object:Gem::Requirement
@@ -137,7 +137,7 @@ extensions: []
137
137
  extra_rdoc_files: []
138
138
  files:
139
139
  - lib/openai-scraper.rb
140
- homepage: https://rubygems.org/gems/openai-scraper
140
+ homepage: https://github.com/leandrosardi/openai-scraper
141
141
  licenses:
142
142
  - MIT
143
143
  metadata: {}