bots 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (6) hide show
  1. checksums.yaml +7 -0
  2. data/lib/base.rb +36 -0
  3. data/lib/bots.rb +9 -0
  4. data/lib/google.rb +80 -0
  5. data/lib/indeed.rb +52 -0
  6. metadata +146 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: be4c3337063af2d514fd1016c94d66385fa84dbde27dbfa5174d5b0908c9efb1
4
+ data.tar.gz: 13ef3dccd336269fdb1960f243c8b7dd08abf782ba89cfbf935c4e802a29ca45
5
+ SHA512:
6
+ metadata.gz: 354704a255a80def04f1993d3a73d5b358fedfeeda387278f47ab995ed10d2afced94fe175a15bc8dd7c55b1209798598b55a5f2d2071424f11613091cd87813
7
+ data.tar.gz: c1fae508d2b55269f0039524ee4e87714d7cc7244922faa83ac5b6c374abc29cdaab34355059ff92e5d6a0469be44900ac628fcfe55da80def3993549a2bcf27
data/lib/base.rb ADDED
@@ -0,0 +1,36 @@
1
+ module BlackStack
2
+ module Bots
3
+ class Bot
4
+ attr_accessor :ip # ip address of proxy
5
+ attr_accessor :user # user of proxy
6
+ attr_accessor :password # password of proxy
7
+ attr_accessor :ports # array of ports
8
+ attr_accessor :port_index # index of the port
9
+
10
+ def initialize(h)
11
+ # array of numbers from 4000 to 4249
12
+ unless h[:proxy].nil?
13
+ self.ip = h[:proxy][:ip]
14
+ self.user = h[:proxy][:user]
15
+ self.password = h[:proxy][:password]
16
+ self.ports = (h[:proxy][:port_from]..h[:proxy][:port_to]).to_a
17
+ end
18
+ self.port_index = -1
19
+ end # initialize
20
+
21
+ # return true if the bot is using a proxy
22
+ def proxy?
23
+ !self.ip.nil?
24
+ end
25
+ end # Bot
26
+
27
+ class MechanizeBot < BlackStack::Bots::Bot
28
+ attr_accessor :agent # mechanize agent
29
+ end # MechanizeBot
30
+
31
+ class SeleniumBot < BlackStack::Bots::Bot
32
+ attr_accessor :driver # selenium driver
33
+ end # MechanizeBot
34
+
35
+ end # Bots
36
+ end # BlackStack
data/lib/bots.rb ADDED
@@ -0,0 +1,9 @@
1
+ require 'mechanize'
2
+ require 'selenium-webdriver'
3
+ require 'simple_cloud_logging'
4
+ require 'colorize'
5
+ require 'csv'
6
+
7
+ require_relative './base'
8
+ require_relative './google'
9
+ require_relative './indeed'
data/lib/google.rb ADDED
@@ -0,0 +1,80 @@
1
+ module BlackStack
2
+ module Bots
3
+ class Google < BlackStack::Bots::MechanizeBot
4
+ def search(query)
5
+ ret = []
6
+ # initialize mechanize agent
7
+ self.agent = Mechanize.new
8
+ # set a proxy with user and password
9
+ self.port_index += 1
10
+ self.port_index = 0 if self.port_index >= self.ports.length
11
+ self.agent.set_proxy(self.ip, self.ports[self.port_index], self.user, self.password) if self.proxy?
12
+ # grab the page
13
+ page = agent.get('http://www.google.com/')
14
+ google_form = page.form('f')
15
+ google_form.q = query
16
+ page = agent.submit(google_form, google_form.buttons.first)
17
+ # iterate divs with class starting with 'g '
18
+ page.search('h3').each do |h3|
19
+ # get the class of the div
20
+ title = h3.text.strip
21
+ # get the link inside the div
22
+ a = h3.parent.parent.parent
23
+ href = a['href']
24
+ descr = a.parent.parent.css('/div').last.text.strip
25
+ # get the value of the paremter with name param1 from the querystring using URI
26
+ uri = URI.parse(href)
27
+ params = CGI.parse(uri.query)
28
+ url = params['q'].first
29
+ # add to the list array of results
30
+ ret << { :title=>title, :url=>url, :description=>descr }
31
+ end
32
+ # destroy mechanize agent
33
+ self.agent.shutdown
34
+ # return
35
+ ret
36
+ end # search
37
+ end # Google
38
+ =begin
39
+ class GoogleEnrichment < BlackStack::Bots::Google
40
+
41
+ # get an array of domains that may be the domain of the company
42
+ def possible_domains_for_company(company_name)
43
+ search = "\"#{company_name}\" home page"
44
+ self.search(search).map { |r|
45
+ # get domain from url using URI, and removing www., and downcasing.
46
+ URI.parse(r[:url]).host.gsub(/^www\./, '').downcase
47
+ }
48
+ end # possible_domains_for_company
49
+
50
+ # find email from fname, lname and cname
51
+ def find_email(fname, lname, cname)
52
+ domains = self.possible_domains_for_company(cname)
53
+ if domains.size > 0
54
+ domains.each { |domain|
55
+ # array of possible emails
56
+ emails = []
57
+ #emails << "#{fname}@#{domain}"
58
+ #emails << "#{lname}@#{domain}"
59
+ emails << "#{fname}#{lname}@#{domain}"
60
+ emails << "#{fname}.#{lname}@#{domain}"
61
+ emails << "#{fname}_#{lname}@#{domain}"
62
+ emails << "#{fname[0]}#{lname}@#{domain}"
63
+ # iterate array of possible emails
64
+ emails.each { |email|
65
+ # search for that email
66
+ search = "\"#{email}\""
67
+ results = self.search(search)
68
+ # find results with the exact email in the description
69
+ return email if results.select { |result| result[:description].downcase =~ /\b#{email.downcase}\b/ }
70
+ }
71
+ }
72
+ end
73
+ return nil
74
+ end # find_email
75
+
76
+ end # GoogleEnrichment
77
+ =end
78
+ end # Bots
79
+ end # BlackStack
80
+
data/lib/indeed.rb ADDED
@@ -0,0 +1,52 @@
1
+ require_relative './base'
2
+ module BlackStack
3
+ module Bots
4
+ class Indeed < BlackStack::Bots::SeleniumBot
5
+
6
+ def results(url, page=1)
7
+ ret = []
8
+ # launch a chrome browser with selenium
9
+ driver = Selenium::WebDriver.for :chrome
10
+ browser = driver.browser
11
+ # TODO: set a proxy with user and password
12
+ driver.get url
13
+ # get the ul list with class .jobsearch-ResultsList
14
+ ul = driver.find_element(:class=>'jobsearch-ResultsList')
15
+ # scroll to the bottom
16
+ driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
17
+ # iterate li elements
18
+ i = 0
19
+ ul.find_elements('css', 'li').each { |li|
20
+ h = {}
21
+ i += 1
22
+ links = li.find_elements('css', 'a.jcs-JobTitle')
23
+ if links.size == 1
24
+ link = li.find_element('css', 'a.jcs-JobTitle')
25
+ h[:title] = link.text
26
+ h[:url] = link.attribute('href')
27
+
28
+ o = li.find_elements('css','span.companyName').first
29
+ h[:company] = o ? o.text : ''
30
+
31
+ o = li.find_elements('css','div.companyLocation').first
32
+ h[:location] = o ? o.text : ''
33
+
34
+ o = li.find_elements('css','div.salary-snippet-container').first
35
+ h[:salary] = o ? o.text : ''
36
+
37
+ o = li.find_elements('css','span.date').first
38
+ h[:posted] = o ? o.text.gsub("Posted\nPosted", '').strip : ''
39
+
40
+ h[:snippets] = li.find_elements('css','div.job-snippet > ul > li').map { |li| li.text }
41
+
42
+ ret << h
43
+ end
44
+ }
45
+ # destroy selenium browser
46
+ driver.quit
47
+ # return
48
+ ret
49
+ end # results
50
+ end # Indeed
51
+ end # Bots
52
+ end # BlackStack
metadata ADDED
@@ -0,0 +1,146 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: bots
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Leandro Daniel Sardi
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2023-08-08 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: simple_cloud_logging
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: 1.2.2
20
+ - - ">="
21
+ - !ruby/object:Gem::Version
22
+ version: 1.2.2
23
+ type: :runtime
24
+ prerelease: false
25
+ version_requirements: !ruby/object:Gem::Requirement
26
+ requirements:
27
+ - - "~>"
28
+ - !ruby/object:Gem::Version
29
+ version: 1.2.2
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: 1.2.2
33
+ - !ruby/object:Gem::Dependency
34
+ name: csv
35
+ requirement: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - "~>"
38
+ - !ruby/object:Gem::Version
39
+ version: 3.2.7
40
+ - - ">="
41
+ - !ruby/object:Gem::Version
42
+ version: 3.2.7
43
+ type: :runtime
44
+ prerelease: false
45
+ version_requirements: !ruby/object:Gem::Requirement
46
+ requirements:
47
+ - - "~>"
48
+ - !ruby/object:Gem::Version
49
+ version: 3.2.7
50
+ - - ">="
51
+ - !ruby/object:Gem::Version
52
+ version: 3.2.7
53
+ - !ruby/object:Gem::Dependency
54
+ name: mechanize
55
+ requirement: !ruby/object:Gem::Requirement
56
+ requirements:
57
+ - - "~>"
58
+ - !ruby/object:Gem::Version
59
+ version: 2.8.5
60
+ - - ">="
61
+ - !ruby/object:Gem::Version
62
+ version: 2.8.5
63
+ type: :runtime
64
+ prerelease: false
65
+ version_requirements: !ruby/object:Gem::Requirement
66
+ requirements:
67
+ - - "~>"
68
+ - !ruby/object:Gem::Version
69
+ version: 2.8.5
70
+ - - ">="
71
+ - !ruby/object:Gem::Version
72
+ version: 2.8.5
73
+ - !ruby/object:Gem::Dependency
74
+ name: selenium-webdriver
75
+ requirement: !ruby/object:Gem::Requirement
76
+ requirements:
77
+ - - "~>"
78
+ - !ruby/object:Gem::Version
79
+ version: 4.10.0
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: 4.10.0
83
+ type: :runtime
84
+ prerelease: false
85
+ version_requirements: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: 4.10.0
90
+ - - ">="
91
+ - !ruby/object:Gem::Version
92
+ version: 4.10.0
93
+ - !ruby/object:Gem::Dependency
94
+ name: colorize
95
+ requirement: !ruby/object:Gem::Requirement
96
+ requirements:
97
+ - - "~>"
98
+ - !ruby/object:Gem::Version
99
+ version: 0.8.1
100
+ - - ">="
101
+ - !ruby/object:Gem::Version
102
+ version: 0.8.1
103
+ type: :runtime
104
+ prerelease: false
105
+ version_requirements: !ruby/object:Gem::Requirement
106
+ requirements:
107
+ - - "~>"
108
+ - !ruby/object:Gem::Version
109
+ version: 0.8.1
110
+ - - ">="
111
+ - !ruby/object:Gem::Version
112
+ version: 0.8.1
113
+ description: Ruby gem for scraping information from the public web.
114
+ email: leandro@connectionsphere.com
115
+ executables: []
116
+ extensions: []
117
+ extra_rdoc_files: []
118
+ files:
119
+ - lib/base.rb
120
+ - lib/bots.rb
121
+ - lib/google.rb
122
+ - lib/indeed.rb
123
+ homepage: https://rubygems.org/gems/bots
124
+ licenses:
125
+ - MIT
126
+ metadata: {}
127
+ post_install_message:
128
+ rdoc_options: []
129
+ require_paths:
130
+ - lib
131
+ required_ruby_version: !ruby/object:Gem::Requirement
132
+ requirements:
133
+ - - ">="
134
+ - !ruby/object:Gem::Version
135
+ version: '0'
136
+ required_rubygems_version: !ruby/object:Gem::Requirement
137
+ requirements:
138
+ - - ">="
139
+ - !ruby/object:Gem::Version
140
+ version: '0'
141
+ requirements: []
142
+ rubygems_version: 3.3.7
143
+ signing_key:
144
+ specification_version: 4
145
+ summary: Ruby gem for scraping information from the public web.
146
+ test_files: []