bots 1.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (6) hide show
  1. checksums.yaml +7 -0
  2. data/lib/base.rb +36 -0
  3. data/lib/bots.rb +9 -0
  4. data/lib/google.rb +80 -0
  5. data/lib/indeed.rb +52 -0
  6. metadata +146 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: be4c3337063af2d514fd1016c94d66385fa84dbde27dbfa5174d5b0908c9efb1
4
+ data.tar.gz: 13ef3dccd336269fdb1960f243c8b7dd08abf782ba89cfbf935c4e802a29ca45
5
+ SHA512:
6
+ metadata.gz: 354704a255a80def04f1993d3a73d5b358fedfeeda387278f47ab995ed10d2afced94fe175a15bc8dd7c55b1209798598b55a5f2d2071424f11613091cd87813
7
+ data.tar.gz: c1fae508d2b55269f0039524ee4e87714d7cc7244922faa83ac5b6c374abc29cdaab34355059ff92e5d6a0469be44900ac628fcfe55da80def3993549a2bcf27
data/lib/base.rb ADDED
@@ -0,0 +1,36 @@
1
+ module BlackStack
2
+ module Bots
3
+ class Bot
4
+ attr_accessor :ip # ip address of proxy
5
+ attr_accessor :user # user of proxy
6
+ attr_accessor :password # password of proxy
7
+ attr_accessor :ports # array of ports
8
+ attr_accessor :port_index # index of the port
9
+
10
+ def initialize(h)
11
+ # array of numbers from 4000 to 4249
12
+ unless h[:proxy].nil?
13
+ self.ip = h[:proxy][:ip]
14
+ self.user = h[:proxy][:user]
15
+ self.password = h[:proxy][:password]
16
+ self.ports = (h[:proxy][:port_from]..h[:proxy][:port_to]).to_a
17
+ end
18
+ self.port_index = -1
19
+ end # initialize
20
+
21
+ # return true if the bot is using a proxy
22
+ def proxy?
23
+ !self.ip.nil?
24
+ end
25
+ end # Bot
26
+
27
+ class MechanizeBot < BlackStack::Bots::Bot
28
+ attr_accessor :agent # mechanize agent
29
+ end # MechanizeBot
30
+
31
+ class SeleniumBot < BlackStack::Bots::Bot
32
+ attr_accessor :driver # selenium driver
33
+ end # MechanizeBot
34
+
35
+ end # Bots
36
+ end # BlackStack
data/lib/bots.rb ADDED
@@ -0,0 +1,9 @@
1
+ require 'mechanize'
2
+ require 'selenium-webdriver'
3
+ require 'simple_cloud_logging'
4
+ require 'colorize'
5
+ require 'csv'
6
+
7
+ require_relative './base'
8
+ require_relative './google'
9
+ require_relative './indeed'
data/lib/google.rb ADDED
@@ -0,0 +1,80 @@
1
+ module BlackStack
2
+ module Bots
3
+ class Google < BlackStack::Bots::MechanizeBot
4
+ def search(query)
5
+ ret = []
6
+ # initialize mechanize agent
7
+ self.agent = Mechanize.new
8
+ # set a proxy with user and password
9
+ self.port_index += 1
10
+ self.port_index = 0 if self.port_index >= self.ports.length
11
+ self.agent.set_proxy(self.ip, self.ports[self.port_index], self.user, self.password) if self.proxy?
12
+ # grab the page
13
+ page = agent.get('http://www.google.com/')
14
+ google_form = page.form('f')
15
+ google_form.q = query
16
+ page = agent.submit(google_form, google_form.buttons.first)
17
+ # iterate divs with class starting with 'g '
18
+ page.search('h3').each do |h3|
19
+ # get the class of the div
20
+ title = h3.text.strip
21
+ # get the link inside the div
22
+ a = h3.parent.parent.parent
23
+ href = a['href']
24
+ descr = a.parent.parent.css('/div').last.text.strip
25
+ # get the value of the paremter with name param1 from the querystring using URI
26
+ uri = URI.parse(href)
27
+ params = CGI.parse(uri.query)
28
+ url = params['q'].first
29
+ # add to the list array of results
30
+ ret << { :title=>title, :url=>url, :description=>descr }
31
+ end
32
+ # destroy mechanize agent
33
+ self.agent.shutdown
34
+ # return
35
+ ret
36
+ end # search
37
+ end # Google
38
+ =begin
39
+ class GoogleEnrichment < BlackStack::Bots::Google
40
+
41
+ # get an array of domains that may be the domain of the company
42
+ def possible_domains_for_company(company_name)
43
+ search = "\"#{company_name}\" home page"
44
+ self.search(search).map { |r|
45
+ # get domain from url using URI, and removing www., and downcasing.
46
+ URI.parse(r[:url]).host.gsub(/^www\./, '').downcase
47
+ }
48
+ end # possible_domains_for_company
49
+
50
+ # find email from fname, lname and cname
51
+ def find_email(fname, lname, cname)
52
+ domains = self.possible_domains_for_company(cname)
53
+ if domains.size > 0
54
+ domains.each { |domain|
55
+ # array of possible emails
56
+ emails = []
57
+ #emails << "#{fname}@#{domain}"
58
+ #emails << "#{lname}@#{domain}"
59
+ emails << "#{fname}#{lname}@#{domain}"
60
+ emails << "#{fname}.#{lname}@#{domain}"
61
+ emails << "#{fname}_#{lname}@#{domain}"
62
+ emails << "#{fname[0]}#{lname}@#{domain}"
63
+ # iterate array of possible emails
64
+ emails.each { |email|
65
+ # search for that email
66
+ search = "\"#{email}\""
67
+ results = self.search(search)
68
+ # find results with the exact email in the description
69
+ return email if results.select { |result| result[:description].downcase =~ /\b#{email.downcase}\b/ }
70
+ }
71
+ }
72
+ end
73
+ return nil
74
+ end # find_email
75
+
76
+ end # GoogleEnrichment
77
+ =end
78
+ end # Bots
79
+ end # BlackStack
80
+
data/lib/indeed.rb ADDED
@@ -0,0 +1,52 @@
1
+ require_relative './base'
2
+ module BlackStack
3
+ module Bots
4
+ class Indeed < BlackStack::Bots::SeleniumBot
5
+
6
+ def results(url, page=1)
7
+ ret = []
8
+ # launch a chrome browser with selenium
9
+ driver = Selenium::WebDriver.for :chrome
10
+ browser = driver.browser
11
+ # TODO: set a proxy with user and password
12
+ driver.get url
13
+ # get the ul list with class .jobsearch-ResultsList
14
+ ul = driver.find_element(:class=>'jobsearch-ResultsList')
15
+ # scroll to the bottom
16
+ driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
17
+ # iterate li elements
18
+ i = 0
19
+ ul.find_elements('css', 'li').each { |li|
20
+ h = {}
21
+ i += 1
22
+ links = li.find_elements('css', 'a.jcs-JobTitle')
23
+ if links.size == 1
24
+ link = li.find_element('css', 'a.jcs-JobTitle')
25
+ h[:title] = link.text
26
+ h[:url] = link.attribute('href')
27
+
28
+ o = li.find_elements('css','span.companyName').first
29
+ h[:company] = o ? o.text : ''
30
+
31
+ o = li.find_elements('css','div.companyLocation').first
32
+ h[:location] = o ? o.text : ''
33
+
34
+ o = li.find_elements('css','div.salary-snippet-container').first
35
+ h[:salary] = o ? o.text : ''
36
+
37
+ o = li.find_elements('css','span.date').first
38
+ h[:posted] = o ? o.text.gsub("Posted\nPosted", '').strip : ''
39
+
40
+ h[:snippets] = li.find_elements('css','div.job-snippet > ul > li').map { |li| li.text }
41
+
42
+ ret << h
43
+ end
44
+ }
45
+ # destroy selenium browser
46
+ driver.quit
47
+ # return
48
+ ret
49
+ end # results
50
+ end # Indeed
51
+ end # Bots
52
+ end # BlackStack
metadata ADDED
@@ -0,0 +1,146 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: bots
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Leandro Daniel Sardi
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2023-08-08 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: simple_cloud_logging
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: 1.2.2
20
+ - - ">="
21
+ - !ruby/object:Gem::Version
22
+ version: 1.2.2
23
+ type: :runtime
24
+ prerelease: false
25
+ version_requirements: !ruby/object:Gem::Requirement
26
+ requirements:
27
+ - - "~>"
28
+ - !ruby/object:Gem::Version
29
+ version: 1.2.2
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: 1.2.2
33
+ - !ruby/object:Gem::Dependency
34
+ name: csv
35
+ requirement: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - "~>"
38
+ - !ruby/object:Gem::Version
39
+ version: 3.2.7
40
+ - - ">="
41
+ - !ruby/object:Gem::Version
42
+ version: 3.2.7
43
+ type: :runtime
44
+ prerelease: false
45
+ version_requirements: !ruby/object:Gem::Requirement
46
+ requirements:
47
+ - - "~>"
48
+ - !ruby/object:Gem::Version
49
+ version: 3.2.7
50
+ - - ">="
51
+ - !ruby/object:Gem::Version
52
+ version: 3.2.7
53
+ - !ruby/object:Gem::Dependency
54
+ name: mechanize
55
+ requirement: !ruby/object:Gem::Requirement
56
+ requirements:
57
+ - - "~>"
58
+ - !ruby/object:Gem::Version
59
+ version: 2.8.5
60
+ - - ">="
61
+ - !ruby/object:Gem::Version
62
+ version: 2.8.5
63
+ type: :runtime
64
+ prerelease: false
65
+ version_requirements: !ruby/object:Gem::Requirement
66
+ requirements:
67
+ - - "~>"
68
+ - !ruby/object:Gem::Version
69
+ version: 2.8.5
70
+ - - ">="
71
+ - !ruby/object:Gem::Version
72
+ version: 2.8.5
73
+ - !ruby/object:Gem::Dependency
74
+ name: selenium-webdriver
75
+ requirement: !ruby/object:Gem::Requirement
76
+ requirements:
77
+ - - "~>"
78
+ - !ruby/object:Gem::Version
79
+ version: 4.10.0
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: 4.10.0
83
+ type: :runtime
84
+ prerelease: false
85
+ version_requirements: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: 4.10.0
90
+ - - ">="
91
+ - !ruby/object:Gem::Version
92
+ version: 4.10.0
93
+ - !ruby/object:Gem::Dependency
94
+ name: colorize
95
+ requirement: !ruby/object:Gem::Requirement
96
+ requirements:
97
+ - - "~>"
98
+ - !ruby/object:Gem::Version
99
+ version: 0.8.1
100
+ - - ">="
101
+ - !ruby/object:Gem::Version
102
+ version: 0.8.1
103
+ type: :runtime
104
+ prerelease: false
105
+ version_requirements: !ruby/object:Gem::Requirement
106
+ requirements:
107
+ - - "~>"
108
+ - !ruby/object:Gem::Version
109
+ version: 0.8.1
110
+ - - ">="
111
+ - !ruby/object:Gem::Version
112
+ version: 0.8.1
113
+ description: Ruby gem for scraping information from the public web.
114
+ email: leandro@connectionsphere.com
115
+ executables: []
116
+ extensions: []
117
+ extra_rdoc_files: []
118
+ files:
119
+ - lib/base.rb
120
+ - lib/bots.rb
121
+ - lib/google.rb
122
+ - lib/indeed.rb
123
+ homepage: https://rubygems.org/gems/bots
124
+ licenses:
125
+ - MIT
126
+ metadata: {}
127
+ post_install_message:
128
+ rdoc_options: []
129
+ require_paths:
130
+ - lib
131
+ required_ruby_version: !ruby/object:Gem::Requirement
132
+ requirements:
133
+ - - ">="
134
+ - !ruby/object:Gem::Version
135
+ version: '0'
136
+ required_rubygems_version: !ruby/object:Gem::Requirement
137
+ requirements:
138
+ - - ">="
139
+ - !ruby/object:Gem::Version
140
+ version: '0'
141
+ requirements: []
142
+ rubygems_version: 3.3.7
143
+ signing_key:
144
+ specification_version: 4
145
+ summary: Ruby gem for scraping information from the public web.
146
+ test_files: []