bots 1.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/lib/base.rb +36 -0
- data/lib/bots.rb +9 -0
- data/lib/google.rb +80 -0
- data/lib/indeed.rb +52 -0
- metadata +146 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: be4c3337063af2d514fd1016c94d66385fa84dbde27dbfa5174d5b0908c9efb1
|
4
|
+
data.tar.gz: 13ef3dccd336269fdb1960f243c8b7dd08abf782ba89cfbf935c4e802a29ca45
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 354704a255a80def04f1993d3a73d5b358fedfeeda387278f47ab995ed10d2afced94fe175a15bc8dd7c55b1209798598b55a5f2d2071424f11613091cd87813
|
7
|
+
data.tar.gz: c1fae508d2b55269f0039524ee4e87714d7cc7244922faa83ac5b6c374abc29cdaab34355059ff92e5d6a0469be44900ac628fcfe55da80def3993549a2bcf27
|
data/lib/base.rb
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
module BlackStack
|
2
|
+
module Bots
|
3
|
+
class Bot
|
4
|
+
attr_accessor :ip # ip address of proxy
|
5
|
+
attr_accessor :user # user of proxy
|
6
|
+
attr_accessor :password # password of proxy
|
7
|
+
attr_accessor :ports # array of ports
|
8
|
+
attr_accessor :port_index # index of the port
|
9
|
+
|
10
|
+
def initialize(h)
|
11
|
+
# array of numbers from 4000 to 4249
|
12
|
+
unless h[:proxy].nil?
|
13
|
+
self.ip = h[:proxy][:ip]
|
14
|
+
self.user = h[:proxy][:user]
|
15
|
+
self.password = h[:proxy][:password]
|
16
|
+
self.ports = (h[:proxy][:port_from]..h[:proxy][:port_to]).to_a
|
17
|
+
end
|
18
|
+
self.port_index = -1
|
19
|
+
end # initialize
|
20
|
+
|
21
|
+
# return true if the bot is using a proxy
|
22
|
+
def proxy?
|
23
|
+
!self.ip.nil?
|
24
|
+
end
|
25
|
+
end # Bot
|
26
|
+
|
27
|
+
class MechanizeBot < BlackStack::Bots::Bot
|
28
|
+
attr_accessor :agent # mechanize agent
|
29
|
+
end # MechanizeBot
|
30
|
+
|
31
|
+
class SeleniumBot < BlackStack::Bots::Bot
|
32
|
+
attr_accessor :driver # selenium driver
|
33
|
+
end # MechanizeBot
|
34
|
+
|
35
|
+
end # Bots
|
36
|
+
end # BlackStack
|
data/lib/bots.rb
ADDED
data/lib/google.rb
ADDED
@@ -0,0 +1,80 @@
|
|
1
|
+
module BlackStack
|
2
|
+
module Bots
|
3
|
+
class Google < BlackStack::Bots::MechanizeBot
|
4
|
+
def search(query)
|
5
|
+
ret = []
|
6
|
+
# initialize mechanize agent
|
7
|
+
self.agent = Mechanize.new
|
8
|
+
# set a proxy with user and password
|
9
|
+
self.port_index += 1
|
10
|
+
self.port_index = 0 if self.port_index >= self.ports.length
|
11
|
+
self.agent.set_proxy(self.ip, self.ports[self.port_index], self.user, self.password) if self.proxy?
|
12
|
+
# grab the page
|
13
|
+
page = agent.get('http://www.google.com/')
|
14
|
+
google_form = page.form('f')
|
15
|
+
google_form.q = query
|
16
|
+
page = agent.submit(google_form, google_form.buttons.first)
|
17
|
+
# iterate divs with class starting with 'g '
|
18
|
+
page.search('h3').each do |h3|
|
19
|
+
# get the class of the div
|
20
|
+
title = h3.text.strip
|
21
|
+
# get the link inside the div
|
22
|
+
a = h3.parent.parent.parent
|
23
|
+
href = a['href']
|
24
|
+
descr = a.parent.parent.css('/div').last.text.strip
|
25
|
+
# get the value of the paremter with name param1 from the querystring using URI
|
26
|
+
uri = URI.parse(href)
|
27
|
+
params = CGI.parse(uri.query)
|
28
|
+
url = params['q'].first
|
29
|
+
# add to the list array of results
|
30
|
+
ret << { :title=>title, :url=>url, :description=>descr }
|
31
|
+
end
|
32
|
+
# destroy mechanize agent
|
33
|
+
self.agent.shutdown
|
34
|
+
# return
|
35
|
+
ret
|
36
|
+
end # search
|
37
|
+
end # Google
|
38
|
+
=begin
|
39
|
+
class GoogleEnrichment < BlackStack::Bots::Google
|
40
|
+
|
41
|
+
# get an array of domains that may be the domain of the company
|
42
|
+
def possible_domains_for_company(company_name)
|
43
|
+
search = "\"#{company_name}\" home page"
|
44
|
+
self.search(search).map { |r|
|
45
|
+
# get domain from url using URI, and removing www., and downcasing.
|
46
|
+
URI.parse(r[:url]).host.gsub(/^www\./, '').downcase
|
47
|
+
}
|
48
|
+
end # possible_domains_for_company
|
49
|
+
|
50
|
+
# find email from fname, lname and cname
|
51
|
+
def find_email(fname, lname, cname)
|
52
|
+
domains = self.possible_domains_for_company(cname)
|
53
|
+
if domains.size > 0
|
54
|
+
domains.each { |domain|
|
55
|
+
# array of possible emails
|
56
|
+
emails = []
|
57
|
+
#emails << "#{fname}@#{domain}"
|
58
|
+
#emails << "#{lname}@#{domain}"
|
59
|
+
emails << "#{fname}#{lname}@#{domain}"
|
60
|
+
emails << "#{fname}.#{lname}@#{domain}"
|
61
|
+
emails << "#{fname}_#{lname}@#{domain}"
|
62
|
+
emails << "#{fname[0]}#{lname}@#{domain}"
|
63
|
+
# iterate array of possible emails
|
64
|
+
emails.each { |email|
|
65
|
+
# search for that email
|
66
|
+
search = "\"#{email}\""
|
67
|
+
results = self.search(search)
|
68
|
+
# find results with the exact email in the description
|
69
|
+
return email if results.select { |result| result[:description].downcase =~ /\b#{email.downcase}\b/ }
|
70
|
+
}
|
71
|
+
}
|
72
|
+
end
|
73
|
+
return nil
|
74
|
+
end # find_email
|
75
|
+
|
76
|
+
end # GoogleEnrichment
|
77
|
+
=end
|
78
|
+
end # Bots
|
79
|
+
end # BlackStack
|
80
|
+
|
data/lib/indeed.rb
ADDED
@@ -0,0 +1,52 @@
|
|
1
|
+
require_relative './base'
|
2
|
+
module BlackStack
|
3
|
+
module Bots
|
4
|
+
class Indeed < BlackStack::Bots::SeleniumBot
|
5
|
+
|
6
|
+
def results(url, page=1)
|
7
|
+
ret = []
|
8
|
+
# launch a chrome browser with selenium
|
9
|
+
driver = Selenium::WebDriver.for :chrome
|
10
|
+
browser = driver.browser
|
11
|
+
# TODO: set a proxy with user and password
|
12
|
+
driver.get url
|
13
|
+
# get the ul list with class .jobsearch-ResultsList
|
14
|
+
ul = driver.find_element(:class=>'jobsearch-ResultsList')
|
15
|
+
# scroll to the bottom
|
16
|
+
driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
|
17
|
+
# iterate li elements
|
18
|
+
i = 0
|
19
|
+
ul.find_elements('css', 'li').each { |li|
|
20
|
+
h = {}
|
21
|
+
i += 1
|
22
|
+
links = li.find_elements('css', 'a.jcs-JobTitle')
|
23
|
+
if links.size == 1
|
24
|
+
link = li.find_element('css', 'a.jcs-JobTitle')
|
25
|
+
h[:title] = link.text
|
26
|
+
h[:url] = link.attribute('href')
|
27
|
+
|
28
|
+
o = li.find_elements('css','span.companyName').first
|
29
|
+
h[:company] = o ? o.text : ''
|
30
|
+
|
31
|
+
o = li.find_elements('css','div.companyLocation').first
|
32
|
+
h[:location] = o ? o.text : ''
|
33
|
+
|
34
|
+
o = li.find_elements('css','div.salary-snippet-container').first
|
35
|
+
h[:salary] = o ? o.text : ''
|
36
|
+
|
37
|
+
o = li.find_elements('css','span.date').first
|
38
|
+
h[:posted] = o ? o.text.gsub("Posted\nPosted", '').strip : ''
|
39
|
+
|
40
|
+
h[:snippets] = li.find_elements('css','div.job-snippet > ul > li').map { |li| li.text }
|
41
|
+
|
42
|
+
ret << h
|
43
|
+
end
|
44
|
+
}
|
45
|
+
# destroy selenium browser
|
46
|
+
driver.quit
|
47
|
+
# return
|
48
|
+
ret
|
49
|
+
end # results
|
50
|
+
end # Indeed
|
51
|
+
end # Bots
|
52
|
+
end # BlackStack
|
metadata
ADDED
@@ -0,0 +1,146 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: bots
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Leandro Daniel Sardi
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2023-08-08 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: simple_cloud_logging
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 1.2.2
|
20
|
+
- - ">="
|
21
|
+
- !ruby/object:Gem::Version
|
22
|
+
version: 1.2.2
|
23
|
+
type: :runtime
|
24
|
+
prerelease: false
|
25
|
+
version_requirements: !ruby/object:Gem::Requirement
|
26
|
+
requirements:
|
27
|
+
- - "~>"
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: 1.2.2
|
30
|
+
- - ">="
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: 1.2.2
|
33
|
+
- !ruby/object:Gem::Dependency
|
34
|
+
name: csv
|
35
|
+
requirement: !ruby/object:Gem::Requirement
|
36
|
+
requirements:
|
37
|
+
- - "~>"
|
38
|
+
- !ruby/object:Gem::Version
|
39
|
+
version: 3.2.7
|
40
|
+
- - ">="
|
41
|
+
- !ruby/object:Gem::Version
|
42
|
+
version: 3.2.7
|
43
|
+
type: :runtime
|
44
|
+
prerelease: false
|
45
|
+
version_requirements: !ruby/object:Gem::Requirement
|
46
|
+
requirements:
|
47
|
+
- - "~>"
|
48
|
+
- !ruby/object:Gem::Version
|
49
|
+
version: 3.2.7
|
50
|
+
- - ">="
|
51
|
+
- !ruby/object:Gem::Version
|
52
|
+
version: 3.2.7
|
53
|
+
- !ruby/object:Gem::Dependency
|
54
|
+
name: mechanize
|
55
|
+
requirement: !ruby/object:Gem::Requirement
|
56
|
+
requirements:
|
57
|
+
- - "~>"
|
58
|
+
- !ruby/object:Gem::Version
|
59
|
+
version: 2.8.5
|
60
|
+
- - ">="
|
61
|
+
- !ruby/object:Gem::Version
|
62
|
+
version: 2.8.5
|
63
|
+
type: :runtime
|
64
|
+
prerelease: false
|
65
|
+
version_requirements: !ruby/object:Gem::Requirement
|
66
|
+
requirements:
|
67
|
+
- - "~>"
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: 2.8.5
|
70
|
+
- - ">="
|
71
|
+
- !ruby/object:Gem::Version
|
72
|
+
version: 2.8.5
|
73
|
+
- !ruby/object:Gem::Dependency
|
74
|
+
name: selenium-webdriver
|
75
|
+
requirement: !ruby/object:Gem::Requirement
|
76
|
+
requirements:
|
77
|
+
- - "~>"
|
78
|
+
- !ruby/object:Gem::Version
|
79
|
+
version: 4.10.0
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: 4.10.0
|
83
|
+
type: :runtime
|
84
|
+
prerelease: false
|
85
|
+
version_requirements: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - "~>"
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: 4.10.0
|
90
|
+
- - ">="
|
91
|
+
- !ruby/object:Gem::Version
|
92
|
+
version: 4.10.0
|
93
|
+
- !ruby/object:Gem::Dependency
|
94
|
+
name: colorize
|
95
|
+
requirement: !ruby/object:Gem::Requirement
|
96
|
+
requirements:
|
97
|
+
- - "~>"
|
98
|
+
- !ruby/object:Gem::Version
|
99
|
+
version: 0.8.1
|
100
|
+
- - ">="
|
101
|
+
- !ruby/object:Gem::Version
|
102
|
+
version: 0.8.1
|
103
|
+
type: :runtime
|
104
|
+
prerelease: false
|
105
|
+
version_requirements: !ruby/object:Gem::Requirement
|
106
|
+
requirements:
|
107
|
+
- - "~>"
|
108
|
+
- !ruby/object:Gem::Version
|
109
|
+
version: 0.8.1
|
110
|
+
- - ">="
|
111
|
+
- !ruby/object:Gem::Version
|
112
|
+
version: 0.8.1
|
113
|
+
description: Ruby gem for scraping information from the public web.
|
114
|
+
email: leandro@connectionsphere.com
|
115
|
+
executables: []
|
116
|
+
extensions: []
|
117
|
+
extra_rdoc_files: []
|
118
|
+
files:
|
119
|
+
- lib/base.rb
|
120
|
+
- lib/bots.rb
|
121
|
+
- lib/google.rb
|
122
|
+
- lib/indeed.rb
|
123
|
+
homepage: https://rubygems.org/gems/bots
|
124
|
+
licenses:
|
125
|
+
- MIT
|
126
|
+
metadata: {}
|
127
|
+
post_install_message:
|
128
|
+
rdoc_options: []
|
129
|
+
require_paths:
|
130
|
+
- lib
|
131
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
132
|
+
requirements:
|
133
|
+
- - ">="
|
134
|
+
- !ruby/object:Gem::Version
|
135
|
+
version: '0'
|
136
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
137
|
+
requirements:
|
138
|
+
- - ">="
|
139
|
+
- !ruby/object:Gem::Version
|
140
|
+
version: '0'
|
141
|
+
requirements: []
|
142
|
+
rubygems_version: 3.3.7
|
143
|
+
signing_key:
|
144
|
+
specification_version: 4
|
145
|
+
summary: Ruby gem for scraping information from the public web.
|
146
|
+
test_files: []
|