bots 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/lib/base.rb +36 -0
- data/lib/bots.rb +9 -0
- data/lib/google.rb +80 -0
- data/lib/indeed.rb +52 -0
- metadata +146 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: be4c3337063af2d514fd1016c94d66385fa84dbde27dbfa5174d5b0908c9efb1
|
4
|
+
data.tar.gz: 13ef3dccd336269fdb1960f243c8b7dd08abf782ba89cfbf935c4e802a29ca45
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 354704a255a80def04f1993d3a73d5b358fedfeeda387278f47ab995ed10d2afced94fe175a15bc8dd7c55b1209798598b55a5f2d2071424f11613091cd87813
|
7
|
+
data.tar.gz: c1fae508d2b55269f0039524ee4e87714d7cc7244922faa83ac5b6c374abc29cdaab34355059ff92e5d6a0469be44900ac628fcfe55da80def3993549a2bcf27
|
data/lib/base.rb
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
module BlackStack
|
2
|
+
module Bots
|
3
|
+
class Bot
|
4
|
+
attr_accessor :ip # ip address of proxy
|
5
|
+
attr_accessor :user # user of proxy
|
6
|
+
attr_accessor :password # password of proxy
|
7
|
+
attr_accessor :ports # array of ports
|
8
|
+
attr_accessor :port_index # index of the port
|
9
|
+
|
10
|
+
def initialize(h)
|
11
|
+
# array of numbers from 4000 to 4249
|
12
|
+
unless h[:proxy].nil?
|
13
|
+
self.ip = h[:proxy][:ip]
|
14
|
+
self.user = h[:proxy][:user]
|
15
|
+
self.password = h[:proxy][:password]
|
16
|
+
self.ports = (h[:proxy][:port_from]..h[:proxy][:port_to]).to_a
|
17
|
+
end
|
18
|
+
self.port_index = -1
|
19
|
+
end # initialize
|
20
|
+
|
21
|
+
# return true if the bot is using a proxy
|
22
|
+
def proxy?
|
23
|
+
!self.ip.nil?
|
24
|
+
end
|
25
|
+
end # Bot
|
26
|
+
|
27
|
+
class MechanizeBot < BlackStack::Bots::Bot
|
28
|
+
attr_accessor :agent # mechanize agent
|
29
|
+
end # MechanizeBot
|
30
|
+
|
31
|
+
class SeleniumBot < BlackStack::Bots::Bot
|
32
|
+
attr_accessor :driver # selenium driver
|
33
|
+
end # MechanizeBot
|
34
|
+
|
35
|
+
end # Bots
|
36
|
+
end # BlackStack
|
data/lib/bots.rb
ADDED
data/lib/google.rb
ADDED
@@ -0,0 +1,80 @@
|
|
1
|
+
module BlackStack
|
2
|
+
module Bots
|
3
|
+
class Google < BlackStack::Bots::MechanizeBot
|
4
|
+
def search(query)
|
5
|
+
ret = []
|
6
|
+
# initialize mechanize agent
|
7
|
+
self.agent = Mechanize.new
|
8
|
+
# set a proxy with user and password
|
9
|
+
self.port_index += 1
|
10
|
+
self.port_index = 0 if self.port_index >= self.ports.length
|
11
|
+
self.agent.set_proxy(self.ip, self.ports[self.port_index], self.user, self.password) if self.proxy?
|
12
|
+
# grab the page
|
13
|
+
page = agent.get('http://www.google.com/')
|
14
|
+
google_form = page.form('f')
|
15
|
+
google_form.q = query
|
16
|
+
page = agent.submit(google_form, google_form.buttons.first)
|
17
|
+
# iterate divs with class starting with 'g '
|
18
|
+
page.search('h3').each do |h3|
|
19
|
+
# get the class of the div
|
20
|
+
title = h3.text.strip
|
21
|
+
# get the link inside the div
|
22
|
+
a = h3.parent.parent.parent
|
23
|
+
href = a['href']
|
24
|
+
descr = a.parent.parent.css('/div').last.text.strip
|
25
|
+
# get the value of the paremter with name param1 from the querystring using URI
|
26
|
+
uri = URI.parse(href)
|
27
|
+
params = CGI.parse(uri.query)
|
28
|
+
url = params['q'].first
|
29
|
+
# add to the list array of results
|
30
|
+
ret << { :title=>title, :url=>url, :description=>descr }
|
31
|
+
end
|
32
|
+
# destroy mechanize agent
|
33
|
+
self.agent.shutdown
|
34
|
+
# return
|
35
|
+
ret
|
36
|
+
end # search
|
37
|
+
end # Google
|
38
|
+
=begin
|
39
|
+
class GoogleEnrichment < BlackStack::Bots::Google
|
40
|
+
|
41
|
+
# get an array of domains that may be the domain of the company
|
42
|
+
def possible_domains_for_company(company_name)
|
43
|
+
search = "\"#{company_name}\" home page"
|
44
|
+
self.search(search).map { |r|
|
45
|
+
# get domain from url using URI, and removing www., and downcasing.
|
46
|
+
URI.parse(r[:url]).host.gsub(/^www\./, '').downcase
|
47
|
+
}
|
48
|
+
end # possible_domains_for_company
|
49
|
+
|
50
|
+
# find email from fname, lname and cname
|
51
|
+
def find_email(fname, lname, cname)
|
52
|
+
domains = self.possible_domains_for_company(cname)
|
53
|
+
if domains.size > 0
|
54
|
+
domains.each { |domain|
|
55
|
+
# array of possible emails
|
56
|
+
emails = []
|
57
|
+
#emails << "#{fname}@#{domain}"
|
58
|
+
#emails << "#{lname}@#{domain}"
|
59
|
+
emails << "#{fname}#{lname}@#{domain}"
|
60
|
+
emails << "#{fname}.#{lname}@#{domain}"
|
61
|
+
emails << "#{fname}_#{lname}@#{domain}"
|
62
|
+
emails << "#{fname[0]}#{lname}@#{domain}"
|
63
|
+
# iterate array of possible emails
|
64
|
+
emails.each { |email|
|
65
|
+
# search for that email
|
66
|
+
search = "\"#{email}\""
|
67
|
+
results = self.search(search)
|
68
|
+
# find results with the exact email in the description
|
69
|
+
return email if results.select { |result| result[:description].downcase =~ /\b#{email.downcase}\b/ }
|
70
|
+
}
|
71
|
+
}
|
72
|
+
end
|
73
|
+
return nil
|
74
|
+
end # find_email
|
75
|
+
|
76
|
+
end # GoogleEnrichment
|
77
|
+
=end
|
78
|
+
end # Bots
|
79
|
+
end # BlackStack
|
80
|
+
|
data/lib/indeed.rb
ADDED
@@ -0,0 +1,52 @@
|
|
1
|
+
require_relative './base'
|
2
|
+
module BlackStack
|
3
|
+
module Bots
|
4
|
+
class Indeed < BlackStack::Bots::SeleniumBot
|
5
|
+
|
6
|
+
def results(url, page=1)
|
7
|
+
ret = []
|
8
|
+
# launch a chrome browser with selenium
|
9
|
+
driver = Selenium::WebDriver.for :chrome
|
10
|
+
browser = driver.browser
|
11
|
+
# TODO: set a proxy with user and password
|
12
|
+
driver.get url
|
13
|
+
# get the ul list with class .jobsearch-ResultsList
|
14
|
+
ul = driver.find_element(:class=>'jobsearch-ResultsList')
|
15
|
+
# scroll to the bottom
|
16
|
+
driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
|
17
|
+
# iterate li elements
|
18
|
+
i = 0
|
19
|
+
ul.find_elements('css', 'li').each { |li|
|
20
|
+
h = {}
|
21
|
+
i += 1
|
22
|
+
links = li.find_elements('css', 'a.jcs-JobTitle')
|
23
|
+
if links.size == 1
|
24
|
+
link = li.find_element('css', 'a.jcs-JobTitle')
|
25
|
+
h[:title] = link.text
|
26
|
+
h[:url] = link.attribute('href')
|
27
|
+
|
28
|
+
o = li.find_elements('css','span.companyName').first
|
29
|
+
h[:company] = o ? o.text : ''
|
30
|
+
|
31
|
+
o = li.find_elements('css','div.companyLocation').first
|
32
|
+
h[:location] = o ? o.text : ''
|
33
|
+
|
34
|
+
o = li.find_elements('css','div.salary-snippet-container').first
|
35
|
+
h[:salary] = o ? o.text : ''
|
36
|
+
|
37
|
+
o = li.find_elements('css','span.date').first
|
38
|
+
h[:posted] = o ? o.text.gsub("Posted\nPosted", '').strip : ''
|
39
|
+
|
40
|
+
h[:snippets] = li.find_elements('css','div.job-snippet > ul > li').map { |li| li.text }
|
41
|
+
|
42
|
+
ret << h
|
43
|
+
end
|
44
|
+
}
|
45
|
+
# destroy selenium browser
|
46
|
+
driver.quit
|
47
|
+
# return
|
48
|
+
ret
|
49
|
+
end # results
|
50
|
+
end # Indeed
|
51
|
+
end # Bots
|
52
|
+
end # BlackStack
|
metadata
ADDED
@@ -0,0 +1,146 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: bots
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Leandro Daniel Sardi
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2023-08-08 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: simple_cloud_logging
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 1.2.2
|
20
|
+
- - ">="
|
21
|
+
- !ruby/object:Gem::Version
|
22
|
+
version: 1.2.2
|
23
|
+
type: :runtime
|
24
|
+
prerelease: false
|
25
|
+
version_requirements: !ruby/object:Gem::Requirement
|
26
|
+
requirements:
|
27
|
+
- - "~>"
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: 1.2.2
|
30
|
+
- - ">="
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: 1.2.2
|
33
|
+
- !ruby/object:Gem::Dependency
|
34
|
+
name: csv
|
35
|
+
requirement: !ruby/object:Gem::Requirement
|
36
|
+
requirements:
|
37
|
+
- - "~>"
|
38
|
+
- !ruby/object:Gem::Version
|
39
|
+
version: 3.2.7
|
40
|
+
- - ">="
|
41
|
+
- !ruby/object:Gem::Version
|
42
|
+
version: 3.2.7
|
43
|
+
type: :runtime
|
44
|
+
prerelease: false
|
45
|
+
version_requirements: !ruby/object:Gem::Requirement
|
46
|
+
requirements:
|
47
|
+
- - "~>"
|
48
|
+
- !ruby/object:Gem::Version
|
49
|
+
version: 3.2.7
|
50
|
+
- - ">="
|
51
|
+
- !ruby/object:Gem::Version
|
52
|
+
version: 3.2.7
|
53
|
+
- !ruby/object:Gem::Dependency
|
54
|
+
name: mechanize
|
55
|
+
requirement: !ruby/object:Gem::Requirement
|
56
|
+
requirements:
|
57
|
+
- - "~>"
|
58
|
+
- !ruby/object:Gem::Version
|
59
|
+
version: 2.8.5
|
60
|
+
- - ">="
|
61
|
+
- !ruby/object:Gem::Version
|
62
|
+
version: 2.8.5
|
63
|
+
type: :runtime
|
64
|
+
prerelease: false
|
65
|
+
version_requirements: !ruby/object:Gem::Requirement
|
66
|
+
requirements:
|
67
|
+
- - "~>"
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: 2.8.5
|
70
|
+
- - ">="
|
71
|
+
- !ruby/object:Gem::Version
|
72
|
+
version: 2.8.5
|
73
|
+
- !ruby/object:Gem::Dependency
|
74
|
+
name: selenium-webdriver
|
75
|
+
requirement: !ruby/object:Gem::Requirement
|
76
|
+
requirements:
|
77
|
+
- - "~>"
|
78
|
+
- !ruby/object:Gem::Version
|
79
|
+
version: 4.10.0
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: 4.10.0
|
83
|
+
type: :runtime
|
84
|
+
prerelease: false
|
85
|
+
version_requirements: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - "~>"
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: 4.10.0
|
90
|
+
- - ">="
|
91
|
+
- !ruby/object:Gem::Version
|
92
|
+
version: 4.10.0
|
93
|
+
- !ruby/object:Gem::Dependency
|
94
|
+
name: colorize
|
95
|
+
requirement: !ruby/object:Gem::Requirement
|
96
|
+
requirements:
|
97
|
+
- - "~>"
|
98
|
+
- !ruby/object:Gem::Version
|
99
|
+
version: 0.8.1
|
100
|
+
- - ">="
|
101
|
+
- !ruby/object:Gem::Version
|
102
|
+
version: 0.8.1
|
103
|
+
type: :runtime
|
104
|
+
prerelease: false
|
105
|
+
version_requirements: !ruby/object:Gem::Requirement
|
106
|
+
requirements:
|
107
|
+
- - "~>"
|
108
|
+
- !ruby/object:Gem::Version
|
109
|
+
version: 0.8.1
|
110
|
+
- - ">="
|
111
|
+
- !ruby/object:Gem::Version
|
112
|
+
version: 0.8.1
|
113
|
+
description: Ruby gem for scraping information from the public web.
|
114
|
+
email: leandro@connectionsphere.com
|
115
|
+
executables: []
|
116
|
+
extensions: []
|
117
|
+
extra_rdoc_files: []
|
118
|
+
files:
|
119
|
+
- lib/base.rb
|
120
|
+
- lib/bots.rb
|
121
|
+
- lib/google.rb
|
122
|
+
- lib/indeed.rb
|
123
|
+
homepage: https://rubygems.org/gems/bots
|
124
|
+
licenses:
|
125
|
+
- MIT
|
126
|
+
metadata: {}
|
127
|
+
post_install_message:
|
128
|
+
rdoc_options: []
|
129
|
+
require_paths:
|
130
|
+
- lib
|
131
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
132
|
+
requirements:
|
133
|
+
- - ">="
|
134
|
+
- !ruby/object:Gem::Version
|
135
|
+
version: '0'
|
136
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
137
|
+
requirements:
|
138
|
+
- - ">="
|
139
|
+
- !ruby/object:Gem::Version
|
140
|
+
version: '0'
|
141
|
+
requirements: []
|
142
|
+
rubygems_version: 3.3.7
|
143
|
+
signing_key:
|
144
|
+
specification_version: 4
|
145
|
+
summary: Ruby gem for scraping information from the public web.
|
146
|
+
test_files: []
|