bots 1.0.1 → 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/base.rb +13 -5
- data/lib/bots.rb +5 -0
- data/lib/scraper.rb +145 -0
- metadata +83 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3a8018e0d8575a415699c41dcba236e3c4f400e8132111093e421ac02e792548
|
4
|
+
data.tar.gz: 4b876044081e94743d1b719c53424331d44bb200a8bdcfddd7d78562209eeed3
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 326e82a582132f2d267e906df73aad0d812f9fc4fe00c8af2ba9ef6cd93a174ae2004719ec76f5cc6b018da0eda1b2cd891dd51f7df7adaad6044da773f207ec
|
7
|
+
data.tar.gz: 2a4944c21854faee39f81b81004fb63baf3f270b2374f081b98ca25b4f7695af80184b1449dff38207d4c21b7f34b524d59c997931f2db2743df42c56a714875
|
data/lib/base.rb
CHANGED
@@ -9,11 +9,13 @@ module BlackStack
|
|
9
9
|
|
10
10
|
def initialize(h)
|
11
11
|
# array of numbers from 4000 to 4249
|
12
|
-
|
13
|
-
self.ip = h[:
|
14
|
-
self.user = h[:
|
15
|
-
self.password = h[:
|
16
|
-
self.ports = (h[:
|
12
|
+
if h
|
13
|
+
self.ip = h[:ip]
|
14
|
+
self.user = h[:user]
|
15
|
+
self.password = h[:password]
|
16
|
+
self.ports = (h[:port_from]..h[:port_to]).to_a
|
17
|
+
else
|
18
|
+
self.ports = []
|
17
19
|
end
|
18
20
|
self.port_index = -1
|
19
21
|
end # initialize
|
@@ -26,10 +28,16 @@ module BlackStack
|
|
26
28
|
|
27
29
|
class MechanizeBot < BlackStack::Bots::Bot
|
28
30
|
attr_accessor :agent # mechanize agent
|
31
|
+
def initialize(h)
|
32
|
+
super(h)
|
33
|
+
end
|
29
34
|
end # MechanizeBot
|
30
35
|
|
31
36
|
class SeleniumBot < BlackStack::Bots::Bot
|
32
37
|
attr_accessor :driver # selenium driver
|
38
|
+
def initialize(h)
|
39
|
+
super(h)
|
40
|
+
end
|
33
41
|
end # MechanizeBot
|
34
42
|
|
35
43
|
end # Bots
|
data/lib/bots.rb
CHANGED
@@ -1,9 +1,14 @@
|
|
1
|
+
require 'open-uri'
|
1
2
|
require 'mechanize'
|
2
3
|
require 'selenium-webdriver'
|
3
4
|
require 'simple_cloud_logging'
|
4
5
|
require 'colorize'
|
5
6
|
require 'csv'
|
7
|
+
require 'pry'
|
8
|
+
require 'sitemap-parser'
|
9
|
+
require 'timeout'
|
6
10
|
|
7
11
|
require_relative './base'
|
8
12
|
require_relative './google'
|
13
|
+
require_relative './scraper'
|
9
14
|
require_relative './indeed'
|
data/lib/scraper.rb
ADDED
@@ -0,0 +1,145 @@
|
|
1
|
+
module BlackStack
|
2
|
+
module Bots
|
3
|
+
class Scraper < BlackStack::Bots::MechanizeBot
|
4
|
+
attr_accessor :domain, :links
|
5
|
+
# auxiliar array of links that I have extracted links from
|
6
|
+
attr_accessor :links_processed
|
7
|
+
|
8
|
+
def initialize(init_domain, h)
|
9
|
+
super(h)
|
10
|
+
self.domain = init_domain
|
11
|
+
#self.agent.agent.http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
12
|
+
self.links = []
|
13
|
+
self.links_processed = []
|
14
|
+
end # def initialize
|
15
|
+
|
16
|
+
def get(url)
|
17
|
+
# initialize mechanize agent
|
18
|
+
self.agent = Mechanize.new
|
19
|
+
# set a proxy with user and password
|
20
|
+
self.port_index += 1
|
21
|
+
self.port_index = 0 if self.port_index >= self.ports.length
|
22
|
+
self.agent.set_proxy(self.ip, self.ports[self.port_index], self.user, self.password) if self.proxy?
|
23
|
+
self.agent.open_timeout = 5
|
24
|
+
self.agent.read_timeout = 5
|
25
|
+
# return
|
26
|
+
return Timeout::timeout(5) { self.agent.get(url) }
|
27
|
+
end
|
28
|
+
|
29
|
+
def get_links_from_sitemap(l=nil)
|
30
|
+
i = 0
|
31
|
+
l.logs "Scrape sitemaps... "
|
32
|
+
begin
|
33
|
+
# download the robots.txt
|
34
|
+
url = "http://#{domain}/robots.txt"
|
35
|
+
# get the content of robots.txt from url
|
36
|
+
s = Timeout::timeout(5) { URI.open(url).read }
|
37
|
+
# get the sitemap
|
38
|
+
sitemaps = s.split("\n").select { |line| line =~ /^sitemap:/i }.map { |a| a.downcase.split('sitemap:').last.strip }.uniq
|
39
|
+
sitemaps.each { |b|
|
40
|
+
parser = Timeout::timeout(5) { SitemapParser.new b }
|
41
|
+
self.links += Timeout::timeout(5) { parser.to_a }
|
42
|
+
self.links.uniq!
|
43
|
+
}
|
44
|
+
l.logf sitemaps.size == 0 ? 'no sitemap found'.yellow : "#{sitemaps.size} sitemaps found".green # get_links
|
45
|
+
rescue => e
|
46
|
+
l.logf "Error: #{e.message.split("\n").first[0..100]})".red # get_links
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
# internal use only
|
51
|
+
def get_links_from_url(url, l=nil)
|
52
|
+
l = BlackStack::DummyLogger.new(nil) if l.nil?
|
53
|
+
l.logs "get_links (#{url})... "
|
54
|
+
begin
|
55
|
+
aux = []
|
56
|
+
# trim url
|
57
|
+
url = url.strip
|
58
|
+
# get domain of the url using open-uri
|
59
|
+
domain = URI.parse(url).host
|
60
|
+
# visit the main page of the website
|
61
|
+
page = self.get(url)
|
62
|
+
# get the self.links to the pages of the website
|
63
|
+
aux = page.links.map(&:href)
|
64
|
+
# remove non-string elements
|
65
|
+
aux = aux.select { |link| link.is_a?(String) }
|
66
|
+
# remove # from the self.links
|
67
|
+
aux = aux.map { |link| !link.nil? && link.split('#').first }
|
68
|
+
# remove querystring from the self.links
|
69
|
+
aux = aux.map { |link| !link.nil? && link.split('?').first }
|
70
|
+
# remove the self.links that are not http:// or https://
|
71
|
+
aux = aux.select { |link| !link.nil? && link =~ /^https?:\/\// }
|
72
|
+
# remove the self.links that are not from the same domain
|
73
|
+
aux = aux.select { |link| !link.nil? && link =~ /#{domain}/ }
|
74
|
+
# remove nil values
|
75
|
+
aux = aux.compact
|
76
|
+
# remove duplications
|
77
|
+
aux = aux.uniq
|
78
|
+
# filter links who already are in the list
|
79
|
+
a = aux.size
|
80
|
+
aux = aux.select { |link| !self.links.include?(link) }
|
81
|
+
b = aux.size
|
82
|
+
# add new links to self.links
|
83
|
+
self.links += aux
|
84
|
+
l.logf "done".green + " (#{a} links found, #{b} new, #{self.links.size} total)" # get_links
|
85
|
+
rescue => e
|
86
|
+
l.logf "Error: #{e.message.split("\n").first[0..100]})".red # get_links
|
87
|
+
end
|
88
|
+
end # def get_links_from_url
|
89
|
+
|
90
|
+
def get_links(stop_at=10, l=nil)
|
91
|
+
l = BlackStack::DummyLogger.new(nil) if l.nil?
|
92
|
+
# working with root url
|
93
|
+
url = "http://#{self.domain}/"
|
94
|
+
self.links << url if self.links.select { |link| link == url }.empty?
|
95
|
+
# iterate until I have discovered all the links
|
96
|
+
while self.links.size != self.links_processed.size && stop_at >= self.links.size
|
97
|
+
# iterate the links who are not in links_processed
|
98
|
+
self.links.select { |link| !self.links_processed.include?(link) }.each { |link|
|
99
|
+
# get the links from the url
|
100
|
+
self.get_links_from_url(link, l)
|
101
|
+
# add the link to the list of processed links
|
102
|
+
self.links_processed << link
|
103
|
+
}
|
104
|
+
end # while
|
105
|
+
# get links from the sitemap
|
106
|
+
self.get_links_from_sitemap(l)
|
107
|
+
end # def get_links
|
108
|
+
|
109
|
+
def find_keywords(a, stop_at=50, l=nil)
|
110
|
+
ret = []
|
111
|
+
l = BlackStack::DummyLogger.new(nil) if l.nil?
|
112
|
+
# iterate the links
|
113
|
+
j = 0
|
114
|
+
self.links.reject { |link| link =~ /\.pdf$/i || link =~ /\.jpg$/i || link =~ /\.jpeg$/i || link =~ /\.gif$/i }.each { |link|
|
115
|
+
j += 1
|
116
|
+
break if j > stop_at
|
117
|
+
l.logs "#{j.to_s}. find_keywords (#{link})... "
|
118
|
+
begin
|
119
|
+
# get the page
|
120
|
+
page = self.get(link)
|
121
|
+
# get page body content in plain text
|
122
|
+
s = Timeout::timeout(5) { Nokogiri::HTML(page.body).text }
|
123
|
+
# iterate the keywords
|
124
|
+
i = 0
|
125
|
+
a.each { |k|
|
126
|
+
# find the keyword
|
127
|
+
if s =~ /#{Regexp.escape(k)}/i
|
128
|
+
i += 1
|
129
|
+
ret << link if ret.select { |link| link == link }.empty?
|
130
|
+
break
|
131
|
+
end # if
|
132
|
+
} # each
|
133
|
+
break if ret.size > 0
|
134
|
+
l.logf i == 0 ? 'no keywords found'.yellow : "#{i} keywords found".green # find_keywords
|
135
|
+
rescue => e
|
136
|
+
l.logf "Error: #{e.message.split("\n").first[0..100]}".red # get_links
|
137
|
+
end # begin
|
138
|
+
} # each
|
139
|
+
# return
|
140
|
+
ret
|
141
|
+
end
|
142
|
+
|
143
|
+
end # class Scraper
|
144
|
+
end # module Bots
|
145
|
+
end # module BlackStack
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bots
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Leandro Daniel Sardi
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-08-
|
11
|
+
date: 2023-08-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: simple_cloud_logging
|
@@ -110,6 +110,86 @@ dependencies:
|
|
110
110
|
- - ">="
|
111
111
|
- !ruby/object:Gem::Version
|
112
112
|
version: 0.8.1
|
113
|
+
- !ruby/object:Gem::Dependency
|
114
|
+
name: pry
|
115
|
+
requirement: !ruby/object:Gem::Requirement
|
116
|
+
requirements:
|
117
|
+
- - "~>"
|
118
|
+
- !ruby/object:Gem::Version
|
119
|
+
version: 0.14.2
|
120
|
+
- - ">="
|
121
|
+
- !ruby/object:Gem::Version
|
122
|
+
version: 0.14.2
|
123
|
+
type: :runtime
|
124
|
+
prerelease: false
|
125
|
+
version_requirements: !ruby/object:Gem::Requirement
|
126
|
+
requirements:
|
127
|
+
- - "~>"
|
128
|
+
- !ruby/object:Gem::Version
|
129
|
+
version: 0.14.2
|
130
|
+
- - ">="
|
131
|
+
- !ruby/object:Gem::Version
|
132
|
+
version: 0.14.2
|
133
|
+
- !ruby/object:Gem::Dependency
|
134
|
+
name: open-uri
|
135
|
+
requirement: !ruby/object:Gem::Requirement
|
136
|
+
requirements:
|
137
|
+
- - "~>"
|
138
|
+
- !ruby/object:Gem::Version
|
139
|
+
version: 0.2.0
|
140
|
+
- - ">="
|
141
|
+
- !ruby/object:Gem::Version
|
142
|
+
version: 0.2.0
|
143
|
+
type: :runtime
|
144
|
+
prerelease: false
|
145
|
+
version_requirements: !ruby/object:Gem::Requirement
|
146
|
+
requirements:
|
147
|
+
- - "~>"
|
148
|
+
- !ruby/object:Gem::Version
|
149
|
+
version: 0.2.0
|
150
|
+
- - ">="
|
151
|
+
- !ruby/object:Gem::Version
|
152
|
+
version: 0.2.0
|
153
|
+
- !ruby/object:Gem::Dependency
|
154
|
+
name: sitemap-parser
|
155
|
+
requirement: !ruby/object:Gem::Requirement
|
156
|
+
requirements:
|
157
|
+
- - "~>"
|
158
|
+
- !ruby/object:Gem::Version
|
159
|
+
version: 0.5.6
|
160
|
+
- - ">="
|
161
|
+
- !ruby/object:Gem::Version
|
162
|
+
version: 0.5.6
|
163
|
+
type: :runtime
|
164
|
+
prerelease: false
|
165
|
+
version_requirements: !ruby/object:Gem::Requirement
|
166
|
+
requirements:
|
167
|
+
- - "~>"
|
168
|
+
- !ruby/object:Gem::Version
|
169
|
+
version: 0.5.6
|
170
|
+
- - ">="
|
171
|
+
- !ruby/object:Gem::Version
|
172
|
+
version: 0.5.6
|
173
|
+
- !ruby/object:Gem::Dependency
|
174
|
+
name: timeout
|
175
|
+
requirement: !ruby/object:Gem::Requirement
|
176
|
+
requirements:
|
177
|
+
- - "~>"
|
178
|
+
- !ruby/object:Gem::Version
|
179
|
+
version: 0.4.0
|
180
|
+
- - ">="
|
181
|
+
- !ruby/object:Gem::Version
|
182
|
+
version: 0.4.0
|
183
|
+
type: :runtime
|
184
|
+
prerelease: false
|
185
|
+
version_requirements: !ruby/object:Gem::Requirement
|
186
|
+
requirements:
|
187
|
+
- - "~>"
|
188
|
+
- !ruby/object:Gem::Version
|
189
|
+
version: 0.4.0
|
190
|
+
- - ">="
|
191
|
+
- !ruby/object:Gem::Version
|
192
|
+
version: 0.4.0
|
113
193
|
description: Ruby gem for scraping information from the public web.
|
114
194
|
email: leandro@connectionsphere.com
|
115
195
|
executables: []
|
@@ -120,6 +200,7 @@ files:
|
|
120
200
|
- lib/bots.rb
|
121
201
|
- lib/google.rb
|
122
202
|
- lib/indeed.rb
|
203
|
+
- lib/scraper.rb
|
123
204
|
homepage: https://rubygems.org/gems/bots
|
124
205
|
licenses:
|
125
206
|
- MIT
|