bots 1.0.1 → 1.0.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/base.rb +13 -5
- data/lib/bots.rb +5 -0
- data/lib/scraper.rb +145 -0
- metadata +83 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3a8018e0d8575a415699c41dcba236e3c4f400e8132111093e421ac02e792548
|
4
|
+
data.tar.gz: 4b876044081e94743d1b719c53424331d44bb200a8bdcfddd7d78562209eeed3
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 326e82a582132f2d267e906df73aad0d812f9fc4fe00c8af2ba9ef6cd93a174ae2004719ec76f5cc6b018da0eda1b2cd891dd51f7df7adaad6044da773f207ec
|
7
|
+
data.tar.gz: 2a4944c21854faee39f81b81004fb63baf3f270b2374f081b98ca25b4f7695af80184b1449dff38207d4c21b7f34b524d59c997931f2db2743df42c56a714875
|
data/lib/base.rb
CHANGED
@@ -9,11 +9,13 @@ module BlackStack
|
|
9
9
|
|
10
10
|
def initialize(h)
|
11
11
|
# array of numbers from 4000 to 4249
|
12
|
-
|
13
|
-
self.ip = h[:
|
14
|
-
self.user = h[:
|
15
|
-
self.password = h[:
|
16
|
-
self.ports = (h[:
|
12
|
+
if h
|
13
|
+
self.ip = h[:ip]
|
14
|
+
self.user = h[:user]
|
15
|
+
self.password = h[:password]
|
16
|
+
self.ports = (h[:port_from]..h[:port_to]).to_a
|
17
|
+
else
|
18
|
+
self.ports = []
|
17
19
|
end
|
18
20
|
self.port_index = -1
|
19
21
|
end # initialize
|
@@ -26,10 +28,16 @@ module BlackStack
|
|
26
28
|
|
27
29
|
class MechanizeBot < BlackStack::Bots::Bot
|
28
30
|
attr_accessor :agent # mechanize agent
|
31
|
+
def initialize(h)
|
32
|
+
super(h)
|
33
|
+
end
|
29
34
|
end # MechanizeBot
|
30
35
|
|
31
36
|
class SeleniumBot < BlackStack::Bots::Bot
|
32
37
|
attr_accessor :driver # selenium driver
|
38
|
+
def initialize(h)
|
39
|
+
super(h)
|
40
|
+
end
|
33
41
|
end # MechanizeBot
|
34
42
|
|
35
43
|
end # Bots
|
data/lib/bots.rb
CHANGED
@@ -1,9 +1,14 @@
|
|
1
|
+
require 'open-uri'
|
1
2
|
require 'mechanize'
|
2
3
|
require 'selenium-webdriver'
|
3
4
|
require 'simple_cloud_logging'
|
4
5
|
require 'colorize'
|
5
6
|
require 'csv'
|
7
|
+
require 'pry'
|
8
|
+
require 'sitemap-parser'
|
9
|
+
require 'timeout'
|
6
10
|
|
7
11
|
require_relative './base'
|
8
12
|
require_relative './google'
|
13
|
+
require_relative './scraper'
|
9
14
|
require_relative './indeed'
|
data/lib/scraper.rb
ADDED
@@ -0,0 +1,145 @@
|
|
1
|
+
module BlackStack
|
2
|
+
module Bots
|
3
|
+
class Scraper < BlackStack::Bots::MechanizeBot
|
4
|
+
attr_accessor :domain, :links
|
5
|
+
# auxiliar array of links that I have extracted links from
|
6
|
+
attr_accessor :links_processed
|
7
|
+
|
8
|
+
def initialize(init_domain, h)
|
9
|
+
super(h)
|
10
|
+
self.domain = init_domain
|
11
|
+
#self.agent.agent.http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
12
|
+
self.links = []
|
13
|
+
self.links_processed = []
|
14
|
+
end # def initialize
|
15
|
+
|
16
|
+
def get(url)
|
17
|
+
# initialize mechanize agent
|
18
|
+
self.agent = Mechanize.new
|
19
|
+
# set a proxy with user and password
|
20
|
+
self.port_index += 1
|
21
|
+
self.port_index = 0 if self.port_index >= self.ports.length
|
22
|
+
self.agent.set_proxy(self.ip, self.ports[self.port_index], self.user, self.password) if self.proxy?
|
23
|
+
self.agent.open_timeout = 5
|
24
|
+
self.agent.read_timeout = 5
|
25
|
+
# return
|
26
|
+
return Timeout::timeout(5) { self.agent.get(url) }
|
27
|
+
end
|
28
|
+
|
29
|
+
def get_links_from_sitemap(l=nil)
|
30
|
+
i = 0
|
31
|
+
l.logs "Scrape sitemaps... "
|
32
|
+
begin
|
33
|
+
# download the robots.txt
|
34
|
+
url = "http://#{domain}/robots.txt"
|
35
|
+
# get the content of robots.txt from url
|
36
|
+
s = Timeout::timeout(5) { URI.open(url).read }
|
37
|
+
# get the sitemap
|
38
|
+
sitemaps = s.split("\n").select { |line| line =~ /^sitemap:/i }.map { |a| a.downcase.split('sitemap:').last.strip }.uniq
|
39
|
+
sitemaps.each { |b|
|
40
|
+
parser = Timeout::timeout(5) { SitemapParser.new b }
|
41
|
+
self.links += Timeout::timeout(5) { parser.to_a }
|
42
|
+
self.links.uniq!
|
43
|
+
}
|
44
|
+
l.logf sitemaps.size == 0 ? 'no sitemap found'.yellow : "#{sitemaps.size} sitemaps found".green # get_links
|
45
|
+
rescue => e
|
46
|
+
l.logf "Error: #{e.message.split("\n").first[0..100]})".red # get_links
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
# internal use only
|
51
|
+
def get_links_from_url(url, l=nil)
|
52
|
+
l = BlackStack::DummyLogger.new(nil) if l.nil?
|
53
|
+
l.logs "get_links (#{url})... "
|
54
|
+
begin
|
55
|
+
aux = []
|
56
|
+
# trim url
|
57
|
+
url = url.strip
|
58
|
+
# get domain of the url using open-uri
|
59
|
+
domain = URI.parse(url).host
|
60
|
+
# visit the main page of the website
|
61
|
+
page = self.get(url)
|
62
|
+
# get the self.links to the pages of the website
|
63
|
+
aux = page.links.map(&:href)
|
64
|
+
# remove non-string elements
|
65
|
+
aux = aux.select { |link| link.is_a?(String) }
|
66
|
+
# remove # from the self.links
|
67
|
+
aux = aux.map { |link| !link.nil? && link.split('#').first }
|
68
|
+
# remove querystring from the self.links
|
69
|
+
aux = aux.map { |link| !link.nil? && link.split('?').first }
|
70
|
+
# remove the self.links that are not http:// or https://
|
71
|
+
aux = aux.select { |link| !link.nil? && link =~ /^https?:\/\// }
|
72
|
+
# remove the self.links that are not from the same domain
|
73
|
+
aux = aux.select { |link| !link.nil? && link =~ /#{domain}/ }
|
74
|
+
# remove nil values
|
75
|
+
aux = aux.compact
|
76
|
+
# remove duplications
|
77
|
+
aux = aux.uniq
|
78
|
+
# filter links who already are in the list
|
79
|
+
a = aux.size
|
80
|
+
aux = aux.select { |link| !self.links.include?(link) }
|
81
|
+
b = aux.size
|
82
|
+
# add new links to self.links
|
83
|
+
self.links += aux
|
84
|
+
l.logf "done".green + " (#{a} links found, #{b} new, #{self.links.size} total)" # get_links
|
85
|
+
rescue => e
|
86
|
+
l.logf "Error: #{e.message.split("\n").first[0..100]})".red # get_links
|
87
|
+
end
|
88
|
+
end # def get_links_from_url
|
89
|
+
|
90
|
+
def get_links(stop_at=10, l=nil)
|
91
|
+
l = BlackStack::DummyLogger.new(nil) if l.nil?
|
92
|
+
# working with root url
|
93
|
+
url = "http://#{self.domain}/"
|
94
|
+
self.links << url if self.links.select { |link| link == url }.empty?
|
95
|
+
# iterate until I have discovered all the links
|
96
|
+
while self.links.size != self.links_processed.size && stop_at >= self.links.size
|
97
|
+
# iterate the links who are not in links_processed
|
98
|
+
self.links.select { |link| !self.links_processed.include?(link) }.each { |link|
|
99
|
+
# get the links from the url
|
100
|
+
self.get_links_from_url(link, l)
|
101
|
+
# add the link to the list of processed links
|
102
|
+
self.links_processed << link
|
103
|
+
}
|
104
|
+
end # while
|
105
|
+
# get links from the sitemap
|
106
|
+
self.get_links_from_sitemap(l)
|
107
|
+
end # def get_links
|
108
|
+
|
109
|
+
def find_keywords(a, stop_at=50, l=nil)
|
110
|
+
ret = []
|
111
|
+
l = BlackStack::DummyLogger.new(nil) if l.nil?
|
112
|
+
# iterate the links
|
113
|
+
j = 0
|
114
|
+
self.links.reject { |link| link =~ /\.pdf$/i || link =~ /\.jpg$/i || link =~ /\.jpeg$/i || link =~ /\.gif$/i }.each { |link|
|
115
|
+
j += 1
|
116
|
+
break if j > stop_at
|
117
|
+
l.logs "#{j.to_s}. find_keywords (#{link})... "
|
118
|
+
begin
|
119
|
+
# get the page
|
120
|
+
page = self.get(link)
|
121
|
+
# get page body content in plain text
|
122
|
+
s = Timeout::timeout(5) { Nokogiri::HTML(page.body).text }
|
123
|
+
# iterate the keywords
|
124
|
+
i = 0
|
125
|
+
a.each { |k|
|
126
|
+
# find the keyword
|
127
|
+
if s =~ /#{Regexp.escape(k)}/i
|
128
|
+
i += 1
|
129
|
+
ret << link if ret.select { |link| link == link }.empty?
|
130
|
+
break
|
131
|
+
end # if
|
132
|
+
} # each
|
133
|
+
break if ret.size > 0
|
134
|
+
l.logf i == 0 ? 'no keywords found'.yellow : "#{i} keywords found".green # find_keywords
|
135
|
+
rescue => e
|
136
|
+
l.logf "Error: #{e.message.split("\n").first[0..100]}".red # get_links
|
137
|
+
end # begin
|
138
|
+
} # each
|
139
|
+
# return
|
140
|
+
ret
|
141
|
+
end
|
142
|
+
|
143
|
+
end # class Scraper
|
144
|
+
end # module Bots
|
145
|
+
end # module BlackStack
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bots
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Leandro Daniel Sardi
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-08-
|
11
|
+
date: 2023-08-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: simple_cloud_logging
|
@@ -110,6 +110,86 @@ dependencies:
|
|
110
110
|
- - ">="
|
111
111
|
- !ruby/object:Gem::Version
|
112
112
|
version: 0.8.1
|
113
|
+
- !ruby/object:Gem::Dependency
|
114
|
+
name: pry
|
115
|
+
requirement: !ruby/object:Gem::Requirement
|
116
|
+
requirements:
|
117
|
+
- - "~>"
|
118
|
+
- !ruby/object:Gem::Version
|
119
|
+
version: 0.14.2
|
120
|
+
- - ">="
|
121
|
+
- !ruby/object:Gem::Version
|
122
|
+
version: 0.14.2
|
123
|
+
type: :runtime
|
124
|
+
prerelease: false
|
125
|
+
version_requirements: !ruby/object:Gem::Requirement
|
126
|
+
requirements:
|
127
|
+
- - "~>"
|
128
|
+
- !ruby/object:Gem::Version
|
129
|
+
version: 0.14.2
|
130
|
+
- - ">="
|
131
|
+
- !ruby/object:Gem::Version
|
132
|
+
version: 0.14.2
|
133
|
+
- !ruby/object:Gem::Dependency
|
134
|
+
name: open-uri
|
135
|
+
requirement: !ruby/object:Gem::Requirement
|
136
|
+
requirements:
|
137
|
+
- - "~>"
|
138
|
+
- !ruby/object:Gem::Version
|
139
|
+
version: 0.2.0
|
140
|
+
- - ">="
|
141
|
+
- !ruby/object:Gem::Version
|
142
|
+
version: 0.2.0
|
143
|
+
type: :runtime
|
144
|
+
prerelease: false
|
145
|
+
version_requirements: !ruby/object:Gem::Requirement
|
146
|
+
requirements:
|
147
|
+
- - "~>"
|
148
|
+
- !ruby/object:Gem::Version
|
149
|
+
version: 0.2.0
|
150
|
+
- - ">="
|
151
|
+
- !ruby/object:Gem::Version
|
152
|
+
version: 0.2.0
|
153
|
+
- !ruby/object:Gem::Dependency
|
154
|
+
name: sitemap-parser
|
155
|
+
requirement: !ruby/object:Gem::Requirement
|
156
|
+
requirements:
|
157
|
+
- - "~>"
|
158
|
+
- !ruby/object:Gem::Version
|
159
|
+
version: 0.5.6
|
160
|
+
- - ">="
|
161
|
+
- !ruby/object:Gem::Version
|
162
|
+
version: 0.5.6
|
163
|
+
type: :runtime
|
164
|
+
prerelease: false
|
165
|
+
version_requirements: !ruby/object:Gem::Requirement
|
166
|
+
requirements:
|
167
|
+
- - "~>"
|
168
|
+
- !ruby/object:Gem::Version
|
169
|
+
version: 0.5.6
|
170
|
+
- - ">="
|
171
|
+
- !ruby/object:Gem::Version
|
172
|
+
version: 0.5.6
|
173
|
+
- !ruby/object:Gem::Dependency
|
174
|
+
name: timeout
|
175
|
+
requirement: !ruby/object:Gem::Requirement
|
176
|
+
requirements:
|
177
|
+
- - "~>"
|
178
|
+
- !ruby/object:Gem::Version
|
179
|
+
version: 0.4.0
|
180
|
+
- - ">="
|
181
|
+
- !ruby/object:Gem::Version
|
182
|
+
version: 0.4.0
|
183
|
+
type: :runtime
|
184
|
+
prerelease: false
|
185
|
+
version_requirements: !ruby/object:Gem::Requirement
|
186
|
+
requirements:
|
187
|
+
- - "~>"
|
188
|
+
- !ruby/object:Gem::Version
|
189
|
+
version: 0.4.0
|
190
|
+
- - ">="
|
191
|
+
- !ruby/object:Gem::Version
|
192
|
+
version: 0.4.0
|
113
193
|
description: Ruby gem for scraping information from the public web.
|
114
194
|
email: leandro@connectionsphere.com
|
115
195
|
executables: []
|
@@ -120,6 +200,7 @@ files:
|
|
120
200
|
- lib/bots.rb
|
121
201
|
- lib/google.rb
|
122
202
|
- lib/indeed.rb
|
203
|
+
- lib/scraper.rb
|
123
204
|
homepage: https://rubygems.org/gems/bots
|
124
205
|
licenses:
|
125
206
|
- MIT
|