bots 1.0.5 → 1.0.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/bots.rb +3 -1
- data/lib/scraper.rb +37 -32
- metadata +22 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 46e3e095564f34c1f9a0375dd4fefb669d4920b23a8c8d925b5a598250cc9ee2
|
4
|
+
data.tar.gz: 65075cef5bebe85cfbbd5098363ece05a19673144c3ba487e6c844523dc5195f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d319b1aee567eb3bad89a2611965409af49a3e7d00997c0c3c6b0060ea68fe60c82c4d4b4c4e51bcb4efb7b5393c0fe5438ec0b02bd1ee25b42d1354265dddf3
|
7
|
+
data.tar.gz: abe09849e8951f87a7012096be40bade937d658a1ff64852ea4db96941a90ae0cf5b82e14967dd6010403fc40ecd5bfdaa9ebf70e7e93f2d084d0a3c71f38948
|
data/lib/bots.rb
CHANGED
@@ -7,8 +7,10 @@ require 'csv'
|
|
7
7
|
require 'pry'
|
8
8
|
require 'sitemap-parser'
|
9
9
|
require 'timeout'
|
10
|
+
require 'watir'
|
10
11
|
|
11
12
|
require_relative './base'
|
12
13
|
require_relative './google'
|
13
14
|
require_relative './scraper'
|
14
|
-
require_relative './indeed'
|
15
|
+
require_relative './indeed'
|
16
|
+
require_relative './browser'
|
data/lib/scraper.rb
CHANGED
@@ -1,31 +1,21 @@
|
|
1
1
|
module BlackStack
|
2
2
|
module Bots
|
3
|
-
class Scraper
|
4
|
-
attr_accessor :domain, :links
|
3
|
+
class Scraper
|
4
|
+
attr_accessor :domain, :links, :timeout, :load_wait_time, :stop_scraping_at_page_number, :stop_scraping_at_match_number
|
5
5
|
# auxiliar array of links that I have extracted links from
|
6
6
|
attr_accessor :links_processed
|
7
7
|
|
8
|
-
def initialize(init_domain, h)
|
9
|
-
super(h)
|
8
|
+
def initialize(init_domain, timeout, h)
|
10
9
|
self.domain = init_domain
|
10
|
+
self.timeout = timeout || 10
|
11
|
+
self.load_wait_time = 3
|
12
|
+
self.stop_scraping_at_page_number = 25
|
13
|
+
self.stop_scraping_at_match_number = 1
|
11
14
|
#self.agent.agent.http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
12
15
|
self.links = []
|
13
16
|
self.links_processed = []
|
14
17
|
end # def initialize
|
15
18
|
|
16
|
-
def get(url)
|
17
|
-
# initialize mechanize agent
|
18
|
-
self.agent = Mechanize.new
|
19
|
-
# set a proxy with user and password
|
20
|
-
self.port_index += 1
|
21
|
-
self.port_index = 0 if self.port_index >= self.ports.length
|
22
|
-
self.agent.set_proxy(self.ip, self.ports[self.port_index], self.user, self.password) if self.proxy?
|
23
|
-
self.agent.open_timeout = 5
|
24
|
-
self.agent.read_timeout = 5
|
25
|
-
# return
|
26
|
-
return Timeout::timeout(5) { self.agent.get(url) }
|
27
|
-
end
|
28
|
-
|
29
19
|
def get_links_from_sitemap(l=nil)
|
30
20
|
i = 0
|
31
21
|
l.logs "Scrape sitemaps... "
|
@@ -33,17 +23,17 @@ module BlackStack
|
|
33
23
|
# download the robots.txt
|
34
24
|
url = "http://#{domain}/robots.txt"
|
35
25
|
# get the content of robots.txt from url
|
36
|
-
s = Timeout::timeout(
|
26
|
+
s = Timeout::timeout(self.timeout) { URI.open(url).read }
|
37
27
|
# get the sitemap
|
38
28
|
sitemaps = s.split("\n").select { |line| line =~ /^sitemap:/i }.map { |a| a.downcase.split('sitemap:').last.strip }.uniq
|
39
29
|
sitemaps.each { |b|
|
40
|
-
parser = Timeout::timeout(
|
41
|
-
self.links += Timeout::timeout(
|
30
|
+
parser = Timeout::timeout(self.timeout) { SitemapParser.new b }
|
31
|
+
self.links += Timeout::timeout(self.timeout) { parser.to_a }
|
42
32
|
self.links.uniq!
|
43
33
|
}
|
44
34
|
l.logf sitemaps.size == 0 ? 'no sitemap found'.yellow : "#{sitemaps.size} sitemaps found".green # get_links
|
45
35
|
rescue => e
|
46
|
-
l.logf "Error: #{e.message.split("\n").first[0..100]}
|
36
|
+
l.logf "Error: #{e.message.split("\n").first[0..100]}".red # get_links
|
47
37
|
end
|
48
38
|
end
|
49
39
|
|
@@ -51,16 +41,19 @@ module BlackStack
|
|
51
41
|
def get_links_from_url(url, l=nil)
|
52
42
|
l = BlackStack::DummyLogger.new(nil) if l.nil?
|
53
43
|
l.logs "get_links (#{url})... "
|
44
|
+
aux = []
|
45
|
+
browser = nil
|
54
46
|
begin
|
55
|
-
aux = []
|
56
47
|
# trim url
|
57
48
|
url = url.strip
|
58
49
|
# get domain of the url using open-uri
|
59
50
|
domain = URI.parse(url).host
|
60
51
|
# visit the main page of the website
|
61
|
-
|
52
|
+
browser = BlackStack::Bots::Browser.new()
|
53
|
+
browser.goto url
|
54
|
+
sleep(self.load_wait_time) # wait 10 seconds for javascript content to load
|
62
55
|
# get the self.links to the pages of the website
|
63
|
-
aux =
|
56
|
+
aux = browser.links.map(&:href)
|
64
57
|
# remove non-string elements
|
65
58
|
aux = aux.select { |link| link.is_a?(String) }
|
66
59
|
# remove # from the self.links
|
@@ -80,11 +73,15 @@ module BlackStack
|
|
80
73
|
aux = aux.select { |link| !self.links.include?(link) }
|
81
74
|
b = aux.size
|
82
75
|
# add new links to self.links
|
83
|
-
self.links += aux
|
84
76
|
l.logf "done".green + " (#{a} links found, #{b} new, #{self.links.size} total)" # get_links
|
77
|
+
rescue Net::ReadTimeout => e
|
78
|
+
l.logf "Timeout Error: #{e.message}".red
|
85
79
|
rescue => e
|
86
|
-
l.logf "Error: #{e.message.split("\n").first[0..100]}
|
80
|
+
l.logf "Error: #{e.message.split("\n").first[0..100]}".red # get_links
|
81
|
+
ensure
|
82
|
+
browser.close if browser
|
87
83
|
end
|
84
|
+
self.links += aux
|
88
85
|
end # def get_links_from_url
|
89
86
|
|
90
87
|
def get_links(stop_at=10, l=nil)
|
@@ -106,8 +103,9 @@ module BlackStack
|
|
106
103
|
self.get_links_from_sitemap(l)
|
107
104
|
end # def get_links
|
108
105
|
|
109
|
-
def find_keywords(a, stop_at=
|
106
|
+
def find_keywords(a, stop_at=25, stop_on_first_link_found=false, l=nil)
|
110
107
|
pages = []
|
108
|
+
browser = nil
|
111
109
|
l = BlackStack::DummyLogger.new(nil) if l.nil?
|
112
110
|
# iterate the links
|
113
111
|
j = 0
|
@@ -117,12 +115,14 @@ module BlackStack
|
|
117
115
|
l.logs "#{j.to_s}. find_keywords (#{link})... "
|
118
116
|
begin
|
119
117
|
# get the page
|
120
|
-
|
118
|
+
browser = BlackStack::Bots::Browser.new()
|
119
|
+
browser.goto link
|
120
|
+
sleep(self.load_wait_time) # wait 10 seconds for javascript content to load
|
121
121
|
# get page body content in plain text
|
122
|
-
title =
|
123
|
-
s =
|
122
|
+
title = browser.title
|
123
|
+
s = browser.body.text
|
124
124
|
# add the link to the results of no-keyword
|
125
|
-
hpage = { 'page_url' => link.downcase, 'page_title' => title, 'page_html' =>
|
125
|
+
hpage = { 'page_url' => link.downcase, 'page_title' => title, 'page_html' => browser.body.html, 'keywords' => [] }
|
126
126
|
pages << hpage
|
127
127
|
# iterate the keywords
|
128
128
|
i = 0
|
@@ -140,9 +140,14 @@ module BlackStack
|
|
140
140
|
} # each
|
141
141
|
break if match && stop_on_first_link_found
|
142
142
|
l.logf i == 0 ? 'no keywords found'.yellow : "#{i} keywords found".green # find_keywords
|
143
|
+
|
144
|
+
rescue Net::ReadTimeout => e
|
145
|
+
l.logf "Timeout Error: #{e.message}".red
|
143
146
|
rescue => e
|
144
147
|
l.logf "Error: #{e.message.split("\n").first[0..100]}".red # get_links
|
145
|
-
|
148
|
+
ensure
|
149
|
+
browser.close if browser
|
150
|
+
end
|
146
151
|
} # each
|
147
152
|
# return
|
148
153
|
pages
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bots
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Leandro Daniel Sardi
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-08-
|
11
|
+
date: 2023-08-24 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: simple_cloud_logging
|
@@ -190,6 +190,26 @@ dependencies:
|
|
190
190
|
- - ">="
|
191
191
|
- !ruby/object:Gem::Version
|
192
192
|
version: 0.4.0
|
193
|
+
- !ruby/object:Gem::Dependency
|
194
|
+
name: watir
|
195
|
+
requirement: !ruby/object:Gem::Requirement
|
196
|
+
requirements:
|
197
|
+
- - "~>"
|
198
|
+
- !ruby/object:Gem::Version
|
199
|
+
version: 7.3.0
|
200
|
+
- - ">="
|
201
|
+
- !ruby/object:Gem::Version
|
202
|
+
version: 7.3.0
|
203
|
+
type: :runtime
|
204
|
+
prerelease: false
|
205
|
+
version_requirements: !ruby/object:Gem::Requirement
|
206
|
+
requirements:
|
207
|
+
- - "~>"
|
208
|
+
- !ruby/object:Gem::Version
|
209
|
+
version: 7.3.0
|
210
|
+
- - ">="
|
211
|
+
- !ruby/object:Gem::Version
|
212
|
+
version: 7.3.0
|
193
213
|
description: Ruby gem for scraping information from the public web.
|
194
214
|
email: leandro@connectionsphere.com
|
195
215
|
executables: []
|