bots 1.0.5 → 1.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/bots.rb +3 -1
- data/lib/scraper.rb +37 -32
- metadata +22 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 46e3e095564f34c1f9a0375dd4fefb669d4920b23a8c8d925b5a598250cc9ee2
|
4
|
+
data.tar.gz: 65075cef5bebe85cfbbd5098363ece05a19673144c3ba487e6c844523dc5195f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d319b1aee567eb3bad89a2611965409af49a3e7d00997c0c3c6b0060ea68fe60c82c4d4b4c4e51bcb4efb7b5393c0fe5438ec0b02bd1ee25b42d1354265dddf3
|
7
|
+
data.tar.gz: abe09849e8951f87a7012096be40bade937d658a1ff64852ea4db96941a90ae0cf5b82e14967dd6010403fc40ecd5bfdaa9ebf70e7e93f2d084d0a3c71f38948
|
data/lib/bots.rb
CHANGED
@@ -7,8 +7,10 @@ require 'csv'
|
|
7
7
|
require 'pry'
|
8
8
|
require 'sitemap-parser'
|
9
9
|
require 'timeout'
|
10
|
+
require 'watir'
|
10
11
|
|
11
12
|
require_relative './base'
|
12
13
|
require_relative './google'
|
13
14
|
require_relative './scraper'
|
14
|
-
require_relative './indeed'
|
15
|
+
require_relative './indeed'
|
16
|
+
require_relative './browser'
|
data/lib/scraper.rb
CHANGED
@@ -1,31 +1,21 @@
|
|
1
1
|
module BlackStack
|
2
2
|
module Bots
|
3
|
-
class Scraper
|
4
|
-
attr_accessor :domain, :links
|
3
|
+
class Scraper
|
4
|
+
attr_accessor :domain, :links, :timeout, :load_wait_time, :stop_scraping_at_page_number, :stop_scraping_at_match_number
|
5
5
|
# auxiliar array of links that I have extracted links from
|
6
6
|
attr_accessor :links_processed
|
7
7
|
|
8
|
-
def initialize(init_domain, h)
|
9
|
-
super(h)
|
8
|
+
def initialize(init_domain, timeout, h)
|
10
9
|
self.domain = init_domain
|
10
|
+
self.timeout = timeout || 10
|
11
|
+
self.load_wait_time = 3
|
12
|
+
self.stop_scraping_at_page_number = 25
|
13
|
+
self.stop_scraping_at_match_number = 1
|
11
14
|
#self.agent.agent.http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
12
15
|
self.links = []
|
13
16
|
self.links_processed = []
|
14
17
|
end # def initialize
|
15
18
|
|
16
|
-
def get(url)
|
17
|
-
# initialize mechanize agent
|
18
|
-
self.agent = Mechanize.new
|
19
|
-
# set a proxy with user and password
|
20
|
-
self.port_index += 1
|
21
|
-
self.port_index = 0 if self.port_index >= self.ports.length
|
22
|
-
self.agent.set_proxy(self.ip, self.ports[self.port_index], self.user, self.password) if self.proxy?
|
23
|
-
self.agent.open_timeout = 5
|
24
|
-
self.agent.read_timeout = 5
|
25
|
-
# return
|
26
|
-
return Timeout::timeout(5) { self.agent.get(url) }
|
27
|
-
end
|
28
|
-
|
29
19
|
def get_links_from_sitemap(l=nil)
|
30
20
|
i = 0
|
31
21
|
l.logs "Scrape sitemaps... "
|
@@ -33,17 +23,17 @@ module BlackStack
|
|
33
23
|
# download the robots.txt
|
34
24
|
url = "http://#{domain}/robots.txt"
|
35
25
|
# get the content of robots.txt from url
|
36
|
-
s = Timeout::timeout(
|
26
|
+
s = Timeout::timeout(self.timeout) { URI.open(url).read }
|
37
27
|
# get the sitemap
|
38
28
|
sitemaps = s.split("\n").select { |line| line =~ /^sitemap:/i }.map { |a| a.downcase.split('sitemap:').last.strip }.uniq
|
39
29
|
sitemaps.each { |b|
|
40
|
-
parser = Timeout::timeout(
|
41
|
-
self.links += Timeout::timeout(
|
30
|
+
parser = Timeout::timeout(self.timeout) { SitemapParser.new b }
|
31
|
+
self.links += Timeout::timeout(self.timeout) { parser.to_a }
|
42
32
|
self.links.uniq!
|
43
33
|
}
|
44
34
|
l.logf sitemaps.size == 0 ? 'no sitemap found'.yellow : "#{sitemaps.size} sitemaps found".green # get_links
|
45
35
|
rescue => e
|
46
|
-
l.logf "Error: #{e.message.split("\n").first[0..100]}
|
36
|
+
l.logf "Error: #{e.message.split("\n").first[0..100]}".red # get_links
|
47
37
|
end
|
48
38
|
end
|
49
39
|
|
@@ -51,16 +41,19 @@ module BlackStack
|
|
51
41
|
def get_links_from_url(url, l=nil)
|
52
42
|
l = BlackStack::DummyLogger.new(nil) if l.nil?
|
53
43
|
l.logs "get_links (#{url})... "
|
44
|
+
aux = []
|
45
|
+
browser = nil
|
54
46
|
begin
|
55
|
-
aux = []
|
56
47
|
# trim url
|
57
48
|
url = url.strip
|
58
49
|
# get domain of the url using open-uri
|
59
50
|
domain = URI.parse(url).host
|
60
51
|
# visit the main page of the website
|
61
|
-
|
52
|
+
browser = BlackStack::Bots::Browser.new()
|
53
|
+
browser.goto url
|
54
|
+
sleep(self.load_wait_time) # wait 10 seconds for javascript content to load
|
62
55
|
# get the self.links to the pages of the website
|
63
|
-
aux =
|
56
|
+
aux = browser.links.map(&:href)
|
64
57
|
# remove non-string elements
|
65
58
|
aux = aux.select { |link| link.is_a?(String) }
|
66
59
|
# remove # from the self.links
|
@@ -80,11 +73,15 @@ module BlackStack
|
|
80
73
|
aux = aux.select { |link| !self.links.include?(link) }
|
81
74
|
b = aux.size
|
82
75
|
# add new links to self.links
|
83
|
-
self.links += aux
|
84
76
|
l.logf "done".green + " (#{a} links found, #{b} new, #{self.links.size} total)" # get_links
|
77
|
+
rescue Net::ReadTimeout => e
|
78
|
+
l.logf "Timeout Error: #{e.message}".red
|
85
79
|
rescue => e
|
86
|
-
l.logf "Error: #{e.message.split("\n").first[0..100]}
|
80
|
+
l.logf "Error: #{e.message.split("\n").first[0..100]}".red # get_links
|
81
|
+
ensure
|
82
|
+
browser.close if browser
|
87
83
|
end
|
84
|
+
self.links += aux
|
88
85
|
end # def get_links_from_url
|
89
86
|
|
90
87
|
def get_links(stop_at=10, l=nil)
|
@@ -106,8 +103,9 @@ module BlackStack
|
|
106
103
|
self.get_links_from_sitemap(l)
|
107
104
|
end # def get_links
|
108
105
|
|
109
|
-
def find_keywords(a, stop_at=
|
106
|
+
def find_keywords(a, stop_at=25, stop_on_first_link_found=false, l=nil)
|
110
107
|
pages = []
|
108
|
+
browser = nil
|
111
109
|
l = BlackStack::DummyLogger.new(nil) if l.nil?
|
112
110
|
# iterate the links
|
113
111
|
j = 0
|
@@ -117,12 +115,14 @@ module BlackStack
|
|
117
115
|
l.logs "#{j.to_s}. find_keywords (#{link})... "
|
118
116
|
begin
|
119
117
|
# get the page
|
120
|
-
|
118
|
+
browser = BlackStack::Bots::Browser.new()
|
119
|
+
browser.goto link
|
120
|
+
sleep(self.load_wait_time) # wait 10 seconds for javascript content to load
|
121
121
|
# get page body content in plain text
|
122
|
-
title =
|
123
|
-
s =
|
122
|
+
title = browser.title
|
123
|
+
s = browser.body.text
|
124
124
|
# add the link to the results of no-keyword
|
125
|
-
hpage = { 'page_url' => link.downcase, 'page_title' => title, 'page_html' =>
|
125
|
+
hpage = { 'page_url' => link.downcase, 'page_title' => title, 'page_html' => browser.body.html, 'keywords' => [] }
|
126
126
|
pages << hpage
|
127
127
|
# iterate the keywords
|
128
128
|
i = 0
|
@@ -140,9 +140,14 @@ module BlackStack
|
|
140
140
|
} # each
|
141
141
|
break if match && stop_on_first_link_found
|
142
142
|
l.logf i == 0 ? 'no keywords found'.yellow : "#{i} keywords found".green # find_keywords
|
143
|
+
|
144
|
+
rescue Net::ReadTimeout => e
|
145
|
+
l.logf "Timeout Error: #{e.message}".red
|
143
146
|
rescue => e
|
144
147
|
l.logf "Error: #{e.message.split("\n").first[0..100]}".red # get_links
|
145
|
-
|
148
|
+
ensure
|
149
|
+
browser.close if browser
|
150
|
+
end
|
146
151
|
} # each
|
147
152
|
# return
|
148
153
|
pages
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: bots
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Leandro Daniel Sardi
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-08-
|
11
|
+
date: 2023-08-24 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: simple_cloud_logging
|
@@ -190,6 +190,26 @@ dependencies:
|
|
190
190
|
- - ">="
|
191
191
|
- !ruby/object:Gem::Version
|
192
192
|
version: 0.4.0
|
193
|
+
- !ruby/object:Gem::Dependency
|
194
|
+
name: watir
|
195
|
+
requirement: !ruby/object:Gem::Requirement
|
196
|
+
requirements:
|
197
|
+
- - "~>"
|
198
|
+
- !ruby/object:Gem::Version
|
199
|
+
version: 7.3.0
|
200
|
+
- - ">="
|
201
|
+
- !ruby/object:Gem::Version
|
202
|
+
version: 7.3.0
|
203
|
+
type: :runtime
|
204
|
+
prerelease: false
|
205
|
+
version_requirements: !ruby/object:Gem::Requirement
|
206
|
+
requirements:
|
207
|
+
- - "~>"
|
208
|
+
- !ruby/object:Gem::Version
|
209
|
+
version: 7.3.0
|
210
|
+
- - ">="
|
211
|
+
- !ruby/object:Gem::Version
|
212
|
+
version: 7.3.0
|
193
213
|
description: Ruby gem for scraping information from the public web.
|
194
214
|
email: leandro@connectionsphere.com
|
195
215
|
executables: []
|