bots 1.0.4 → 1.0.6

Sign up to get free protection for your applications and to get access to all the features.
Files changed (4) hide show
  1. checksums.yaml +4 -4
  2. data/lib/bots.rb +3 -1
  3. data/lib/scraper.rb +38 -33
  4. metadata +22 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ef6ff9fa026dd9e1f1c750b5963c957a4deacf5413f92a3dc8fd1f0584305898
4
- data.tar.gz: e652c088971c2fd9c97c12902d09ce7ed4933da36da6e010f2f3ffae1abdb292
3
+ metadata.gz: c75697e4698d3a14ac4feeced1a6011febf800b798806ff624d24ee9604e468f
4
+ data.tar.gz: 842f91d7870719ed98d6cb445de1bb33e4069e1dc3b86969c9a05c7c5cfc0814
5
5
  SHA512:
6
- metadata.gz: 1c8fce3accf4fc84701706355880bd59cc1cd5a16dfbc4a05bbdb30faa492c7f212f4a22ef917b81e7b8ba5f0c6caaa911f23bef1df4f4f23868ab225673adff
7
- data.tar.gz: 3942240b8cff2bdbfdb003912af8da5c26f6b117a3744becbff6a781f77f9eb32d2c5e0aef1439edc6b288ec0ff359f5e45fec68e21db8d6cca78c8d36cf04dc
6
+ metadata.gz: c64ba194a4f6bc68662eef99324a56b5036796d2e43576f2d44c308809cc4c567c72df4c5ae4474250cf98d3b366cb6d82c5c0df6b5cb13985a3cf96d63f85bd
7
+ data.tar.gz: 10625bee7634f62a3c45741d92789e6f599ce13de15d951d772900179abec0392c8a190706853b0077ed9bb9de0fbf522e658eecfae2b8777f0288d98fd637fb
data/lib/bots.rb CHANGED
@@ -7,8 +7,10 @@ require 'csv'
7
7
  require 'pry'
8
8
  require 'sitemap-parser'
9
9
  require 'timeout'
10
+ require 'watir'
10
11
 
11
12
  require_relative './base'
12
13
  require_relative './google'
13
14
  require_relative './scraper'
14
- require_relative './indeed'
15
+ require_relative './indeed'
16
+ require_relative './browser'
data/lib/scraper.rb CHANGED
@@ -1,31 +1,21 @@
1
1
  module BlackStack
2
2
  module Bots
3
- class Scraper < BlackStack::Bots::MechanizeBot
4
- attr_accessor :domain, :links
3
+ class Scraper
4
+ attr_accessor :domain, :links, :timeout, :load_wait_time, :stop_scraping_at_page_number, :stop_scraping_at_match_number
5
5
  # auxiliar array of links that I have extracted links from
6
6
  attr_accessor :links_processed
7
7
 
8
- def initialize(init_domain, h)
9
- super(h)
8
+ def initialize(init_domain, timeout, h)
10
9
  self.domain = init_domain
10
+ self.timeout = timeout || 10
11
+ self.load_wait_time = 3
12
+ self.stop_scraping_at_page_number = 25
13
+ self.stop_scraping_at_match_number = 1
11
14
  #self.agent.agent.http.verify_mode = OpenSSL::SSL::VERIFY_NONE
12
15
  self.links = []
13
16
  self.links_processed = []
14
17
  end # def initialize
15
18
 
16
- def get(url)
17
- # initialize mechanize agent
18
- self.agent = Mechanize.new
19
- # set a proxy with user and password
20
- self.port_index += 1
21
- self.port_index = 0 if self.port_index >= self.ports.length
22
- self.agent.set_proxy(self.ip, self.ports[self.port_index], self.user, self.password) if self.proxy?
23
- self.agent.open_timeout = 5
24
- self.agent.read_timeout = 5
25
- # return
26
- return Timeout::timeout(5) { self.agent.get(url) }
27
- end
28
-
29
19
  def get_links_from_sitemap(l=nil)
30
20
  i = 0
31
21
  l.logs "Scrape sitemaps... "
@@ -33,17 +23,17 @@ module BlackStack
33
23
  # download the robots.txt
34
24
  url = "http://#{domain}/robots.txt"
35
25
  # get the content of robots.txt from url
36
- s = Timeout::timeout(5) { URI.open(url).read }
26
+ s = Timeout::timeout(self.timeout) { URI.open(url).read }
37
27
  # get the sitemap
38
28
  sitemaps = s.split("\n").select { |line| line =~ /^sitemap:/i }.map { |a| a.downcase.split('sitemap:').last.strip }.uniq
39
29
  sitemaps.each { |b|
40
- parser = Timeout::timeout(5) { SitemapParser.new b }
41
- self.links += Timeout::timeout(5) { parser.to_a }
30
+ parser = Timeout::timeout(self.timeout) { SitemapParser.new b }
31
+ self.links += Timeout::timeout(self.timeout) { parser.to_a }
42
32
  self.links.uniq!
43
33
  }
44
34
  l.logf sitemaps.size == 0 ? 'no sitemap found'.yellow : "#{sitemaps.size} sitemaps found".green # get_links
45
35
  rescue => e
46
- l.logf "Error: #{e.message.split("\n").first[0..100]})".red # get_links
36
+ l.logf "Error: #{e.message.split("\n").first[0..100]}".red # get_links
47
37
  end
48
38
  end
49
39
 
@@ -51,16 +41,19 @@ module BlackStack
51
41
  def get_links_from_url(url, l=nil)
52
42
  l = BlackStack::DummyLogger.new(nil) if l.nil?
53
43
  l.logs "get_links (#{url})... "
44
+ aux = []
45
+ browser = nil
54
46
  begin
55
- aux = []
56
47
  # trim url
57
48
  url = url.strip
58
49
  # get domain of the url using open-uri
59
50
  domain = URI.parse(url).host
60
51
  # visit the main page of the website
61
- page = self.get(url)
52
+ browser = BlackStack::Bots::Browser.new()
53
+ browser.goto url
54
+ sleep(self.load_wait_time) # wait 10 seconds for javascript content to load
62
55
  # get the self.links to the pages of the website
63
- aux = page.links.map(&:href)
56
+ aux = browser.links.map(&:href)
64
57
  # remove non-string elements
65
58
  aux = aux.select { |link| link.is_a?(String) }
66
59
  # remove # from the self.links
@@ -80,11 +73,15 @@ module BlackStack
80
73
  aux = aux.select { |link| !self.links.include?(link) }
81
74
  b = aux.size
82
75
  # add new links to self.links
83
- self.links += aux
84
76
  l.logf "done".green + " (#{a} links found, #{b} new, #{self.links.size} total)" # get_links
77
+ rescue Net::ReadTimeout => e
78
+ l.logf "Timeout Error: #{e.message}".red
85
79
  rescue => e
86
- l.logf "Error: #{e.message.split("\n").first[0..100]})".red # get_links
80
+ l.logf "Error: #{e.message.split("\n").first[0..100]}".red # get_links
81
+ ensure
82
+ browser.close if browser
87
83
  end
84
+ self.links += aux
88
85
  end # def get_links_from_url
89
86
 
90
87
  def get_links(stop_at=10, l=nil)
@@ -106,8 +103,9 @@ module BlackStack
106
103
  self.get_links_from_sitemap(l)
107
104
  end # def get_links
108
105
 
109
- def find_keywords(a, stop_at=50, stop_on_first_link_found=false, l=nil)
106
+ def find_keywords(a, stop_at=25, stop_on_first_link_found=false, l=nil)
110
107
  pages = []
108
+ browser = nil
111
109
  l = BlackStack::DummyLogger.new(nil) if l.nil?
112
110
  # iterate the links
113
111
  j = 0
@@ -117,12 +115,14 @@ module BlackStack
117
115
  l.logs "#{j.to_s}. find_keywords (#{link})... "
118
116
  begin
119
117
  # get the page
120
- page = self.get(link)
118
+ browser = BlackStack::Bots::Browser.new()
119
+ browser.goto link
120
+ sleep(self.load_wait_time) # wait 10 seconds for javascript content to load
121
121
  # get page body content in plain text
122
- title = page.title
123
- s = Timeout::timeout(5) { page.search('body').text }
122
+ title = browser.title
123
+ s = browser.body.text
124
124
  # add the link to the results of no-keyword
125
- hpage = { :url => link, :title => title, :html => page.body, :keywords => [] }
125
+ hpage = { 'page_url' => link.downcase, 'page_title' => title, 'page_html' => browser.body.html, 'keywords' => [] }
126
126
  pages << hpage
127
127
  # iterate the keywords
128
128
  i = 0
@@ -130,7 +130,7 @@ module BlackStack
130
130
  a.each { |k|
131
131
  # find the keyword
132
132
  match = ( s =~ /#{Regexp.escape(k)}/i )
133
- hpage[:keywords] << k if match
133
+ hpage['keywords'] << k if match
134
134
  # count the number of links with match
135
135
  # break if only 1 link is needed
136
136
  if match
@@ -140,9 +140,14 @@ module BlackStack
140
140
  } # each
141
141
  break if match && stop_on_first_link_found
142
142
  l.logf i == 0 ? 'no keywords found'.yellow : "#{i} keywords found".green # find_keywords
143
+
144
+ rescue Net::ReadTimeout => e
145
+ l.logf "Timeout Error: #{e.message}".red
143
146
  rescue => e
144
147
  l.logf "Error: #{e.message.split("\n").first[0..100]}".red # get_links
145
- end # begin
148
+ ensure
149
+ browser.close if browser
150
+ end
146
151
  } # each
147
152
  # return
148
153
  pages
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bots
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.4
4
+ version: 1.0.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Leandro Daniel Sardi
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-08-18 00:00:00.000000000 Z
11
+ date: 2023-08-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: simple_cloud_logging
@@ -190,6 +190,26 @@ dependencies:
190
190
  - - ">="
191
191
  - !ruby/object:Gem::Version
192
192
  version: 0.4.0
193
+ - !ruby/object:Gem::Dependency
194
+ name: watir
195
+ requirement: !ruby/object:Gem::Requirement
196
+ requirements:
197
+ - - "~>"
198
+ - !ruby/object:Gem::Version
199
+ version: 7.3.0
200
+ - - ">="
201
+ - !ruby/object:Gem::Version
202
+ version: 7.3.0
203
+ type: :runtime
204
+ prerelease: false
205
+ version_requirements: !ruby/object:Gem::Requirement
206
+ requirements:
207
+ - - "~>"
208
+ - !ruby/object:Gem::Version
209
+ version: 7.3.0
210
+ - - ">="
211
+ - !ruby/object:Gem::Version
212
+ version: 7.3.0
193
213
  description: Ruby gem for scraping information from the public web.
194
214
  email: leandro@connectionsphere.com
195
215
  executables: []