bots 1.0.1 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (5) hide show
  1. checksums.yaml +4 -4
  2. data/lib/base.rb +13 -5
  3. data/lib/bots.rb +5 -0
  4. data/lib/scraper.rb +145 -0
  5. metadata +83 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: be4c3337063af2d514fd1016c94d66385fa84dbde27dbfa5174d5b0908c9efb1
4
- data.tar.gz: 13ef3dccd336269fdb1960f243c8b7dd08abf782ba89cfbf935c4e802a29ca45
3
+ metadata.gz: 3a8018e0d8575a415699c41dcba236e3c4f400e8132111093e421ac02e792548
4
+ data.tar.gz: 4b876044081e94743d1b719c53424331d44bb200a8bdcfddd7d78562209eeed3
5
5
  SHA512:
6
- metadata.gz: 354704a255a80def04f1993d3a73d5b358fedfeeda387278f47ab995ed10d2afced94fe175a15bc8dd7c55b1209798598b55a5f2d2071424f11613091cd87813
7
- data.tar.gz: c1fae508d2b55269f0039524ee4e87714d7cc7244922faa83ac5b6c374abc29cdaab34355059ff92e5d6a0469be44900ac628fcfe55da80def3993549a2bcf27
6
+ metadata.gz: 326e82a582132f2d267e906df73aad0d812f9fc4fe00c8af2ba9ef6cd93a174ae2004719ec76f5cc6b018da0eda1b2cd891dd51f7df7adaad6044da773f207ec
7
+ data.tar.gz: 2a4944c21854faee39f81b81004fb63baf3f270b2374f081b98ca25b4f7695af80184b1449dff38207d4c21b7f34b524d59c997931f2db2743df42c56a714875
data/lib/base.rb CHANGED
@@ -9,11 +9,13 @@ module BlackStack
9
9
 
10
10
  def initialize(h)
11
11
  # array of numbers from 4000 to 4249
12
- unless h[:proxy].nil?
13
- self.ip = h[:proxy][:ip]
14
- self.user = h[:proxy][:user]
15
- self.password = h[:proxy][:password]
16
- self.ports = (h[:proxy][:port_from]..h[:proxy][:port_to]).to_a
12
+ if h
13
+ self.ip = h[:ip]
14
+ self.user = h[:user]
15
+ self.password = h[:password]
16
+ self.ports = (h[:port_from]..h[:port_to]).to_a
17
+ else
18
+ self.ports = []
17
19
  end
18
20
  self.port_index = -1
19
21
  end # initialize
@@ -26,10 +28,16 @@ module BlackStack
26
28
 
27
29
  class MechanizeBot < BlackStack::Bots::Bot
28
30
  attr_accessor :agent # mechanize agent
31
+ def initialize(h)
32
+ super(h)
33
+ end
29
34
  end # MechanizeBot
30
35
 
31
36
  class SeleniumBot < BlackStack::Bots::Bot
32
37
  attr_accessor :driver # selenium driver
38
+ def initialize(h)
39
+ super(h)
40
+ end
33
41
  end # MechanizeBot
34
42
 
35
43
  end # Bots
data/lib/bots.rb CHANGED
@@ -1,9 +1,14 @@
1
+ require 'open-uri'
1
2
  require 'mechanize'
2
3
  require 'selenium-webdriver'
3
4
  require 'simple_cloud_logging'
4
5
  require 'colorize'
5
6
  require 'csv'
7
+ require 'pry'
8
+ require 'sitemap-parser'
9
+ require 'timeout'
6
10
 
7
11
  require_relative './base'
8
12
  require_relative './google'
13
+ require_relative './scraper'
9
14
  require_relative './indeed'
data/lib/scraper.rb ADDED
@@ -0,0 +1,145 @@
1
+ module BlackStack
2
+ module Bots
3
+ class Scraper < BlackStack::Bots::MechanizeBot
4
+ attr_accessor :domain, :links
5
+ # auxiliar array of links that I have extracted links from
6
+ attr_accessor :links_processed
7
+
8
+ def initialize(init_domain, h)
9
+ super(h)
10
+ self.domain = init_domain
11
+ #self.agent.agent.http.verify_mode = OpenSSL::SSL::VERIFY_NONE
12
+ self.links = []
13
+ self.links_processed = []
14
+ end # def initialize
15
+
16
+ def get(url)
17
+ # initialize mechanize agent
18
+ self.agent = Mechanize.new
19
+ # set a proxy with user and password
20
+ self.port_index += 1
21
+ self.port_index = 0 if self.port_index >= self.ports.length
22
+ self.agent.set_proxy(self.ip, self.ports[self.port_index], self.user, self.password) if self.proxy?
23
+ self.agent.open_timeout = 5
24
+ self.agent.read_timeout = 5
25
+ # return
26
+ return Timeout::timeout(5) { self.agent.get(url) }
27
+ end
28
+
29
+ def get_links_from_sitemap(l=nil)
30
+ i = 0
31
+ l.logs "Scrape sitemaps... "
32
+ begin
33
+ # download the robots.txt
34
+ url = "http://#{domain}/robots.txt"
35
+ # get the content of robots.txt from url
36
+ s = Timeout::timeout(5) { URI.open(url).read }
37
+ # get the sitemap
38
+ sitemaps = s.split("\n").select { |line| line =~ /^sitemap:/i }.map { |a| a.downcase.split('sitemap:').last.strip }.uniq
39
+ sitemaps.each { |b|
40
+ parser = Timeout::timeout(5) { SitemapParser.new b }
41
+ self.links += Timeout::timeout(5) { parser.to_a }
42
+ self.links.uniq!
43
+ }
44
+ l.logf sitemaps.size == 0 ? 'no sitemap found'.yellow : "#{sitemaps.size} sitemaps found".green # get_links
45
+ rescue => e
46
+ l.logf "Error: #{e.message.split("\n").first[0..100]})".red # get_links
47
+ end
48
+ end
49
+
50
+ # internal use only
51
+ def get_links_from_url(url, l=nil)
52
+ l = BlackStack::DummyLogger.new(nil) if l.nil?
53
+ l.logs "get_links (#{url})... "
54
+ begin
55
+ aux = []
56
+ # trim url
57
+ url = url.strip
58
+ # get domain of the url using open-uri
59
+ domain = URI.parse(url).host
60
+ # visit the main page of the website
61
+ page = self.get(url)
62
+ # get the self.links to the pages of the website
63
+ aux = page.links.map(&:href)
64
+ # remove non-string elements
65
+ aux = aux.select { |link| link.is_a?(String) }
66
+ # remove # from the self.links
67
+ aux = aux.map { |link| !link.nil? && link.split('#').first }
68
+ # remove querystring from the self.links
69
+ aux = aux.map { |link| !link.nil? && link.split('?').first }
70
+ # remove the self.links that are not http:// or https://
71
+ aux = aux.select { |link| !link.nil? && link =~ /^https?:\/\// }
72
+ # remove the self.links that are not from the same domain
73
+ aux = aux.select { |link| !link.nil? && link =~ /#{domain}/ }
74
+ # remove nil values
75
+ aux = aux.compact
76
+ # remove duplications
77
+ aux = aux.uniq
78
+ # filter links who already are in the list
79
+ a = aux.size
80
+ aux = aux.select { |link| !self.links.include?(link) }
81
+ b = aux.size
82
+ # add new links to self.links
83
+ self.links += aux
84
+ l.logf "done".green + " (#{a} links found, #{b} new, #{self.links.size} total)" # get_links
85
+ rescue => e
86
+ l.logf "Error: #{e.message.split("\n").first[0..100]})".red # get_links
87
+ end
88
+ end # def get_links_from_url
89
+
90
+ def get_links(stop_at=10, l=nil)
91
+ l = BlackStack::DummyLogger.new(nil) if l.nil?
92
+ # working with root url
93
+ url = "http://#{self.domain}/"
94
+ self.links << url if self.links.select { |link| link == url }.empty?
95
+ # iterate until I have discovered all the links
96
+ while self.links.size != self.links_processed.size && stop_at >= self.links.size
97
+ # iterate the links who are not in links_processed
98
+ self.links.select { |link| !self.links_processed.include?(link) }.each { |link|
99
+ # get the links from the url
100
+ self.get_links_from_url(link, l)
101
+ # add the link to the list of processed links
102
+ self.links_processed << link
103
+ }
104
+ end # while
105
+ # get links from the sitemap
106
+ self.get_links_from_sitemap(l)
107
+ end # def get_links
108
+
109
+ def find_keywords(a, stop_at=50, l=nil)
110
+ ret = []
111
+ l = BlackStack::DummyLogger.new(nil) if l.nil?
112
+ # iterate the links
113
+ j = 0
114
+ self.links.reject { |link| link =~ /\.pdf$/i || link =~ /\.jpg$/i || link =~ /\.jpeg$/i || link =~ /\.gif$/i }.each { |link|
115
+ j += 1
116
+ break if j > stop_at
117
+ l.logs "#{j.to_s}. find_keywords (#{link})... "
118
+ begin
119
+ # get the page
120
+ page = self.get(link)
121
+ # get page body content in plain text
122
+ s = Timeout::timeout(5) { Nokogiri::HTML(page.body).text }
123
+ # iterate the keywords
124
+ i = 0
125
+ a.each { |k|
126
+ # find the keyword
127
+ if s =~ /#{Regexp.escape(k)}/i
128
+ i += 1
129
+ ret << link if ret.select { |link| link == link }.empty?
130
+ break
131
+ end # if
132
+ } # each
133
+ break if ret.size > 0
134
+ l.logf i == 0 ? 'no keywords found'.yellow : "#{i} keywords found".green # find_keywords
135
+ rescue => e
136
+ l.logf "Error: #{e.message.split("\n").first[0..100]}".red # get_links
137
+ end # begin
138
+ } # each
139
+ # return
140
+ ret
141
+ end
142
+
143
+ end # class Scraper
144
+ end # module Bots
145
+ end # module BlackStack
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bots
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.1
4
+ version: 1.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Leandro Daniel Sardi
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-08-08 00:00:00.000000000 Z
11
+ date: 2023-08-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: simple_cloud_logging
@@ -110,6 +110,86 @@ dependencies:
110
110
  - - ">="
111
111
  - !ruby/object:Gem::Version
112
112
  version: 0.8.1
113
+ - !ruby/object:Gem::Dependency
114
+ name: pry
115
+ requirement: !ruby/object:Gem::Requirement
116
+ requirements:
117
+ - - "~>"
118
+ - !ruby/object:Gem::Version
119
+ version: 0.14.2
120
+ - - ">="
121
+ - !ruby/object:Gem::Version
122
+ version: 0.14.2
123
+ type: :runtime
124
+ prerelease: false
125
+ version_requirements: !ruby/object:Gem::Requirement
126
+ requirements:
127
+ - - "~>"
128
+ - !ruby/object:Gem::Version
129
+ version: 0.14.2
130
+ - - ">="
131
+ - !ruby/object:Gem::Version
132
+ version: 0.14.2
133
+ - !ruby/object:Gem::Dependency
134
+ name: open-uri
135
+ requirement: !ruby/object:Gem::Requirement
136
+ requirements:
137
+ - - "~>"
138
+ - !ruby/object:Gem::Version
139
+ version: 0.2.0
140
+ - - ">="
141
+ - !ruby/object:Gem::Version
142
+ version: 0.2.0
143
+ type: :runtime
144
+ prerelease: false
145
+ version_requirements: !ruby/object:Gem::Requirement
146
+ requirements:
147
+ - - "~>"
148
+ - !ruby/object:Gem::Version
149
+ version: 0.2.0
150
+ - - ">="
151
+ - !ruby/object:Gem::Version
152
+ version: 0.2.0
153
+ - !ruby/object:Gem::Dependency
154
+ name: sitemap-parser
155
+ requirement: !ruby/object:Gem::Requirement
156
+ requirements:
157
+ - - "~>"
158
+ - !ruby/object:Gem::Version
159
+ version: 0.5.6
160
+ - - ">="
161
+ - !ruby/object:Gem::Version
162
+ version: 0.5.6
163
+ type: :runtime
164
+ prerelease: false
165
+ version_requirements: !ruby/object:Gem::Requirement
166
+ requirements:
167
+ - - "~>"
168
+ - !ruby/object:Gem::Version
169
+ version: 0.5.6
170
+ - - ">="
171
+ - !ruby/object:Gem::Version
172
+ version: 0.5.6
173
+ - !ruby/object:Gem::Dependency
174
+ name: timeout
175
+ requirement: !ruby/object:Gem::Requirement
176
+ requirements:
177
+ - - "~>"
178
+ - !ruby/object:Gem::Version
179
+ version: 0.4.0
180
+ - - ">="
181
+ - !ruby/object:Gem::Version
182
+ version: 0.4.0
183
+ type: :runtime
184
+ prerelease: false
185
+ version_requirements: !ruby/object:Gem::Requirement
186
+ requirements:
187
+ - - "~>"
188
+ - !ruby/object:Gem::Version
189
+ version: 0.4.0
190
+ - - ">="
191
+ - !ruby/object:Gem::Version
192
+ version: 0.4.0
113
193
  description: Ruby gem for scraping information from the public web.
114
194
  email: leandro@connectionsphere.com
115
195
  executables: []
@@ -120,6 +200,7 @@ files:
120
200
  - lib/bots.rb
121
201
  - lib/google.rb
122
202
  - lib/indeed.rb
203
+ - lib/scraper.rb
123
204
  homepage: https://rubygems.org/gems/bots
124
205
  licenses:
125
206
  - MIT